/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 13 by nigel, Sat Feb 24 21:38:21 2007 UTC revision 49 by nigel, Sat Feb 24 21:39:33 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 33  restrictions: Line 37  restrictions:
37    
38  /* #define DEBUG */  /* #define DEBUG */
39    
40  /* Use a macro for debugging printing, 'cause that eliminates the the use  /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41  of #ifdef inline, and there are *still* stupid compilers about that don't like  inline, and there are *still* stupid compilers about that don't like indented
42  indented pre-processor statements. I suppose it's only been 10 years... */  pre-processor statements. I suppose it's only been 10 years... */
43    
44  #ifdef DEBUG  #ifdef DEBUG
45  #define DPRINTF(p) printf p  #define DPRINTF(p) printf p
# Line 49  the external pcre header. */ Line 53  the external pcre header. */
53  #include "internal.h"  #include "internal.h"
54    
55    
56    /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58    #ifdef __cplusplus
59    #define class pcre_class
60    #endif
61    
62    
63    /* Number of items on the nested bracket stacks at compile time. This should
64    not be set greater than 200. */
65    
66    #define BRASTACK_SIZE 200
67    
68    
69    /* The number of bytes in a literal character string above which we can't add
70    any more is different when UTF-8 characters may be encountered. */
71    
72    #ifdef SUPPORT_UTF8
73    #define MAXLIT 250
74    #else
75    #define MAXLIT 255
76    #endif
77    
78    
79  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81  static char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82  static char rep_max[] = { 0, 0, 0, 0, 1, 1 };  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83    
84  /* Text forms of OP_ values and things, for debugging (not all used) */  /* Text forms of OP_ values and things, for debugging (not all used) */
85    
86  #ifdef DEBUG  #ifdef DEBUG
87  static const char *OP_names[] = {  static const char *OP_names[] = {
88    "End", "\\A", "\\B", "\\b", "\\D", "\\d",    "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89    "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",    "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90    "not",    "Opt", "^", "$", "Any", "chars", "not",
91    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
95    "class", "negclass", "Ref",    "class", "Ref", "Recurse",
96    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97      "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
99  };  };
100  #endif  #endif
# Line 76  are simple data values; negative values Line 104  are simple data values; negative values
104  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
105  is invalid. */  is invalid. */
106    
107  static short int escapes[] = {  static const short int escapes[] = {
108      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
109      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
110    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
# Line 86  static short int escapes[] = { Line 114  static short int escapes[] = {
114    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
115      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
116      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
117      0,      0,      0                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
118    };
119    
120    /* Tables of names of POSIX character classes and their lengths. The list is
121    terminated by a zero length entry. The first three must be alpha, upper, lower,
122    as this is assumed for handling case independence. */
123    
124    static const char *posix_names[] = {
125      "alpha", "lower", "upper",
126      "alnum", "ascii", "cntrl", "digit", "graph",
127      "print", "punct", "space", "word",  "xdigit" };
128    
129    static const uschar posix_name_lengths[] = {
130      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131    
132    /* Table of class bit maps for each POSIX class; up to three may be combined
133    to form the class. */
134    
135    static const int posix_class_maps[] = {
136      cbit_lower, cbit_upper, -1,             /* alpha */
137      cbit_lower, -1,         -1,             /* lower */
138      cbit_upper, -1,         -1,             /* upper */
139      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
140      cbit_print, cbit_cntrl, -1,             /* ascii */
141      cbit_cntrl, -1,         -1,             /* cntrl */
142      cbit_digit, -1,         -1,             /* digit */
143      cbit_graph, -1,         -1,             /* graph */
144      cbit_print, -1,         -1,             /* print */
145      cbit_punct, -1,         -1,             /* punct */
146      cbit_space, -1,         -1,             /* space */
147      cbit_word,  -1,         -1,             /* word */
148      cbit_xdigit,-1,         -1              /* xdigit */
149  };  };
150    
151    
152  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
153    
154  static BOOL  static BOOL
155    compile_regex(int, int *, uschar **, const uschar **, const char **);    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156        BOOL, int, int *, int *, compile_data *);
157    
158    /* Structure for building a chain of data that actually lives on the
159    stack, for holding the values of the subject pointer at the start of each
160    subpattern, so as to detect when an empty string has been matched by a
161    subpattern - to break infinite loops. */
162    
163    typedef struct eptrblock {
164      struct eptrblock *prev;
165      const uschar *saved_eptr;
166    } eptrblock;
167    
168  /* Structure for passing "static" information around between the functions  /* Flag bits for the match() function */
 doing the matching, so that they are thread-safe. */  
169    
170  typedef struct match_data {  #define match_condassert   0x01    /* Called to check a condition assertion */
171    int    errorcode;             /* As it says */  #define match_isgroup      0x02    /* Set if start of bracketed group */
   int   *offset_vector;         /* Offset vector */  
   int    offset_end;            /* One past the end */  
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   caseless;              /* Case-independent flag */  
   BOOL   runtime_caseless;      /* Caseless forced at run time */  
   BOOL   multiline;             /* Multiline flag */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   dotall;                /* Dot matches any char */  
   BOOL   endonly;               /* Dollar not before final \n */  
   const uschar *start_subject;  /* Start of the subject string */  
   const uschar *end_subject;    /* End of the subject string */  
   jmp_buf fail_env;             /* Environment for longjump() break out */  
   const uschar *end_match_ptr;  /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
172    
173    
174    
# Line 132  void  (*pcre_free)(void *) = free; Line 186  void  (*pcre_free)(void *) = free;
186    
187    
188    
189    /*************************************************
190    *    Macros and tables for character handling    *
191    *************************************************/
192    
193    /* When UTF-8 encoding is being used, a character is no longer just a single
194    byte. The macros for character handling generate simple sequences when used in
195    byte-mode, and more complicated ones for UTF-8 characters. */
196    
197    #ifndef SUPPORT_UTF8
198    #define GETCHARINC(c, eptr) c = *eptr++;
199    #define GETCHARLEN(c, eptr, len) c = *eptr;
200    #define BACKCHAR(eptr)
201    
202    #else   /* SUPPORT_UTF8 */
203    
204    /* Get the next UTF-8 character, advancing the pointer */
205    
206    #define GETCHARINC(c, eptr) \
207      c = *eptr++; \
208      if (md->utf8 && (c & 0xc0) == 0xc0) \
209        { \
210        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
211        int s = 6 - a;                  /* Amount to shift next byte */  \
212        c &= utf8_table3[a];            /* Low order bits from first byte */ \
213        while (a-- > 0) \
214          { \
215          c |= (*eptr++ & 0x3f) << s; \
216          s += 6; \
217          } \
218        }
219    
220    /* Get the next UTF-8 character, not advancing the pointer, setting length */
221    
222    #define GETCHARLEN(c, eptr, len) \
223      c = *eptr; \
224      len = 1; \
225      if (md->utf8 && (c & 0xc0) == 0xc0) \
226        { \
227        int i; \
228        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
229        int s = 6 - a;                  /* Amount to shift next byte */  \
230        c &= utf8_table3[a];            /* Low order bits from first byte */ \
231        for (i = 1; i <= a; i++) \
232          { \
233          c |= (eptr[i] & 0x3f) << s; \
234          s += 6; \
235          } \
236        len += a; \
237        }
238    
239    /* If the pointer is not at the start of a character, move it back until
240    it is. */
241    
242    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243    
244    #endif
245    
246    
247    
248    /*************************************************
249    *             Default character tables           *
250    *************************************************/
251    
252    /* A default set of character tables is included in the PCRE binary. Its source
253    is built by the maketables auxiliary program, which uses the default C ctypes
254    functions, and put in the file chartables.c. These tables are used by PCRE
255    whenever the caller of pcre_compile() does not provide an alternate set of
256    tables. */
257    
258    #include "chartables.c"
259    
260    
261    
262    #ifdef SUPPORT_UTF8
263    /*************************************************
264    *           Tables for UTF-8 support             *
265    *************************************************/
266    
267    /* These are the breakpoints for different numbers of bytes in a UTF-8
268    character. */
269    
270    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271    
272    /* These are the indicator bits and the mask for the data bits to set in the
273    first byte of a character, indexed by the number of additional bytes. */
274    
275    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277    
278    /* Table of the number of extra characters, indexed by the first character
279    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280    0x3d. */
281    
282    static uschar utf8_table4[] = {
283      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287    
288    
289    /*************************************************
290    *       Convert character value to UTF-8         *
291    *************************************************/
292    
293    /* This function takes an integer value in the range 0 - 0x7fffffff
294    and encodes it as a UTF-8 character in 0 to 6 bytes.
295    
296    Arguments:
297      cvalue     the character value
298      buffer     pointer to buffer for result - at least 6 bytes long
299    
300    Returns:     number of characters placed in the buffer
301    */
302    
303    static int
304    ord2utf8(int cvalue, uschar *buffer)
305    {
306    register int i, j;
307    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308      if (cvalue <= utf8_table1[i]) break;
309    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310    cvalue >>= 6 - i;
311    for (j = 0; j < i; j++)
312      {
313      *buffer++ = 0x80 | (cvalue & 0x3f);
314      cvalue >>= 6;
315      }
316    return i + 1;
317    }
318    #endif
319    
320    
321    
322  /*************************************************  /*************************************************
323  *          Return version string                 *  *          Return version string                 *
324  *************************************************/  *************************************************/
325    
326    #define STRING(a)  # a
327    #define XSTRING(s) STRING(s)
328    
329  const char *  const char *
330  pcre_version(void)  pcre_version(void)
331  {  {
332  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333  }  }
334    
335    
336    
337    
338  /*************************************************  /*************************************************
339  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
340  *************************************************/  *************************************************/
341    
342  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
343  structure.  of the private structure, but its interface was too rigid. It remains for
344    backwards compatibility. The public options are passed back in an int - though
345    the re->options field has been expanded to a long int, all the public options
346    at the low end of it, and so even on 16-bit systems this will still be OK.
347    Therefore, I haven't changed the API for pcre_info().
348    
349  Arguments:  Arguments:
350    external_re   points to compiled code    external_re   points to compiled code
# Line 160  Arguments: Line 353  Arguments:
353                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
354                  or -2 otherwise                  or -2 otherwise
355    
356  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
357                  or negative values on error                  or negative values on error
358  */  */
359    
# Line 170  pcre_info(const pcre *external_re, int * Line 363  pcre_info(const pcre *external_re, int *
363  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
364  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
365  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367  if (first_char != NULL)  if (first_char != NULL)
368    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 179  return re->top_bracket; Line 372  return re->top_bracket;
372    
373    
374    
375    /*************************************************
376    *        Return info about compiled pattern      *
377    *************************************************/
378    
379    /* This is a newer "info" function which has an extensible interface so
380    that additional items can be added compatibly.
381    
382    Arguments:
383      external_re      points to compiled code
384      external_study   points to study data, or NULL
385      what             what information is required
386      where            where to put the information
387    
388    Returns:           0 if data returned, negative on error
389    */
390    
391    int
392    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393      void *where)
394    {
395    const real_pcre *re = (const real_pcre *)external_re;
396    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397    
398    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400    
401    switch (what)
402      {
403      case PCRE_INFO_OPTIONS:
404      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405      break;
406    
407      case PCRE_INFO_SIZE:
408      *((size_t *)where) = re->size;
409      break;
410    
411      case PCRE_INFO_CAPTURECOUNT:
412      *((int *)where) = re->top_bracket;
413      break;
414    
415      case PCRE_INFO_BACKREFMAX:
416      *((int *)where) = re->top_backref;
417      break;
418    
419      case PCRE_INFO_FIRSTCHAR:
420      *((int *)where) =
421        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423      break;
424    
425      case PCRE_INFO_FIRSTTABLE:
426      *((const uschar **)where) =
427        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428          study->start_bits : NULL;
429      break;
430    
431      case PCRE_INFO_LASTLITERAL:
432      *((int *)where) =
433        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434      break;
435    
436      default: return PCRE_ERROR_BADOPTION;
437      }
438    
439    return 0;
440    }
441    
442    
443    
444  #ifdef DEBUG  #ifdef DEBUG
445  /*************************************************  /*************************************************
# Line 211  while (length-- > 0) Line 472  while (length-- > 0)
472    
473    
474  /*************************************************  /*************************************************
 *         Check subpattern for empty operand     *  
 *************************************************/  
   
 /* This function checks a bracketed subpattern to see if any of the paths  
 through it could match an empty string. This is used to diagnose an error if  
 such a subpattern is followed by a quantifier with an unlimited upper bound.  
   
 Argument:  
   code      points to the opening bracket  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 could_be_empty(uschar *code)  
 {  
 do {  
   uschar *cc = code + 3;  
   
   /* Scan along the opcodes for this branch; as soon as we find something  
   that matches a non-empty string, break out and advance to test the next  
   branch. If we get to the end of the branch, return TRUE for the whole  
   sub-expression. */  
   
   for (;;)  
     {  
     /* Test an embedded subpattern; if it could not be empty, break the  
     loop. Otherwise carry on in the branch. */  
   
     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)  
       {  
       if (!could_be_empty(cc)) break;  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       }  
   
     else switch (*cc)  
       {  
       /* Reached end of a branch: the subpattern may match the empty string */  
   
       case OP_ALT:  
       case OP_KET:  
       case OP_KETRMAX:  
       case OP_KETRMIN:  
       return TRUE;  
   
       /* Skip over assertive subpatterns */  
   
       case OP_ASSERT:  
       case OP_ASSERT_NOT:  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       break;  
   
       /* Skip over things that don't match chars */  
   
       case OP_SOD:  
       case OP_EOD:  
       case OP_CIRC:  
       case OP_DOLL:  
       case OP_BRAZERO:  
       case OP_BRAMINZERO:  
       case OP_NOT_WORD_BOUNDARY:  
       case OP_WORD_BOUNDARY:  
       cc++;  
       break;  
   
       /* Skip over simple repeats with zero lower bound */  
   
       case OP_STAR:  
       case OP_MINSTAR:  
       case OP_QUERY:  
       case OP_MINQUERY:  
       case OP_NOTSTAR:  
       case OP_NOTMINSTAR:  
       case OP_NOTQUERY:  
       case OP_NOTMINQUERY:  
       case OP_TYPESTAR:  
       case OP_TYPEMINSTAR:  
       case OP_TYPEQUERY:  
       case OP_TYPEMINQUERY:  
       cc += 2;  
       break;  
   
       /* Skip over UPTOs (lower bound is zero) */  
   
       case OP_UPTO:  
       case OP_MINUPTO:  
       case OP_TYPEUPTO:  
       case OP_TYPEMINUPTO:  
       cc += 4;  
       break;  
   
       /* Check a class or a back reference for a zero minimum */  
   
       case OP_CLASS:  
       case OP_NEGCLASS:  
       case OP_REF:  
       cc += (*cc == OP_REF)? 2 : 33;  
   
       switch (*cc)  
         {  
         case OP_CRSTAR:  
         case OP_CRMINSTAR:  
         case OP_CRQUERY:  
         case OP_CRMINQUERY:  
         cc++;  
         break;  
   
         case OP_CRRANGE:  
         case OP_CRMINRANGE:  
         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;  
         cc += 3;  
         break;  
   
         default:  
         goto NEXT_BRANCH;  
         }  
       break;  
   
       /* Anything else matches at least one character */  
   
       default:  
       goto NEXT_BRANCH;  
       }  
     }  
   
   NEXT_BRANCH:  
   code += (code[1] << 8) + code[2];  
   }  
 while (*code == OP_ALT);  
   
 /* No branches match the empty string */  
   
 return FALSE;  
 }  
   
   
   
 /*************************************************  
475  *            Handle escapes                      *  *            Handle escapes                      *
476  *************************************************/  *************************************************/
477    
478  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
479  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
480  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
482  sequence.  the \. On exit, it is on the final character of the escape sequence.
483    
484  Arguments:  Arguments:
485    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 366  Arguments: Line 487  Arguments:
487    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
488    options    the options bits    options    the options bits
489    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
490      cd         pointer to char tables block
491    
492  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
493               negative => a special escape sequence               negative => a special escape sequence
# Line 374  Returns:     zero or positive => a data Line 496  Returns:     zero or positive => a data
496    
497  static int  static int
498  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499    int options, BOOL isclass)    int options, BOOL isclass, compile_data *cd)
500  {  {
501  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
502  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
503    
504    /* If backslash is at the end of the pattern, it's an error. */
505    
506    c = *(++ptr);
507  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
508    
509  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 417  else Line 541  else
541        {        {
542        oldptr = ptr;        oldptr = ptr;
543        c -= '0';        c -= '0';
544        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
546        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
547          {          {
# Line 439  else Line 563  else
563        }        }
564    
565      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
566      larger first octal digit */      larger first octal digit. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
572          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
573        c &= 255;     /* Take least significant 8 bits */
574      break;      break;
575    
576      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577        which can be greater than 0xff, but only if the ddd are hex digits. */
578    
579      case 'x':      case 'x':
580    #ifdef SUPPORT_UTF8
581        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582          {
583          const uschar *pt = ptr + 2;
584          register int count = 0;
585          c = 0;
586          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587            {
588            count++;
589            c = c * 16 + cd->lcc[*pt] -
590              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591            pt++;
592            }
593          if (*pt == '}')
594            {
595            if (c < 0 || count > 8) *errorptr = ERR34;
596            ptr = pt;
597            break;
598            }
599          /* If the sequence of hex digits does not end with '}', then we don't
600          recognize this construct; fall through to the normal \x handling. */
601          }
602    #endif
603    
604        /* Read just a single hex char */
605    
606      c = 0;      c = 0;
607      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608        {        {
609        ptr++;        ptr++;
610        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
611          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
612        }        }
613      break;      break;
614    
615        /* Other special escapes not starting with a digit are straightforward */
616    
617      case 'c':      case 'c':
618      c = *(++ptr);      c = *(++ptr);
619      if (c == 0)      if (c == 0)
# Line 470  else Line 624  else
624    
625      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
626    
627      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
628      c ^= 0x40;      c ^= 0x40;
629      break;      break;
630    
631      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
634        there used to be some cases other than the default, and there may be again
635        in future, so I haven't "optimized" it. */
636    
637      default:      default:
638      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
639        {        {
       case 'X':  
       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */  
       break;  
   
640        default:        default:
641        *errorptr = ERR3;        *errorptr = ERR3;
642        break;        break;
# Line 510  where the ddds are digits. Line 662  where the ddds are digits.
662    
663  Arguments:  Arguments:
664    p         pointer to the first char after '{'    p         pointer to the first char after '{'
665      cd        pointer to char tables block
666    
667  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
668  */  */
669    
670  static BOOL  static BOOL
671  is_counted_repeat(const uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
672  {  {
673  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
676    
677  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
678  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
679    
680  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
682  return (*p == '}');  return (*p == '}');
683  }  }
684    
# Line 545  Arguments: Line 698  Arguments:
698    maxp       pointer to int for max    maxp       pointer to int for max
699               returned as -1 if no max               returned as -1 if no max
700    errorptr   points to pointer to error message    errorptr   points to pointer to error message
701      cd         pointer to character tables clock
702    
703  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
704               current ptr on error, with errorptr set               current ptr on error, with errorptr set
705  */  */
706    
707  static const uschar *  static const uschar *
708  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
709      const char **errorptr, compile_data *cd)
710  {  {
711  int min = 0;  int min = 0;
712  int max = -1;  int max = -1;
713    
714  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
715    
716  if (*p == '}') max = min; else  if (*p == '}') max = min; else
717    {    {
718    if (*(++p) != '}')    if (*(++p) != '}')
719      {      {
720      max = 0;      max = 0;
721      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722      if (max < min)      if (max < min)
723        {        {
724        *errorptr = ERR4;        *errorptr = ERR4;
# Line 588  return p; Line 743  return p;
743    
744    
745  /*************************************************  /*************************************************
746    *        Find the fixed length of a pattern      *
747    *************************************************/
748    
749    /* Scan a pattern and compute the fixed length of subject that will match it,
750    if the length is fixed. This is needed for dealing with backward assertions.
751    
752    Arguments:
753      code     points to the start of the pattern (the bracket)
754      options  the compiling options
755    
756    Returns:   the fixed length, or -1 if there is no fixed length
757    */
758    
759    static int
760    find_fixedlength(uschar *code, int options)
761    {
762    int length = -1;
763    
764    register int branchlength = 0;
765    register uschar *cc = code + 3;
766    
767    /* Scan along the opcodes for this branch. If we get to the end of the
768    branch, check the length against that of the other branches. */
769    
770    for (;;)
771      {
772      int d;
773      register int op = *cc;
774      if (op >= OP_BRA) op = OP_BRA;
775    
776      switch (op)
777        {
778        case OP_BRA:
779        case OP_ONCE:
780        case OP_COND:
781        d = find_fixedlength(cc, options);
782        if (d < 0) return -1;
783        branchlength += d;
784        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
785        cc += 3;
786        break;
787    
788        /* Reached end of a branch; if it's a ket it is the end of a nested
789        call. If it's ALT it is an alternation in a nested call. If it is
790        END it's the end of the outer call. All can be handled by the same code. */
791    
792        case OP_ALT:
793        case OP_KET:
794        case OP_KETRMAX:
795        case OP_KETRMIN:
796        case OP_END:
797        if (length < 0) length = branchlength;
798          else if (length != branchlength) return -1;
799        if (*cc != OP_ALT) return length;
800        cc += 3;
801        branchlength = 0;
802        break;
803    
804        /* Skip over assertive subpatterns */
805    
806        case OP_ASSERT:
807        case OP_ASSERT_NOT:
808        case OP_ASSERTBACK:
809        case OP_ASSERTBACK_NOT:
810        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811        cc += 3;
812        break;
813    
814        /* Skip over things that don't match chars */
815    
816        case OP_REVERSE:
817        cc++;
818        /* Fall through */
819    
820        case OP_CREF:
821        case OP_OPT:
822        cc++;
823        /* Fall through */
824    
825        case OP_SOD:
826        case OP_EOD:
827        case OP_EODN:
828        case OP_CIRC:
829        case OP_DOLL:
830        case OP_NOT_WORD_BOUNDARY:
831        case OP_WORD_BOUNDARY:
832        cc++;
833        break;
834    
835        /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836        This requires a scan of the string, unfortunately. We assume valid UTF-8
837        strings, so all we do is reduce the length by one for byte whose bits are
838        10xxxxxx. */
839    
840        case OP_CHARS:
841        branchlength += *(++cc);
842    #ifdef SUPPORT_UTF8
843        for (d = 1; d <= *cc; d++)
844          if ((cc[d] & 0xc0) == 0x80) branchlength--;
845    #endif
846        cc += *cc + 1;
847        break;
848    
849        /* Handle exact repetitions */
850    
851        case OP_EXACT:
852        case OP_TYPEEXACT:
853        branchlength += (cc[1] << 8) + cc[2];
854        cc += 4;
855        break;
856    
857        /* Handle single-char matchers */
858    
859        case OP_NOT_DIGIT:
860        case OP_DIGIT:
861        case OP_NOT_WHITESPACE:
862        case OP_WHITESPACE:
863        case OP_NOT_WORDCHAR:
864        case OP_WORDCHAR:
865        case OP_ANY:
866        branchlength++;
867        cc++;
868        break;
869    
870    
871        /* Check a class for variable quantification */
872    
873        case OP_CLASS:
874        cc += (*cc == OP_REF)? 2 : 33;
875    
876        switch (*cc)
877          {
878          case OP_CRSTAR:
879          case OP_CRMINSTAR:
880          case OP_CRQUERY:
881          case OP_CRMINQUERY:
882          return -1;
883    
884          case OP_CRRANGE:
885          case OP_CRMINRANGE:
886          if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887          branchlength += (cc[1] << 8) + cc[2];
888          cc += 5;
889          break;
890    
891          default:
892          branchlength++;
893          }
894        break;
895    
896        /* Anything else is variable length */
897    
898        default:
899        return -1;
900        }
901      }
902    /* Control never gets here */
903    }
904    
905    
906    
907    
908    /*************************************************
909    *           Check for POSIX class syntax         *
910    *************************************************/
911    
912    /* This function is called when the sequence "[:" or "[." or "[=" is
913    encountered in a character class. It checks whether this is followed by an
914    optional ^ and then a sequence of letters, terminated by a matching ":]" or
915    ".]" or "=]".
916    
917    Argument:
918      ptr      pointer to the initial [
919      endptr   where to return the end pointer
920      cd       pointer to compile data
921    
922    Returns:   TRUE or FALSE
923    */
924    
925    static BOOL
926    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927    {
928    int terminator;          /* Don't combine these lines; the Solaris cc */
929    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
930    if (*(++ptr) == '^') ptr++;
931    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932    if (*ptr == terminator && ptr[1] == ']')
933      {
934      *endptr = ptr;
935      return TRUE;
936      }
937    return FALSE;
938    }
939    
940    
941    
942    
943    /*************************************************
944    *          Check POSIX class name                *
945    *************************************************/
946    
947    /* This function is called to check the name given in a POSIX-style class entry
948    such as [:alnum:].
949    
950    Arguments:
951      ptr        points to the first letter
952      len        the length of the name
953    
954    Returns:     a value representing the name, or -1 if unknown
955    */
956    
957    static int
958    check_posix_name(const uschar *ptr, int len)
959    {
960    register int yield = 0;
961    while (posix_name_lengths[yield] != 0)
962      {
963      if (len == posix_name_lengths[yield] &&
964        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965      yield++;
966      }
967    return -1;
968    }
969    
970    
971    
972    
973    /*************************************************
974  *           Compile one branch                   *  *           Compile one branch                   *
975  *************************************************/  *************************************************/
976    
977  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
978    
979  Arguments:  Arguments:
980    options    the option bits    options      the option bits
981    bracket    points to number of brackets used    brackets     points to number of brackets used
982    code       points to the pointer to the current code point    code         points to the pointer to the current code point
983    ptrptr     points to the current pattern pointer    ptrptr       points to the current pattern pointer
984    errorptr   points to pointer to error message    errorptr     points to pointer to error message
985      optchanged   set to the value of the last OP_OPT item compiled
986      reqchar      set to the last literal character required, else -1
987      countlits    set to count of mandatory literal characters
988      cd           contains pointers to tables
989    
990  Returns:     TRUE on success  Returns:       TRUE on success
991               FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
992  */  */
993    
994  static BOOL  static BOOL
995  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
996    const uschar **ptrptr, const char **errorptr)    const uschar **ptrptr, const char **errorptr, int *optchanged,
997      int *reqchar, int *countlits, compile_data *cd)
998  {  {
999  int repeat_type, op_type;  int repeat_type, op_type;
1000  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1001  int bravalue, length;  int bravalue, length;
1002    int greedy_default, greedy_non_default;
1003    int prevreqchar;
1004    int condcount = 0;
1005    int subcountlits = 0;
1006  register int c;  register int c;
1007  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1008    uschar *tempcode;
1009  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1010  const uschar *oldptr;  const uschar *tempptr;
1011  uschar *previous = NULL;  uschar *previous = NULL;
1012  uschar class[32];  uschar class[32];
1013    
1014    /* Set up the default and non-default settings for greediness */
1015    
1016    greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017    greedy_non_default = greedy_default ^ 1;
1018    
1019    /* Initialize no required char, and count of literals */
1020    
1021    *reqchar = prevreqchar = -1;
1022    *countlits = 0;
1023    
1024  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1025    
1026  for (;; ptr++)  for (;; ptr++)
1027    {    {
1028    BOOL negate_class;    BOOL negate_class;
1029    int  class_charcount;    int class_charcount;
1030    int  class_lastchar;    int class_lastchar;
1031      int newoptions;
1032      int condref;
1033      int subreqchar;
1034    
1035    c = *ptr;    c = *ptr;
1036    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
1037      {      {
1038      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039      if (c == '#')      if (c == '#')
1040        {        {
1041        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1042          on the Macintosh. */
1043          while ((c = *(++ptr)) != 0 && c != '\n') ;
1044        continue;        continue;
1045        }        }
1046      }      }
# Line 672  for (;; ptr++) Line 1080  for (;; ptr++)
1080    
1081      case '[':      case '[':
1082      previous = code;      previous = code;
1083        *code++ = OP_CLASS;
1084    
1085      /* If the first character is '^', set the negation flag, and use a      /* If the first character is '^', set the negation flag and skip it. */
     different opcode. This only matters if caseless matching is specified at  
     runtime. */  
1086    
1087      if ((c = *(++ptr)) == '^')      if ((c = *(++ptr)) == '^')
1088        {        {
1089        negate_class = TRUE;        negate_class = TRUE;
       *code++ = OP_NEGCLASS;  
1090        c = *(++ptr);        c = *(++ptr);
1091        }        }
1092      else      else negate_class = FALSE;
       {  
       negate_class = FALSE;  
       *code++ = OP_CLASS;  
       }  
1093    
1094      /* Keep a count of chars so that we can optimize the case of just a single      /* Keep a count of chars so that we can optimize the case of just a single
1095      character. */      character. */
# Line 713  for (;; ptr++) Line 1115  for (;; ptr++)
1115          goto FAILED;          goto FAILED;
1116          }          }
1117    
1118          /* Handle POSIX class names. Perl allows a negation extension of the
1119          form [:^name]. A square bracket that doesn't match the syntax is
1120          treated as a literal. We also recognize the POSIX constructions
1121          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122          5.6 does. */
1123    
1124          if (c == '[' &&
1125              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126              check_posix_syntax(ptr, &tempptr, cd))
1127            {
1128            BOOL local_negate = FALSE;
1129            int posix_class, i;
1130            register const uschar *cbits = cd->cbits;
1131    
1132            if (ptr[1] != ':')
1133              {
1134              *errorptr = ERR31;
1135              goto FAILED;
1136              }
1137    
1138            ptr += 2;
1139            if (*ptr == '^')
1140              {
1141              local_negate = TRUE;
1142              ptr++;
1143              }
1144    
1145            posix_class = check_posix_name(ptr, tempptr - ptr);
1146            if (posix_class < 0)
1147              {
1148              *errorptr = ERR30;
1149              goto FAILED;
1150              }
1151    
1152            /* If matching is caseless, upper and lower are converted to
1153            alpha. This relies on the fact that the class table starts with
1154            alpha, lower, upper as the first 3 entries. */
1155    
1156            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157              posix_class = 0;
1158    
1159            /* Or into the map we are building up to 3 of the static class
1160            tables, or their negations. */
1161    
1162            posix_class *= 3;
1163            for (i = 0; i < 3; i++)
1164              {
1165              int taboffset = posix_class_maps[posix_class + i];
1166              if (taboffset < 0) break;
1167              if (local_negate)
1168                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169              else
1170                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171              }
1172    
1173            ptr = tempptr + 1;
1174            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1175            continue;
1176            }
1177    
1178        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1179        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1180        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 723  for (;; ptr++) Line 1185  for (;; ptr++)
1185    
1186        if (c == '\\')        if (c == '\\')
1187          {          {
1188          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
1190          else if (c < 0)          else if (c < 0)
1191            {            {
1192              register const uschar *cbits = cd->cbits;
1193            class_charcount = 10;            class_charcount = 10;
1194            switch (-c)            switch (-c)
1195              {              {
1196              case ESC_d:              case ESC_d:
1197              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198              continue;              continue;
1199    
1200              case ESC_D:              case ESC_D:
1201              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202              continue;              continue;
1203    
1204              case ESC_w:              case ESC_w:
1205              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1206              continue;              continue;
1207    
1208              case ESC_W:              case ESC_W:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_s:              case ESC_s:
1213              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214              continue;              continue;
1215    
1216              case ESC_S:              case ESC_S:
1217              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218              continue;              continue;
1219    
1220              default:              default:
# Line 761  for (;; ptr++) Line 1222  for (;; ptr++)
1222              goto FAILED;              goto FAILED;
1223              }              }
1224            }            }
1225          /* Fall through if single character */  
1226            /* Fall through if single character, but don't at present allow
1227            chars > 255 in UTF-8 mode. */
1228    
1229    #ifdef SUPPORT_UTF8
1230            if (c > 255)
1231              {
1232              *errorptr = ERR33;
1233              goto FAILED;
1234              }
1235    #endif
1236          }          }
1237    
1238        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 781  for (;; ptr++) Line 1252  for (;; ptr++)
1252            }            }
1253    
1254          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1255          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256            in such circumstances. */
1257    
1258          if (d == '\\')          if (d == '\\')
1259            {            {
1260            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            const uschar *oldptr = ptr;
1261              d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262    
1263    #ifdef SUPPORT_UTF8
1264              if (d > 255)
1265                {
1266                *errorptr = ERR33;
1267                goto FAILED;
1268                }
1269    #endif
1270              /* \b is backslash; any other special means the '-' was literal */
1271    
1272            if (d < 0)            if (d < 0)
1273              {              {
1274              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1275                {                {
1276                *errorptr = ERR7;                ptr = oldptr - 2;
1277                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1278                }                }
1279              }              }
1280            }            }
# Line 807  for (;; ptr++) Line 1290  for (;; ptr++)
1290            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
1291            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
1292              {              {
1293              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
1294              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
1295              }              }
1296            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 819  for (;; ptr++) Line 1302  for (;; ptr++)
1302        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1303        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1304    
1305          SINGLE_CHARACTER:
1306    
1307        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1308        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1309          {          {
1310          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
1311          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
1312          }          }
1313        class_charcount++;        class_charcount++;
# Line 869  for (;; ptr++) Line 1354  for (;; ptr++)
1354      /* Various kinds of repeat */      /* Various kinds of repeat */
1355    
1356      case '{':      case '{':
1357      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
1360      goto REPEAT;      goto REPEAT;
1361    
# Line 895  for (;; ptr++) Line 1380  for (;; ptr++)
1380        goto FAILED;        goto FAILED;
1381        }        }
1382    
1383      /* If the next character is '?' this is a minimizing repeat. Advance to the      /* If the next character is '?' this is a minimizing repeat, by default,
1384        but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385      next character. */      next character. */
1386    
1387      if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;      if (ptr[1] == '?')
1388          { repeat_type = greedy_non_default; ptr++; }
1389      /* If the maximum is zero then the minimum must also be zero; Perl allows      else repeat_type = greedy_default;
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
1390    
1391      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1392      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1393      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1394        out any reqchar setting, backing up to the previous value. We must also
1395        adjust the countlits value. */
1396    
1397      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1398        {        {
1399        int len = previous[1];        int len = previous[1];
1400    
1401          if (repeat_min == 0) *reqchar = prevreqchar;
1402          *countlits += repeat_min - 1;
1403    
1404        if (len == 1)        if (len == 1)
1405          {          {
1406          c = previous[2];          c = previous[2];
# Line 943  for (;; ptr++) Line 1432  for (;; ptr++)
1432      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
1433      repeats by adding a suitable offset into repeat_type. */      repeats by adding a suitable offset into repeat_type. */
1434    
1435      else if ((int)*previous < OP_EOD || *previous == OP_ANY)      else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436        {        {
1437        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1438        c = *previous;        c = *previous;
1439        code = previous;        code = previous;
1440    
1441        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1442        repeat_type += op_type;      /* Combine both values for many cases */  
1443          /* If the maximum is zero then the minimum must also be zero; Perl allows
1444          this case, so we do too - by simply omitting the item altogether. */
1445    
1446          if (repeat_max == 0) goto END_REPEAT;
1447    
1448          /* Combine the op_type with the repeat_type */
1449    
1450          repeat_type += op_type;
1451    
1452        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1453        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 987  for (;; ptr++) Line 1484  for (;; ptr++)
1484          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
1485          we either have to put back the item that got cancelled if the string          we either have to put back the item that got cancelled if the string
1486          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
1487          string. For a character type nothing need be done; it will just get put          string. For a character type nothing need be done; it will just get
1488          back naturally. */          put back naturally. Note that the final character is always going to
1489            get added below. */
1490    
1491          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
1492            {            {
1493            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else previous[1]++;
1494            }            }
1495    
1496            /*  For a single negated character we also have to put back the
1497            item that got cancelled. */
1498    
1499            else if (*previous == OP_NOT) code++;
1500    
1501          /* If the maximum is unlimited, insert an OP_STAR. */          /* If the maximum is unlimited, insert an OP_STAR. */
1502    
1503          if (repeat_max < 0)          if (repeat_max < 0)
# Line 1021  for (;; ptr++) Line 1524  for (;; ptr++)
1524        }        }
1525    
1526      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1527      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1528    
1529      else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||      else if (*previous == OP_CLASS || *previous == OP_REF)
              *previous == OP_REF)  
1530        {        {
1531          if (repeat_max == 0)
1532            {
1533            code = previous;
1534            goto END_REPEAT;
1535            }
1536        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1537          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1538        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1044  for (;; ptr++) Line 1551  for (;; ptr++)
1551        }        }
1552    
1553      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
1554      cases. If the maximum repeat count is unlimited, check that the bracket      cases. */
     group cannot match the empty string, and diagnose an error if it can. */  
1555    
1556      else if ((int)*previous >= OP_BRA)      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557                 (int)*previous == OP_COND)
1558        {        {
1559        int i;        register int i;
1560          int ketoffset = 0;
1561        int len = code - previous;        int len = code - previous;
1562          uschar *bralink = NULL;
1563    
1564          /* If the maximum repeat count is unlimited, find the end of the bracket
1565          by scanning through from the start, and compute the offset back to it
1566          from the current code pointer. There may be an OP_OPT setting following
1567          the final KET, so we can't find the end just by going back from the code
1568          pointer. */
1569    
1570          if (repeat_max == -1)
1571            {
1572            register uschar *ket = previous;
1573            do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574            ketoffset = code - ket;
1575            }
1576    
1577          /* The case of a zero minimum is special because of the need to stick
1578          OP_BRAZERO in front of it, and because the group appears once in the
1579          data, whereas in other cases it appears the minimum number of times. For
1580          this reason, it is simplest to treat this case separately, as otherwise
1581          the code gets far too mess. There are several special subcases when the
1582          minimum is zero. */
1583    
1584        if (repeat_max == -1 && could_be_empty(previous))        if (repeat_min == 0)
1585          {          {
1586          *errorptr = ERR10;          /* If we set up a required char from the bracket, we must back off
1587          goto FAILED;          to the previous value and reset the countlits value too. */
         }  
1588    
1589        /* If the minimum is greater than zero, and the maximum is unlimited or          if (subcountlits > 0)
1590        equal to the minimum, the first copy remains where it is, and is            {
1591        replicated up to the minimum number of times. This case includes the +            *reqchar = prevreqchar;
1592        repeat, but of course no replication is needed in that case. */            *countlits -= subcountlits;
1593              }
1594    
1595        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))          /* If the maximum is also zero, we just omit the group from the output
1596          {          altogether. */
1597          for (i = 1; i < repeat_min; i++)  
1598            if (repeat_max == 0)
1599            {            {
1600            memcpy(code, previous, len);            code = previous;
1601            code += len;            goto END_REPEAT;
1602            }            }
         }  
1603    
1604        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is 1 or unlimited, we just have to stick in the
1605        Then, if there is a fixed upper limit, replicated up to that many times,          BRAZERO and do no more at this point. */
       sticking BRAZERO in front of all the optional ones. */  
1606    
1607        else          if (repeat_max <= 1)
         {  
         if (repeat_min == 0)  
1608            {            {
1609            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1610            code++;            code++;
1611            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1612            }            }
1613    
1614            /* If the maximum is greater than 1 and limited, we have to replicate
1615            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616            The first one has to be handled carefully because it's the original
1617            copy, which has to be moved up. The remainder can be handled by code
1618            that is common with the non-zero minimum case below. We just have to
1619            adjust the value or repeat_max, since one less copy is required. */
1620    
1621            else
1622              {
1623              int offset;
1624              memmove(previous+4, previous, len);
1625              code += 4;
1626              *previous++ = OP_BRAZERO + repeat_type;
1627              *previous++ = OP_BRA;
1628    
1629              /* We chain together the bracket offset fields that have to be
1630              filled in later when the ends of the brackets are reached. */
1631    
1632              offset = (bralink == NULL)? 0 : previous - bralink;
1633              bralink = previous;
1634              *previous++ = offset >> 8;
1635              *previous++ = offset & 255;
1636              }
1637    
1638            repeat_max--;
1639            }
1640    
1641          /* If the minimum is greater than zero, replicate the group as many
1642          times as necessary, and adjust the maximum to the number of subsequent
1643          copies that we need. */
1644    
1645          else
1646            {
1647          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1648            {            {
1649            memcpy(code, previous, len);            memcpy(code, previous, len);
1650            code += len;            code += len;
1651            }            }
1652            if (repeat_max > 0) repeat_max -= repeat_min;
1653            }
1654    
1655          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        /* This code is common to both the zero and non-zero minimum cases. If
1656          the maximum is limited, it replicates the group in a nested fashion,
1657          remembering the bracket starts on a stack. In the case of a zero minimum,
1658          the first one was set up above. In all cases the repeat_max now specifies
1659          the number of additional copies needed. */
1660    
1661          if (repeat_max >= 0)
1662            {
1663            for (i = repeat_max - 1; i >= 0; i--)
1664            {            {
1665            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1666    
1667              /* All but the final copy start a new nesting, maintaining the
1668              chain of brackets outstanding. */
1669    
1670              if (i != 0)
1671                {
1672                int offset;
1673                *code++ = OP_BRA;
1674                offset = (bralink == NULL)? 0 : code - bralink;
1675                bralink = code;
1676                *code++ = offset >> 8;
1677                *code++ = offset & 255;
1678                }
1679    
1680            memcpy(code, previous, len);            memcpy(code, previous, len);
1681            code += len;            code += len;
1682            }            }
1683    
1684            /* Now chain through the pending brackets, and fill in their length
1685            fields (which are holding the chain links pro tem). */
1686    
1687            while (bralink != NULL)
1688              {
1689              int oldlinkoffset;
1690              int offset = code - bralink + 1;
1691              uschar *bra = code - offset;
1692              oldlinkoffset = (bra[1] << 8) + bra[2];
1693              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694              *code++ = OP_KET;
1695              *code++ = bra[1] = offset >> 8;
1696              *code++ = bra[2] = (offset & 255);
1697              }
1698          }          }
1699    
1700        /* If the maximum is unlimited, set a repeater in the final copy. */        /* If the maximum is unlimited, set a repeater in the final copy. We
1701          can't just offset backwards from the current code point, because we
1702          don't know if there's been an options resetting after the ket. The
1703          correct offset was computed above. */
1704    
1705        if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706        }        }
1707    
1708      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1114  for (;; ptr++) Line 1715  for (;; ptr++)
1715    
1716      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1717    
1718        END_REPEAT:
1719      previous = NULL;      previous = NULL;
1720      break;      break;
1721    
1722    
1723      /* Start of nested bracket sub-expression, or comment or lookahead.      /* Start of nested bracket sub-expression, or comment or lookahead or
1724      First deal with special things that can come after a bracket; all are      lookbehind or option setting or condition. First deal with special things
1725      introduced by ?, and the appearance of any of them means that this is not a      that can come after a bracket; all are introduced by ?, and the appearance
1726      referencing group. They were checked for validity in the first pass over      of any of them means that this is not a referencing group. They were
1727      the string, so we don't have to check for syntax errors here.  */      checked for validity in the first pass over the string, so we don't have to
1728        check for syntax errors here.  */
1729    
1730      case '(':      case '(':
1731      previous = code;              /* Only real brackets can be repeated */      newoptions = options;
1732        condref = -1;
1733    
1734      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1735        {        {
1736        bravalue = OP_BRA;        int set, unset;
1737          int *optset;
1738    
1739        switch (*(++ptr))        switch (*(++ptr))
1740          {          {
1741          case '#':          case '#':                 /* Comment; skip to ket */
         case 'i':  
         case 'm':  
         case 's':  
         case 'x':  
1742          ptr++;          ptr++;
1743          while (*ptr != ')') ptr++;          while (*ptr != ')') ptr++;
         previous = NULL;  
1744          continue;          continue;
1745    
1746          case ':':                 /* Non-extracting bracket */          case ':':                 /* Non-extracting bracket */
1747            bravalue = OP_BRA;
1748          ptr++;          ptr++;
1749          break;          break;
1750    
1751          case '=':                 /* Assertions can't be repeated */          case '(':
1752            bravalue = OP_COND;       /* Conditional group */
1753            if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754              {
1755              condref = *ptr - '0';
1756              while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757              ptr++;
1758              }
1759            else ptr--;
1760            break;
1761    
1762            case '=':                 /* Positive lookahead */
1763          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
1764          ptr++;          ptr++;
         previous = NULL;  
1765          break;          break;
1766    
1767          case '!':          case '!':                 /* Negative lookahead */
1768          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
1769          ptr++;          ptr++;
         previous = NULL;  
1770          break;          break;
1771    
1772          case '>':                         /* "Match once" brackets */          case '<':                 /* Lookbehinds */
1773          if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */          switch (*(++ptr))
1774            {            {
1775            bravalue = OP_ONCE;            case '=':               /* Positive lookbehind */
1776              bravalue = OP_ASSERTBACK;
1777              ptr++;
1778              break;
1779    
1780              case '!':               /* Negative lookbehind */
1781              bravalue = OP_ASSERTBACK_NOT;
1782            ptr++;            ptr++;
           previous = NULL;  
1783            break;            break;
1784    
1785              default:                /* Syntax error */
1786              *errorptr = ERR24;
1787              goto FAILED;
1788            }            }
1789          /* Else fall through */          break;
1790    
1791          default:          case '>':                 /* One-time brackets */
1792          *errorptr = ERR12;          bravalue = OP_ONCE;
1793          goto FAILED;          ptr++;
1794            break;
1795    
1796            case 'R':                 /* Pattern recursion */
1797            *code++ = OP_RECURSE;
1798            ptr++;
1799            continue;
1800    
1801            default:                  /* Option setting */
1802            set = unset = 0;
1803            optset = &set;
1804    
1805            while (*ptr != ')' && *ptr != ':')
1806              {
1807              switch (*ptr++)
1808                {
1809                case '-': optset = &unset; break;
1810    
1811                case 'i': *optset |= PCRE_CASELESS; break;
1812                case 'm': *optset |= PCRE_MULTILINE; break;
1813                case 's': *optset |= PCRE_DOTALL; break;
1814                case 'x': *optset |= PCRE_EXTENDED; break;
1815                case 'U': *optset |= PCRE_UNGREEDY; break;
1816                case 'X': *optset |= PCRE_EXTRA; break;
1817    
1818                default:
1819                *errorptr = ERR12;
1820                goto FAILED;
1821                }
1822              }
1823    
1824            /* Set up the changed option bits, but don't change anything yet. */
1825    
1826            newoptions = (options | set) & (~unset);
1827    
1828            /* If the options ended with ')' this is not the start of a nested
1829            group with option changes, so the options change at this level. At top
1830            level there is nothing else to be done (the options will in fact have
1831            been set from the start of compiling as a result of the first pass) but
1832            at an inner level we must compile code to change the ims options if
1833            necessary, and pass the new setting back so that it can be put at the
1834            start of any following branches, and when this group ends, a resetting
1835            item can be compiled. */
1836    
1837            if (*ptr == ')')
1838              {
1839              if ((options & PCRE_INGROUP) != 0 &&
1840                  (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1841                {
1842                *code++ = OP_OPT;
1843                *code++ = *optchanged = newoptions & PCRE_IMS;
1844                }
1845              options = newoptions;  /* Change options at this level */
1846              previous = NULL;       /* This item can't be repeated */
1847              continue;              /* It is complete */
1848              }
1849    
1850            /* If the options ended with ':' we are heading into a nested group
1851            with possible change of options. Such groups are non-capturing and are
1852            not assertions of any kind. All we need to do is skip over the ':';
1853            the newoptions value is handled below. */
1854    
1855            bravalue = OP_BRA;
1856            ptr++;
1857          }          }
1858        }        }
1859    
1860      /* Else we have a referencing group */      /* Else we have a referencing group; adjust the opcode. */
1861    
1862      else      else
1863        {        {
# Line 1186  for (;; ptr++) Line 1869  for (;; ptr++)
1869        bravalue = OP_BRA + *brackets;        bravalue = OP_BRA + *brackets;
1870        }        }
1871    
1872      /* Process nested bracketed re; at end pointer is on the bracket. We copy      /* Process nested bracketed re. Assertions may not be repeated, but other
1873      code into a non-register variable in order to be able to pass its address      kinds can be. We copy code into a non-register variable in order to be able
1874      because some compilers complain otherwise. */      to pass its address because some compilers complain otherwise. Pass in a
1875        new setting for the ims options if they have changed. */
1876    
1877        previous = (bravalue >= OP_ONCE)? code : NULL;
1878      *code = bravalue;      *code = bravalue;
1879        tempcode = code;
1880    
1881        if (!compile_regex(
1882             options | PCRE_INGROUP,       /* Set for all nested groups */
1883             ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1884               newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1885             brackets,                     /* Bracket level */
1886             &tempcode,                    /* Where to put code (updated) */
1887             &ptr,                         /* Input pointer (updated) */
1888             errorptr,                     /* Where to put an error message */
1889             (bravalue == OP_ASSERTBACK ||
1890              bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1891             condref,                      /* Condition reference number */
1892             &subreqchar,                  /* For possible last char */
1893             &subcountlits,                /* For literal count */
1894             cd))                          /* Tables block */
1895          goto FAILED;
1896    
1897        /* At the end of compiling, code is still pointing to the start of the
1898        group, while tempcode has been updated to point past the end of the group
1899        and any option resetting that may follow it. The pattern pointer (ptr)
1900        is on the bracket. */
1901    
1902        /* If this is a conditional bracket, check that there are no more than
1903        two branches in the group. */
1904    
1905        if (bravalue == OP_COND)
1906        {        {
1907        uschar *mcode = code;        uschar *tc = code;
1908        if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))        condcount = 0;
1909    
1910          do {
1911             condcount++;
1912             tc += (tc[1] << 8) | tc[2];
1913             }
1914          while (*tc != OP_KET);
1915    
1916          if (condcount > 2)
1917            {
1918            *errorptr = ERR27;
1919          goto FAILED;          goto FAILED;
1920        code = mcode;          }
1921        }        }
1922    
1923        /* Handle updating of the required character. If the subpattern didn't
1924        set one, leave it as it was. Otherwise, update it for normal brackets of
1925        all kinds, forward assertions, and conditions with two branches. Don't
1926        update the literal count for forward assertions, however. If the bracket
1927        is followed by a quantifier with zero repeat, we have to back off. Hence
1928        the definition of prevreqchar and subcountlits outside the main loop so
1929        that they can be accessed for the back off. */
1930    
1931        if (subreqchar > 0 &&
1932             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1933             (bravalue == OP_COND && condcount == 2)))
1934          {
1935          prevreqchar = *reqchar;
1936          *reqchar = subreqchar;
1937          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1938          }
1939    
1940        /* Now update the main code pointer to the end of the group. */
1941    
1942        code = tempcode;
1943    
1944        /* Error if hit end of pattern */
1945    
1946      if (*ptr != ')')      if (*ptr != ')')
1947        {        {
1948        *errorptr = ERR14;        *errorptr = ERR14;
# Line 1210  for (;; ptr++) Line 1955  for (;; ptr++)
1955      for validity in the pre-compiling pass. */      for validity in the pre-compiling pass. */
1956    
1957      case '\\':      case '\\':
1958      oldptr = ptr;      tempptr = ptr;
1959      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1960    
1961      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1962      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1224  for (;; ptr++) Line 1969  for (;; ptr++)
1969        {        {
1970        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1971          {          {
         int refnum = -c - ESC_REF;  
         if (*brackets < refnum)  
           {  
           *errorptr = ERR15;  
           goto FAILED;  
           }  
1972          previous = code;          previous = code;
1973          *code++ = OP_REF;          *code++ = OP_REF;
1974          *code++ = refnum;          *code++ = -c - ESC_REF;
1975          }          }
1976        else        else
1977          {          {
1978          previous = (-c > ESC_b && -c < ESC_X)? code : NULL;          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1979          *code++ = -c;          *code++ = -c;
1980          }          }
1981        continue;        continue;
# Line 1244  for (;; ptr++) Line 1983  for (;; ptr++)
1983    
1984      /* Data character: reset and fall through */      /* Data character: reset and fall through */
1985    
1986      ptr = oldptr;      ptr = tempptr;
1987      c = '\\';      c = '\\';
1988    
1989      /* Handle a run of data characters until a metacharacter is encountered.      /* Handle a run of data characters until a metacharacter is encountered.
# Line 1262  for (;; ptr++) Line 2001  for (;; ptr++)
2001        {        {
2002        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2003          {          {
2004          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2005          if (c == '#')          if (c == '#')
2006            {            {
2007            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2008              on the Macintosh. */
2009              while ((c = *(++ptr)) != 0 && c != '\n') ;
2010            if (c == 0) break;            if (c == 0) break;
2011            continue;            continue;
2012            }            }
# Line 1277  for (;; ptr++) Line 2018  for (;; ptr++)
2018    
2019        if (c == '\\')        if (c == '\\')
2020          {          {
2021          oldptr = ptr;          tempptr = ptr;
2022          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2023          if (c < 0) { ptr = oldptr; break; }          if (c < 0) { ptr = tempptr; break; }
2024    
2025            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2026            two or more characters in the UTF-8 encoding. */
2027    
2028    #ifdef SUPPORT_UTF8
2029            if (c > 127 && (options & PCRE_UTF8) != 0)
2030              {
2031              uschar buffer[8];
2032              int len = ord2utf8(c, buffer);
2033              for (c = 0; c < len; c++) *code++ = buffer[c];
2034              length += len;
2035              continue;
2036              }
2037    #endif
2038          }          }
2039    
2040        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1290  for (;; ptr++) Line 2045  for (;; ptr++)
2045    
2046      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2047    
2048      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2049    
2050        /* Update the last character and the count of literals */
2051    
2052        prevreqchar = (length > 1)? code[-2] : *reqchar;
2053        *reqchar = code[-1];
2054        *countlits += length;
2055    
2056      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2057      the next state. */      the next state. */
2058    
2059      previous[1] = length;      previous[1] = length;
2060      ptr--;      if (length < MAXLIT) ptr--;
2061      break;      break;
2062      }      }
2063    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1320  return FALSE; Line 2081  return FALSE;
2081  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return
2082  it points to the closing bracket, or vertical bar, or end of string.  it points to the closing bracket, or vertical bar, or end of string.
2083  The code variable is pointing at the byte into which the BRA operator has been  The code variable is pointing at the byte into which the BRA operator has been
2084  stored.  stored. If the ims options are changed at the start (for a (?ims: group) or
2085    during any branch, we need to insert an OP_OPT item at the start of every
2086    following branch to ensure they get set correctly at run time, and also pass
2087    the new options into every subsequent branch compile.
2088    
2089  Argument:  Argument:
2090    options   the option bits    options     the option bits
2091    brackets  -> int containing the number of extracting brackets used    optchanged  new ims options to set as if (?ims) were at the start, or -1
2092    codeptr   -> the address of the current code pointer                 for no change
2093    ptrptr    -> the address of the current pattern pointer    brackets    -> int containing the number of extracting brackets used
2094    errorptr  -> pointer to error message    codeptr     -> the address of the current code pointer
2095      ptrptr      -> the address of the current pattern pointer
2096      errorptr    -> pointer to error message
2097      lookbehind  TRUE if this is a lookbehind assertion
2098      condref     > 0 for OPT_CREF setting at start of conditional group
2099      reqchar     -> place to put the last required character, or a negative number
2100      countlits   -> place to put the shortest literal count of any branch
2101      cd          points to the data block with tables pointers
2102    
2103  Returns:    TRUE on success  Returns:      TRUE on success
2104  */  */
2105    
2106  static BOOL  static BOOL
2107  compile_regex(int options, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2108    const uschar **ptrptr, const char **errorptr)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2109      int *reqchar, int *countlits, compile_data *cd)
2110  {  {
2111  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2112  uschar *code = *codeptr;  uschar *code = *codeptr;
2113    uschar *last_branch = code;
2114  uschar *start_bracket = code;  uschar *start_bracket = code;
2115    uschar *reverse_count = NULL;
2116    int oldoptions = options & PCRE_IMS;
2117    int branchreqchar, branchcountlits;
2118    
2119    *reqchar = -1;
2120    *countlits = INT_MAX;
2121    code += 3;
2122    
2123    /* At the start of a reference-based conditional group, insert the reference
2124    number as an OP_CREF item. */
2125    
2126    if (condref > 0)
2127      {
2128      *code++ = OP_CREF;
2129      *code++ = condref;
2130      }
2131    
2132    /* Loop for each alternative branch */
2133    
2134  for (;;)  for (;;)
2135    {    {
2136    int length;    int length;
   uschar *last_branch = code;  
2137    
2138    code += 3;    /* Handle change of options */
2139    if (!compile_branch(options, brackets, &code, &ptr, errorptr))  
2140      if (optchanged >= 0)
2141        {
2142        *code++ = OP_OPT;
2143        *code++ = optchanged;
2144        options = (options & ~PCRE_IMS) | optchanged;
2145        }
2146    
2147      /* Set up dummy OP_REVERSE if lookbehind assertion */
2148    
2149      if (lookbehind)
2150        {
2151        *code++ = OP_REVERSE;
2152        reverse_count = code;
2153        *code++ = 0;
2154        *code++ = 0;
2155        }
2156    
2157      /* Now compile the branch */
2158    
2159      if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2160          &branchreqchar, &branchcountlits, cd))
2161      {      {
2162      *ptrptr = ptr;      *ptrptr = ptr;
2163      return FALSE;      return FALSE;
2164      }      }
2165    
2166    /* Fill in the length of the last branch */    /* Fill in the length of the last branch */
2167    
2168      length = code - last_branch;
2169      last_branch[1] = length >> 8;
2170      last_branch[2] = length & 255;
2171    
2172      /* Save the last required character if all branches have the same; a current
2173      value of -1 means unset, while -2 means "previous branch had no last required
2174      char".  */
2175    
2176      if (*reqchar != -2)
2177        {
2178        if (branchreqchar >= 0)
2179          {
2180          if (*reqchar == -1) *reqchar = branchreqchar;
2181          else if (*reqchar != branchreqchar) *reqchar = -2;
2182          }
2183        else *reqchar = -2;
2184        }
2185    
2186      /* Keep the shortest literal count */
2187    
2188      if (branchcountlits < *countlits) *countlits = branchcountlits;
2189      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2190    
2191    length = code - last_branch;    /* If lookbehind, check that this branch matches a fixed-length string,
2192    last_branch[1] = length >> 8;    and put the length into the OP_REVERSE item. Temporarily mark the end of
2193    last_branch[2] = length & 255;    the branch with OP_END. */
2194    
2195      if (lookbehind)
2196        {
2197        *code = OP_END;
2198        length = find_fixedlength(last_branch, options);
2199        DPRINTF(("fixed length = %d\n", length));
2200        if (length < 0)
2201          {
2202          *errorptr = ERR25;
2203          *ptrptr = ptr;
2204          return FALSE;
2205          }
2206        reverse_count[0] = (length >> 8);
2207        reverse_count[1] = length & 255;
2208        }
2209    
2210    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Insert a
2211    terminating ket and the length of the whole bracketed item, and return,    terminating ket and the length of the whole bracketed item, and return,
2212    leaving the pointer at the terminating char. */    leaving the pointer at the terminating char. If any of the ims options
2213      were changed inside the group, compile a resetting op-code following. */
2214    
2215    if (*ptr != '|')    if (*ptr != '|')
2216      {      {
# Line 1368  for (;;) Line 2218  for (;;)
2218      *code++ = OP_KET;      *code++ = OP_KET;
2219      *code++ = length >> 8;      *code++ = length >> 8;
2220      *code++ = length & 255;      *code++ = length & 255;
2221        if (optchanged >= 0)
2222          {
2223          *code++ = OP_OPT;
2224          *code++ = oldoptions;
2225          }
2226      *codeptr = code;      *codeptr = code;
2227      *ptrptr = ptr;      *ptrptr = ptr;
2228      return TRUE;      return TRUE;
# Line 1376  for (;;) Line 2231  for (;;)
2231    /* Another branch follows; insert an "or" node and advance the pointer. */    /* Another branch follows; insert an "or" node and advance the pointer. */
2232    
2233    *code = OP_ALT;    *code = OP_ALT;
2234      last_branch = code;
2235      code += 3;
2236    ptr++;    ptr++;
2237    }    }
2238  /* Control never reaches here */  /* Control never reaches here */
# Line 1383  for (;;) Line 2240  for (;;)
2240    
2241    
2242    
2243    
2244    /*************************************************
2245    *      Find first significant op code            *
2246    *************************************************/
2247    
2248    /* This is called by several functions that scan a compiled expression looking
2249    for a fixed first character, or an anchoring op code etc. It skips over things
2250    that do not influence this. For one application, a change of caseless option is
2251    important.
2252    
2253    Arguments:
2254      code       pointer to the start of the group
2255      options    pointer to external options
2256      optbit     the option bit whose changing is significant, or
2257                 zero if none are
2258      optstop    TRUE to return on option change, otherwise change the options
2259                   value and continue
2260    
2261    Returns:     pointer to the first significant opcode
2262    */
2263    
2264    static const uschar*
2265    first_significant_code(const uschar *code, int *options, int optbit,
2266      BOOL optstop)
2267    {
2268    for (;;)
2269      {
2270      switch ((int)*code)
2271        {
2272        case OP_OPT:
2273        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2274          {
2275          if (optstop) return code;
2276          *options = (int)code[1];
2277          }
2278        code += 2;
2279        break;
2280    
2281        case OP_CREF:
2282        code += 2;
2283        break;
2284    
2285        case OP_WORD_BOUNDARY:
2286        case OP_NOT_WORD_BOUNDARY:
2287        code++;
2288        break;
2289    
2290        case OP_ASSERT_NOT:
2291        case OP_ASSERTBACK:
2292        case OP_ASSERTBACK_NOT:
2293        do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2294        code += 3;
2295        break;
2296    
2297        default:
2298        return code;
2299        }
2300      }
2301    /* Control never reaches here */
2302    }
2303    
2304    
2305    
2306    
2307  /*************************************************  /*************************************************
2308  *          Check for anchored expression         *  *          Check for anchored expression         *
2309  *************************************************/  *************************************************/
# Line 1393  all of whose alternatives start with OP_ Line 2314  all of whose alternatives start with OP_
2314  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2315  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2316    
2317  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2318  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2319  trying them again.  so there is no point trying them again.
2320    
2321  Argument:  points to start of expression (the bracket)  Arguments:
2322  Returns:   TRUE or FALSE    code       points to start of expression (the bracket)
2323      options    points to the options setting
2324    
2325    Returns:     TRUE or FALSE
2326  */  */
2327    
2328  static BOOL  static BOOL
2329  is_anchored(register const uschar *code, BOOL multiline)  is_anchored(register const uschar *code, int *options)
2330  {  {
2331  do {  do {
2332     int op = (int)code[3];     const uschar *scode = first_significant_code(code + 3, options,
2333     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)       PCRE_MULTILINE, FALSE);
2334       { if (!is_anchored(code+3, multiline)) return FALSE; }     register int op = *scode;
2335     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2336       { if (code[4] != OP_ANY) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2337     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2338                (*options & PCRE_DOTALL) != 0)
2339         { if (scode[1] != OP_ANY) return FALSE; }
2340       else if (op != OP_SOD &&
2341               ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2342         return FALSE;
2343     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2344     }     }
2345  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1420  return TRUE; Line 2349  return TRUE;
2349    
2350    
2351  /*************************************************  /*************************************************
2352  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2353  *************************************************/  *************************************************/
2354    
2355  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2356  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2357    matching and for non-DOTALL patterns that start with .* (which must start at
2358    the beginning or after \n).
2359    
2360  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2361  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1434  static BOOL Line 2365  static BOOL
2365  is_startline(const uschar *code)  is_startline(const uschar *code)
2366  {  {
2367  do {  do {
2368     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2369       { if (!is_startline(code+3)) return FALSE; }     register int op = *scode;
2370     else if (code[3] != OP_CIRC) return FALSE;     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2371         { if (!is_startline(scode)) return FALSE; }
2372       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2373         { if (scode[1] != OP_ANY) return FALSE; }
2374       else if (op != OP_CIRC) return FALSE;
2375     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2376     }     }
2377  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1455  Consider each alternative branch. If the Line 2390  Consider each alternative branch. If the
2390  a bracket all of whose alternatives start with the same char (recurse ad lib),  a bracket all of whose alternatives start with the same char (recurse ad lib),
2391  then we return that char, otherwise -1.  then we return that char, otherwise -1.
2392    
2393  Argument:  points to start of expression (the bracket)  Arguments:
2394  Returns:   -1 or the fixed first char    code       points to start of expression (the bracket)
2395      options    pointer to the options (used to check casing changes)
2396    
2397    Returns:     -1 or the fixed first char
2398  */  */
2399    
2400  static int  static int
2401  find_firstchar(uschar *code)  find_firstchar(const uschar *code, int *options)
2402  {  {
2403  register int c = -1;  register int c = -1;
2404  do  do {
2405    {     int d;
2406    register int charoffset = 4;     const uschar *scode = first_significant_code(code + 3, options,
2407         PCRE_CASELESS, TRUE);
2408    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     register int op = *scode;
2409      {  
2410      register int d;     if (op >= OP_BRA) op = OP_BRA;
2411      if ((d = find_firstchar(code+3)) < 0) return -1;  
2412      if (c < 0) c = d; else if (c != d) return -1;     switch(op)
2413      }       {
2414         default:
2415    else switch(code[3])       return -1;
2416      {  
2417      default:       case OP_BRA:
2418      return -1;       case OP_ASSERT:
2419         case OP_ONCE:
2420      case OP_EXACT:       /* Fall through */       case OP_COND:
2421      charoffset++;       if ((d = find_firstchar(scode, options)) < 0) return -1;
2422         if (c < 0) c = d; else if (c != d) return -1;
2423      case OP_CHARS:       /* Fall through */       break;
2424      charoffset++;  
2425         case OP_EXACT:       /* Fall through */
2426         scode++;
2427    
2428         case OP_CHARS:       /* Fall through */
2429         scode++;
2430    
2431         case OP_PLUS:
2432         case OP_MINPLUS:
2433         if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2434         break;
2435         }
2436    
2437      case OP_PLUS:     code += (code[1] << 8) + code[2];
2438      case OP_MINPLUS:     }
     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;  
     break;  
     }  
   code += (code[1] << 8) + code[2];  
   }  
2439  while (*code == OP_ALT);  while (*code == OP_ALT);
2440  return c;  return c;
2441  }  }
2442    
2443    
2444    
2445    
2446    
2447  /*************************************************  /*************************************************
2448  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
2449  *************************************************/  *************************************************/
# Line 1510  Arguments: Line 2456  Arguments:
2456    options      various option bits    options      various option bits
2457    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2458    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2459      tables       pointer to character tables or NULL
2460    
2461  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2462                 with errorptr and erroroffset set                 with errorptr and erroroffset set
# Line 1517  Returns:       pointer to compiled data Line 2464  Returns:       pointer to compiled data
2464    
2465  pcre *  pcre *
2466  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2467    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2468  {  {
2469  real_pcre *re;  real_pcre *re;
 int spaces = 0;  
2470  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2471  int runlength;  int runlength;
2472  int c, size;  int c, reqchar, countlits;
2473  int bracount = 0;  int bracount = 0;
 int brastack[200];  
2474  int top_backref = 0;  int top_backref = 0;
2475    int branch_extra = 0;
2476    int branch_newextra;
2477  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2478    size_t size;
2479  uschar *code;  uschar *code;
2480  const uschar *ptr;  const uschar *ptr;
2481    compile_data compile_block;
2482    int brastack[BRASTACK_SIZE];
2483    uschar bralenstack[BRASTACK_SIZE];
2484    
2485  #ifdef DEBUG  #ifdef DEBUG
2486  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2487  #endif  #endif
2488    
2489    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2490    
2491    #ifndef SUPPORT_UTF8
2492    if ((options & PCRE_UTF8) != 0)
2493      {
2494      *errorptr = ERR32;
2495      return NULL;
2496      }
2497    #endif
2498    
2499  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2500  can do is just return NULL. */  can do is just return NULL. */
2501    
# Line 1556  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2517  if ((options & ~PUBLIC_OPTIONS) != 0)
2517    return NULL;    return NULL;
2518    }    }
2519    
2520    /* Set up pointers to the individual character tables */
2521    
2522    if (tables == NULL) tables = pcre_default_tables;
2523    compile_block.lcc = tables + lcc_offset;
2524    compile_block.fcc = tables + fcc_offset;
2525    compile_block.cbits = tables + cbits_offset;
2526    compile_block.ctypes = tables + ctypes_offset;
2527    
2528    /* Reflect pattern for debugging output */
2529    
2530  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
2531  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
2532    
# Line 1572  while ((c = *(++ptr)) != 0) Line 2543  while ((c = *(++ptr)) != 0)
2543    int min, max;    int min, max;
2544    int class_charcount;    int class_charcount;
2545    
2546    if ((pcre_ctypes[c] & ctype_space) != 0)    if ((options & PCRE_EXTENDED) != 0)
     {  
     if ((options & PCRE_EXTENDED) != 0) continue;  
     spaces++;  
     }  
   
   if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2547      {      {
2548      while ((c = *(++ptr)) != 0 && c != '\n');      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2549      continue;      if (c == '#')
2550          {
2551          /* The space before the ; is to avoid a warning on a silly compiler
2552          on the Macintosh. */
2553          while ((c = *(++ptr)) != 0 && c != '\n') ;
2554          continue;
2555          }
2556      }      }
2557    
2558    switch(c)    switch(c)
# Line 1594  while ((c = *(++ptr)) != 0) Line 2565  while ((c = *(++ptr)) != 0)
2565      case '\\':      case '\\':
2566        {        {
2567        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2568        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2569        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2570        if (c >= 0)        if (c >= 0)
2571          {          {
# Line 1614  while ((c = *(++ptr)) != 0) Line 2585  while ((c = *(++ptr)) != 0)
2585        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2586        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2587        length++;   /* For single back reference */        length++;   /* For single back reference */
2588        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2589          {          {
2590          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2591          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2592          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2593            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1640  while ((c = *(++ptr)) != 0) Line 2611  while ((c = *(++ptr)) != 0)
2611      or back reference. */      or back reference. */
2612    
2613      case '{':      case '{':
2614      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2615      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2616      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2617      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2618        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1655  while ((c = *(++ptr)) != 0) Line 2626  while ((c = *(++ptr)) != 0)
2626      if (ptr[1] == '?') ptr++;      if (ptr[1] == '?') ptr++;
2627      continue;      continue;
2628    
2629      /* An alternation contains an offset to the next branch or ket. */      /* An alternation contains an offset to the next branch or ket. If any ims
2630        options changed in the previous branch(es), and/or if we are in a
2631        lookbehind assertion, extra space will be needed at the start of the
2632        branch. This is handled by branch_extra. */
2633    
2634      case '|':      case '|':
2635      length += 3;      length += 3 + branch_extra;
2636      continue;      continue;
2637    
2638      /* A character class uses 33 characters. Don't worry about character types      /* A character class uses 33 characters. Don't worry about character types
# Line 1672  while ((c = *(++ptr)) != 0) Line 2647  while ((c = *(++ptr)) != 0)
2647        {        {
2648        if (*ptr == '\\')        if (*ptr == '\\')
2649          {          {
2650          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2651              &compile_block);
2652          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2653          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2654          }          }
# Line 1689  while ((c = *(++ptr)) != 0) Line 2665  while ((c = *(++ptr)) != 0)
2665    
2666        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2667    
2668        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2669          {          {
2670          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2671          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2672          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2673            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1705  while ((c = *(++ptr)) != 0) Line 2681  while ((c = *(++ptr)) != 0)
2681      /* Brackets may be genuine groups or special things */      /* Brackets may be genuine groups or special things */
2682    
2683      case '(':      case '(':
2684        branch_newextra = 0;
2685    
2686      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2687    
2688      if (ptr[1] == '?') switch (c = ptr[2])      if (ptr[1] == '?')
2689        {        {
2690        /* Skip over comments entirely */        int set, unset;
2691        case '#':        int *optset;
       ptr += 3;  
       while (*ptr != 0 && *ptr != ')') ptr++;  
       if (*ptr == 0)  
         {  
         *errorptr = ERR18;  
         goto PCRE_ERROR_RETURN;  
         }  
       continue;  
2692    
2693        /* Non-referencing groups and lookaheads just move the pointer on, and        switch (c = ptr[2])
2694        then behave like a non-special bracket, except that they don't increment          {
2695        the count of extracting brackets. */          /* Skip over comments entirely */
2696            case '#':
2697        case ':':          ptr += 3;
2698        case '=':          while (*ptr != 0 && *ptr != ')') ptr++;
2699        case '!':          if (*ptr == 0)
2700        ptr += 2;            {
2701        break;            *errorptr = ERR18;
2702              goto PCRE_ERROR_RETURN;
2703              }
2704            continue;
2705    
2706        /* Ditto for the "once only" bracket, allowed only if the extra bit          /* Non-referencing groups and lookaheads just move the pointer on, and
2707        is set. */          then behave like a non-special bracket, except that they don't increment
2708            the count of extracting brackets. Ditto for the "once only" bracket,
2709            which is in Perl from version 5.005. */
2710    
2711        case '>':          case ':':
2712        if ((options & PCRE_EXTRA) != 0)          case '=':
2713          {          case '!':
2714            case '>':
2715          ptr += 2;          ptr += 2;
2716          break;          break;
         }  
       /* Else fall thourh */  
2717    
2718        /* Else loop setting valid options until ) is met. Anything else is an          /* A recursive call to the regex is an extension, to provide the
2719        error. */          facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2720    
2721        default:          case 'R':
2722        ptr += 2;          if (ptr[3] != ')')
       for (;; ptr++)  
         {  
         if ((c = *ptr) == 'i')  
2723            {            {
2724            options |= PCRE_CASELESS;            *errorptr = ERR29;
2725            continue;            goto PCRE_ERROR_RETURN;
2726            }            }
2727          else if ((c = *ptr) == 'm')          ptr += 3;
2728            length += 1;
2729            break;
2730    
2731            /* Lookbehinds are in Perl from version 5.005 */
2732    
2733            case '<':
2734            if (ptr[3] == '=' || ptr[3] == '!')
2735            {            {
2736            options |= PCRE_MULTILINE;            ptr += 3;
2737            continue;            branch_newextra = 3;
2738              length += 3;         /* For the first branch */
2739              break;
2740              }
2741            *errorptr = ERR24;
2742            goto PCRE_ERROR_RETURN;
2743    
2744            /* Conditionals are in Perl from version 5.005. The bracket must either
2745            be followed by a number (for bracket reference) or by an assertion
2746            group. */
2747    
2748            case '(':
2749            if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2750              {
2751              ptr += 4;
2752              length += 2;
2753              while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2754              if (*ptr != ')')
2755                {
2756                *errorptr = ERR26;
2757                goto PCRE_ERROR_RETURN;
2758                }
2759            }            }
2760          else if (c == 's')          else   /* An assertion must follow */
2761            {            {
2762            options |= PCRE_DOTALL;            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2763            continue;            if (ptr[2] != '?' ||
2764                 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2765                {
2766                ptr += 2;    /* To get right offset in message */
2767                *errorptr = ERR28;
2768                goto PCRE_ERROR_RETURN;
2769                }
2770              }
2771            break;
2772    
2773            /* Else loop checking valid options until ) is met. Anything else is an
2774            error. If we are without any brackets, i.e. at top level, the settings
2775            act as if specified in the options, so massage the options immediately.
2776            This is for backward compatibility with Perl 5.004. */
2777    
2778            default:
2779            set = unset = 0;
2780            optset = &set;
2781            ptr += 2;
2782    
2783            for (;; ptr++)
2784              {
2785              c = *ptr;
2786              switch (c)
2787                {
2788                case 'i':
2789                *optset |= PCRE_CASELESS;
2790                continue;
2791    
2792                case 'm':
2793                *optset |= PCRE_MULTILINE;
2794                continue;
2795    
2796                case 's':
2797                *optset |= PCRE_DOTALL;
2798                continue;
2799    
2800                case 'x':
2801                *optset |= PCRE_EXTENDED;
2802                continue;
2803    
2804                case 'X':
2805                *optset |= PCRE_EXTRA;
2806                continue;
2807    
2808                case 'U':
2809                *optset |= PCRE_UNGREEDY;
2810                continue;
2811    
2812                case '-':
2813                optset = &unset;
2814                continue;
2815    
2816                /* A termination by ')' indicates an options-setting-only item;
2817                this is global at top level; otherwise nothing is done here and
2818                it is handled during the compiling process on a per-bracket-group
2819                basis. */
2820    
2821                case ')':
2822                if (brastackptr == 0)
2823                  {
2824                  options = (options | set) & (~unset);
2825                  set = unset = 0;     /* To save length */
2826                  }
2827                /* Fall through */
2828    
2829                /* A termination by ':' indicates the start of a nested group with
2830                the given options set. This is again handled at compile time, but
2831                we must allow for compiled space if any of the ims options are
2832                set. We also have to allow for resetting space at the end of
2833                the group, which is why 4 is added to the length and not just 2.
2834                If there are several changes of options within the same group, this
2835                will lead to an over-estimate on the length, but this shouldn't
2836                matter very much. We also have to allow for resetting options at
2837                the start of any alternations, which we do by setting
2838                branch_newextra to 2. Finally, we record whether the case-dependent
2839                flag ever changes within the regex. This is used by the "required
2840                character" code. */
2841    
2842                case ':':
2843                if (((set|unset) & PCRE_IMS) != 0)
2844                  {
2845                  length += 4;
2846                  branch_newextra = 2;
2847                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2848                  }
2849                goto END_OPTIONS;
2850    
2851                /* Unrecognized option character */
2852    
2853                default:
2854                *errorptr = ERR12;
2855                goto PCRE_ERROR_RETURN;
2856                }
2857            }            }
2858          else if (c == 'x')  
2859            /* If we hit a closing bracket, that's it - this is a freestanding
2860            option-setting. We need to ensure that branch_extra is updated if
2861            necessary. The only values branch_newextra can have here are 0 or 2.
2862            If the value is 2, then branch_extra must either be 2 or 5, depending
2863            on whether this is a lookbehind group or not. */
2864    
2865            END_OPTIONS:
2866            if (c == ')')
2867            {            {
2868            options |= PCRE_EXTENDED;            if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2869            length -= spaces;          /* Already counted spaces */              branch_extra += branch_newextra;
2870            continue;            continue;
2871            }            }
         else if (c == ')') break;  
2872    
2873          *errorptr = ERR12;          /* If options were terminated by ':' control comes here. Fall through
2874          goto PCRE_ERROR_RETURN;          to handle the group below. */
2875          }          }
       continue;                      /* End of this bracket handling */  
2876        }        }
2877    
2878      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
# Line 1784  while ((c = *(++ptr)) != 0) Line 2881  while ((c = *(++ptr)) != 0)
2881      else bracount++;      else bracount++;
2882    
2883      /* Non-special forms of bracket. Save length for computing whole length      /* Non-special forms of bracket. Save length for computing whole length
2884      at end if there's a repeat that requires duplication of the group. */      at end if there's a repeat that requires duplication of the group. Also
2885        save the current value of branch_extra, and start the new group with
2886        the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2887        for a lookbehind assertion. */
2888    
2889      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2890        {        {
# Line 1792  while ((c = *(++ptr)) != 0) Line 2892  while ((c = *(++ptr)) != 0)
2892        goto PCRE_ERROR_RETURN;        goto PCRE_ERROR_RETURN;
2893        }        }
2894    
2895        bralenstack[brastackptr] = branch_extra;
2896        branch_extra = branch_newextra;
2897    
2898      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2899      length += 3;      length += 3;
2900      continue;      continue;
# Line 1799  while ((c = *(++ptr)) != 0) Line 2902  while ((c = *(++ptr)) != 0)
2902      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
2903      have to replicate this bracket up to that many times. If brastackptr is      have to replicate this bracket up to that many times. If brastackptr is
2904      0 this is an unmatched bracket which will generate an error, but take care      0 this is an unmatched bracket which will generate an error, but take care
2905      not to try to access brastack[-1]. */      not to try to access brastack[-1] when computing the length and restoring
2906        the branch_extra value. */
2907    
2908      case ')':      case ')':
2909      length += 3;      length += 3;
2910        {        {
2911        int minval = 1;        int minval = 1;
2912        int maxval = 1;        int maxval = 1;
2913        int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;        int duplength;
2914    
2915          if (brastackptr > 0)
2916            {
2917            duplength = length - brastack[--brastackptr];
2918            branch_extra = bralenstack[brastackptr];
2919            }
2920          else duplength = 0;
2921    
2922        /* Leave ptr at the final char; for read_repeat_counts this happens        /* Leave ptr at the final char; for read_repeat_counts this happens
2923        automatically; for the others we need an increment. */        automatically; for the others we need an increment. */
2924    
2925        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2926          {          {
2927          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2928              &compile_block);
2929          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2930          }          }
2931        else if (c == '*') { minval = 0; maxval = -1; ptr++; }        else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2932        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2933        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2934    
2935        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2936        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2937        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2938        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2939    
2940        if (minval == 0) length++;        if (minval == 0)
2941          else if (minval > 1) length += (minval - 1) * duplength;          {
2942        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2943            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2944            }
2945    
2946          /* When the minimum is greater than zero, 1 we have to replicate up to
2947          minval-1 times, with no additions required in the copies. Then, if
2948          there is a limited maximum we have to replicate up to maxval-1 times
2949          allowing for a BRAZERO item before each optional copy and nesting
2950          brackets for all but one of the optional copies. */
2951    
2952          else
2953            {
2954            length += (minval - 1) * duplength;
2955            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2956              length += (maxval - minval) * (duplength + 7) - 6;
2957            }
2958        }        }
2959      continue;      continue;
2960    
# Line 1842  while ((c = *(++ptr)) != 0) Line 2969  while ((c = *(++ptr)) != 0)
2969      runlength = 0;      runlength = 0;
2970      do      do
2971        {        {
2972        if ((pcre_ctypes[c] & ctype_space) != 0)        if ((options & PCRE_EXTENDED) != 0)
         {  
         if ((options & PCRE_EXTENDED) != 0) continue;  
         spaces++;  
         }  
   
       if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2973          {          {
2974          while ((c = *(++ptr)) != 0 && c != '\n');          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2975          continue;          if (c == '#')
2976              {
2977              /* The space before the ; is to avoid a warning on a silly compiler
2978              on the Macintosh. */
2979              while ((c = *(++ptr)) != 0 && c != '\n') ;
2980              continue;
2981              }
2982          }          }
2983    
2984        /* Backslash may introduce a data char or a metacharacter; stop the        /* Backslash may introduce a data char or a metacharacter; stop the
# Line 1860  while ((c = *(++ptr)) != 0) Line 2987  while ((c = *(++ptr)) != 0)
2987        if (c == '\\')        if (c == '\\')
2988          {          {
2989          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
2990          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2991              &compile_block);
2992          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2993          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2994    
2995    #ifdef SUPPORT_UTF8
2996            if (c > 127 && (options & PCRE_UTF8) != 0)
2997              {
2998              int i;
2999              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3000                if (c <= utf8_table1[i]) break;
3001              runlength += i;
3002              }
3003    #endif
3004          }          }
3005    
3006        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1872  while ((c = *(++ptr)) != 0) Line 3010  while ((c = *(++ptr)) != 0)
3010    
3011      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3012    
3013      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < MAXLIT &&
3014          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3015    
3016      ptr--;      ptr--;
3017      length += runlength;      length += runlength;
# Line 1903  if (re == NULL) Line 3042  if (re == NULL)
3042    return NULL;    return NULL;
3043    }    }
3044    
3045  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3046    
3047  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3048    re->size = size;
3049  re->options = options;  re->options = options;
3050    re->tables = tables;
3051    
3052  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
3053  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 1916  ptr = (const uschar *)pattern; Line 3057  ptr = (const uschar *)pattern;
3057  code = re->code;  code = re->code;
3058  *code = OP_BRA;  *code = OP_BRA;
3059  bracount = 0;  bracount = 0;
3060  (void)compile_regex(options, &bracount, &code, &ptr, errorptr);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3061      &reqchar, &countlits, &compile_block);
3062  re->top_bracket = bracount;  re->top_bracket = bracount;
3063  re->top_backref = top_backref;  re->top_backref = top_backref;
3064    
# Line 1933  if debugging, leave the test till after Line 3075  if debugging, leave the test till after
3075  if (code - re->code > length) *errorptr = ERR23;  if (code - re->code > length) *errorptr = ERR23;
3076  #endif  #endif
3077    
3078    /* Give an error if there's back reference to a non-existent capturing
3079    subpattern. */
3080    
3081    if (top_backref > re->top_bracket) *errorptr = ERR15;
3082    
3083  /* Failed to compile */  /* Failed to compile */
3084    
3085  if (*errorptr != NULL)  if (*errorptr != NULL)
# Line 1943  if (*errorptr != NULL) Line 3090  if (*errorptr != NULL)
3090    return NULL;    return NULL;
3091    }    }
3092    
3093  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3094  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3095  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3096  unanchored matches no end. In the case of multiline matches, an alternative is  
3097  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3098    that speeds up unanchored matches no end. If not, see if we can set the
3099    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3100    start with ^. and also when all branches start with .* for non-DOTALL matches.
3101    */
3102    
3103  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3104    {    {
3105    if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))    int temp_options = options;
3106      if (is_anchored(re->code, &temp_options))
3107      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
3108    else    else
3109      {      {
3110      int ch = find_firstchar(re->code);      int ch = find_firstchar(re->code, &temp_options);
3111      if (ch >= 0)      if (ch >= 0)
3112        {        {
3113        re->first_char = ch;        re->first_char = ch;
# Line 1966  if ((options & PCRE_ANCHORED) == 0) Line 3118  if ((options & PCRE_ANCHORED) == 0)
3118      }      }
3119    }    }
3120    
3121    /* Save the last required character if there are at least two literal
3122    characters on all paths, or if there is no first character setting. */
3123    
3124    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3125      {
3126      re->req_char = reqchar;
3127      re->options |= PCRE_REQCHSET;
3128      }
3129    
3130  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3131    
3132  #ifdef DEBUG  #ifdef DEBUG
3133    
3134  printf("Length = %d top_bracket = %d top_backref=%d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
3135    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
3136    
3137  if (re->options != 0)  if (re->options != 0)
3138    {    {
3139    printf("%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3140      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3141      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3142        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3143      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3144      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3145      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3146      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3147      ((re->options & PCRE_EXTRA) != 0)? "extra " : "");      ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3148        ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3149    }    }
3150    
3151  if ((re->options & PCRE_FIRSTSET) != 0)  if ((re->options & PCRE_FIRSTSET) != 0)
# Line 1991  if ((re->options & PCRE_FIRSTSET) != 0) Line 3154  if ((re->options & PCRE_FIRSTSET) != 0)
3154      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3155    }    }
3156    
3157    if ((re->options & PCRE_REQCHSET) != 0)
3158      {
3159      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3160        else printf("Req char = \\x%02x\n", re->req_char);
3161      }
3162    
3163  code_end = code;  code_end = code;
3164  code_base = code = re->code;  code_base = code = re->code;
3165    
# Line 2008  while (code < code_end) Line 3177  while (code < code_end)
3177    
3178    else switch(*code)    else switch(*code)
3179      {      {
3180        case OP_OPT:
3181        printf(" %.2x %s", code[1], OP_names[*code]);
3182        code++;
3183        break;
3184    
3185        case OP_COND:
3186        printf("%3d Cond", (code[1] << 8) + code[2]);
3187        code += 2;
3188        break;
3189    
3190        case OP_CREF:
3191        printf(" %.2d %s", code[1], OP_names[*code]);
3192        code++;
3193        break;
3194    
3195      case OP_CHARS:      case OP_CHARS:
3196      charlength = *(++code);      charlength = *(++code);
3197      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 2021  while (code < code_end) Line 3205  while (code < code_end)
3205      case OP_KET:      case OP_KET:
3206      case OP_ASSERT:      case OP_ASSERT:
3207      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3208        case OP_ASSERTBACK:
3209        case OP_ASSERTBACK_NOT:
3210      case OP_ONCE:      case OP_ONCE:
3211      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3212      code += 2;      code += 2;
3213      break;      break;
3214    
3215        case OP_REVERSE:
3216        printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3217        code += 2;
3218        break;
3219    
3220      case OP_STAR:      case OP_STAR:
3221      case OP_MINSTAR:      case OP_MINSTAR:
3222      case OP_PLUS:      case OP_PLUS:
# Line 2099  while (code < code_end) Line 3290  while (code < code_end)
3290      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
3291    
3292      case OP_CLASS:      case OP_CLASS:
     case OP_NEGCLASS:  
3293        {        {
3294        int i, min, max;        int i, min, max;
3295          code++;
3296        if (*code++ == OP_CLASS) printf("    [");        printf("    [");
         else printf("   ^[");  
3297    
3298        for (i = 0; i < 256; i++)        for (i = 0; i < 256; i++)
3299          {          {
# Line 2186  return (pcre *)re; Line 3375  return (pcre *)re;
3375    
3376    
3377  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
3378  *          Match a back-reference                *  *          Match a back-reference                *
3379  *************************************************/  *************************************************/
3380    
3381  /* If a back reference hasn't been set, the match fails.  /* If a back reference hasn't been set, the length that is passed is greater
3382    than the number of characters left in the string, so the match fails.
3383    
3384  Arguments:  Arguments:
3385    number      reference number    offset      index into the offset vector
3386    eptr        points into the subject    eptr        points into the subject
3387    length      length to be matched    length      length to be matched
3388    md          points to match data block    md          points to match data block
3389      ims         the ims flags
3390    
3391  Returns:      TRUE if matched  Returns:      TRUE if matched
3392  */  */
3393    
3394  static BOOL  static BOOL
3395  match_ref(int number, register const uschar *eptr, int length, match_data *md)  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3396      unsigned long int ims)
3397  {  {
3398  const uschar *p = md->start_subject + md->offset_vector[number];  const uschar *p = md->start_subject + md->offset_vector[offset];
3399    
3400  #ifdef DEBUG  #ifdef DEBUG
3401  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
# Line 2260  printf("\n"); Line 3412  printf("\n");
3412    
3413  /* Always fail if not enough characters left */  /* Always fail if not enough characters left */
3414    
3415  if (length > md->end_subject - p) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
3416    
3417  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
3418    
3419  if (md->caseless)  if ((ims & PCRE_CASELESS) != 0)
3420    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
3421      while (length-- > 0)
3422        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3423      }
3424  else  else
3425    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3426    
# Line 2278  return TRUE; Line 3433  return TRUE;
3433  *         Match from current position            *  *         Match from current position            *
3434  *************************************************/  *************************************************/
3435    
3436  /* On entry ecode points to the first opcode, and eptr to the first character.  /* On entry ecode points to the first opcode, and eptr to the first character
3437    in the subject string, while eptrb holds the value of eptr at the start of the
3438    last bracketed group - used for breaking infinite loops matching zero-length
3439    strings.
3440    
3441  Arguments:  Arguments:
3442     eptr        pointer in subject     eptr        pointer in subject
3443     ecode       position in code     ecode       position in code
3444     offset_top  current top pointer     offset_top  current top pointer
3445     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3446       ims         current /i, /m, and /s options
3447       eptrb       pointer to chain of blocks containing eptr at start of
3448                     brackets - for testing for empty matches
3449       flags       can contain
3450                     match_condassert - this is an assertion condition
3451                     match_isgroup - this is the start of a bracketed group
3452    
3453  Returns:       TRUE if matched  Returns:       TRUE if matched
3454  */  */
3455    
3456  static BOOL  static BOOL
3457  match(register const uschar *eptr, register const uschar *ecode, int offset_top,  match(register const uschar *eptr, register const uschar *ecode,
3458    match_data *md)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3459      int flags)
3460  {  {
3461    unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3462    eptrblock newptrb;
3463    
3464    /* At the start of a bracketed group, add the current subject pointer to the
3465    stack of such pointers, to be re-instated at the end of the group when we hit
3466    the closing ket. When match() is called in other circumstances, we don't add to
3467    the stack. */
3468    
3469    if ((flags & match_isgroup) != 0)
3470      {
3471      newptrb.prev = eptrb;
3472      newptrb.saved_eptr = eptr;
3473      eptrb = &newptrb;
3474      }
3475    
3476    /* Now start processing the operations. */
3477    
3478  for (;;)  for (;;)
3479    {    {
3480      int op = (int)*ecode;
3481    int min, max, ctype;    int min, max, ctype;
3482    register int i;    register int i;
3483    register int c;    register int c;
3484    BOOL minimize = FALSE;    BOOL minimize = FALSE;
3485    
3486    /* Opening bracket. Check the alternative branches in turn, failing if none    /* Opening capturing bracket. If there is space in the offset vector, save
3487    match. We have to set the start offset if required and there is space    the current subject position in the working slot at the top of the vector. We
3488    in the offset vector so that it is available for subsequent back references    mustn't change the current values of the data slot, because they may be set
3489    if the bracket matches. However, if the bracket fails, we must put back the    from a previous iteration of this group, and be referred to by a reference
3490    previous value of both offsets in case they were set by a previous copy of    inside the group.
3491    the same bracket. Don't worry about setting the flag for the error case here;  
3492    that is handled in the code for KET. */    If the bracket fails to match, we need to restore this value and also the
3493      values of the final offsets, in case they were set by a previous iteration of
3494      the same bracket.
3495    
3496      If there isn't enough space in the offset vector, treat this as if it were a
3497      non-capturing bracket. Don't worry about setting the flag for the error case
3498      here; that is handled in the code for KET. */
3499    
3500    if ((int)*ecode >= OP_BRA)    if (op > OP_BRA)
3501      {      {
3502      int number = (*ecode - OP_BRA) << 1;      int number = op - OP_BRA;
3503      int save_offset1 = 0, save_offset2 = 0;      int offset = number << 1;
3504    
3505      DPRINTF(("start bracket %d\n", number/2));  #ifdef DEBUG
3506        printf("start bracket %d subject=", number);
3507        pchars(eptr, 16, TRUE, md);
3508        printf("\n");
3509    #endif
3510    
3511      if (number > 0 && number < md->offset_end)      if (offset < md->offset_max)
3512        {        {
3513        save_offset1 = md->offset_vector[number];        int save_offset1 = md->offset_vector[offset];
3514        save_offset2 = md->offset_vector[number+1];        int save_offset2 = md->offset_vector[offset+1];
3515        md->offset_vector[number] = eptr - md->start_subject;        int save_offset3 = md->offset_vector[md->offset_end - number];
3516    
3517          DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3518          md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3519    
3520          do
3521            {
3522            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3523              return TRUE;
3524            ecode += (ecode[1] << 8) + ecode[2];
3525            }
3526          while (*ecode == OP_ALT);
3527    
3528          DPRINTF(("bracket %d failed\n", number));
3529    
3530        DPRINTF(("saving %d %d\n", save_offset1, save_offset2));        md->offset_vector[offset] = save_offset1;
3531          md->offset_vector[offset+1] = save_offset2;
3532          md->offset_vector[md->offset_end - number] = save_offset3;
3533          return FALSE;
3534        }        }
3535    
3536      /* Recurse for all the alternatives. */      /* Insufficient room for saving captured contents */
3537    
3538        else op = OP_BRA;
3539        }
3540    
3541      /* Other types of node can be handled by a switch */
3542    
3543      switch(op)
3544        {
3545        case OP_BRA:     /* Non-capturing bracket: optimized */
3546        DPRINTF(("start bracket 0\n"));
3547      do      do
3548        {        {
3549        if (match(eptr, ecode+3, offset_top, md)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3550            return TRUE;
3551        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3552        }        }
3553      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3554        DPRINTF(("bracket 0 failed\n"));
3555        return FALSE;
3556    
3557        /* Conditional group: compilation checked that there are no more than
3558        two branches. If the condition is false, skipping the first branch takes us
3559        past the end if there is only one branch, but that's OK because that is
3560        exactly what going to the ket would do. */
3561    
3562        case OP_COND:
3563        if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3564          {
3565          int offset = ecode[4] << 1;    /* Doubled reference number */
3566          return match(eptr,
3567            ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3568              5 : 3 + (ecode[1] << 8) + ecode[2]),
3569            offset_top, md, ims, eptrb, match_isgroup);
3570          }
3571    
3572      DPRINTF(("bracket %d failed\n", number/2));      /* The condition is an assertion. Call match() to evaluate it - setting
3573        the final argument TRUE causes it to stop at the end of an assertion. */
3574    
3575      if (number > 0 && number < md->offset_end)      else
3576        {        {
3577        md->offset_vector[number] = save_offset1;        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3578        md->offset_vector[number+1] = save_offset2;            match_condassert | match_isgroup))
3579            {
3580            ecode += 3 + (ecode[4] << 8) + ecode[5];
3581            while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3582            }
3583          else ecode += (ecode[1] << 8) + ecode[2];
3584          return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3585        }        }
3586        /* Control never reaches here */
3587    
3588      return FALSE;      /* Skip over conditional reference data if encountered (should not be) */
     }  
3589    
3590    /* Other types of node can be handled by a switch */      case OP_CREF:
3591        ecode += 2;
3592        break;
3593    
3594        /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3595        an empty string - recursion will then try other alternatives, if any. */
3596    
   switch(*ecode)  
     {  
3597      case OP_END:      case OP_END:
3598        if (md->notempty && eptr == md->start_match) return FALSE;
3599      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3600      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3601      return TRUE;      return TRUE;
3602    
3603      /* The equivalent of Prolog's "cut" - if the rest doesn't match, the      /* Change option settings */
     whole thing doesn't match, so we have to get out via a longjmp(). */  
3604    
3605      case OP_CUT:      case OP_OPT:
3606      if (match(eptr, ecode+1, offset_top, md)) return TRUE;      ims = ecode[1];
3607      longjmp(md->fail_env, 1);      ecode += 2;
3608        DPRINTF(("ims set to %02lx\n", ims));
3609        break;
3610    
3611      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
3612      matching won't pass the KET for an assertion. If any one branch matches,      matching won't pass the KET for an assertion. If any one branch matches,
3613      the assertion is true. */      the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3614        start of each branch to move the current point backwards, so the code at
3615        this level is identical to the lookahead case. */
3616    
3617      case OP_ASSERT:      case OP_ASSERT:
3618        case OP_ASSERTBACK:
3619      do      do
3620        {        {
3621        if (match(eptr, ecode+3, offset_top, md)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3622        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3623        }        }
3624      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3625      if (*ecode == OP_KET) return FALSE;      if (*ecode == OP_KET) return FALSE;
3626    
3627        /* If checking an assertion for a condition, return TRUE. */
3628    
3629        if ((flags & match_condassert) != 0) return TRUE;
3630    
3631      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3632      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
3633    
# Line 2384  for (;;) Line 3639  for (;;)
3639      /* Negative assertion: all branches must fail to match */      /* Negative assertion: all branches must fail to match */
3640    
3641      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3642        case OP_ASSERTBACK_NOT:
3643      do      do
3644        {        {
3645        if (match(eptr, ecode+3, offset_top, md)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3646            return FALSE;
3647        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3648        }        }
3649      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3650    
3651        if ((flags & match_condassert) != 0) return TRUE;
3652    
3653      ecode += 3;      ecode += 3;
3654      continue;      continue;
3655    
3656        /* Move the subject pointer back. This occurs only at the start of
3657        each branch of a lookbehind assertion. If we are too close to the start to
3658        move back, this match function fails. When working with UTF-8 we move
3659        back a number of characters, not bytes. */
3660    
3661        case OP_REVERSE:
3662    #ifdef SUPPORT_UTF8
3663        c = (ecode[1] << 8) + ecode[2];
3664        for (i = 0; i < c; i++)
3665          {
3666          eptr--;
3667          BACKCHAR(eptr)
3668          }
3669    #else
3670        eptr -= (ecode[1] << 8) + ecode[2];
3671    #endif
3672    
3673        if (eptr < md->start_subject) return FALSE;
3674        ecode += 3;
3675        break;
3676    
3677        /* Recursion matches the current regex, nested. If there are any capturing
3678        brackets started but not finished, we have to save their starting points
3679        and reinstate them after the recursion. However, we don't know how many
3680        such there are (offset_top records the completed total) so we just have
3681        to save all the potential data. There may be up to 99 such values, which
3682        is a bit large to put on the stack, but using malloc for small numbers
3683        seems expensive. As a compromise, the stack is used when there are fewer
3684        than 16 values to store; otherwise malloc is used. A problem is what to do
3685        if the malloc fails ... there is no way of returning to the top level with
3686        an error. Save the top 15 values on the stack, and accept that the rest
3687        may be wrong. */
3688    
3689        case OP_RECURSE:
3690          {
3691          BOOL rc;
3692          int *save;
3693          int stacksave[15];
3694    
3695          c = md->offset_max;
3696    
3697          if (c < 16) save = stacksave; else
3698            {
3699            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3700            if (save == NULL)
3701              {
3702              save = stacksave;
3703              c = 15;
3704              }
3705            }
3706    
3707          for (i = 1; i <= c; i++)
3708            save[i] = md->offset_vector[md->offset_end - i];
3709          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3710            match_isgroup);
3711          for (i = 1; i <= c; i++)
3712            md->offset_vector[md->offset_end - i] = save[i];
3713          if (save != stacksave) (pcre_free)(save);
3714          if (!rc) return FALSE;
3715    
3716          /* In case the recursion has set more capturing values, save the final
3717          number, then move along the subject till after the recursive match,
3718          and advance one byte in the pattern code. */
3719    
3720          offset_top = md->end_offset_top;
3721          eptr = md->end_match_ptr;
3722          ecode++;
3723          }
3724        break;
3725    
3726      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3727      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
3728      a move back into the brackets. Check the alternative branches in turn - the      a move back into the brackets. Check the alternative branches in turn - the
3729      matching won't pass the KET for this kind of subpattern. If any one branch      matching won't pass the KET for this kind of subpattern. If any one branch
3730      matches, we carry on, leaving the subject pointer. */      matches, we carry on as at the end of a normal bracket, leaving the subject
3731        pointer. */
3732    
3733      case OP_ONCE:      case OP_ONCE:
     do  
3734        {        {
3735        if (match(eptr, ecode+3, offset_top, md)) break;        const uschar *prev = ecode;
3736        ecode += (ecode[1] << 8) + ecode[2];        const uschar *saved_eptr = eptr;
       }  
     while (*ecode == OP_ALT);  
     if (*ecode == OP_KET) return FALSE;  
3737    
3738      /* Continue as from after the assertion, updating the offsets high water        do
3739      mark, since extracts may have been taken. */          {
3740            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3741              break;
3742            ecode += (ecode[1] << 8) + ecode[2];
3743            }
3744          while (*ecode == OP_ALT);
3745    
3746      do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);        /* If hit the end of the group (which could be repeated), fail */
3747      ecode += 3;  
3748      offset_top = md->end_offset_top;        if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3749      eptr = md->end_match_ptr;  
3750      continue;        /* Continue as from after the assertion, updating the offsets high water
3751          mark, since extracts may have been taken. */
3752    
3753          do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3754    
3755          offset_top = md->end_offset_top;
3756          eptr = md->end_match_ptr;
3757    
3758          /* For a non-repeating ket, just continue at this level. This also
3759          happens for a repeating ket if no characters were matched in the group.
3760          This is the forcible breaking of infinite loops as implemented in Perl
3761          5.005. If there is an options reset, it will get obeyed in the normal
3762          course of events. */
3763    
3764          if (*ecode == OP_KET || eptr == saved_eptr)
3765            {
3766            ecode += 3;
3767            break;
3768            }
3769    
3770          /* The repeating kets try the rest of the pattern or restart from the
3771          preceding bracket, in the appropriate order. We need to reset any options
3772          that changed within the bracket before re-running it, so check the next
3773          opcode. */
3774    
3775          if (ecode[3] == OP_OPT)
3776            {
3777            ims = (ims & ~PCRE_IMS) | ecode[4];
3778            DPRINTF(("ims set to %02lx at group repeat\n", ims));
3779            }
3780    
3781          if (*ecode == OP_KETRMIN)
3782            {
3783            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3784                match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3785                  return TRUE;
3786            }
3787          else  /* OP_KETRMAX */
3788            {
3789            if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3790                match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3791            }
3792          }
3793        return FALSE;
3794    
3795      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
3796      bracketed group and go to there. */      bracketed group and go to there. */
# Line 2433  for (;;) Line 3808  for (;;)
3808      case OP_BRAZERO:      case OP_BRAZERO:
3809        {        {
3810        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3811        if (match(eptr, next, offset_top, md)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3812            return TRUE;
3813        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3814        ecode = next + 3;        ecode = next + 3;
3815        }        }
# Line 2443  for (;;) Line 3819  for (;;)
3819        {        {
3820        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3821        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3822        if (match(eptr, next+3, offset_top, md)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3823            return TRUE;
3824        ecode++;        ecode++;
3825        }        }
3826      break;;      break;
3827    
3828      /* End of a group, repeated or non-repeating. If we are at the end of      /* End of a group, repeated or non-repeating. If we are at the end of
3829      an assertion "group", stop matching and return TRUE, but record the      an assertion "group", stop matching and return TRUE, but record the
3830      current high water mark for use by positive assertions. */      current high water mark for use by positive assertions. Do this also
3831        for the "once" (not-backup up) groups. */
3832    
3833      case OP_KET:      case OP_KET:
3834      case OP_KETRMIN:      case OP_KETRMIN:
3835      case OP_KETRMAX:      case OP_KETRMAX:
3836        {        {
       int number;  
3837        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3838          const uschar *saved_eptr = eptrb->saved_eptr;
3839    
3840        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)        eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3841    
3842          if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3843              *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3844              *prev == OP_ONCE)
3845          {          {
3846          md->end_match_ptr = eptr;      /* For ONCE */          md->end_match_ptr = eptr;      /* For ONCE */
3847          md->end_offset_top = offset_top;          md->end_offset_top = offset_top;
3848          return TRUE;          return TRUE;
3849          }          }
3850    
3851        /* In all other cases we have to check the group number back at the        /* In all other cases except a conditional group we have to check the
3852        start and if necessary complete handling an extraction by setting the        group number back at the start and if necessary complete handling an
3853        final offset and bumping the high water mark. */        extraction by setting the offsets and bumping the high water mark. */
3854    
3855        number = (*prev - OP_BRA) << 1;        if (*prev != OP_COND)
3856            {
3857            int number = *prev - OP_BRA;
3858            int offset = number << 1;
3859    
3860        DPRINTF(("end bracket %d\n", number/2));  #ifdef DEBUG
3861            printf("end bracket %d", number);
3862            printf("\n");
3863    #endif
3864    
3865        if (number > 0)          if (number > 0)
         {  
         if (number >= md->offset_end) md->offset_overflow = TRUE; else  
3866            {            {
3867            md->offset_vector[number+1] = eptr - md->start_subject;            if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3868            if (offset_top <= number) offset_top = number + 2;              {
3869                md->offset_vector[offset] =
3870                  md->offset_vector[md->offset_end - number];
3871                md->offset_vector[offset+1] = eptr - md->start_subject;
3872                if (offset_top <= offset) offset_top = offset + 2;
3873                }
3874            }            }
3875          }          }
3876    
3877        /* For a non-repeating ket, just advance to the next node and continue at        /* Reset the value of the ims flags, in case they got changed during
3878        this level. */        the group. */
3879    
3880          ims = original_ims;
3881          DPRINTF(("ims reset to %02lx\n", ims));
3882    
3883        if (*ecode == OP_KET)        /* For a non-repeating ket, just continue at this level. This also
3884          happens for a repeating ket if no characters were matched in the group.
3885          This is the forcible breaking of infinite loops as implemented in Perl
3886          5.005. If there is an options reset, it will get obeyed in the normal
3887          course of events. */
3888    
3889          if (*ecode == OP_KET || eptr == saved_eptr)
3890          {          {
3891          ecode += 3;          ecode += 3;
3892          break;          break;
# Line 2497  for (;;) Line 3897  for (;;)
3897    
3898        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3899          {          {
3900          if (match(eptr, ecode+3, offset_top, md) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3901              match(eptr, prev, offset_top, md)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3902                  return TRUE;
3903          }          }
3904        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3905          {          {
3906          if (match(eptr, prev, offset_top, md) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3907              match(eptr, ecode+3, offset_top, md)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3908          }          }
3909        }        }
3910      return FALSE;      return FALSE;
# Line 2512  for (;;) Line 3913  for (;;)
3913    
3914      case OP_CIRC:      case OP_CIRC:
3915      if (md->notbol && eptr == md->start_subject) return FALSE;      if (md->notbol && eptr == md->start_subject) return FALSE;
3916