/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 15 by nigel, Sat Feb 24 21:38:25 2007 UTC revision 51 by nigel, Sat Feb 24 21:39:37 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 33  restrictions: Line 37  restrictions:
37    
38  /* #define DEBUG */  /* #define DEBUG */
39    
40  /* Use a macro for debugging printing, 'cause that eliminates the the use  /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41  of #ifdef inline, and there are *still* stupid compilers about that don't like  inline, and there are *still* stupid compilers about that don't like indented
42  indented pre-processor statements. I suppose it's only been 10 years... */  pre-processor statements. I suppose it's only been 10 years... */
43    
44  #ifdef DEBUG  #ifdef DEBUG
45  #define DPRINTF(p) printf p  #define DPRINTF(p) printf p
# Line 56  the external pcre header. */ Line 60  the external pcre header. */
60  #endif  #endif
61    
62    
63    /* Number of items on the nested bracket stacks at compile time. This should
64    not be set greater than 200. */
65    
66    #define BRASTACK_SIZE 200
67    
68    
69    /* The number of bytes in a literal character string above which we can't add
70    any more is different when UTF-8 characters may be encountered. */
71    
72    #ifdef SUPPORT_UTF8
73    #define MAXLIT 250
74    #else
75    #define MAXLIT 255
76    #endif
77    
78    
79  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 66  static const char rep_max[] = { 0, 0, 0, Line 86  static const char rep_max[] = { 0, 0, 0,
86  #ifdef DEBUG  #ifdef DEBUG
87  static const char *OP_names[] = {  static const char *OP_names[] = {
88    "End", "\\A", "\\B", "\\b", "\\D", "\\d",    "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89    "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",    "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90    "not",    "Opt", "^", "$", "Any", "chars", "not",
91    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
95    "class", "negclass", "Ref",    "class", "Ref", "Recurse",
96    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97      "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
99  };  };
100  #endif  #endif
# Line 93  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
115      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
116      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
117      0,      0,      0                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
118    };
119    
120    /* Tables of names of POSIX character classes and their lengths. The list is
121    terminated by a zero length entry. The first three must be alpha, upper, lower,
122    as this is assumed for handling case independence. */
123    
124    static const char *posix_names[] = {
125      "alpha", "lower", "upper",
126      "alnum", "ascii", "cntrl", "digit", "graph",
127      "print", "punct", "space", "word",  "xdigit" };
128    
129    static const uschar posix_name_lengths[] = {
130      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131    
132    /* Table of class bit maps for each POSIX class; up to three may be combined
133    to form the class. */
134    
135    static const int posix_class_maps[] = {
136      cbit_lower, cbit_upper, -1,             /* alpha */
137      cbit_lower, -1,         -1,             /* lower */
138      cbit_upper, -1,         -1,             /* upper */
139      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
140      cbit_print, cbit_cntrl, -1,             /* ascii */
141      cbit_cntrl, -1,         -1,             /* cntrl */
142      cbit_digit, -1,         -1,             /* digit */
143      cbit_graph, -1,         -1,             /* graph */
144      cbit_print, -1,         -1,             /* print */
145      cbit_punct, -1,         -1,             /* punct */
146      cbit_space, -1,         -1,             /* space */
147      cbit_word,  -1,         -1,             /* word */
148      cbit_xdigit,-1,         -1              /* xdigit */
149  };  };
150    
151    
152  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
153    
154  static BOOL  static BOOL
155    compile_regex(int, int *, uschar **, const uschar **, const char **);    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156        BOOL, int, int *, int *, compile_data *);
157    
158  /* Structure for passing "static" information around between the functions  /* Structure for building a chain of data that actually lives on the
159  doing the matching, so that they are thread-safe. */  stack, for holding the values of the subject pointer at the start of each
160    subpattern, so as to detect when an empty string has been matched by a
161    subpattern - to break infinite loops. */
162    
163    typedef struct eptrblock {
164      struct eptrblock *prev;
165      const uschar *saved_eptr;
166    } eptrblock;
167    
168  typedef struct match_data {  /* Flag bits for the match() function */
169    int    errorcode;             /* As it says */  
170    int   *offset_vector;         /* Offset vector */  #define match_condassert   0x01    /* Called to check a condition assertion */
171    int    offset_end;            /* One past the end */  #define match_isgroup      0x02    /* Set if start of bracketed group */
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   caseless;              /* Case-independent flag */  
   BOOL   runtime_caseless;      /* Caseless forced at run time */  
   BOOL   multiline;             /* Multiline flag */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   dotall;                /* Dot matches any char */  
   BOOL   endonly;               /* Dollar not before final \n */  
   const uschar *start_subject;  /* Start of the subject string */  
   const uschar *end_subject;    /* End of the subject string */  
   jmp_buf fail_env;             /* Environment for longjump() break out */  
   const uschar *end_match_ptr;  /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
172    
173    
174    
# Line 139  void  (*pcre_free)(void *) = free; Line 186  void  (*pcre_free)(void *) = free;
186    
187    
188    
189    /*************************************************
190    *    Macros and tables for character handling    *
191    *************************************************/
192    
193    /* When UTF-8 encoding is being used, a character is no longer just a single
194    byte. The macros for character handling generate simple sequences when used in
195    byte-mode, and more complicated ones for UTF-8 characters. */
196    
197    #ifndef SUPPORT_UTF8
198    #define GETCHARINC(c, eptr) c = *eptr++;
199    #define GETCHARLEN(c, eptr, len) c = *eptr;
200    #define BACKCHAR(eptr)
201    
202    #else   /* SUPPORT_UTF8 */
203    
204    /* Get the next UTF-8 character, advancing the pointer */
205    
206    #define GETCHARINC(c, eptr) \
207      c = *eptr++; \
208      if (md->utf8 && (c & 0xc0) == 0xc0) \
209        { \
210        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
211        int s = 6 - a;                  /* Amount to shift next byte */  \
212        c &= utf8_table3[a];            /* Low order bits from first byte */ \
213        while (a-- > 0) \
214          { \
215          c |= (*eptr++ & 0x3f) << s; \
216          s += 6; \
217          } \
218        }
219    
220    /* Get the next UTF-8 character, not advancing the pointer, setting length */
221    
222    #define GETCHARLEN(c, eptr, len) \
223      c = *eptr; \
224      len = 1; \
225      if (md->utf8 && (c & 0xc0) == 0xc0) \
226        { \
227        int i; \
228        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
229        int s = 6 - a;                  /* Amount to shift next byte */  \
230        c &= utf8_table3[a];            /* Low order bits from first byte */ \
231        for (i = 1; i <= a; i++) \
232          { \
233          c |= (eptr[i] & 0x3f) << s; \
234          s += 6; \
235          } \
236        len += a; \
237        }
238    
239    /* If the pointer is not at the start of a character, move it back until
240    it is. */
241    
242    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243    
244    #endif
245    
246    
247    
248    /*************************************************
249    *             Default character tables           *
250    *************************************************/
251    
252    /* A default set of character tables is included in the PCRE binary. Its source
253    is built by the maketables auxiliary program, which uses the default C ctypes
254    functions, and put in the file chartables.c. These tables are used by PCRE
255    whenever the caller of pcre_compile() does not provide an alternate set of
256    tables. */
257    
258    #include "chartables.c"
259    
260    
261    
262    #ifdef SUPPORT_UTF8
263    /*************************************************
264    *           Tables for UTF-8 support             *
265    *************************************************/
266    
267    /* These are the breakpoints for different numbers of bytes in a UTF-8
268    character. */
269    
270    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271    
272    /* These are the indicator bits and the mask for the data bits to set in the
273    first byte of a character, indexed by the number of additional bytes. */
274    
275    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277    
278    /* Table of the number of extra characters, indexed by the first character
279    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280    0x3d. */
281    
282    static uschar utf8_table4[] = {
283      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287    
288    
289    /*************************************************
290    *       Convert character value to UTF-8         *
291    *************************************************/
292    
293    /* This function takes an integer value in the range 0 - 0x7fffffff
294    and encodes it as a UTF-8 character in 0 to 6 bytes.
295    
296    Arguments:
297      cvalue     the character value
298      buffer     pointer to buffer for result - at least 6 bytes long
299    
300    Returns:     number of characters placed in the buffer
301    */
302    
303    static int
304    ord2utf8(int cvalue, uschar *buffer)
305    {
306    register int i, j;
307    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308      if (cvalue <= utf8_table1[i]) break;
309    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310    cvalue >>= 6 - i;
311    for (j = 0; j < i; j++)
312      {
313      *buffer++ = 0x80 | (cvalue & 0x3f);
314      cvalue >>= 6;
315      }
316    return i + 1;
317    }
318    #endif
319    
320    
321    
322  /*************************************************  /*************************************************
323  *          Return version string                 *  *          Return version string                 *
324  *************************************************/  *************************************************/
325    
326    #define STRING(a)  # a
327    #define XSTRING(s) STRING(s)
328    
329  const char *  const char *
330  pcre_version(void)  pcre_version(void)
331  {  {
332  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333  }  }
334    
335    
336    
337    
338  /*************************************************  /*************************************************
339  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
340  *************************************************/  *************************************************/
341    
342  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
343  structure.  of the private structure, but its interface was too rigid. It remains for
344    backwards compatibility. The public options are passed back in an int - though
345    the re->options field has been expanded to a long int, all the public options
346    at the low end of it, and so even on 16-bit systems this will still be OK.
347    Therefore, I haven't changed the API for pcre_info().
348    
349  Arguments:  Arguments:
350    external_re   points to compiled code    external_re   points to compiled code
# Line 167  Arguments: Line 353  Arguments:
353                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
354                  or -2 otherwise                  or -2 otherwise
355    
356  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
357                  or negative values on error                  or negative values on error
358  */  */
359    
# Line 177  pcre_info(const pcre *external_re, int * Line 363  pcre_info(const pcre *external_re, int *
363  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
364  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
365  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367  if (first_char != NULL)  if (first_char != NULL)
368    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 186  return re->top_bracket; Line 372  return re->top_bracket;
372    
373    
374    
375    /*************************************************
376    *        Return info about compiled pattern      *
377    *************************************************/
378    
379    /* This is a newer "info" function which has an extensible interface so
380    that additional items can be added compatibly.
381    
382    Arguments:
383      external_re      points to compiled code
384      external_study   points to study data, or NULL
385      what             what information is required
386      where            where to put the information
387    
388    Returns:           0 if data returned, negative on error
389    */
390    
391    int
392    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393      void *where)
394    {
395    const real_pcre *re = (const real_pcre *)external_re;
396    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397    
398    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400    
401    switch (what)
402      {
403      case PCRE_INFO_OPTIONS:
404      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405      break;
406    
407      case PCRE_INFO_SIZE:
408      *((size_t *)where) = re->size;
409      break;
410    
411      case PCRE_INFO_CAPTURECOUNT:
412      *((int *)where) = re->top_bracket;
413      break;
414    
415      case PCRE_INFO_BACKREFMAX:
416      *((int *)where) = re->top_backref;
417      break;
418    
419      case PCRE_INFO_FIRSTCHAR:
420      *((int *)where) =
421        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423      break;
424    
425      case PCRE_INFO_FIRSTTABLE:
426      *((const uschar **)where) =
427        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428          study->start_bits : NULL;
429      break;
430    
431      case PCRE_INFO_LASTLITERAL:
432      *((int *)where) =
433        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434      break;
435    
436      default: return PCRE_ERROR_BADOPTION;
437      }
438    
439    return 0;
440    }
441    
442    
443    
444  #ifdef DEBUG  #ifdef DEBUG
445  /*************************************************  /*************************************************
# Line 218  while (length-- > 0) Line 472  while (length-- > 0)
472    
473    
474  /*************************************************  /*************************************************
 *         Check subpattern for empty operand     *  
 *************************************************/  
   
 /* This function checks a bracketed subpattern to see if any of the paths  
 through it could match an empty string. This is used to diagnose an error if  
 such a subpattern is followed by a quantifier with an unlimited upper bound.  
   
 Argument:  
   code      points to the opening bracket  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 could_be_empty(uschar *code)  
 {  
 do {  
   uschar *cc = code + 3;  
   
   /* Scan along the opcodes for this branch; as soon as we find something  
   that matches a non-empty string, break out and advance to test the next  
   branch. If we get to the end of the branch, return TRUE for the whole  
   sub-expression. */  
   
   for (;;)  
     {  
     /* Test an embedded subpattern; if it could not be empty, break the  
     loop. Otherwise carry on in the branch. */  
   
     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)  
       {  
       if (!could_be_empty(cc)) break;  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       }  
   
     else switch (*cc)  
       {  
       /* Reached end of a branch: the subpattern may match the empty string */  
   
       case OP_ALT:  
       case OP_KET:  
       case OP_KETRMAX:  
       case OP_KETRMIN:  
       return TRUE;  
   
       /* Skip over assertive subpatterns */  
   
       case OP_ASSERT:  
       case OP_ASSERT_NOT:  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       break;  
   
       /* Skip over things that don't match chars */  
   
       case OP_SOD:  
       case OP_EOD:  
       case OP_CIRC:  
       case OP_DOLL:  
       case OP_BRAZERO:  
       case OP_BRAMINZERO:  
       case OP_NOT_WORD_BOUNDARY:  
       case OP_WORD_BOUNDARY:  
       cc++;  
       break;  
   
       /* Skip over simple repeats with zero lower bound */  
   
       case OP_STAR:  
       case OP_MINSTAR:  
       case OP_QUERY:  
       case OP_MINQUERY:  
       case OP_NOTSTAR:  
       case OP_NOTMINSTAR:  
       case OP_NOTQUERY:  
       case OP_NOTMINQUERY:  
       case OP_TYPESTAR:  
       case OP_TYPEMINSTAR:  
       case OP_TYPEQUERY:  
       case OP_TYPEMINQUERY:  
       cc += 2;  
       break;  
   
       /* Skip over UPTOs (lower bound is zero) */  
   
       case OP_UPTO:  
       case OP_MINUPTO:  
       case OP_TYPEUPTO:  
       case OP_TYPEMINUPTO:  
       cc += 4;  
       break;  
   
       /* Check a class or a back reference for a zero minimum */  
   
       case OP_CLASS:  
       case OP_NEGCLASS:  
       case OP_REF:  
       cc += (*cc == OP_REF)? 2 : 33;  
   
       switch (*cc)  
         {  
         case OP_CRSTAR:  
         case OP_CRMINSTAR:  
         case OP_CRQUERY:  
         case OP_CRMINQUERY:  
         cc++;  
         break;  
   
         case OP_CRRANGE:  
         case OP_CRMINRANGE:  
         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;  
         cc += 3;  
         break;  
   
         default:  
         goto NEXT_BRANCH;  
         }  
       break;  
   
       /* Anything else matches at least one character */  
   
       default:  
       goto NEXT_BRANCH;  
       }  
     }  
   
   NEXT_BRANCH:  
   code += (code[1] << 8) + code[2];  
   }  
 while (*code == OP_ALT);  
   
 /* No branches match the empty string */  
   
 return FALSE;  
 }  
   
   
   
 /*************************************************  
475  *            Handle escapes                      *  *            Handle escapes                      *
476  *************************************************/  *************************************************/
477    
478  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
479  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
480  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
482  sequence.  the \. On exit, it is on the final character of the escape sequence.
483    
484  Arguments:  Arguments:
485    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 373  Arguments: Line 487  Arguments:
487    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
488    options    the options bits    options    the options bits
489    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
490      cd         pointer to char tables block
491    
492  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
493               negative => a special escape sequence               negative => a special escape sequence
# Line 381  Returns:     zero or positive => a data Line 496  Returns:     zero or positive => a data
496    
497  static int  static int
498  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499    int options, BOOL isclass)    int options, BOOL isclass, compile_data *cd)
500  {  {
501  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
502  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
503    
504    /* If backslash is at the end of the pattern, it's an error. */
505    
506    c = *(++ptr);
507  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
508    
509  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 424  else Line 541  else
541        {        {
542        oldptr = ptr;        oldptr = ptr;
543        c -= '0';        c -= '0';
544        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
546        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
547          {          {
# Line 446  else Line 563  else
563        }        }
564    
565      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
566      larger first octal digit */      larger first octal digit. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
572          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
573        c &= 255;     /* Take least significant 8 bits */
574      break;      break;
575    
576      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577        which can be greater than 0xff, but only if the ddd are hex digits. */
578    
579      case 'x':      case 'x':
580    #ifdef SUPPORT_UTF8
581        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582          {
583          const uschar *pt = ptr + 2;
584          register int count = 0;
585          c = 0;
586          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587            {
588            count++;
589            c = c * 16 + cd->lcc[*pt] -
590              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591            pt++;
592            }
593          if (*pt == '}')
594            {
595            if (c < 0 || count > 8) *errorptr = ERR34;
596            ptr = pt;
597            break;
598            }
599          /* If the sequence of hex digits does not end with '}', then we don't
600          recognize this construct; fall through to the normal \x handling. */
601          }
602    #endif
603    
604        /* Read just a single hex char */
605    
606      c = 0;      c = 0;
607      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608        {        {
609        ptr++;        ptr++;
610        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
611          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
612        }        }
613      break;      break;
614    
615        /* Other special escapes not starting with a digit are straightforward */
616    
617      case 'c':      case 'c':
618      c = *(++ptr);      c = *(++ptr);
619      if (c == 0)      if (c == 0)
# Line 477  else Line 624  else
624    
625      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
626    
627      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
628      c ^= 0x40;      c ^= 0x40;
629      break;      break;
630    
631      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
634        there used to be some cases other than the default, and there may be again
635        in future, so I haven't "optimized" it. */
636    
637      default:      default:
638      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
639        {        {
       case 'X':  
       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */  
       break;  
   
640        default:        default:
641        *errorptr = ERR3;        *errorptr = ERR3;
642        break;        break;
# Line 517  where the ddds are digits. Line 662  where the ddds are digits.
662    
663  Arguments:  Arguments:
664    p         pointer to the first char after '{'    p         pointer to the first char after '{'
665      cd        pointer to char tables block
666    
667  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
668  */  */
669    
670  static BOOL  static BOOL
671  is_counted_repeat(const uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
672  {  {
673  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
676    
677  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
678  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
679    
680  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
682  return (*p == '}');  return (*p == '}');
683  }  }
684    
# Line 552  Arguments: Line 698  Arguments:
698    maxp       pointer to int for max    maxp       pointer to int for max
699               returned as -1 if no max               returned as -1 if no max
700    errorptr   points to pointer to error message    errorptr   points to pointer to error message
701      cd         pointer to character tables clock
702    
703  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
704               current ptr on error, with errorptr set               current ptr on error, with errorptr set
705  */  */
706    
707  static const uschar *  static const uschar *
708  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
709      const char **errorptr, compile_data *cd)
710  {  {
711  int min = 0;  int min = 0;
712  int max = -1;  int max = -1;
713    
714  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
715    
716  if (*p == '}') max = min; else  if (*p == '}') max = min; else
717    {    {
718    if (*(++p) != '}')    if (*(++p) != '}')
719      {      {
720      max = 0;      max = 0;
721      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722      if (max < min)      if (max < min)
723        {        {
724        *errorptr = ERR4;        *errorptr = ERR4;
# Line 595  return p; Line 743  return p;
743    
744    
745  /*************************************************  /*************************************************
746    *        Find the fixed length of a pattern      *
747    *************************************************/
748    
749    /* Scan a pattern and compute the fixed length of subject that will match it,
750    if the length is fixed. This is needed for dealing with backward assertions.
751    
752    Arguments:
753      code     points to the start of the pattern (the bracket)
754      options  the compiling options
755    
756    Returns:   the fixed length, or -1 if there is no fixed length
757    */
758    
759    static int
760    find_fixedlength(uschar *code, int options)
761    {
762    int length = -1;
763    
764    register int branchlength = 0;
765    register uschar *cc = code + 3;
766    
767    /* Scan along the opcodes for this branch. If we get to the end of the
768    branch, check the length against that of the other branches. */
769    
770    for (;;)
771      {
772      int d;
773      register int op = *cc;
774      if (op >= OP_BRA) op = OP_BRA;
775    
776      switch (op)
777        {
778        case OP_BRA:
779        case OP_ONCE:
780        case OP_COND:
781        d = find_fixedlength(cc, options);
782        if (d < 0) return -1;
783        branchlength += d;
784        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
785        cc += 3;
786        break;
787    
788        /* Reached end of a branch; if it's a ket it is the end of a nested
789        call. If it's ALT it is an alternation in a nested call. If it is
790        END it's the end of the outer call. All can be handled by the same code. */
791    
792        case OP_ALT:
793        case OP_KET:
794        case OP_KETRMAX:
795        case OP_KETRMIN:
796        case OP_END:
797        if (length < 0) length = branchlength;
798          else if (length != branchlength) return -1;
799        if (*cc != OP_ALT) return length;
800        cc += 3;
801        branchlength = 0;
802        break;
803    
804        /* Skip over assertive subpatterns */
805    
806        case OP_ASSERT:
807        case OP_ASSERT_NOT:
808        case OP_ASSERTBACK:
809        case OP_ASSERTBACK_NOT:
810        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811        cc += 3;
812        break;
813    
814        /* Skip over things that don't match chars */
815    
816        case OP_REVERSE:
817        cc++;
818        /* Fall through */
819    
820        case OP_CREF:
821        case OP_OPT:
822        cc++;
823        /* Fall through */
824    
825        case OP_SOD:
826        case OP_EOD:
827        case OP_EODN:
828        case OP_CIRC:
829        case OP_DOLL:
830        case OP_NOT_WORD_BOUNDARY:
831        case OP_WORD_BOUNDARY:
832        cc++;
833        break;
834    
835        /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836        This requires a scan of the string, unfortunately. We assume valid UTF-8
837        strings, so all we do is reduce the length by one for byte whose bits are
838        10xxxxxx. */
839    
840        case OP_CHARS:
841        branchlength += *(++cc);
842    #ifdef SUPPORT_UTF8
843        for (d = 1; d <= *cc; d++)
844          if ((cc[d] & 0xc0) == 0x80) branchlength--;
845    #endif
846        cc += *cc + 1;
847        break;
848    
849        /* Handle exact repetitions */
850    
851        case OP_EXACT:
852        case OP_TYPEEXACT:
853        branchlength += (cc[1] << 8) + cc[2];
854        cc += 4;
855        break;
856    
857        /* Handle single-char matchers */
858    
859        case OP_NOT_DIGIT:
860        case OP_DIGIT:
861        case OP_NOT_WHITESPACE:
862        case OP_WHITESPACE:
863        case OP_NOT_WORDCHAR:
864        case OP_WORDCHAR:
865        case OP_ANY:
866        branchlength++;
867        cc++;
868        break;
869    
870    
871        /* Check a class for variable quantification */
872    
873        case OP_CLASS:
874        cc += (*cc == OP_REF)? 2 : 33;
875    
876        switch (*cc)
877          {
878          case OP_CRSTAR:
879          case OP_CRMINSTAR:
880          case OP_CRQUERY:
881          case OP_CRMINQUERY:
882          return -1;
883    
884          case OP_CRRANGE:
885          case OP_CRMINRANGE:
886          if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887          branchlength += (cc[1] << 8) + cc[2];
888          cc += 5;
889          break;
890    
891          default:
892          branchlength++;
893          }
894        break;
895    
896        /* Anything else is variable length */
897    
898        default:
899        return -1;
900        }
901      }
902    /* Control never gets here */
903    }
904    
905    
906    
907    
908    /*************************************************
909    *           Check for POSIX class syntax         *
910    *************************************************/
911    
912    /* This function is called when the sequence "[:" or "[." or "[=" is
913    encountered in a character class. It checks whether this is followed by an
914    optional ^ and then a sequence of letters, terminated by a matching ":]" or
915    ".]" or "=]".
916    
917    Argument:
918      ptr      pointer to the initial [
919      endptr   where to return the end pointer
920      cd       pointer to compile data
921    
922    Returns:   TRUE or FALSE
923    */
924    
925    static BOOL
926    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927    {
928    int terminator;          /* Don't combine these lines; the Solaris cc */
929    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
930    if (*(++ptr) == '^') ptr++;
931    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932    if (*ptr == terminator && ptr[1] == ']')
933      {
934      *endptr = ptr;
935      return TRUE;
936      }
937    return FALSE;
938    }
939    
940    
941    
942    
943    /*************************************************
944    *          Check POSIX class name                *
945    *************************************************/
946    
947    /* This function is called to check the name given in a POSIX-style class entry
948    such as [:alnum:].
949    
950    Arguments:
951      ptr        points to the first letter
952      len        the length of the name
953    
954    Returns:     a value representing the name, or -1 if unknown
955    */
956    
957    static int
958    check_posix_name(const uschar *ptr, int len)
959    {
960    register int yield = 0;
961    while (posix_name_lengths[yield] != 0)
962      {
963      if (len == posix_name_lengths[yield] &&
964        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965      yield++;
966      }
967    return -1;
968    }
969    
970    
971    
972    
973    /*************************************************
974  *           Compile one branch                   *  *           Compile one branch                   *
975  *************************************************/  *************************************************/
976    
977  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
978    
979  Arguments:  Arguments:
980    options    the option bits    options      the option bits
981    bracket    points to number of brackets used    brackets     points to number of brackets used
982    code       points to the pointer to the current code point    code         points to the pointer to the current code point
983    ptrptr     points to the current pattern pointer    ptrptr       points to the current pattern pointer
984    errorptr   points to pointer to error message    errorptr     points to pointer to error message
985      optchanged   set to the value of the last OP_OPT item compiled
986      reqchar      set to the last literal character required, else -1
987      countlits    set to count of mandatory literal characters
988      cd           contains pointers to tables
989    
990  Returns:     TRUE on success  Returns:       TRUE on success
991               FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
992  */  */
993    
994  static BOOL  static BOOL
995  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
996    const uschar **ptrptr, const char **errorptr)    const uschar **ptrptr, const char **errorptr, int *optchanged,
997      int *reqchar, int *countlits, compile_data *cd)
998  {  {
999  int repeat_type, op_type;  int repeat_type, op_type;
1000  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1001  int bravalue, length;  int bravalue, length;
1002    int greedy_default, greedy_non_default;
1003    int prevreqchar;
1004    int condcount = 0;
1005    int subcountlits = 0;
1006  register int c;  register int c;
1007  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1008    uschar *tempcode;
1009  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1010  const uschar *oldptr;  const uschar *tempptr;
1011  uschar *previous = NULL;  uschar *previous = NULL;
1012  uschar class[32];  uschar class[32];
1013    
1014    /* Set up the default and non-default settings for greediness */
1015    
1016    greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017    greedy_non_default = greedy_default ^ 1;
1018    
1019    /* Initialize no required char, and count of literals */
1020    
1021    *reqchar = prevreqchar = -1;
1022    *countlits = 0;
1023    
1024  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1025    
1026  for (;; ptr++)  for (;; ptr++)
1027    {    {
1028    BOOL negate_class;    BOOL negate_class;
1029    int  class_charcount;    int class_charcount;
1030    int  class_lastchar;    int class_lastchar;
1031      int newoptions;
1032      int condref;
1033      int subreqchar;
1034    
1035    c = *ptr;    c = *ptr;
1036    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
1037      {      {
1038      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039      if (c == '#')      if (c == '#')
1040        {        {
1041        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1042          on the Macintosh. */
1043          while ((c = *(++ptr)) != 0 && c != '\n') ;
1044        continue;        continue;
1045        }        }
1046      }      }
# Line 679  for (;; ptr++) Line 1080  for (;; ptr++)
1080    
1081      case '[':      case '[':
1082      previous = code;      previous = code;
1083        *code++ = OP_CLASS;
1084    
1085      /* If the first character is '^', set the negation flag, and use a      /* If the first character is '^', set the negation flag and skip it. */
     different opcode. This only matters if caseless matching is specified at  
     runtime. */  
1086    
1087      if ((c = *(++ptr)) == '^')      if ((c = *(++ptr)) == '^')
1088        {        {
1089        negate_class = TRUE;        negate_class = TRUE;
       *code++ = OP_NEGCLASS;  
1090        c = *(++ptr);        c = *(++ptr);
1091        }        }
1092      else      else negate_class = FALSE;
       {  
       negate_class = FALSE;  
       *code++ = OP_CLASS;  
       }  
1093    
1094      /* Keep a count of chars so that we can optimize the case of just a single      /* Keep a count of chars so that we can optimize the case of just a single
1095      character. */      character. */
# Line 720  for (;; ptr++) Line 1115  for (;; ptr++)
1115          goto FAILED;          goto FAILED;
1116          }          }
1117    
1118          /* Handle POSIX class names. Perl allows a negation extension of the
1119          form [:^name]. A square bracket that doesn't match the syntax is
1120          treated as a literal. We also recognize the POSIX constructions
1121          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122          5.6 does. */
1123    
1124          if (c == '[' &&
1125              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126              check_posix_syntax(ptr, &tempptr, cd))
1127            {
1128            BOOL local_negate = FALSE;
1129            int posix_class, i;
1130            register const uschar *cbits = cd->cbits;
1131    
1132            if (ptr[1] != ':')
1133              {
1134              *errorptr = ERR31;
1135              goto FAILED;
1136              }
1137    
1138            ptr += 2;
1139            if (*ptr == '^')
1140              {
1141              local_negate = TRUE;
1142              ptr++;
1143              }
1144    
1145            posix_class = check_posix_name(ptr, tempptr - ptr);
1146            if (posix_class < 0)
1147              {
1148              *errorptr = ERR30;
1149              goto FAILED;
1150              }
1151    
1152            /* If matching is caseless, upper and lower are converted to
1153            alpha. This relies on the fact that the class table starts with
1154            alpha, lower, upper as the first 3 entries. */
1155    
1156            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157              posix_class = 0;
1158    
1159            /* Or into the map we are building up to 3 of the static class
1160            tables, or their negations. */
1161    
1162            posix_class *= 3;
1163            for (i = 0; i < 3; i++)
1164              {
1165              int taboffset = posix_class_maps[posix_class + i];
1166              if (taboffset < 0) break;
1167              if (local_negate)
1168                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169              else
1170                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171              }
1172    
1173            ptr = tempptr + 1;
1174            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1175            continue;
1176            }
1177    
1178        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1179        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1180        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 730  for (;; ptr++) Line 1185  for (;; ptr++)
1185    
1186        if (c == '\\')        if (c == '\\')
1187          {          {
1188          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
1190          else if (c < 0)          else if (c < 0)
1191            {            {
1192              register const uschar *cbits = cd->cbits;
1193            class_charcount = 10;            class_charcount = 10;
1194            switch (-c)            switch (-c)
1195              {              {
1196              case ESC_d:              case ESC_d:
1197              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198              continue;              continue;
1199    
1200              case ESC_D:              case ESC_D:
1201              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202              continue;              continue;
1203    
1204              case ESC_w:              case ESC_w:
1205              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1206              continue;              continue;
1207    
1208              case ESC_W:              case ESC_W:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_s:              case ESC_s:
1213              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214              continue;              continue;
1215    
1216              case ESC_S:              case ESC_S:
1217              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218              continue;              continue;
1219    
1220              default:              default:
# Line 768  for (;; ptr++) Line 1222  for (;; ptr++)
1222              goto FAILED;              goto FAILED;
1223              }              }
1224            }            }
1225          /* Fall through if single character */  
1226            /* Fall through if single character, but don't at present allow
1227            chars > 255 in UTF-8 mode. */
1228    
1229    #ifdef SUPPORT_UTF8
1230            if (c > 255)
1231              {
1232              *errorptr = ERR33;
1233              goto FAILED;
1234              }
1235    #endif
1236          }          }
1237    
1238        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 788  for (;; ptr++) Line 1252  for (;; ptr++)
1252            }            }
1253    
1254          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1255          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256            in such circumstances. */
1257    
1258          if (d == '\\')          if (d == '\\')
1259            {            {
1260            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            const uschar *oldptr = ptr;
1261              d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262    
1263    #ifdef SUPPORT_UTF8
1264              if (d > 255)
1265                {
1266                *errorptr = ERR33;
1267                goto FAILED;
1268                }
1269    #endif
1270              /* \b is backslash; any other special means the '-' was literal */
1271    
1272            if (d < 0)            if (d < 0)
1273              {              {
1274              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1275                {                {
1276                *errorptr = ERR7;                ptr = oldptr - 2;
1277                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1278                }                }
1279              }              }
1280            }            }
# Line 814  for (;; ptr++) Line 1290  for (;; ptr++)
1290            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
1291            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
1292              {              {
1293              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
1294              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
1295              }              }
1296            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 826  for (;; ptr++) Line 1302  for (;; ptr++)
1302        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1303        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1304    
1305          SINGLE_CHARACTER:
1306    
1307        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1308        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1309          {          {
1310          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
1311          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
1312          }          }
1313        class_charcount++;        class_charcount++;
# Line 876  for (;; ptr++) Line 1354  for (;; ptr++)
1354      /* Various kinds of repeat */      /* Various kinds of repeat */
1355    
1356      case '{':      case '{':
1357      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
1360      goto REPEAT;      goto REPEAT;
1361    
# Line 902  for (;; ptr++) Line 1380  for (;; ptr++)
1380        goto FAILED;        goto FAILED;
1381        }        }
1382    
1383      /* If the next character is '?' this is a minimizing repeat. Advance to the      /* If the next character is '?' this is a minimizing repeat, by default,
1384        but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385      next character. */      next character. */
1386    
1387      if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;      if (ptr[1] == '?')
1388          { repeat_type = greedy_non_default; ptr++; }
1389      /* If the maximum is zero then the minimum must also be zero; Perl allows      else repeat_type = greedy_default;
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
1390    
1391      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1392      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1393      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1394        out any reqchar setting, backing up to the previous value. We must also
1395        adjust the countlits value. */
1396    
1397      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1398        {        {
1399        int len = previous[1];        int len = previous[1];
1400    
1401          if (repeat_min == 0) *reqchar = prevreqchar;
1402          *countlits += repeat_min - 1;
1403    
1404        if (len == 1)        if (len == 1)
1405          {          {
1406          c = previous[2];          c = previous[2];
# Line 950  for (;; ptr++) Line 1432  for (;; ptr++)
1432      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
1433      repeats by adding a suitable offset into repeat_type. */      repeats by adding a suitable offset into repeat_type. */
1434    
1435      else if ((int)*previous < OP_EOD || *previous == OP_ANY)      else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436        {        {
1437        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1438        c = *previous;        c = *previous;
1439        code = previous;        code = previous;
1440    
1441        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1442        repeat_type += op_type;      /* Combine both values for many cases */  
1443          /* If the maximum is zero then the minimum must also be zero; Perl allows
1444          this case, so we do too - by simply omitting the item altogether. */
1445    
1446          if (repeat_max == 0) goto END_REPEAT;
1447    
1448          /* Combine the op_type with the repeat_type */
1449    
1450          repeat_type += op_type;
1451    
1452        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1453        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 994  for (;; ptr++) Line 1484  for (;; ptr++)
1484          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
1485          we either have to put back the item that got cancelled if the string          we either have to put back the item that got cancelled if the string
1486          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
1487          string. For a character type nothing need be done; it will just get put          string. For a character type nothing need be done; it will just get
1488          back naturally. */          put back naturally. Note that the final character is always going to
1489            get added below. */
1490    
1491          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
1492            {            {
1493            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else previous[1]++;
1494            }            }
1495    
1496            /*  For a single negated character we also have to put back the
1497            item that got cancelled. */
1498    
1499            else if (*previous == OP_NOT) code++;
1500    
1501          /* If the maximum is unlimited, insert an OP_STAR. */          /* If the maximum is unlimited, insert an OP_STAR. */
1502    
1503          if (repeat_max < 0)          if (repeat_max < 0)
# Line 1028  for (;; ptr++) Line 1524  for (;; ptr++)
1524        }        }
1525    
1526      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1527      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1528    
1529      else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||      else if (*previous == OP_CLASS || *previous == OP_REF)
              *previous == OP_REF)  
1530        {        {
1531          if (repeat_max == 0)
1532            {
1533            code = previous;
1534            goto END_REPEAT;
1535            }
1536        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1537          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1538        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1051  for (;; ptr++) Line 1551  for (;; ptr++)
1551        }        }
1552    
1553      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
1554      cases. If the maximum repeat count is unlimited, check that the bracket      cases. */
     group cannot match the empty string, and diagnose an error if it can. */  
1555    
1556      else if ((int)*previous >= OP_BRA)      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557                 (int)*previous == OP_COND)
1558        {        {
1559        int i;        register int i;
1560          int ketoffset = 0;
1561        int len = code - previous;        int len = code - previous;
1562          uschar *bralink = NULL;
1563    
1564          /* If the maximum repeat count is unlimited, find the end of the bracket
1565          by scanning through from the start, and compute the offset back to it
1566          from the current code pointer. There may be an OP_OPT setting following
1567          the final KET, so we can't find the end just by going back from the code
1568          pointer. */
1569    
1570          if (repeat_max == -1)
1571            {
1572            register uschar *ket = previous;
1573            do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574            ketoffset = code - ket;
1575            }
1576    
1577          /* The case of a zero minimum is special because of the need to stick
1578          OP_BRAZERO in front of it, and because the group appears once in the
1579          data, whereas in other cases it appears the minimum number of times. For
1580          this reason, it is simplest to treat this case separately, as otherwise
1581          the code gets far too mess. There are several special subcases when the
1582          minimum is zero. */
1583    
1584        if (repeat_max == -1 && could_be_empty(previous))        if (repeat_min == 0)
1585          {          {
1586          *errorptr = ERR10;          /* If we set up a required char from the bracket, we must back off
1587          goto FAILED;          to the previous value and reset the countlits value too. */
         }  
1588    
1589        /* If the minimum is greater than zero, and the maximum is unlimited or          if (subcountlits > 0)
1590        equal to the minimum, the first copy remains where it is, and is            {
1591        replicated up to the minimum number of times. This case includes the +            *reqchar = prevreqchar;
1592        repeat, but of course no replication is needed in that case. */            *countlits -= subcountlits;
1593              }
1594    
1595        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))          /* If the maximum is also zero, we just omit the group from the output
1596          {          altogether. */
1597          for (i = 1; i < repeat_min; i++)  
1598            if (repeat_max == 0)
1599            {            {
1600            memcpy(code, previous, len);            code = previous;
1601            code += len;            goto END_REPEAT;
1602            }            }
         }  
1603    
1604        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is 1 or unlimited, we just have to stick in the
1605        Then, if there is a fixed upper limit, replicated up to that many times,          BRAZERO and do no more at this point. */
       sticking BRAZERO in front of all the optional ones. */  
1606    
1607        else          if (repeat_max <= 1)
         {  
         if (repeat_min == 0)  
1608            {            {
1609            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1610            code++;            code++;
1611            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1612            }            }
1613    
1614            /* If the maximum is greater than 1 and limited, we have to replicate
1615            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616            The first one has to be handled carefully because it's the original
1617            copy, which has to be moved up. The remainder can be handled by code
1618            that is common with the non-zero minimum case below. We just have to
1619            adjust the value or repeat_max, since one less copy is required. */
1620    
1621            else
1622              {
1623              int offset;
1624              memmove(previous+4, previous, len);
1625              code += 4;
1626              *previous++ = OP_BRAZERO + repeat_type;
1627              *previous++ = OP_BRA;
1628    
1629              /* We chain together the bracket offset fields that have to be
1630              filled in later when the ends of the brackets are reached. */
1631    
1632              offset = (bralink == NULL)? 0 : previous - bralink;
1633              bralink = previous;
1634              *previous++ = offset >> 8;
1635              *previous++ = offset & 255;
1636              }
1637    
1638            repeat_max--;
1639            }
1640    
1641          /* If the minimum is greater than zero, replicate the group as many
1642          times as necessary, and adjust the maximum to the number of subsequent
1643          copies that we need. */
1644    
1645          else
1646            {
1647          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1648            {            {
1649            memcpy(code, previous, len);            memcpy(code, previous, len);
1650            code += len;            code += len;
1651            }            }
1652            if (repeat_max > 0) repeat_max -= repeat_min;
1653            }
1654    
1655          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        /* This code is common to both the zero and non-zero minimum cases. If
1656          the maximum is limited, it replicates the group in a nested fashion,
1657          remembering the bracket starts on a stack. In the case of a zero minimum,
1658          the first one was set up above. In all cases the repeat_max now specifies
1659          the number of additional copies needed. */
1660    
1661          if (repeat_max >= 0)
1662            {
1663            for (i = repeat_max - 1; i >= 0; i--)
1664            {            {
1665            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1666    
1667              /* All but the final copy start a new nesting, maintaining the
1668              chain of brackets outstanding. */
1669    
1670              if (i != 0)
1671                {
1672                int offset;
1673                *code++ = OP_BRA;
1674                offset = (bralink == NULL)? 0 : code - bralink;
1675                bralink = code;
1676                *code++ = offset >> 8;
1677                *code++ = offset & 255;
1678                }
1679    
1680            memcpy(code, previous, len);            memcpy(code, previous, len);
1681            code += len;            code += len;
1682            }            }
1683    
1684            /* Now chain through the pending brackets, and fill in their length
1685            fields (which are holding the chain links pro tem). */
1686    
1687            while (bralink != NULL)
1688              {
1689              int oldlinkoffset;
1690              int offset = code - bralink + 1;
1691              uschar *bra = code - offset;
1692              oldlinkoffset = (bra[1] << 8) + bra[2];
1693              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694              *code++ = OP_KET;
1695              *code++ = bra[1] = offset >> 8;
1696              *code++ = bra[2] = (offset & 255);
1697              }
1698          }          }
1699    
1700        /* If the maximum is unlimited, set a repeater in the final copy. */        /* If the maximum is unlimited, set a repeater in the final copy. We
1701          can't just offset backwards from the current code point, because we
1702          don't know if there's been an options resetting after the ket. The
1703          correct offset was computed above. */
1704    
1705        if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706        }        }
1707    
1708      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1121  for (;; ptr++) Line 1715  for (;; ptr++)
1715    
1716      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1717    
1718        END_REPEAT:
1719      previous = NULL;      previous = NULL;
1720      break;      break;
1721    
1722    
1723      /* Start of nested bracket sub-expression, or comment or lookahead.      /* Start of nested bracket sub-expression, or comment or lookahead or
1724      First deal with special things that can come after a bracket; all are      lookbehind or option setting or condition. First deal with special things
1725      introduced by ?, and the appearance of any of them means that this is not a      that can come after a bracket; all are introduced by ?, and the appearance
1726      referencing group. They were checked for validity in the first pass over      of any of them means that this is not a referencing group. They were
1727      the string, so we don't have to check for syntax errors here.  */      checked for validity in the first pass over the string, so we don't have to
1728        check for syntax errors here.  */
1729    
1730      case '(':      case '(':
1731      previous = code;              /* Only real brackets can be repeated */      newoptions = options;
1732        condref = -1;
1733    
1734      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1735        {        {
1736        bravalue = OP_BRA;        int set, unset;
1737          int *optset;
1738    
1739        switch (*(++ptr))        switch (*(++ptr))
1740          {          {
1741          case '#':          case '#':                 /* Comment; skip to ket */
         case 'i':  
         case 'm':  
         case 's':  
         case 'x':  
1742          ptr++;          ptr++;
1743          while (*ptr != ')') ptr++;          while (*ptr != ')') ptr++;
         previous = NULL;  
1744          continue;          continue;
1745    
1746          case ':':                 /* Non-extracting bracket */          case ':':                 /* Non-extracting bracket */
1747            bravalue = OP_BRA;
1748          ptr++;          ptr++;
1749          break;          break;
1750    
1751          case '=':                 /* Assertions can't be repeated */          case '(':
1752            bravalue = OP_COND;       /* Conditional group */
1753            if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754              {
1755              condref = *ptr - '0';
1756              while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757              if (condref == 0)
1758                {
1759                *errorptr = ERR35;
1760                goto FAILED;
1761                }
1762              ptr++;
1763              }
1764            else ptr--;
1765            break;
1766    
1767            case '=':                 /* Positive lookahead */
1768          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
1769          ptr++;          ptr++;
         previous = NULL;  
1770          break;          break;
1771    
1772          case '!':          case '!':                 /* Negative lookahead */
1773          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
1774          ptr++;          ptr++;
         previous = NULL;  
1775          break;          break;
1776    
1777          case '>':                         /* "Match once" brackets */          case '<':                 /* Lookbehinds */
1778          if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */          switch (*(++ptr))
1779            {            {
1780            bravalue = OP_ONCE;            case '=':               /* Positive lookbehind */
1781              bravalue = OP_ASSERTBACK;
1782              ptr++;
1783              break;
1784    
1785              case '!':               /* Negative lookbehind */
1786              bravalue = OP_ASSERTBACK_NOT;
1787            ptr++;            ptr++;
           previous = NULL;  
1788            break;            break;
1789    
1790              default:                /* Syntax error */
1791              *errorptr = ERR24;
1792              goto FAILED;
1793            }            }
1794          /* Else fall through */          break;
1795    
1796          default:          case '>':                 /* One-time brackets */
1797          *errorptr = ERR12;          bravalue = OP_ONCE;
1798          goto FAILED;          ptr++;
1799            break;
1800    
1801            case 'R':                 /* Pattern recursion */
1802            *code++ = OP_RECURSE;
1803            ptr++;
1804            continue;
1805    
1806            default:                  /* Option setting */
1807            set = unset = 0;
1808            optset = &set;
1809    
1810            while (*ptr != ')' && *ptr != ':')
1811              {
1812              switch (*ptr++)
1813                {
1814                case '-': optset = &unset; break;
1815    
1816                case 'i': *optset |= PCRE_CASELESS; break;
1817                case 'm': *optset |= PCRE_MULTILINE; break;
1818                case 's': *optset |= PCRE_DOTALL; break;
1819                case 'x': *optset |= PCRE_EXTENDED; break;
1820                case 'U': *optset |= PCRE_UNGREEDY; break;
1821                case 'X': *optset |= PCRE_EXTRA; break;
1822    
1823                default:
1824                *errorptr = ERR12;
1825                goto FAILED;
1826                }
1827              }
1828    
1829            /* Set up the changed option bits, but don't change anything yet. */
1830    
1831            newoptions = (options | set) & (~unset);
1832    
1833            /* If the options ended with ')' this is not the start of a nested
1834            group with option changes, so the options change at this level. At top
1835            level there is nothing else to be done (the options will in fact have
1836            been set from the start of compiling as a result of the first pass) but
1837            at an inner level we must compile code to change the ims options if
1838            necessary, and pass the new setting back so that it can be put at the
1839            start of any following branches, and when this group ends, a resetting
1840            item can be compiled. */
1841    
1842            if (*ptr == ')')
1843              {
1844              if ((options & PCRE_INGROUP) != 0 &&
1845                  (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846                {
1847                *code++ = OP_OPT;
1848                *code++ = *optchanged = newoptions & PCRE_IMS;
1849                }
1850              options = newoptions;  /* Change options at this level */
1851              previous = NULL;       /* This item can't be repeated */
1852              continue;              /* It is complete */
1853              }
1854    
1855            /* If the options ended with ':' we are heading into a nested group
1856            with possible change of options. Such groups are non-capturing and are
1857            not assertions of any kind. All we need to do is skip over the ':';
1858            the newoptions value is handled below. */
1859    
1860            bravalue = OP_BRA;
1861            ptr++;
1862          }          }
1863        }        }
1864    
1865      /* Else we have a referencing group */      /* Else we have a referencing group; adjust the opcode. */
1866    
1867      else      else
1868        {        {
# Line 1193  for (;; ptr++) Line 1874  for (;; ptr++)
1874        bravalue = OP_BRA + *brackets;        bravalue = OP_BRA + *brackets;
1875        }        }
1876    
1877      /* Process nested bracketed re; at end pointer is on the bracket. We copy      /* Process nested bracketed re. Assertions may not be repeated, but other
1878      code into a non-register variable in order to be able to pass its address      kinds can be. We copy code into a non-register variable in order to be able
1879      because some compilers complain otherwise. */      to pass its address because some compilers complain otherwise. Pass in a
1880        new setting for the ims options if they have changed. */
1881    
1882        previous = (bravalue >= OP_ONCE)? code : NULL;
1883      *code = bravalue;      *code = bravalue;
1884        tempcode = code;
1885    
1886        if (!compile_regex(
1887             options | PCRE_INGROUP,       /* Set for all nested groups */
1888             ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889               newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890             brackets,                     /* Bracket level */
1891             &tempcode,                    /* Where to put code (updated) */
1892             &ptr,                         /* Input pointer (updated) */
1893             errorptr,                     /* Where to put an error message */
1894             (bravalue == OP_ASSERTBACK ||
1895              bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896             condref,                      /* Condition reference number */
1897             &subreqchar,                  /* For possible last char */
1898             &subcountlits,                /* For literal count */
1899             cd))                          /* Tables block */
1900          goto FAILED;
1901    
1902        /* At the end of compiling, code is still pointing to the start of the
1903        group, while tempcode has been updated to point past the end of the group
1904        and any option resetting that may follow it. The pattern pointer (ptr)
1905        is on the bracket. */
1906    
1907        /* If this is a conditional bracket, check that there are no more than
1908        two branches in the group. */
1909    
1910        if (bravalue == OP_COND)
1911        {        {
1912        uschar *mcode = code;        uschar *tc = code;
1913        if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))        condcount = 0;
1914    
1915          do {
1916             condcount++;
1917             tc += (tc[1] << 8) | tc[2];
1918             }
1919          while (*tc != OP_KET);
1920    
1921          if (condcount > 2)
1922            {
1923            *errorptr = ERR27;
1924          goto FAILED;          goto FAILED;
1925        code = mcode;          }
1926        }        }
1927    
1928        /* Handle updating of the required character. If the subpattern didn't
1929        set one, leave it as it was. Otherwise, update it for normal brackets of
1930        all kinds, forward assertions, and conditions with two branches. Don't
1931        update the literal count for forward assertions, however. If the bracket
1932        is followed by a quantifier with zero repeat, we have to back off. Hence
1933        the definition of prevreqchar and subcountlits outside the main loop so
1934        that they can be accessed for the back off. */
1935    
1936        if (subreqchar > 0 &&
1937             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938             (bravalue == OP_COND && condcount == 2)))
1939          {
1940          prevreqchar = *reqchar;
1941          *reqchar = subreqchar;
1942          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1943          }
1944    
1945        /* Now update the main code pointer to the end of the group. */
1946    
1947        code = tempcode;
1948    
1949        /* Error if hit end of pattern */
1950    
1951      if (*ptr != ')')      if (*ptr != ')')
1952        {        {
1953        *errorptr = ERR14;        *errorptr = ERR14;
# Line 1217  for (;; ptr++) Line 1960  for (;; ptr++)
1960      for validity in the pre-compiling pass. */      for validity in the pre-compiling pass. */
1961    
1962      case '\\':      case '\\':
1963      oldptr = ptr;      tempptr = ptr;
1964      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1965    
1966      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1231  for (;; ptr++) Line 1974  for (;; ptr++)
1974        {        {
1975        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1976          {          {
         int refnum = -c - ESC_REF;  
         if (*brackets < refnum)  
           {  
           *errorptr = ERR15;  
           goto FAILED;  
           }  
1977          previous = code;          previous = code;
1978          *code++ = OP_REF;          *code++ = OP_REF;
1979          *code++ = refnum;          *code++ = -c - ESC_REF;
1980          }          }
1981        else        else
1982          {          {
1983          previous = (-c > ESC_b && -c < ESC_X)? code : NULL;          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1984          *code++ = -c;          *code++ = -c;
1985          }          }
1986        continue;        continue;
# Line 1251  for (;; ptr++) Line 1988  for (;; ptr++)
1988    
1989      /* Data character: reset and fall through */      /* Data character: reset and fall through */
1990    
1991      ptr = oldptr;      ptr = tempptr;
1992      c = '\\';      c = '\\';
1993    
1994      /* Handle a run of data characters until a metacharacter is encountered.      /* Handle a run of data characters until a metacharacter is encountered.
# Line 1269  for (;; ptr++) Line 2006  for (;; ptr++)
2006        {        {
2007        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2008          {          {
2009          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2010          if (c == '#')          if (c == '#')
2011            {            {
2012            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2013              on the Macintosh. */
2014              while ((c = *(++ptr)) != 0 && c != '\n') ;
2015            if (c == 0) break;            if (c == 0) break;
2016            continue;            continue;
2017            }            }
# Line 1284  for (;; ptr++) Line 2023  for (;; ptr++)
2023    
2024        if (c == '\\')        if (c == '\\')
2025          {          {
2026          oldptr = ptr;          tempptr = ptr;
2027          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028          if (c < 0) { ptr = oldptr; break; }          if (c < 0) { ptr = tempptr; break; }
2029    
2030            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031            two or more characters in the UTF-8 encoding. */
2032    
2033    #ifdef SUPPORT_UTF8
2034            if (c > 127 && (options & PCRE_UTF8) != 0)
2035              {
2036              uschar buffer[8];
2037              int len = ord2utf8(c, buffer);
2038              for (c = 0; c < len; c++) *code++ = buffer[c];
2039              length += len;
2040              continue;
2041              }
2042    #endif
2043          }          }
2044    
2045        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1297  for (;; ptr++) Line 2050  for (;; ptr++)
2050    
2051      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2052    
2053      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054    
2055        /* Update the last character and the count of literals */
2056    
2057        prevreqchar = (length > 1)? code[-2] : *reqchar;
2058        *reqchar = code[-1];
2059        *countlits += length;
2060    
2061      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2062      the next state. */      the next state. */
2063    
2064      previous[1] = length;      previous[1] = length;
2065      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2066      break;      break;
2067      }      }
2068    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1327  return FALSE; Line 2086  return FALSE;
2086  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return
2087  it points to the closing bracket, or vertical bar, or end of string.  it points to the closing bracket, or vertical bar, or end of string.
2088  The code variable is pointing at the byte into which the BRA operator has been  The code variable is pointing at the byte into which the BRA operator has been
2089  stored.  stored. If the ims options are changed at the start (for a (?ims: group) or
2090    during any branch, we need to insert an OP_OPT item at the start of every
2091    following branch to ensure they get set correctly at run time, and also pass
2092    the new options into every subsequent branch compile.
2093    
2094  Argument:  Argument:
2095    options   the option bits    options     the option bits
2096    brackets  -> int containing the number of extracting brackets used    optchanged  new ims options to set as if (?ims) were at the start, or -1
2097    codeptr   -> the address of the current code pointer                 for no change
2098    ptrptr    -> the address of the current pattern pointer    brackets    -> int containing the number of extracting brackets used
2099    errorptr  -> pointer to error message    codeptr     -> the address of the current code pointer
2100      ptrptr      -> the address of the current pattern pointer
2101      errorptr    -> pointer to error message
2102      lookbehind  TRUE if this is a lookbehind assertion
2103      condref     >= 0 for OPT_CREF setting at start of conditional group
2104      reqchar     -> place to put the last required character, or a negative number
2105      countlits   -> place to put the shortest literal count of any branch
2106      cd          points to the data block with tables pointers
2107    
2108  Returns:    TRUE on success  Returns:      TRUE on success
2109  */  */
2110    
2111  static BOOL  static BOOL
2112  compile_regex(int options, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113    const uschar **ptrptr, const char **errorptr)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114      int *reqchar, int *countlits, compile_data *cd)
2115  {  {
2116  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2117  uschar *code = *codeptr;  uschar *code = *codeptr;
2118    uschar *last_branch = code;
2119  uschar *start_bracket = code;  uschar *start_bracket = code;
2120    uschar *reverse_count = NULL;
2121    int oldoptions = options & PCRE_IMS;
2122    int branchreqchar, branchcountlits;
2123    
2124    *reqchar = -1;
2125    *countlits = INT_MAX;
2126    code += 3;
2127    
2128    /* At the start of a reference-based conditional group, insert the reference
2129    number as an OP_CREF item. */
2130    
2131    if (condref >= 0)
2132      {
2133      *code++ = OP_CREF;
2134      *code++ = condref;
2135      }
2136    
2137    /* Loop for each alternative branch */
2138    
2139  for (;;)  for (;;)
2140    {    {
2141    int length;    int length;
   uschar *last_branch = code;  
2142    
2143    code += 3;    /* Handle change of options */
2144    if (!compile_branch(options, brackets, &code, &ptr, errorptr))  
2145      if (optchanged >= 0)
2146        {
2147        *code++ = OP_OPT;
2148        *code++ = optchanged;
2149        options = (options & ~PCRE_IMS) | optchanged;
2150        }
2151    
2152      /* Set up dummy OP_REVERSE if lookbehind assertion */
2153    
2154      if (lookbehind)
2155        {
2156        *code++ = OP_REVERSE;
2157        reverse_count = code;
2158        *code++ = 0;
2159        *code++ = 0;
2160        }
2161    
2162      /* Now compile the branch */
2163    
2164      if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165          &branchreqchar, &branchcountlits, cd))
2166      {      {
2167      *ptrptr = ptr;      *ptrptr = ptr;
2168      return FALSE;      return FALSE;
2169      }      }
2170    
2171    /* Fill in the length of the last branch */    /* Fill in the length of the last branch */
2172    
2173      length = code - last_branch;
2174      last_branch[1] = length >> 8;
2175      last_branch[2] = length & 255;
2176    
2177      /* Save the last required character if all branches have the same; a current
2178      value of -1 means unset, while -2 means "previous branch had no last required
2179      char".  */
2180    
2181      if (*reqchar != -2)
2182        {
2183        if (branchreqchar >= 0)
2184          {
2185          if (*reqchar == -1) *reqchar = branchreqchar;
2186          else if (*reqchar != branchreqchar) *reqchar = -2;
2187          }
2188        else *reqchar = -2;
2189        }
2190    
2191      /* Keep the shortest literal count */
2192    
2193      if (branchcountlits < *countlits) *countlits = branchcountlits;
2194      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2195    
2196    length = code - last_branch;    /* If lookbehind, check that this branch matches a fixed-length string,
2197    last_branch[1] = length >> 8;    and put the length into the OP_REVERSE item. Temporarily mark the end of
2198    last_branch[2] = length & 255;    the branch with OP_END. */
2199    
2200      if (lookbehind)
2201        {
2202        *code = OP_END;
2203        length = find_fixedlength(last_branch, options);
2204        DPRINTF(("fixed length = %d\n", length));
2205        if (length < 0)
2206          {
2207          *errorptr = ERR25;
2208          *ptrptr = ptr;
2209          return FALSE;
2210          }
2211        reverse_count[0] = (length >> 8);
2212        reverse_count[1] = length & 255;
2213        }
2214    
2215    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Insert a
2216    terminating ket and the length of the whole bracketed item, and return,    terminating ket and the length of the whole bracketed item, and return,
2217    leaving the pointer at the terminating char. */    leaving the pointer at the terminating char. If any of the ims options
2218      were changed inside the group, compile a resetting op-code following. */
2219    
2220    if (*ptr != '|')    if (*ptr != '|')
2221      {      {
# Line 1375  for (;;) Line 2223  for (;;)
2223      *code++ = OP_KET;      *code++ = OP_KET;
2224      *code++ = length >> 8;      *code++ = length >> 8;
2225      *code++ = length & 255;      *code++ = length & 255;
2226        if (optchanged >= 0)
2227          {
2228          *code++ = OP_OPT;
2229          *code++ = oldoptions;
2230          }
2231      *codeptr = code;      *codeptr = code;
2232      *ptrptr = ptr;      *ptrptr = ptr;
2233      return TRUE;      return TRUE;
# Line 1383  for (;;) Line 2236  for (;;)
2236    /* Another branch follows; insert an "or" node and advance the pointer. */    /* Another branch follows; insert an "or" node and advance the pointer. */
2237    
2238    *code = OP_ALT;    *code = OP_ALT;
2239      last_branch = code;
2240      code += 3;
2241    ptr++;    ptr++;
2242    }    }
2243  /* Control never reaches here */  /* Control never reaches here */
# Line 1390  for (;;) Line 2245  for (;;)
2245    
2246    
2247    
2248    
2249    /*************************************************
2250    *      Find first significant op code            *
2251    *************************************************/
2252    
2253    /* This is called by several functions that scan a compiled expression looking
2254    for a fixed first character, or an anchoring op code etc. It skips over things
2255    that do not influence this. For one application, a change of caseless option is
2256    important.
2257    
2258    Arguments:
2259      code       pointer to the start of the group
2260      options    pointer to external options
2261      optbit     the option bit whose changing is significant, or
2262                 zero if none are
2263      optstop    TRUE to return on option change, otherwise change the options
2264                   value and continue
2265    
2266    Returns:     pointer to the first significant opcode
2267    */
2268    
2269    static const uschar*
2270    first_significant_code(const uschar *code, int *options, int optbit,
2271      BOOL optstop)
2272    {
2273    for (;;)
2274      {
2275      switch ((int)*code)
2276        {
2277        case OP_OPT:
2278        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2279          {
2280          if (optstop) return code;
2281          *options = (int)code[1];
2282          }
2283        code += 2;
2284        break;
2285    
2286        case OP_CREF:
2287        code += 2;
2288        break;
2289    
2290        case OP_WORD_BOUNDARY:
2291        case OP_NOT_WORD_BOUNDARY:
2292        code++;
2293        break;
2294    
2295        case OP_ASSERT_NOT:
2296        case OP_ASSERTBACK:
2297        case OP_ASSERTBACK_NOT:
2298        do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2299        code += 3;
2300        break;
2301    
2302        default:
2303        return code;
2304        }
2305      }
2306    /* Control never reaches here */
2307    }
2308    
2309    
2310    
2311    
2312  /*************************************************  /*************************************************
2313  *          Check for anchored expression         *  *          Check for anchored expression         *
2314  *************************************************/  *************************************************/
# Line 1400  all of whose alternatives start with OP_ Line 2319  all of whose alternatives start with OP_
2319  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2321    
2322  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2324  trying them again.  so there is no point trying them again.
2325    
2326  Argument:  points to start of expression (the bracket)  Arguments:
2327  Returns:   TRUE or FALSE    code       points to start of expression (the bracket)
2328      options    points to the options setting
2329    
2330    Returns:     TRUE or FALSE
2331  */  */
2332    
2333  static BOOL  static BOOL
2334  is_anchored(register const uschar *code, BOOL multiline)  is_anchored(register const uschar *code, int *options)
2335  {  {
2336  do {  do {
2337     int op = (int)code[3];     const uschar *scode = first_significant_code(code + 3, options,
2338     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)       PCRE_MULTILINE, FALSE);
2339       { if (!is_anchored(code+3, multiline)) return FALSE; }     register int op = *scode;
2340     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341       { if (code[4] != OP_ANY) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2342     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343                (*options & PCRE_DOTALL) != 0)
2344         { if (scode[1] != OP_ANY) return FALSE; }
2345       else if (op != OP_SOD &&
2346               ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2347         return FALSE;
2348     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2349     }     }
2350  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1427  return TRUE; Line 2354  return TRUE;
2354    
2355    
2356  /*************************************************  /*************************************************
2357  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2358  *************************************************/  *************************************************/
2359    
2360  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2361  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2362    matching and for non-DOTALL patterns that start with .* (which must start at
2363    the beginning or after \n).
2364    
2365  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2366  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1441  static BOOL Line 2370  static BOOL
2370  is_startline(const uschar *code)  is_startline(const uschar *code)
2371  {  {
2372  do {  do {
2373     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374       { if (!is_startline(code+3)) return FALSE; }     register int op = *scode;
2375     else if (code[3] != OP_CIRC) return FALSE;     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376         { if (!is_startline(scode)) return FALSE; }
2377       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378         { if (scode[1] != OP_ANY) return FALSE; }
2379       else if (op != OP_CIRC) return FALSE;
2380     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2381     }     }
2382  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1462  Consider each alternative branch. If the Line 2395  Consider each alternative branch. If the
2395  a bracket all of whose alternatives start with the same char (recurse ad lib),  a bracket all of whose alternatives start with the same char (recurse ad lib),
2396  then we return that char, otherwise -1.  then we return that char, otherwise -1.
2397    
2398  Argument:  points to start of expression (the bracket)  Arguments:
2399  Returns:   -1 or the fixed first char    code       points to start of expression (the bracket)
2400      options    pointer to the options (used to check casing changes)
2401    
2402    Returns:     -1 or the fixed first char
2403  */  */
2404    
2405  static int  static int
2406  find_firstchar(uschar *code)  find_firstchar(const uschar *code, int *options)
2407  {  {
2408  register int c = -1;  register int c = -1;
2409  do  do {
2410    {     int d;
2411    register int charoffset = 4;     const uschar *scode = first_significant_code(code + 3, options,
2412         PCRE_CASELESS, TRUE);
2413    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     register int op = *scode;
2414      {  
2415      register int d;     if (op >= OP_BRA) op = OP_BRA;
2416      if ((d = find_firstchar(code+3)) < 0) return -1;  
2417      if (c < 0) c = d; else if (c != d) return -1;     switch(op)
2418      }       {
2419         default:
2420    else switch(code[3])       return -1;
2421      {  
2422      default:       case OP_BRA:
2423      return -1;       case OP_ASSERT:
2424         case OP_ONCE:
2425      case OP_EXACT:       /* Fall through */       case OP_COND:
2426      charoffset++;       if ((d = find_firstchar(scode, options)) < 0) return -1;
2427         if (c < 0) c = d; else if (c != d) return -1;
2428      case OP_CHARS:       /* Fall through */       break;
2429      charoffset++;  
2430         case OP_EXACT:       /* Fall through */
2431         scode++;
2432    
2433         case OP_CHARS:       /* Fall through */
2434         scode++;
2435    
2436         case OP_PLUS:
2437         case OP_MINPLUS:
2438         if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2439         break;
2440         }
2441    
2442      case OP_PLUS:     code += (code[1] << 8) + code[2];
2443      case OP_MINPLUS:     }
     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;  
     break;  
     }  
   code += (code[1] << 8) + code[2];  
   }  
2444  while (*code == OP_ALT);  while (*code == OP_ALT);
2445  return c;  return c;
2446  }  }
2447    
2448    
2449    
2450    
2451    
2452  /*************************************************  /*************************************************
2453  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
2454  *************************************************/  *************************************************/
# Line 1517  Arguments: Line 2461  Arguments:
2461    options      various option bits    options      various option bits
2462    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2463    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2464      tables       pointer to character tables or NULL
2465    
2466  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2467                 with errorptr and erroroffset set                 with errorptr and erroroffset set
# Line 1524  Returns:       pointer to compiled data Line 2469  Returns:       pointer to compiled data
2469    
2470  pcre *  pcre *
2471  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2472    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2473  {  {
2474  real_pcre *re;  real_pcre *re;
 int spaces = 0;  
2475  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2476  int runlength;  int runlength;
2477  int c, size;  int c, reqchar, countlits;
2478  int bracount = 0;  int bracount = 0;
 int brastack[200];  
2479  int top_backref = 0;  int top_backref = 0;
2480    int branch_extra = 0;
2481    int branch_newextra;
2482  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2483    size_t size;
2484  uschar *code;  uschar *code;
2485  const uschar *ptr;  const uschar *ptr;
2486    compile_data compile_block;
2487    int brastack[BRASTACK_SIZE];
2488    uschar bralenstack[BRASTACK_SIZE];
2489    
2490  #ifdef DEBUG  #ifdef DEBUG
2491  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2492  #endif  #endif
2493    
2494    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2495    
2496    #ifndef SUPPORT_UTF8
2497    if ((options & PCRE_UTF8) != 0)
2498      {
2499      *errorptr = ERR32;
2500      return NULL;
2501      }
2502    #endif
2503    
2504  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2505  can do is just return NULL. */  can do is just return NULL. */
2506    
# Line 1563  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2522  if ((options & ~PUBLIC_OPTIONS) != 0)
2522    return NULL;    return NULL;
2523    }    }
2524    
2525    /* Set up pointers to the individual character tables */
2526    
2527    if (tables == NULL) tables = pcre_default_tables;
2528    compile_block.lcc = tables + lcc_offset;
2529    compile_block.fcc = tables + fcc_offset;
2530    compile_block.cbits = tables + cbits_offset;
2531    compile_block.ctypes = tables + ctypes_offset;
2532    
2533    /* Reflect pattern for debugging output */
2534    
2535  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
2536  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
2537    
# Line 1579  while ((c = *(++ptr)) != 0) Line 2548  while ((c = *(++ptr)) != 0)
2548    int min, max;    int min, max;
2549    int class_charcount;    int class_charcount;
2550    
2551    if ((pcre_ctypes[c] & ctype_space) != 0)    if ((options & PCRE_EXTENDED) != 0)
     {  
     if ((options & PCRE_EXTENDED) != 0) continue;  
     spaces++;  
     }  
   
   if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2552      {      {
2553      while ((c = *(++ptr)) != 0 && c != '\n');      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2554      continue;      if (c == '#')
2555          {
2556          /* The space before the ; is to avoid a warning on a silly compiler
2557          on the Macintosh. */
2558          while ((c = *(++ptr)) != 0 && c != '\n') ;
2559          continue;
2560          }
2561      }      }
2562    
2563    switch(c)    switch(c)
# Line 1601  while ((c = *(++ptr)) != 0) Line 2570  while ((c = *(++ptr)) != 0)
2570      case '\\':      case '\\':
2571        {        {
2572        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2573        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2574        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2575        if (c >= 0)        if (c >= 0)
2576          {          {
# Line 1621  while ((c = *(++ptr)) != 0) Line 2590  while ((c = *(++ptr)) != 0)
2590        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2591        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2592        length++;   /* For single back reference */        length++;   /* For single back reference */
2593        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2594          {          {
2595          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2596          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2598            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1647  while ((c = *(++ptr)) != 0) Line 2616  while ((c = *(++ptr)) != 0)
2616      or back reference. */      or back reference. */
2617    
2618      case '{':      case '{':
2619      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2620      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2621      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2622      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2623        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1662  while ((c = *(++ptr)) != 0) Line 2631  while ((c = *(++ptr)) != 0)
2631      if (ptr[1] == '?') ptr++;      if (ptr[1] == '?') ptr++;
2632      continue;      continue;
2633    
2634      /* An alternation contains an offset to the next branch or ket. */      /* An alternation contains an offset to the next branch or ket. If any ims
2635        options changed in the previous branch(es), and/or if we are in a
2636        lookbehind assertion, extra space will be needed at the start of the
2637        branch. This is handled by branch_extra. */
2638    
2639      case '|':      case '|':
2640      length += 3;      length += 3 + branch_extra;
2641      continue;      continue;
2642    
2643      /* A character class uses 33 characters. Don't worry about character types      /* A character class uses 33 characters. Don't worry about character types
# Line 1679  while ((c = *(++ptr)) != 0) Line 2652  while ((c = *(++ptr)) != 0)
2652        {        {
2653        if (*ptr == '\\')        if (*ptr == '\\')
2654          {          {
2655          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2656              &compile_block);
2657          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2658          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2659          }          }
# Line 1696  while ((c = *(++ptr)) != 0) Line 2670  while ((c = *(++ptr)) != 0)
2670    
2671        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2672    
2673        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2674          {          {
2675          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2676          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2678            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1712  while ((c = *(++ptr)) != 0) Line 2686  while ((c = *(++ptr)) != 0)
2686      /* Brackets may be genuine groups or special things */      /* Brackets may be genuine groups or special things */
2687    
2688      case '(':      case '(':
2689        branch_newextra = 0;
2690    
2691      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2692    
2693      if (ptr[1] == '?') switch (c = ptr[2])      if (ptr[1] == '?')
2694        {        {
2695        /* Skip over comments entirely */        int set, unset;
2696        case '#':        int *optset;
       ptr += 3;  
       while (*ptr != 0 && *ptr != ')') ptr++;  
       if (*ptr == 0)  
         {  
         *errorptr = ERR18;  
         goto PCRE_ERROR_RETURN;  
         }  
       continue;  
2697    
2698        /* Non-referencing groups and lookaheads just move the pointer on, and        switch (c = ptr[2])
2699        then behave like a non-special bracket, except that they don't increment          {
2700        the count of extracting brackets. */          /* Skip over comments entirely */
2701            case '#':
2702        case ':':          ptr += 3;
2703        case '=':          while (*ptr != 0 && *ptr != ')') ptr++;
2704        case '!':          if (*ptr == 0)
2705        ptr += 2;            {
2706        break;            *errorptr = ERR18;
2707              goto PCRE_ERROR_RETURN;
2708              }
2709            continue;
2710    
2711        /* Ditto for the "once only" bracket, allowed only if the extra bit          /* Non-referencing groups and lookaheads just move the pointer on, and
2712        is set. */          then behave like a non-special bracket, except that they don't increment
2713            the count of extracting brackets. Ditto for the "once only" bracket,
2714            which is in Perl from version 5.005. */
2715    
2716        case '>':          case ':':
2717        if ((options & PCRE_EXTRA) != 0)          case '=':
2718          {          case '!':
2719            case '>':
2720          ptr += 2;          ptr += 2;
2721          break;          break;
         }  
       /* Else fall thourh */  
2722    
2723        /* Else loop setting valid options until ) is met. Anything else is an          /* A recursive call to the regex is an extension, to provide the
2724        error. */          facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2725    
2726        default:          case 'R':
2727        ptr += 2;          if (ptr[3] != ')')
       for (;; ptr++)  
         {  
         if ((c = *ptr) == 'i')  
2728            {            {
2729            options |= PCRE_CASELESS;            *errorptr = ERR29;
2730            continue;            goto PCRE_ERROR_RETURN;
2731            }            }
2732          else if ((c = *ptr) == 'm')          ptr += 3;
2733            length += 1;
2734            break;
2735    
2736            /* Lookbehinds are in Perl from version 5.005 */
2737    
2738            case '<':
2739            if (ptr[3] == '=' || ptr[3] == '!')
2740            {            {
2741            options |= PCRE_MULTILINE;            ptr += 3;
2742            continue;            branch_newextra = 3;
2743              length += 3;         /* For the first branch */
2744              break;
2745              }
2746            *errorptr = ERR24;
2747            goto PCRE_ERROR_RETURN;
2748    
2749            /* Conditionals are in Perl from version 5.005. The bracket must either
2750            be followed by a number (for bracket reference) or by an assertion
2751            group. */
2752    
2753            case '(':
2754            if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2755              {
2756              ptr += 4;
2757              length += 2;
2758              while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2759              if (*ptr != ')')
2760                {
2761                *errorptr = ERR26;
2762                goto PCRE_ERROR_RETURN;
2763                }
2764            }            }
2765          else if (c == 's')          else   /* An assertion must follow */
2766            {            {
2767            options |= PCRE_DOTALL;            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2768            continue;            if (ptr[2] != '?' ||
2769                 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2770                {
2771                ptr += 2;    /* To get right offset in message */
2772                *errorptr = ERR28;
2773                goto PCRE_ERROR_RETURN;
2774                }
2775              }
2776            break;
2777    
2778            /* Else loop checking valid options until ) is met. Anything else is an
2779            error. If we are without any brackets, i.e. at top level, the settings
2780            act as if specified in the options, so massage the options immediately.
2781            This is for backward compatibility with Perl 5.004. */
2782    
2783            default:
2784            set = unset = 0;
2785            optset = &set;
2786            ptr += 2;
2787    
2788            for (;; ptr++)
2789              {
2790              c = *ptr;
2791              switch (c)
2792                {
2793                case 'i':
2794                *optset |= PCRE_CASELESS;
2795                continue;
2796    
2797                case 'm':
2798                *optset |= PCRE_MULTILINE;
2799                continue;
2800    
2801                case 's':
2802                *optset |= PCRE_DOTALL;
2803                continue;
2804    
2805                case 'x':
2806                *optset |= PCRE_EXTENDED;
2807                continue;
2808    
2809                case 'X':
2810                *optset |= PCRE_EXTRA;
2811                continue;
2812    
2813                case 'U':
2814                *optset |= PCRE_UNGREEDY;
2815                continue;
2816    
2817                case '-':
2818                optset = &unset;
2819                continue;
2820    
2821                /* A termination by ')' indicates an options-setting-only item;
2822                this is global at top level; otherwise nothing is done here and
2823                it is handled during the compiling process on a per-bracket-group
2824                basis. */
2825    
2826                case ')':
2827                if (brastackptr == 0)
2828                  {
2829                  options = (options | set) & (~unset);
2830                  set = unset = 0;     /* To save length */
2831                  }
2832                /* Fall through */
2833    
2834                /* A termination by ':' indicates the start of a nested group with
2835                the given options set. This is again handled at compile time, but
2836                we must allow for compiled space if any of the ims options are
2837                set. We also have to allow for resetting space at the end of
2838                the group, which is why 4 is added to the length and not just 2.
2839                If there are several changes of options within the same group, this
2840                will lead to an over-estimate on the length, but this shouldn't
2841                matter very much. We also have to allow for resetting options at
2842                the start of any alternations, which we do by setting
2843                branch_newextra to 2. Finally, we record whether the case-dependent
2844                flag ever changes within the regex. This is used by the "required
2845                character" code. */
2846    
2847                case ':':
2848                if (((set|unset) & PCRE_IMS) != 0)
2849                  {
2850                  length += 4;
2851                  branch_newextra = 2;
2852                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2853                  }
2854                goto END_OPTIONS;
2855    
2856                /* Unrecognized option character */
2857    
2858                default:
2859                *errorptr = ERR12;
2860                goto PCRE_ERROR_RETURN;
2861                }
2862            }            }
2863          else if (c == 'x')  
2864            /* If we hit a closing bracket, that's it - this is a freestanding
2865            option-setting. We need to ensure that branch_extra is updated if
2866            necessary. The only values branch_newextra can have here are 0 or 2.
2867            If the value is 2, then branch_extra must either be 2 or 5, depending
2868            on whether this is a lookbehind group or not. */
2869    
2870            END_OPTIONS:
2871            if (c == ')')
2872            {            {
2873            options |= PCRE_EXTENDED;            if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2874            length -= spaces;          /* Already counted spaces */              branch_extra += branch_newextra;
2875            continue;            continue;
2876            }            }
         else if (c == ')') break;  
2877    
2878          *errorptr = ERR12;          /* If options were terminated by ':' control comes here. Fall through
2879          goto PCRE_ERROR_RETURN;          to handle the group below. */
2880          }          }
       continue;                      /* End of this bracket handling */  
2881        }        }
2882    
2883      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
# Line 1791  while ((c = *(++ptr)) != 0) Line 2886  while ((c = *(++ptr)) != 0)
2886      else bracount++;      else bracount++;
2887    
2888      /* Non-special forms of bracket. Save length for computing whole length      /* Non-special forms of bracket. Save length for computing whole length
2889      at end if there's a repeat that requires duplication of the group. */      at end if there's a repeat that requires duplication of the group. Also
2890        save the current value of branch_extra, and start the new group with
2891        the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2892        for a lookbehind assertion. */
2893    
2894      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2895        {        {
# Line 1799  while ((c = *(++ptr)) != 0) Line 2897  while ((c = *(++ptr)) != 0)
2897        goto PCRE_ERROR_RETURN;        goto PCRE_ERROR_RETURN;
2898        }        }
2899    
2900        bralenstack[brastackptr] = branch_extra;
2901        branch_extra = branch_newextra;
2902    
2903      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2904      length += 3;      length += 3;
2905      continue;      continue;
# Line 1806  while ((c = *(++ptr)) != 0) Line 2907  while ((c = *(++ptr)) != 0)
2907      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
2908      have to replicate this bracket up to that many times. If brastackptr is      have to replicate this bracket up to that many times. If brastackptr is
2909      0 this is an unmatched bracket which will generate an error, but take care      0 this is an unmatched bracket which will generate an error, but take care
2910      not to try to access brastack[-1]. */      not to try to access brastack[-1] when computing the length and restoring
2911        the branch_extra value. */
2912    
2913      case ')':      case ')':
2914      length += 3;      length += 3;
2915        {        {
2916        int minval = 1;        int minval = 1;
2917        int maxval = 1;        int maxval = 1;
2918        int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;        int duplength;
2919    
2920          if (brastackptr > 0)
2921            {
2922            duplength = length - brastack[--brastackptr];
2923            branch_extra = bralenstack[brastackptr];
2924            }
2925          else duplength = 0;
2926    
2927        /* Leave ptr at the final char; for read_repeat_counts this happens        /* Leave ptr at the final char; for read_repeat_counts this happens
2928        automatically; for the others we need an increment. */        automatically; for the others we need an increment. */
2929    
2930        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2931          {          {
2932          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2933              &compile_block);
2934          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2935          }          }
2936        else if (c == '*') { minval = 0; maxval = -1; ptr++; }        else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2937        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2938        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2939    
2940        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2941        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2942        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2943        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2944    
2945        if (minval == 0) length++;        if (minval == 0)
2946          else if (minval > 1) length += (minval - 1) * duplength;          {
2947        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2948            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2949            }
2950    
2951          /* When the minimum is greater than zero, 1 we have to replicate up to
2952          minval-1 times, with no additions required in the copies. Then, if
2953          there is a limited maximum we have to replicate up to maxval-1 times
2954          allowing for a BRAZERO item before each optional copy and nesting
2955          brackets for all but one of the optional copies. */
2956    
2957          else
2958            {
2959            length += (minval - 1) * duplength;
2960            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2961              length += (maxval - minval) * (duplength + 7) - 6;
2962            }
2963        }        }
2964      continue;      continue;
2965    
# Line 1849  while ((c = *(++ptr)) != 0) Line 2974  while ((c = *(++ptr)) != 0)
2974      runlength = 0;      runlength = 0;
2975      do      do
2976        {        {
2977        if ((pcre_ctypes[c] & ctype_space) != 0)        if ((options & PCRE_EXTENDED) != 0)
         {  
         if ((options & PCRE_EXTENDED) != 0) continue;  
         spaces++;  
         }  
   
       if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2978          {          {
2979          while ((c = *(++ptr)) != 0 && c != '\n');          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2980          continue;          if (c == '#')
2981              {
2982              /* The space before the ; is to avoid a warning on a silly compiler
2983              on the Macintosh. */
2984              while ((c = *(++ptr)) != 0 && c != '\n') ;
2985              continue;
2986              }
2987          }          }
2988    
2989        /* Backslash may introduce a data char or a metacharacter; stop the        /* Backslash may introduce a data char or a metacharacter; stop the
# Line 1867  while ((c = *(++ptr)) != 0) Line 2992  while ((c = *(++ptr)) != 0)
2992        if (c == '\\')        if (c == '\\')
2993          {          {
2994          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
2995          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2996              &compile_block);
2997          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2998          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2999    
3000    #ifdef SUPPORT_UTF8
3001            if (c > 127 && (options & PCRE_UTF8) != 0)
3002              {
3003              int i;
3004              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3005                if (c <= utf8_table1[i]) break;
3006              runlength += i;
3007              }
3008    #endif
3009          }          }
3010    
3011        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1879  while ((c = *(++ptr)) != 0) Line 3015  while ((c = *(++ptr)) != 0)
3015    
3016      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3017    
3018      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < MAXLIT &&
3019          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3020    
3021      ptr--;      ptr--;
3022      length += runlength;      length += runlength;
# Line 1910  if (re == NULL) Line 3047  if (re == NULL)
3047    return NULL;    return NULL;
3048    }    }
3049    
3050  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3051    
3052  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3053    re->size = size;
3054  re->options = options;  re->options = options;
3055    re->tables = tables;
3056    
3057  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
3058  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 1923  ptr = (const uschar *)pattern; Line 3062  ptr = (const uschar *)pattern;
3062  code = re->code;  code = re->code;
3063  *code = OP_BRA;  *code = OP_BRA;
3064  bracount = 0;  bracount = 0;
3065  (void)compile_regex(options, &bracount, &code, &ptr, errorptr);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3066      &reqchar, &countlits, &compile_block);
3067  re->top_bracket = bracount;  re->top_bracket = bracount;
3068  re->top_backref = top_backref;  re->top_backref = top_backref;
3069    
# Line 1940  if debugging, leave the test till after Line 3080  if debugging, leave the test till after
3080  if (code - re->code > length) *errorptr = ERR23;  if (code - re->code > length) *errorptr = ERR23;
3081  #endif  #endif
3082    
3083    /* Give an error if there's back reference to a non-existent capturing
3084    subpattern. */
3085    
3086    if (top_backref > re->top_bracket) *errorptr = ERR15;
3087    
3088  /* Failed to compile */  /* Failed to compile */
3089    
3090  if (*errorptr != NULL)  if (*errorptr != NULL)
# Line 1950  if (*errorptr != NULL) Line 3095  if (*errorptr != NULL)
3095    return NULL;    return NULL;
3096    }    }
3097    
3098  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3099  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3100  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3101  unanchored matches no end. In the case of multiline matches, an alternative is  
3102  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3103    that speeds up unanchored matches no end. If not, see if we can set the
3104    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3105    start with ^. and also when all branches start with .* for non-DOTALL matches.
3106    */
3107    
3108  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3109    {    {
3110    if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))    int temp_options = options;
3111      if (is_anchored(re->code, &temp_options))
3112      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
3113    else    else
3114      {      {
3115      int ch = find_firstchar(re->code);      int ch = find_firstchar(re->code, &temp_options);
3116      if (ch >= 0)      if (ch >= 0)
3117        {        {
3118        re->first_char = ch;        re->first_char = ch;
# Line 1973  if ((options & PCRE_ANCHORED) == 0) Line 3123  if ((options & PCRE_ANCHORED) == 0)
3123      }      }
3124    }    }
3125    
3126    /* Save the last required character if there are at least two literal
3127    characters on all paths, or if there is no first character setting. */
3128    
3129    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3130      {
3131      re->req_char = reqchar;
3132      re->options |= PCRE_REQCHSET;
3133      }
3134    
3135  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3136    
3137  #ifdef DEBUG  #ifdef DEBUG
3138    
3139  printf("Length = %d top_bracket = %d top_backref=%d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
3140    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
3141    
3142  if (re->options != 0)  if (re->options != 0)
3143    {    {
3144    printf("%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3145      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3146      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3147        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3148      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3149      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3150      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3151      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3152      ((re->options & PCRE_EXTRA) != 0)? "extra " : "");      ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3153        ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3154    }    }
3155    
3156  if ((re->options & PCRE_FIRSTSET) != 0)  if ((re->options & PCRE_FIRSTSET) != 0)
# Line 1998  if ((re->options & PCRE_FIRSTSET) != 0) Line 3159  if ((re->options & PCRE_FIRSTSET) != 0)
3159      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3160    }    }
3161    
3162    if ((re->options & PCRE_REQCHSET) != 0)
3163      {
3164      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3165        else printf("Req char = \\x%02x\n", re->req_char);
3166      }
3167    
3168  code_end = code;  code_end = code;
3169  code_base = code = re->code;  code_base = code = re->code;
3170    
# Line 2015  while (code < code_end) Line 3182  while (code < code_end)
3182    
3183    else switch(*code)    else switch(*code)
3184      {      {
3185        case OP_OPT:
3186        printf(" %.2x %s", code[1], OP_names[*code]);
3187        code++;
3188        break;
3189    
3190        case OP_COND:
3191        printf("%3d Cond", (code[1] << 8) + code[2]);
3192        code += 2;
3193        break;
3194    
3195        case OP_CREF:
3196        printf(" %.2d %s", code[1], OP_names[*code]);
3197        code++;
3198        break;
3199    
3200      case OP_CHARS:      case OP_CHARS:
3201      charlength = *(++code);      charlength = *(++code);
3202      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 2028  while (code < code_end) Line 3210  while (code < code_end)
3210      case OP_KET:      case OP_KET:
3211      case OP_ASSERT:      case OP_ASSERT:
3212      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3213        case OP_ASSERTBACK:
3214        case OP_ASSERTBACK_NOT:
3215      case OP_ONCE:      case OP_ONCE:
3216      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3217      code += 2;      code += 2;
3218      break;      break;
3219    
3220        case OP_REVERSE:
3221        printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3222        code += 2;
3223        break;
3224    
3225      case OP_STAR:      case OP_STAR:
3226      case OP_MINSTAR:      case OP_MINSTAR:
3227      case OP_PLUS:      case OP_PLUS:
# Line 2106  while (code < code_end) Line 3295  while (code < code_end)
3295      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
3296    
3297      case OP_CLASS:      case OP_CLASS:
     case OP_NEGCLASS:  
3298        {        {
3299        int i, min, max;        int i, min, max;
3300          code++;
3301        if (*code++ == OP_CLASS) printf("    [");        printf("    [");
         else printf("   ^[");  
3302    
3303        for (i = 0; i < 256; i++)        for (i = 0; i < 256; i++)
3304          {          {
# Line 2193  return (pcre *)re; Line 3380  return (pcre *)re;
3380    
3381    
3382  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
3383  *          Match a back-reference                *  *          Match a back-reference                *
3384  *************************************************/  *************************************************/
3385    
3386  /* If a back reference hasn't been set, the match fails.  /* If a back reference hasn't been set, the length that is passed is greater
3387    than the number of characters left in the string, so the match fails.
3388    
3389  Arguments:  Arguments:
3390    number      reference number    offset      index into the offset vector
3391    eptr        points into the subject    eptr        points into the subject
3392    length      length to be matched    length      length to be matched
3393    md          points to match data block    md          points to match data block
3394      ims         the ims flags
3395    
3396  Returns:      TRUE if matched  Returns:      TRUE if matched
3397  */  */
3398    
3399  static BOOL  static BOOL
3400  match_ref(int number, register const uschar *eptr, int length, match_data *md)  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3401      unsigned long int ims)
3402  {  {
3403  const uschar *p = md->start_subject + md->offset_vector[number];  const uschar *p = md->start_subject + md->offset_vector[offset];
3404    
3405  #ifdef DEBUG  #ifdef DEBUG
3406  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
# Line 2267  printf("\n"); Line 3417  printf("\n");
3417    
3418  /* Always fail if not enough characters left */  /* Always fail if not enough characters left */
3419    
3420  if (length > md->end_subject - p) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
3421    
3422  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
3423    
3424  if (md->caseless)  if ((ims & PCRE_CASELESS) != 0)
3425    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
3426      while (length-- > 0)
3427        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3428      }
3429  else  else
3430    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3431    
# Line 2285  return TRUE; Line 3438  return TRUE;
3438  *         Match from current position            *  *         Match from current position            *
3439  *************************************************/  *************************************************/
3440    
3441  /* On entry ecode points to the first opcode, and eptr to the first character.  /* On entry ecode points to the first opcode, and eptr to the first character
3442    in the subject string, while eptrb holds the value of eptr at the start of the
3443    last bracketed group - used for breaking infinite loops matching zero-length
3444    strings.
3445    
3446  Arguments:  Arguments:
3447     eptr        pointer in subject     eptr        pointer in subject
3448     ecode       position in code     ecode       position in code
3449     offset_top  current top pointer     offset_top  current top pointer
3450     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3451       ims         current /i, /m, and /s options
3452       eptrb       pointer to chain of blocks containing eptr at start of
3453                     brackets - for testing for empty matches
3454       flags       can contain
3455                     match_condassert - this is an assertion condition
3456                     match_isgroup - this is the start of a bracketed group
3457    
3458  Returns:       TRUE if matched  Returns:       TRUE if matched
3459  */  */
3460    
3461  static BOOL  static BOOL
3462  match(register const uschar *eptr, register const uschar *ecode, int offset_top,  match(register const uschar *eptr, register const uschar *ecode,
3463    match_data *md)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3464      int flags)
3465  {  {
3466    unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3467    eptrblock newptrb;
3468    
3469    /* At the start of a bracketed group, add the current subject pointer to the
3470    stack of such pointers, to be re-instated at the end of the group when we hit
3471    the closing ket. When match() is called in other circumstances, we don't add to
3472    the stack. */
3473    
3474    if ((flags & match_isgroup) != 0)
3475      {
3476      newptrb.prev = eptrb;
3477      newptrb.saved_eptr = eptr;
3478      eptrb = &newptrb;
3479      }
3480    
3481    /* Now start processing the operations. */
3482    
3483  for (;;)  for (;;)
3484    {    {
3485      int op = (int)*ecode;
3486    int min, max, ctype;    int min, max, ctype;
3487    register int i;    register int i;
3488    register int c;    register int c;
3489    BOOL minimize = FALSE;    BOOL minimize = FALSE;
3490    
3491    /* Opening bracket. Check the alternative branches in turn, failing if none    /* Opening capturing bracket. If there is space in the offset vector, save
3492    match. We have to set the start offset if required and there is space    the current subject position in the working slot at the top of the vector. We
3493    in the offset vector so that it is available for subsequent back references    mustn't change the current values of the data slot, because they may be set
3494    if the bracket matches. However, if the bracket fails, we must put back the    from a previous iteration of this group, and be referred to by a reference
3495    previous value of both offsets in case they were set by a previous copy of    inside the group.
3496    the same bracket. Don't worry about setting the flag for the error case here;  
3497    that is handled in the code for KET. */    If the bracket fails to match, we need to restore this value and also the
3498      values of the final offsets, in case they were set by a previous iteration of
3499      the same bracket.
3500    
3501      If there isn't enough space in the offset vector, treat this as if it were a
3502      non-capturing bracket. Don't worry about setting the flag for the error case
3503      here; that is handled in the code for KET. */
3504    
3505    if ((int)*ecode >= OP_BRA)    if (op > OP_BRA)
3506      {      {
3507      int number = (*ecode - OP_BRA) << 1;      int number = op - OP_BRA;
3508      int save_offset1 = 0, save_offset2 = 0;      int offset = number << 1;
3509    
3510      DPRINTF(("start bracket %d\n", number/2));  #ifdef DEBUG
3511        printf("start bracket %d subject=", number);
3512        pchars(eptr, 16, TRUE, md);
3513        printf("\n");
3514    #endif
3515    
3516      if (number > 0 && number < md->offset_end)      if (offset < md->offset_max)
3517        {        {
3518        save_offset1 = md->offset_vector[number];        int save_offset1 = md->offset_vector[offset];
3519        save_offset2 = md->offset_vector[number+1];        int save_offset2 = md->offset_vector[offset+1];
3520        md->offset_vector[number] = eptr - md->start_subject;        int save_offset3 = md->offset_vector[md->offset_end - number];
3521    
3522          DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3523          md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3524    
3525          do
3526            {
3527            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3528              return TRUE;
3529            ecode += (ecode[1] << 8) + ecode[2];
3530            }
3531          while (*ecode == OP_ALT);
3532    
3533          DPRINTF(("bracket %d failed\n", number));
3534    
3535        DPRINTF(("saving %d %d\n", save_offset1, save_offset2));        md->offset_vector[offset] = save_offset1;
3536          md->offset_vector[offset+1] = save_offset2;
3537          md->offset_vector[md->offset_end - number] = save_offset3;
3538          return FALSE;
3539        }        }
3540    
3541      /* Recurse for all the alternatives. */      /* Insufficient room for saving captured contents */
3542    
3543        else op = OP_BRA;
3544        }
3545    
3546      /* Other types of node can be handled by a switch */
3547    
3548      switch(op)
3549        {
3550        case OP_BRA:     /* Non-capturing bracket: optimized */
3551        DPRINTF(("start bracket 0\n"));
3552      do      do
3553        {        {
3554        if (match(eptr, ecode+3, offset_top, md)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3555            return TRUE;
3556        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3557        }        }
3558      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3559        DPRINTF(("bracket 0 failed\n"));
3560        return FALSE;
3561    
3562        /* Conditional group: compilation checked that there are no more than
3563        two branches. If the condition is false, skipping the first branch takes us
3564        past the end if there is only one branch, but that's OK because that is
3565        exactly what going to the ket would do. */
3566    
3567        case OP_COND:
3568        if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3569          {
3570          int offset = ecode[4] << 1;    /* Doubled reference number */
3571          return match(eptr,
3572            ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3573              5 : 3 + (ecode[1] << 8) + ecode[2]),
3574            offset_top, md, ims, eptrb, match_isgroup);
3575          }
3576    
3577      DPRINTF(("bracket %d failed\n", number/2));      /* The condition is an assertion. Call match() to evaluate it - setting
3578        the final argument TRUE causes it to stop at the end of an assertion. */
3579    
3580      if (number > 0 && number < md->offset_end)      else
3581        {        {
3582        md->offset_vector[number] = save_offset1;        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3583        md->offset_vector[number+1] = save_offset2;            match_condassert | match_isgroup))
3584            {
3585            ecode += 3 + (ecode[4] << 8) + ecode[5];
3586            while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3587            }
3588          else ecode += (ecode[1] << 8) + ecode[2];
3589          return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3590        }        }
3591        /* Control never reaches here */
3592    
3593      return FALSE;      /* Skip over conditional reference data if encountered (should not be) */
     }  
3594    
3595    /* Other types of node can be handled by a switch */      case OP_CREF:
3596        ecode += 2;
3597        break;
3598    
3599        /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3600        an empty string - recursion will then try other alternatives, if any. */
3601    
   switch(*ecode)  
     {  
3602      case OP_END:      case OP_END:
3603        if (md->notempty && eptr == md->start_match) return FALSE;
3604      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3605      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3606      return TRUE;      return TRUE;
3607    
3608      /* The equivalent of Prolog's "cut" - if the rest doesn't match, the      /* Change option settings */
     whole thing doesn't match, so we have to get out via a longjmp(). */  
3609    
3610      case OP_CUT:      case OP_OPT:
3611      if (match(eptr, ecode+1, offset_top, md)) return TRUE;      ims = ecode[1];
3612      longjmp(md->fail_env, 1);      ecode += 2;
3613        DPRINTF(("ims set to %02lx\n", ims));
3614        break;
3615    
3616      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
3617      matching won't pass the KET for an assertion. If any one branch matches,      matching won't pass the KET for an assertion. If any one branch matches,
3618      the assertion is true. */      the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3619        start of each branch to move the current point backwards, so the code at
3620        this level is identical to the lookahead case. */
3621    
3622      case OP_ASSERT:      case OP_ASSERT:
3623        case OP_ASSERTBACK:
3624      do      do
3625        {        {
3626        if (match(eptr, ecode+3, offset_top, md)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3627        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3628        }        }
3629      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3630      if (*ecode == OP_KET) return FALSE;      if (*ecode == OP_KET) return FALSE;
3631    
3632        /* If checking an assertion for a condition, return TRUE. */
3633    
3634        if ((flags & match_condassert) != 0) return TRUE;
3635    
3636      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3637      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
3638    
# Line 2391  for (;;) Line 3644  for (;;)
3644      /* Negative assertion: all branches must fail to match */      /* Negative assertion: all branches must fail to match */
3645    
3646      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3647        case OP_ASSERTBACK_NOT:
3648      do      do
3649        {        {
3650        if (match(eptr, ecode+3, offset_top, md)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3651            return FALSE;
3652        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3653        }        }
3654      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3655    
3656        if ((flags & match_condassert) != 0) return TRUE;
3657    
3658      ecode += 3;      ecode += 3;
3659      continue;      continue;
3660    
3661        /* Move the subject pointer back. This occurs only at the start of
3662        each branch of a lookbehind assertion. If we are too close to the start to
3663        move back, this match function fails. When working with UTF-8 we move
3664        back a number of characters, not bytes. */
3665    
3666        case OP_REVERSE:
3667    #ifdef SUPPORT_UTF8
3668        c = (ecode[1] << 8) + ecode[2];
3669        for (i = 0; i < c; i++)
3670          {
3671          eptr--;
3672          BACKCHAR(eptr)
3673          }
3674    #else
3675        eptr -= (ecode[1] << 8) + ecode[2];
3676    #endif
3677    
3678        if (eptr < md->start_subject) return FALSE;
3679        ecode += 3;
3680        break;
3681    
3682        /* Recursion matches the current regex, nested. If there are any capturing
3683        brackets started but not finished, we have to save their starting points
3684        and reinstate them after the recursion. However, we don't know how many
3685        such there are (offset_top records the completed total) so we just have
3686        to save all the potential data. There may be up to 99 such values, which
3687        is a bit large to put on the stack, but using malloc for small numbers
3688        seems expensive. As a compromise, the stack is used when there are fewer
3689        than 16 values to store; otherwise malloc is used. A problem is what to do
3690        if the malloc fails ... there is no way of returning to the top level with
3691        an error. Save the top 15 values on the stack, and accept that the rest
3692        may be wrong. */
3693    
3694        case OP_RECURSE:
3695          {
3696          BOOL rc;
3697          int *save;
3698          int stacksave[15];
3699    
3700          c = md->offset_max;
3701    
3702          if (c < 16) save = stacksave; else
3703            {
3704            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3705            if (save == NULL)
3706              {
3707              save = stacksave;
3708              c = 15;
3709              }
3710            }
3711    
3712          for (i = 1; i <= c; i++)
3713            save[i] = md->offset_vector[md->offset_end - i];
3714          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3715            match_isgroup);
3716          for (i = 1; i <= c; i++)
3717            md->offset_vector[md->offset_end - i] = save[i];
3718          if (save != stacksave) (pcre_free)(save);
3719          if (!rc) return FALSE;
3720    
3721          /* In case the recursion has set more capturing values, save the final
3722          number, then move along the subject till after the recursive match,
3723          and advance one byte in the pattern code. */
3724    
3725          offset_top = md->end_offset_top;
3726          eptr = md->end_match_ptr;
3727          ecode++;
3728          }
3729        break;
3730    
3731      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3732      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
3733      a move back into the brackets. Check the alternative branches in turn - the      a move back into the brackets. Check the alternative branches in turn - the
3734      matching won't pass the KET for this kind of subpattern. If any one branch      matching won't pass the KET for this kind of subpattern. If any one branch
3735      matches, we carry on, leaving the subject pointer. */      matches, we carry on as at the end of a normal bracket, leaving the subject
3736        pointer. */
3737    
3738      case OP_ONCE:      case OP_ONCE:
     do  
3739        {        {
3740        if (match(eptr, ecode+3, offset_top, md)) break;        const uschar *prev = ecode;
3741        ecode += (ecode[1] << 8) + ecode[2];        const uschar *saved_eptr = eptr;
       }  
     while (*ecode == OP_ALT);  
     if (*ecode == OP_KET) return FALSE;  
3742    
3743      /* Continue as from after the assertion, updating the offsets high water        do
3744      mark, since extracts may have been taken. */          {
3745            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3746              break;
3747            ecode += (ecode[1] << 8) + ecode[2];
3748            }
3749          while (*ecode == OP_ALT);
3750    
3751      do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);        /* If hit the end of the group (which could be repeated), fail */
3752      ecode += 3;  
3753      offset_top = md->end_offset_top;        if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3754      eptr = md->end_match_ptr;  
3755      continue;        /* Continue as from after the assertion, updating the offsets high water
3756          mark, since extracts may have been taken. */
3757    
3758          do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3759    
3760          offset_top = md->end_offset_top;
3761          eptr = md->end_match_ptr;
3762    
3763          /* For a non-repeating ket, just continue at this level. This also
3764          happens for a repeating ket if no characters were matched in the group.
3765          This is the forcible breaking of infinite loops as implemented in Perl
3766          5.005. If there is an options reset, it will get obeyed in the normal
3767          course of events. */
3768    
3769          if (*ecode == OP_KET || eptr == saved_eptr)
3770            {
3771            ecode += 3;
3772            break;
3773            }
3774    
3775          /* The repeating kets try the rest of the pattern or restart from the
3776          preceding bracket, in the appropriate order. We need to reset any options
3777          that changed within the bracket before re-running it, so check the next
3778          opcode. */
3779    
3780          if (ecode[3] == OP_OPT)
3781            {
3782            ims = (ims & ~PCRE_IMS) | ecode[4];
3783            DPRINTF(("ims set to %02lx at group repeat\n", ims));
3784            }
3785    
3786          if (*ecode == OP_KETRMIN)
3787            {
3788            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3789                match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3790                  return TRUE;
3791            }
3792          else  /* OP_KETRMAX */
3793            {
3794            if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3795                match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3796            }
3797          }
3798        return FALSE;
3799    
3800      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
3801      bracketed group and go to there. */      bracketed group and go to there. */
# Line 2440  for (;;) Line 3813  for (;;)
3813      case OP_BRAZERO:      case OP_BRAZERO:
3814        {        {
3815        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3816        if (match(eptr, next, offset_top, md)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3817            return TRUE;
3818        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3819        ecode = next + 3;        ecode = next + 3;
3820        }        }
# Line 2450  for (;;) Line 3824  for (;;)
3824        {        {
3825        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3826        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3827        if (match(eptr, next+3, offset_top, md)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3828            return TRUE;
3829        ecode++;        ecode++;
3830        }        }
3831      break;;      break;
3832    
3833      /* End of a group, repeated or non-repeating. If we are at the end of      /* End of a group, repeated or non-repeating. If we are at the end of
3834      an assertion "group", stop matching and return TRUE, but record the      an assertion "group", stop matching and return TRUE, but record the
3835      current high water mark for use by positive assertions. */      current high water mark for use by positive assertions. Do this also
3836        for the "once" (not-backup up) groups. */
3837    
3838      case OP_KET:      case OP_KET:
3839      case OP_KETRMIN:      case OP_KETRMIN:
3840      case OP_KETRMAX:      case OP_KETRMAX:
3841        {        {
       int number;  
3842        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3843          const uschar *saved_eptr = eptrb->saved_eptr;
3844    
3845        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)        eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3846    
3847          if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3848              *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3849              *prev == OP_ONCE)
3850          {          {
3851          md->end_match_ptr = eptr;      /* For ONCE */          md->end_match_ptr = eptr;      /* For ONCE */
3852          md->end_offset_top = offset_top;          md->end_offset_top = offset_top;
3853          return TRUE;          return TRUE;
3854          }          }
3855    
3856        /* In all other cases we have to check the group number back at the        /* In all other cases except a conditional group we have to check the
3857        start and if necessary complete handling an extraction by setting the        group number back at the start and if necessary complete handling an
3858        final offset and bumping the high water mark. */        extraction by setting the offsets and bumping the high water mark. */
3859    
3860        number = (*prev - OP_BRA) << 1;        if (*prev != OP_COND)
3861            {
3862            int number = *prev - OP_BRA;
3863            int offset = number << 1;
3864    
3865        DPRINTF(("end bracket %d\n", number/2));  #ifdef DEBUG
3866            printf("end bracket %d", number);
3867            printf("\n");
3868    #endif
3869    
3870        if (number > 0)          if (number > 0)
         {  
         if (number >= md->offset_end) md->offset_overflow = TRUE; else  
3871            {            {
3872            md->offset_vector[number+1] = eptr - md->start_subject;            if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3873            if (offset_top <= number) offset_top = number + 2;              {
3874                md->offset_vector[offset] =
3875                  md->offset_vector[md->offset_end - number];
3876                md->offset_vector[offset+1] = eptr - md->start_subject;
3877                if (offset_top <= offset) offset_top = offset + 2;
3878                }
3879            }            }
3880          }          }
3881    
3882        /* For a non-repeating ket, just advance to the next node and continue at        /* Reset the value of the ims flags, in case they got changed during
3883        this level. */        the group. */
3884    
3885          ims = original_ims;
3886          DPRINTF(("ims reset to %02lx\n", ims));
3887    
3888        if (*ecode == OP_KET)        /* For a non-repeating ket, just continue at this level. This also
3889          happens for a repeating ket if no characters were matched in the group.
3890          This is the forcible breaking of infinite loops as implemented in Perl
3891          5.005. If there is an options reset, it will get obeyed in the normal
3892          course of events. */
3893    
3894          if (*ecode == OP_KET || eptr == saved_eptr)
3895          {          {
3896          ecode += 3;          ecode += 3;
3897          break;          break;
# Line 2504  for (;;) Line 3902  for (;;)
3902    
3903        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3904          {          {
3905          if (match(eptr, ecode+3, offset_top, md) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3906              match(eptr, prev, offset_top, md)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3907                  return TRUE;
3908          }          }
3909        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3910          {          {
3911          if (match(eptr, prev, offset_top, md) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3912              match(eptr, ecode+3, offset_top, md)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3913          }          }
3914        }        }
3915      return FALSE;      return FALSE;
# Line 2519  for (;;) Line 3918  for (;;)
3918    
3919      case OP_CIRC:      case OP_CIRC:
3920      if (md->notbol && eptr == md->start_subject) return FALSE;      if (md->notbol && eptr == md->start_subject) return FALSE;
3921      if (md->multiline)      if ((ims & PCRE_MULTILINE) != 0)
3922        {        {
3923        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3924        ecode++;        ecode++;
# Line 2534  for (;;) Line 3933  for (;;)
3933      ecode++;      ecode++;
3934      break;      break;
3935    
3936      /* Assert before internal newline if multiline, or before      /* Assert before internal newline if multiline, or before a terminating
3937      a terminating newline unless endonly is set, else end of subject unless      newline unless endonly is set, else end of subject unless noteol is set. */
     noteol is set. */  
3938    
3939      case OP_DOLL:      case OP_DOLL:
3940      if (md->noteol && eptr >= md->end_subject) return FALSE;      if ((ims & PCRE_MULTILINE) != 0)
     if (md->multiline)  
3941        {        {