/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 17 by nigel, Sat Feb 24 21:38:29 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-2001 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 33  restrictions: Line 37  restrictions:
37    
38  /* #define DEBUG */  /* #define DEBUG */
39    
40  /* Use a macro for debugging printing, 'cause that eliminates the the use  /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41  of #ifdef inline, and there are *still* stupid compilers about that don't like  inline, and there are *still* stupid compilers about that don't like indented
42  indented pre-processor statements. I suppose it's only been 10 years... */  pre-processor statements. I suppose it's only been 10 years... */
43    
44  #ifdef DEBUG  #ifdef DEBUG
45  #define DPRINTF(p) printf p  #define DPRINTF(p) printf p
# Line 56  the external pcre header. */ Line 60  the external pcre header. */
60  #endif  #endif
61    
62    
63    /* Maximum number of items on the nested bracket stacks at compile time. This
64    applies to the nesting of all kinds of parentheses. It does not limit
65    un-nested, non-capturing parentheses. This number can be made bigger if
66    necessary - it is used to dimension one int and one unsigned char vector at
67    compile time. */
68    
69    #define BRASTACK_SIZE 200
70    
71    
72    /* The number of bytes in a literal character string above which we can't add
73    any more is different when UTF-8 characters may be encountered. */
74    
75    #ifdef SUPPORT_UTF8
76    #define MAXLIT 250
77    #else
78    #define MAXLIT 255
79    #endif
80    
81    
82  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83    
84  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 66  static const char rep_max[] = { 0, 0, 0, Line 89  static const char rep_max[] = { 0, 0, 0,
89  #ifdef DEBUG  #ifdef DEBUG
90  static const char *OP_names[] = {  static const char *OP_names[] = {
91    "End", "\\A", "\\B", "\\b", "\\D", "\\d",    "End", "\\A", "\\B", "\\b", "\\D", "\\d",
92    "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",    "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
93    "not",    "Opt", "^", "$", "Any", "chars", "not",
94    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
95    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
98    "class", "negclass", "Ref",    "class", "Ref", "Recurse",
99    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100    "Brazero", "Braminzero", "Bra"    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101      "Brazero", "Braminzero", "Branumber", "Bra"
102  };  };
103  #endif  #endif
104    
# Line 90  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
115      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
116      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
117    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
118      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
119      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
120      0,      0,      0                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
121    };
122    
123    /* Tables of names of POSIX character classes and their lengths. The list is
124    terminated by a zero length entry. The first three must be alpha, upper, lower,
125    as this is assumed for handling case independence. */
126    
127    static const char *posix_names[] = {
128      "alpha", "lower", "upper",
129      "alnum", "ascii", "cntrl", "digit", "graph",
130      "print", "punct", "space", "word",  "xdigit" };
131    
132    static const uschar posix_name_lengths[] = {
133      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
134    
135    /* Table of class bit maps for each POSIX class; up to three may be combined
136    to form the class. */
137    
138    static const int posix_class_maps[] = {
139      cbit_lower, cbit_upper, -1,             /* alpha */
140      cbit_lower, -1,         -1,             /* lower */
141      cbit_upper, -1,         -1,             /* upper */
142      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
143      cbit_print, cbit_cntrl, -1,             /* ascii */
144      cbit_cntrl, -1,         -1,             /* cntrl */
145      cbit_digit, -1,         -1,             /* digit */
146      cbit_graph, -1,         -1,             /* graph */
147      cbit_print, -1,         -1,             /* print */
148      cbit_punct, -1,         -1,             /* punct */
149      cbit_space, -1,         -1,             /* space */
150      cbit_word,  -1,         -1,             /* word */
151      cbit_xdigit,-1,         -1              /* xdigit */
152  };  };
153    
154    
155  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
156    
157  static BOOL  static BOOL
158    compile_regex(int, int *, uschar **, const uschar **, const char **);    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159        BOOL, int, int *, int *, compile_data *);
160    
161  /* Structure for passing "static" information around between the functions  /* Structure for building a chain of data that actually lives on the
162  doing the matching, so that they are thread-safe. */  stack, for holding the values of the subject pointer at the start of each
163    subpattern, so as to detect when an empty string has been matched by a
164    subpattern - to break infinite loops. */
165    
166    typedef struct eptrblock {
167      struct eptrblock *prev;
168      const uschar *saved_eptr;
169    } eptrblock;
170    
171  typedef struct match_data {  /* Flag bits for the match() function */
172    int    errorcode;             /* As it says */  
173    int   *offset_vector;         /* Offset vector */  #define match_condassert   0x01    /* Called to check a condition assertion */
174    int    offset_end;            /* One past the end */  #define match_isgroup      0x02    /* Set if start of bracketed group */
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   caseless;              /* Case-independent flag */  
   BOOL   runtime_caseless;      /* Caseless forced at run time */  
   BOOL   multiline;             /* Multiline flag */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   dotall;                /* Dot matches any char */  
   BOOL   endonly;               /* Dollar not before final \n */  
   const uschar *start_subject;  /* Start of the subject string */  
   const uschar *end_subject;    /* End of the subject string */  
   jmp_buf fail_env;             /* Environment for longjump() break out */  
   const uschar *end_match_ptr;  /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
175    
176    
177    
# Line 139  void  (*pcre_free)(void *) = free; Line 189  void  (*pcre_free)(void *) = free;
189    
190    
191    
192    /*************************************************
193    *    Macros and tables for character handling    *
194    *************************************************/
195    
196    /* When UTF-8 encoding is being used, a character is no longer just a single
197    byte. The macros for character handling generate simple sequences when used in
198    byte-mode, and more complicated ones for UTF-8 characters. */
199    
200    #ifndef SUPPORT_UTF8
201    #define GETCHARINC(c, eptr) c = *eptr++;
202    #define GETCHARLEN(c, eptr, len) c = *eptr;
203    #define BACKCHAR(eptr)
204    
205    #else   /* SUPPORT_UTF8 */
206    
207    /* Get the next UTF-8 character, advancing the pointer */
208    
209    #define GETCHARINC(c, eptr) \
210      c = *eptr++; \
211      if (md->utf8 && (c & 0xc0) == 0xc0) \
212        { \
213        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
214        int s = 6 - a;                  /* Amount to shift next byte */  \
215        c &= utf8_table3[a];            /* Low order bits from first byte */ \
216        while (a-- > 0) \
217          { \
218          c |= (*eptr++ & 0x3f) << s; \
219          s += 6; \
220          } \
221        }
222    
223    /* Get the next UTF-8 character, not advancing the pointer, setting length */
224    
225    #define GETCHARLEN(c, eptr, len) \
226      c = *eptr; \
227      len = 1; \
228      if (md->utf8 && (c & 0xc0) == 0xc0) \
229        { \
230        int i; \
231        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
232        int s = 6 - a;                  /* Amount to shift next byte */  \
233        c &= utf8_table3[a];            /* Low order bits from first byte */ \
234        for (i = 1; i <= a; i++) \
235          { \
236          c |= (eptr[i] & 0x3f) << s; \
237          s += 6; \
238          } \
239        len += a; \
240        }
241    
242    /* If the pointer is not at the start of a character, move it back until
243    it is. */
244    
245    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
246    
247    #endif
248    
249    
250    
251    /*************************************************
252    *             Default character tables           *
253    *************************************************/
254    
255    /* A default set of character tables is included in the PCRE binary. Its source
256    is built by the maketables auxiliary program, which uses the default C ctypes
257    functions, and put in the file chartables.c. These tables are used by PCRE
258    whenever the caller of pcre_compile() does not provide an alternate set of
259    tables. */
260    
261    #include "chartables.c"
262    
263    
264    
265    #ifdef SUPPORT_UTF8
266    /*************************************************
267    *           Tables for UTF-8 support             *
268    *************************************************/
269    
270    /* These are the breakpoints for different numbers of bytes in a UTF-8
271    character. */
272    
273    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
274    
275    /* These are the indicator bits and the mask for the data bits to set in the
276    first byte of a character, indexed by the number of additional bytes. */
277    
278    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
280    
281    /* Table of the number of extra characters, indexed by the first character
282    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
283    0x3d. */
284    
285    static uschar utf8_table4[] = {
286      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
290    
291    
292    /*************************************************
293    *       Convert character value to UTF-8         *
294    *************************************************/
295    
296    /* This function takes an integer value in the range 0 - 0x7fffffff
297    and encodes it as a UTF-8 character in 0 to 6 bytes.
298    
299    Arguments:
300      cvalue     the character value
301      buffer     pointer to buffer for result - at least 6 bytes long
302    
303    Returns:     number of characters placed in the buffer
304    */
305    
306    static int
307    ord2utf8(int cvalue, uschar *buffer)
308    {
309    register int i, j;
310    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311      if (cvalue <= utf8_table1[i]) break;
312    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
313    cvalue >>= 6 - i;
314    for (j = 0; j < i; j++)
315      {
316      *buffer++ = 0x80 | (cvalue & 0x3f);
317      cvalue >>= 6;
318      }
319    return i + 1;
320    }
321    #endif
322    
323    
324    
325  /*************************************************  /*************************************************
326  *          Return version string                 *  *          Return version string                 *
327  *************************************************/  *************************************************/
328    
329    #define STRING(a)  # a
330    #define XSTRING(s) STRING(s)
331    
332  const char *  const char *
333  pcre_version(void)  pcre_version(void)
334  {  {
335  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
336  }  }
337    
338    
339    
340    
341  /*************************************************  /*************************************************
342  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
343  *************************************************/  *************************************************/
344    
345  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
346  structure.  of the private structure, but its interface was too rigid. It remains for
347    backwards compatibility. The public options are passed back in an int - though
348    the re->options field has been expanded to a long int, all the public options
349    at the low end of it, and so even on 16-bit systems this will still be OK.
350    Therefore, I haven't changed the API for pcre_info().
351    
352  Arguments:  Arguments:
353    external_re   points to compiled code    external_re   points to compiled code
# Line 167  Arguments: Line 356  Arguments:
356                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
357                  or -2 otherwise                  or -2 otherwise
358    
359  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
360                  or negative values on error                  or negative values on error
361  */  */
362    
# Line 177  pcre_info(const pcre *external_re, int * Line 366  pcre_info(const pcre *external_re, int *
366  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
367  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
368  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370  if (first_char != NULL)  if (first_char != NULL)
371    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 186  return re->top_bracket; Line 375  return re->top_bracket;
375    
376    
377    
378    /*************************************************
379    *        Return info about compiled pattern      *
380    *************************************************/
381    
382    /* This is a newer "info" function which has an extensible interface so
383    that additional items can be added compatibly.
384    
385    Arguments:
386      external_re      points to compiled code
387      external_study   points to study data, or NULL
388      what             what information is required
389      where            where to put the information
390    
391    Returns:           0 if data returned, negative on error
392    */
393    
394    int
395    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
396      void *where)
397    {
398    const real_pcre *re = (const real_pcre *)external_re;
399    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
400    
401    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403    
404    switch (what)
405      {
406      case PCRE_INFO_OPTIONS:
407      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408      break;
409    
410      case PCRE_INFO_SIZE:
411      *((size_t *)where) = re->size;
412      break;
413    
414      case PCRE_INFO_CAPTURECOUNT:
415      *((int *)where) = re->top_bracket;
416      break;
417    
418      case PCRE_INFO_BACKREFMAX:
419      *((int *)where) = re->top_backref;
420      break;
421    
422      case PCRE_INFO_FIRSTCHAR:
423      *((int *)where) =
424        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426      break;
427    
428      case PCRE_INFO_FIRSTTABLE:
429      *((const uschar **)where) =
430        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431          study->start_bits : NULL;
432      break;
433    
434      case PCRE_INFO_LASTLITERAL:
435      *((int *)where) =
436        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
437      break;
438    
439      default: return PCRE_ERROR_BADOPTION;
440      }
441    
442    return 0;
443    }
444    
445    
446    
447  #ifdef DEBUG  #ifdef DEBUG
448  /*************************************************  /*************************************************
# Line 218  while (length-- > 0) Line 475  while (length-- > 0)
475    
476    
477  /*************************************************  /*************************************************
 *         Check subpattern for empty operand     *  
 *************************************************/  
   
 /* This function checks a bracketed subpattern to see if any of the paths  
 through it could match an empty string. This is used to diagnose an error if  
 such a subpattern is followed by a quantifier with an unlimited upper bound.  
   
 Argument:  
   code      points to the opening bracket  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 could_be_empty(uschar *code)  
 {  
 do {  
   uschar *cc = code + 3;  
   
   /* Scan along the opcodes for this branch; as soon as we find something  
   that matches a non-empty string, break out and advance to test the next  
   branch. If we get to the end of the branch, return TRUE for the whole  
   sub-expression. */  
   
   for (;;)  
     {  
     /* Test an embedded subpattern; if it could not be empty, break the  
     loop. Otherwise carry on in the branch. */  
   
     if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)  
       {  
       if (!could_be_empty(cc)) break;  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       }  
   
     else switch (*cc)  
       {  
       /* Reached end of a branch: the subpattern may match the empty string */  
   
       case OP_ALT:  
       case OP_KET:  
       case OP_KETRMAX:  
       case OP_KETRMIN:  
       return TRUE;  
   
       /* Skip over entire bracket groups with zero lower bound */  
   
       case OP_BRAZERO:  
       case OP_BRAMINZERO:  
       cc++;  
       /* Fall through */  
   
       /* Skip over assertive subpatterns */  
   
       case OP_ASSERT:  
       case OP_ASSERT_NOT:  
       do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);  
       cc += 3;  
       break;  
   
       /* Skip over things that don't match chars */  
   
       case OP_SOD:  
       case OP_EOD:  
       case OP_CIRC:  
       case OP_DOLL:  
       case OP_NOT_WORD_BOUNDARY:  
       case OP_WORD_BOUNDARY:  
       cc++;  
       break;  
   
       /* Skip over simple repeats with zero lower bound */  
   
       case OP_STAR:  
       case OP_MINSTAR:  
       case OP_QUERY:  
       case OP_MINQUERY:  
       case OP_NOTSTAR:  
       case OP_NOTMINSTAR:  
       case OP_NOTQUERY:  
       case OP_NOTMINQUERY:  
       case OP_TYPESTAR:  
       case OP_TYPEMINSTAR:  
       case OP_TYPEQUERY:  
       case OP_TYPEMINQUERY:  
       cc += 2;  
       break;  
   
       /* Skip over UPTOs (lower bound is zero) */  
   
       case OP_UPTO:  
       case OP_MINUPTO:  
       case OP_TYPEUPTO:  
       case OP_TYPEMINUPTO:  
       cc += 4;  
       break;  
   
       /* Check a class or a back reference for a zero minimum */  
   
       case OP_CLASS:  
       case OP_NEGCLASS:  
       case OP_REF:  
       cc += (*cc == OP_REF)? 2 : 33;  
   
       switch (*cc)  
         {  
         case OP_CRSTAR:  
         case OP_CRMINSTAR:  
         case OP_CRQUERY:  
         case OP_CRMINQUERY:  
         cc++;  
         break;  
   
         case OP_CRRANGE:  
         case OP_CRMINRANGE:  
         if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;  
         cc += 3;  
         break;  
   
         default:  
         goto NEXT_BRANCH;  
         }  
       break;  
   
       /* Anything else matches at least one character */  
   
       default:  
       goto NEXT_BRANCH;  
       }  
     }  
   
   NEXT_BRANCH:  
   code += (code[1] << 8) + code[2];  
   }  
 while (*code == OP_ALT);  
   
 /* No branches match the empty string */  
   
 return FALSE;  
 }  
   
   
   
 /*************************************************  
478  *            Handle escapes                      *  *            Handle escapes                      *
479  *************************************************/  *************************************************/
480    
481  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
482  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
483  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
485  sequence.  the \. On exit, it is on the final character of the escape sequence.
486    
487  Arguments:  Arguments:
488    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 378  Arguments: Line 490  Arguments:
490    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
491    options    the options bits    options    the options bits
492    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
493      cd         pointer to char tables block
494    
495  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
496               negative => a special escape sequence               negative => a special escape sequence
# Line 386  Returns:     zero or positive => a data Line 499  Returns:     zero or positive => a data
499    
500  static int  static int
501  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
502    int options, BOOL isclass)    int options, BOOL isclass, compile_data *cd)
503  {  {
504  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
505  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
506    
507    /* If backslash is at the end of the pattern, it's an error. */
508    
509    c = *(++ptr);
510  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
511    
512  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 429  else Line 544  else
544        {        {
545        oldptr = ptr;        oldptr = ptr;
546        c -= '0';        c -= '0';
547        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
548          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
549        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
550          {          {
# Line 451  else Line 566  else
566        }        }
567    
568      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
569      larger first octal digit */      larger first octal digit. */
570    
571      case '0':      case '0':
572      c -= '0';      c -= '0';
573      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
575          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
576        c &= 255;     /* Take least significant 8 bits */
577      break;      break;
578    
579      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580        which can be greater than 0xff, but only if the ddd are hex digits. */
581    
582      case 'x':      case 'x':
583    #ifdef SUPPORT_UTF8
584        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
585          {
586          const uschar *pt = ptr + 2;
587          register int count = 0;
588          c = 0;
589          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
590            {
591            count++;
592            c = c * 16 + cd->lcc[*pt] -
593              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
594            pt++;
595            }
596          if (*pt == '}')
597            {
598            if (c < 0 || count > 8) *errorptr = ERR34;
599            ptr = pt;
600            break;
601            }
602          /* If the sequence of hex digits does not end with '}', then we don't
603          recognize this construct; fall through to the normal \x handling. */
604          }
605    #endif
606    
607        /* Read just a single hex char */
608    
609      c = 0;      c = 0;
610      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
611        {        {
612        ptr++;        ptr++;
613        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
614          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
615        }        }
616      break;      break;
617    
618        /* Other special escapes not starting with a digit are straightforward */
619    
620      case 'c':      case 'c':
621      c = *(++ptr);      c = *(++ptr);
622      if (c == 0)      if (c == 0)
# Line 482  else Line 627  else
627    
628      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
629    
630      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
631      c ^= 0x40;      c ^= 0x40;
632      break;      break;
633    
634      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
635      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
636      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
637        there used to be some cases other than the default, and there may be again
638        in future, so I haven't "optimized" it. */
639    
640      default:      default:
641      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
642        {        {
       case 'X':  
       c = -ESC_X;      /* This could be a lookup if it ever got into Perl */  
       break;  
   
643        default:        default:
644        *errorptr = ERR3;        *errorptr = ERR3;
645        break;        break;
# Line 522  where the ddds are digits. Line 665  where the ddds are digits.
665    
666  Arguments:  Arguments:
667    p         pointer to the first char after '{'    p         pointer to the first char after '{'
668      cd        pointer to char tables block
669    
670  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
671  */  */
672    
673  static BOOL  static BOOL
674  is_counted_repeat(const uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
675  {  {
676  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
677  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
678  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
679    
680  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
681  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
682    
683  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
684  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
685  return (*p == '}');  return (*p == '}');
686  }  }
687    
# Line 557  Arguments: Line 701  Arguments:
701    maxp       pointer to int for max    maxp       pointer to int for max
702               returned as -1 if no max               returned as -1 if no max
703    errorptr   points to pointer to error message    errorptr   points to pointer to error message
704      cd         pointer to character tables clock
705    
706  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
707               current ptr on error, with errorptr set               current ptr on error, with errorptr set
708  */  */
709    
710  static const uschar *  static const uschar *
711  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
712      const char **errorptr, compile_data *cd)
713  {  {
714  int min = 0;  int min = 0;
715  int max = -1;  int max = -1;
716    
717  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
718    
719  if (*p == '}') max = min; else  if (*p == '}') max = min; else
720    {    {
721    if (*(++p) != '}')    if (*(++p) != '}')
722      {      {
723      max = 0;      max = 0;
724      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
725      if (max < min)      if (max < min)
726        {        {
727        *errorptr = ERR4;        *errorptr = ERR4;
# Line 600  return p; Line 746  return p;
746    
747    
748  /*************************************************  /*************************************************
749    *        Find the fixed length of a pattern      *
750    *************************************************/
751    
752    /* Scan a pattern and compute the fixed length of subject that will match it,
753    if the length is fixed. This is needed for dealing with backward assertions.
754    
755    Arguments:
756      code     points to the start of the pattern (the bracket)
757      options  the compiling options
758    
759    Returns:   the fixed length, or -1 if there is no fixed length
760    */
761    
762    static int
763    find_fixedlength(uschar *code, int options)
764    {
765    int length = -1;
766    
767    register int branchlength = 0;
768    register uschar *cc = code + 3;
769    
770    /* Scan along the opcodes for this branch. If we get to the end of the
771    branch, check the length against that of the other branches. */
772    
773    for (;;)
774      {
775      int d;
776      register int op = *cc;
777      if (op >= OP_BRA) op = OP_BRA;
778    
779      switch (op)
780        {
781        case OP_BRA:
782        case OP_ONCE:
783        case OP_COND:
784        d = find_fixedlength(cc, options);
785        if (d < 0) return -1;
786        branchlength += d;
787        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
788        cc += 3;
789        break;
790    
791        /* Reached end of a branch; if it's a ket it is the end of a nested
792        call. If it's ALT it is an alternation in a nested call. If it is
793        END it's the end of the outer call. All can be handled by the same code. */
794    
795        case OP_ALT:
796        case OP_KET:
797        case OP_KETRMAX:
798        case OP_KETRMIN:
799        case OP_END:
800        if (length < 0) length = branchlength;
801          else if (length != branchlength) return -1;
802        if (*cc != OP_ALT) return length;
803        cc += 3;
804        branchlength = 0;
805        break;
806    
807        /* Skip over assertive subpatterns */
808    
809        case OP_ASSERT:
810        case OP_ASSERT_NOT:
811        case OP_ASSERTBACK:
812        case OP_ASSERTBACK_NOT:
813        do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
814        cc += 3;
815        break;
816    
817        /* Skip over things that don't match chars */
818    
819        case OP_REVERSE:
820        case OP_BRANUMBER:
821        case OP_CREF:
822        cc++;
823        /* Fall through */
824    
825        case OP_OPT:
826        cc++;
827        /* Fall through */
828    
829        case OP_SOD:
830        case OP_EOD:
831        case OP_EODN:
832        case OP_CIRC:
833        case OP_DOLL:
834        case OP_NOT_WORD_BOUNDARY:
835        case OP_WORD_BOUNDARY:
836        cc++;
837        break;
838    
839        /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840        This requires a scan of the string, unfortunately. We assume valid UTF-8
841        strings, so all we do is reduce the length by one for byte whose bits are
842        10xxxxxx. */
843    
844        case OP_CHARS:
845        branchlength += *(++cc);
846    #ifdef SUPPORT_UTF8
847        for (d = 1; d <= *cc; d++)
848          if ((cc[d] & 0xc0) == 0x80) branchlength--;
849    #endif
850        cc += *cc + 1;
851        break;
852    
853        /* Handle exact repetitions */
854    
855        case OP_EXACT:
856        case OP_TYPEEXACT:
857        branchlength += (cc[1] << 8) + cc[2];
858        cc += 4;
859        break;
860    
861        /* Handle single-char matchers */
862    
863        case OP_NOT_DIGIT:
864        case OP_DIGIT:
865        case OP_NOT_WHITESPACE:
866        case OP_WHITESPACE:
867        case OP_NOT_WORDCHAR:
868        case OP_WORDCHAR:
869        case OP_ANY:
870        branchlength++;
871        cc++;
872        break;
873    
874    
875        /* Check a class for variable quantification */
876    
877        case OP_CLASS:
878        cc += 33;
879    
880        switch (*cc)
881          {
882          case OP_CRSTAR:
883          case OP_CRMINSTAR:
884          case OP_CRQUERY:
885          case OP_CRMINQUERY:
886          return -1;
887    
888          case OP_CRRANGE:
889          case OP_CRMINRANGE:
890          if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
891          branchlength += (cc[1] << 8) + cc[2];
892          cc += 5;
893          break;
894    
895          default:
896          branchlength++;
897          }
898        break;
899    
900        /* Anything else is variable length */
901    
902        default:
903        return -1;
904        }
905      }
906    /* Control never gets here */
907    }
908    
909    
910    
911    
912    /*************************************************
913    *           Check for POSIX class syntax         *
914    *************************************************/
915    
916    /* This function is called when the sequence "[:" or "[." or "[=" is
917    encountered in a character class. It checks whether this is followed by an
918    optional ^ and then a sequence of letters, terminated by a matching ":]" or
919    ".]" or "=]".
920    
921    Argument:
922      ptr      pointer to the initial [
923      endptr   where to return the end pointer
924      cd       pointer to compile data
925    
926    Returns:   TRUE or FALSE
927    */
928    
929    static BOOL
930    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
931    {
932    int terminator;          /* Don't combine these lines; the Solaris cc */
933    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
934    if (*(++ptr) == '^') ptr++;
935    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
936    if (*ptr == terminator && ptr[1] == ']')
937      {
938      *endptr = ptr;
939      return TRUE;
940      }
941    return FALSE;
942    }
943    
944    
945    
946    
947    /*************************************************
948    *          Check POSIX class name                *
949    *************************************************/
950    
951    /* This function is called to check the name given in a POSIX-style class entry
952    such as [:alnum:].
953    
954    Arguments:
955      ptr        points to the first letter
956      len        the length of the name
957    
958    Returns:     a value representing the name, or -1 if unknown
959    */
960    
961    static int
962    check_posix_name(const uschar *ptr, int len)
963    {
964    register int yield = 0;
965    while (posix_name_lengths[yield] != 0)
966      {
967      if (len == posix_name_lengths[yield] &&
968        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
969      yield++;
970      }
971    return -1;
972    }
973    
974    
975    
976    
977    /*************************************************
978  *           Compile one branch                   *  *           Compile one branch                   *
979  *************************************************/  *************************************************/
980    
981  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
982    
983  Arguments:  Arguments:
984    options    the option bits    options      the option bits
985    bracket    points to number of brackets used    brackets     points to number of extracting brackets used
986    code       points to the pointer to the current code point    code         points to the pointer to the current code point
987    ptrptr     points to the current pattern pointer    ptrptr       points to the current pattern pointer
988    errorptr   points to pointer to error message    errorptr     points to pointer to error message
989      optchanged   set to the value of the last OP_OPT item compiled
990      reqchar      set to the last literal character required, else -1
991      countlits    set to count of mandatory literal characters
992      cd           contains pointers to tables
993    
994  Returns:     TRUE on success  Returns:       TRUE on success
995               FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
996  */  */
997    
998  static BOOL  static BOOL
999  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
1000    const uschar **ptrptr, const char **errorptr)    const uschar **ptrptr, const char **errorptr, int *optchanged,
1001      int *reqchar, int *countlits, compile_data *cd)
1002  {  {
1003  int repeat_type, op_type;  int repeat_type, op_type;
1004  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1005  int bravalue, length;  int bravalue, length;
1006    int greedy_default, greedy_non_default;
1007    int prevreqchar;
1008    int condcount = 0;
1009    int subcountlits = 0;
1010  register int c;  register int c;
1011  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1012    uschar *tempcode;
1013  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1014  const uschar *oldptr;  const uschar *tempptr;
1015  uschar *previous = NULL;  uschar *previous = NULL;
1016  uschar class[32];  uschar class[32];
1017    
1018    /* Set up the default and non-default settings for greediness */
1019    
1020    greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021    greedy_non_default = greedy_default ^ 1;
1022    
1023    /* Initialize no required char, and count of literals */
1024    
1025    *reqchar = prevreqchar = -1;
1026    *countlits = 0;
1027    
1028  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1029    
1030  for (;; ptr++)  for (;; ptr++)
1031    {    {
1032    BOOL negate_class;    BOOL negate_class;
1033    int  class_charcount;    int class_charcount;
1034    int  class_lastchar;    int class_lastchar;
1035      int newoptions;
1036      int skipbytes;
1037      int subreqchar;
1038    
1039    c = *ptr;    c = *ptr;
1040    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
1041      {      {
1042      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1043      if (c == '#')      if (c == '#')
1044        {        {
1045        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1046          on the Macintosh. */
1047          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048        continue;        continue;
1049        }        }
1050      }      }
# Line 684  for (;; ptr++) Line 1084  for (;; ptr++)
1084    
1085      case '[':      case '[':
1086      previous = code;      previous = code;
1087        *code++ = OP_CLASS;
1088    
1089      /* If the first character is '^', set the negation flag, and use a      /* If the first character is '^', set the negation flag and skip it. */
     different opcode. This only matters if caseless matching is specified at  
     runtime. */  
1090    
1091      if ((c = *(++ptr)) == '^')      if ((c = *(++ptr)) == '^')
1092        {        {
1093        negate_class = TRUE;        negate_class = TRUE;
       *code++ = OP_NEGCLASS;  
1094        c = *(++ptr);        c = *(++ptr);
1095        }        }
1096      else      else negate_class = FALSE;
       {  
       negate_class = FALSE;  
       *code++ = OP_CLASS;  
       }  
1097    
1098      /* Keep a count of chars so that we can optimize the case of just a single      /* Keep a count of chars so that we can optimize the case of just a single
1099      character. */      character. */
# Line 725  for (;; ptr++) Line 1119  for (;; ptr++)
1119          goto FAILED;          goto FAILED;
1120          }          }
1121    
1122          /* Handle POSIX class names. Perl allows a negation extension of the
1123          form [:^name]. A square bracket that doesn't match the syntax is
1124          treated as a literal. We also recognize the POSIX constructions
1125          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1126          5.6 does. */
1127    
1128          if (c == '[' &&
1129              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130              check_posix_syntax(ptr, &tempptr, cd))
1131            {
1132            BOOL local_negate = FALSE;
1133            int posix_class, i;
1134            register const uschar *cbits = cd->cbits;
1135    
1136            if (ptr[1] != ':')
1137              {
1138              *errorptr = ERR31;
1139              goto FAILED;
1140              }
1141    
1142            ptr += 2;
1143            if (*ptr == '^')
1144              {
1145              local_negate = TRUE;
1146              ptr++;
1147              }
1148    
1149            posix_class = check_posix_name(ptr, tempptr - ptr);
1150            if (posix_class < 0)
1151              {
1152              *errorptr = ERR30;
1153              goto FAILED;
1154              }
1155    
1156            /* If matching is caseless, upper and lower are converted to
1157            alpha. This relies on the fact that the class table starts with
1158            alpha, lower, upper as the first 3 entries. */
1159    
1160            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1161              posix_class = 0;
1162    
1163            /* Or into the map we are building up to 3 of the static class
1164            tables, or their negations. */
1165    
1166            posix_class *= 3;
1167            for (i = 0; i < 3; i++)
1168              {
1169              int taboffset = posix_class_maps[posix_class + i];
1170              if (taboffset < 0) break;
1171              if (local_negate)
1172                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1173              else
1174                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1175              }
1176    
1177            ptr = tempptr + 1;
1178            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1179            continue;
1180            }
1181    
1182        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1183        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1184        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 735  for (;; ptr++) Line 1189  for (;; ptr++)
1189    
1190        if (c == '\\')        if (c == '\\')
1191          {          {
1192          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1193          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
1194          else if (c < 0)          else if (c < 0)
1195            {            {
1196              register const uschar *cbits = cd->cbits;
1197            class_charcount = 10;            class_charcount = 10;
1198            switch (-c)            switch (-c)
1199              {              {
1200              case ESC_d:              case ESC_d:
1201              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1202              continue;              continue;
1203    
1204              case ESC_D:              case ESC_D:
1205              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1206              continue;              continue;
1207    
1208              case ESC_w:              case ESC_w:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_W:              case ESC_W:
1213              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);  
1214              continue;              continue;
1215    
1216              case ESC_s:              case ESC_s:
1217              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1218              continue;              continue;
1219    
1220              case ESC_S:              case ESC_S:
1221              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1222              continue;              continue;
1223    
1224              default:              default:
# Line 773  for (;; ptr++) Line 1226  for (;; ptr++)
1226              goto FAILED;              goto FAILED;
1227              }              }
1228            }            }
1229          /* Fall through if single character */  
1230            /* Fall through if single character, but don't at present allow
1231            chars > 255 in UTF-8 mode. */
1232    
1233    #ifdef SUPPORT_UTF8
1234            if (c > 255)
1235              {
1236              *errorptr = ERR33;
1237              goto FAILED;
1238              }
1239    #endif
1240          }          }
1241    
1242        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 793  for (;; ptr++) Line 1256  for (;; ptr++)
1256            }            }
1257    
1258          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1259          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260            in such circumstances. */
1261    
1262          if (d == '\\')          if (d == '\\')
1263            {            {
1264            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            const uschar *oldptr = ptr;
1265              d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266    
1267    #ifdef SUPPORT_UTF8
1268              if (d > 255)
1269                {
1270                *errorptr = ERR33;
1271                goto FAILED;
1272                }
1273    #endif
1274              /* \b is backslash; any other special means the '-' was literal */
1275    
1276            if (d < 0)            if (d < 0)
1277              {              {
1278              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1279                {                {
1280                *errorptr = ERR7;                ptr = oldptr - 2;
1281                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1282                }                }
1283              }              }
1284            }            }
# Line 819  for (;; ptr++) Line 1294  for (;; ptr++)
1294            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
1295            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
1296              {              {
1297              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
1298              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
1299              }              }
1300            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 831  for (;; ptr++) Line 1306  for (;; ptr++)
1306        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1307        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1308    
1309          SINGLE_CHARACTER:
1310    
1311        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1312        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1313          {          {
1314          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
1315          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
1316          }          }
1317        class_charcount++;        class_charcount++;
# Line 881  for (;; ptr++) Line 1358  for (;; ptr++)
1358      /* Various kinds of repeat */      /* Various kinds of repeat */
1359    
1360      case '{':      case '{':
1361      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1362      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1363      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
1364      goto REPEAT;      goto REPEAT;
1365    
# Line 907  for (;; ptr++) Line 1384  for (;; ptr++)
1384        goto FAILED;        goto FAILED;
1385        }        }
1386    
1387      /* If the next character is '?' this is a minimizing repeat. Advance to the      /* If the next character is '?' this is a minimizing repeat, by default,
1388        but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1389      next character. */      next character. */
1390    
1391      if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;      if (ptr[1] == '?')
1392          { repeat_type = greedy_non_default; ptr++; }
1393      /* If the maximum is zero then the minimum must also be zero; Perl allows      else repeat_type = greedy_default;
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
1394    
1395      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1396      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1397      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1398        out any reqchar setting, backing up to the previous value. We must also
1399        adjust the countlits value. */
1400    
1401      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1402        {        {
1403        int len = previous[1];        int len = previous[1];
1404    
1405          if (repeat_min == 0) *reqchar = prevreqchar;
1406          *countlits += repeat_min - 1;
1407    
1408        if (len == 1)        if (len == 1)
1409          {          {
1410          c = previous[2];          c = previous[2];
# Line 955  for (;; ptr++) Line 1436  for (;; ptr++)
1436      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
1437      repeats by adding a suitable offset into repeat_type. */      repeats by adding a suitable offset into repeat_type. */
1438    
1439      else if ((int)*previous < OP_EOD || *previous == OP_ANY)      else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1440        {        {
1441        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1442        c = *previous;        c = *previous;
1443        code = previous;        code = previous;
1444    
1445        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1446        repeat_type += op_type;      /* Combine both values for many cases */  
1447          /* If the maximum is zero then the minimum must also be zero; Perl allows
1448          this case, so we do too - by simply omitting the item altogether. */
1449    
1450          if (repeat_max == 0) goto END_REPEAT;
1451    
1452          /* Combine the op_type with the repeat_type */
1453    
1454          repeat_type += op_type;
1455    
1456        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1457        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 999  for (;; ptr++) Line 1488  for (;; ptr++)
1488          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
1489          we either have to put back the item that got cancelled if the string          we either have to put back the item that got cancelled if the string
1490          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
1491          string. For a character type nothing need be done; it will just get put          string. For a character type nothing need be done; it will just get
1492          back naturally. */          put back naturally. Note that the final character is always going to
1493            get added below. */
1494    
1495          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
1496            {            {
1497            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else previous[1]++;
1498            }            }
1499    
1500            /*  For a single negated character we also have to put back the
1501            item that got cancelled. */
1502    
1503            else if (*previous == OP_NOT) code++;
1504    
1505          /* If the maximum is unlimited, insert an OP_STAR. */          /* If the maximum is unlimited, insert an OP_STAR. */
1506    
1507          if (repeat_max < 0)          if (repeat_max < 0)
# Line 1033  for (;; ptr++) Line 1528  for (;; ptr++)
1528        }        }
1529    
1530      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1531      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1532    
1533      else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||      else if (*previous == OP_CLASS || *previous == OP_REF)
              *previous == OP_REF)  
1534        {        {
1535          if (repeat_max == 0)
1536            {
1537            code = previous;
1538            goto END_REPEAT;
1539            }
1540        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1541          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1542        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1056  for (;; ptr++) Line 1555  for (;; ptr++)
1555        }        }
1556    
1557      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
1558      cases. If the maximum repeat count is unlimited, check that the bracket      cases. */
     group cannot match the empty string, and diagnose an error if it can. */  
1559    
1560      else if ((int)*previous >= OP_BRA)      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561                 (int)*previous == OP_COND)
1562        {        {
1563        int i;        register int i;
1564          int ketoffset = 0;
1565        int len = code - previous;        int len = code - previous;
1566          uschar *bralink = NULL;
1567    
1568        if (repeat_max == -1 && could_be_empty(previous))        /* If the maximum repeat count is unlimited, find the end of the bracket
1569          by scanning through from the start, and compute the offset back to it
1570          from the current code pointer. There may be an OP_OPT setting following
1571          the final KET, so we can't find the end just by going back from the code
1572          pointer. */
1573    
1574          if (repeat_max == -1)
1575            {
1576            register uschar *ket = previous;
1577            do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1578            ketoffset = code - ket;
1579            }
1580    
1581          /* The case of a zero minimum is special because of the need to stick
1582          OP_BRAZERO in front of it, and because the group appears once in the
1583          data, whereas in other cases it appears the minimum number of times. For
1584          this reason, it is simplest to treat this case separately, as otherwise
1585          the code gets far too messy. There are several special subcases when the
1586          minimum is zero. */
1587    
1588          if (repeat_min == 0)
1589          {          {
1590          *errorptr = ERR10;          /* If we set up a required char from the bracket, we must back off
1591          goto FAILED;          to the previous value and reset the countlits value too. */
         }  
1592    
1593        /* If the minimum is greater than zero, and the maximum is unlimited or          if (subcountlits > 0)
1594        equal to the minimum, the first copy remains where it is, and is            {
1595        replicated up to the minimum number of times. This case includes the +            *reqchar = prevreqchar;
1596        repeat, but of course no replication is needed in that case. */            *countlits -= subcountlits;
1597              }
1598    
1599        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))          /* If the maximum is also zero, we just omit the group from the output
1600          {          altogether. */
1601          for (i = 1; i < repeat_min; i++)  
1602            if (repeat_max == 0)
1603            {            {
1604            memcpy(code, previous, len);            code = previous;
1605            code += len;            goto END_REPEAT;
1606            }            }
         }  
1607    
1608        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is 1 or unlimited, we just have to stick in the
1609        Then, if there is a fixed upper limit, replicated up to that many times,          BRAZERO and do no more at this point. */
       sticking BRAZERO in front of all the optional ones. */  
1610    
1611        else          if (repeat_max <= 1)
         {  
         if (repeat_min == 0)  
1612            {            {
1613            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1614            code++;            code++;
1615            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1616            }            }
1617    
1618            /* If the maximum is greater than 1 and limited, we have to replicate
1619            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620            The first one has to be handled carefully because it's the original
1621            copy, which has to be moved up. The remainder can be handled by code
1622            that is common with the non-zero minimum case below. We just have to
1623            adjust the value or repeat_max, since one less copy is required. */
1624    
1625            else
1626              {
1627              int offset;
1628              memmove(previous+4, previous, len);
1629              code += 4;
1630              *previous++ = OP_BRAZERO + repeat_type;
1631              *previous++ = OP_BRA;
1632    
1633              /* We chain together the bracket offset fields that have to be
1634              filled in later when the ends of the brackets are reached. */
1635    
1636              offset = (bralink == NULL)? 0 : previous - bralink;
1637              bralink = previous;
1638              *previous++ = offset >> 8;
1639              *previous++ = offset & 255;
1640              }
1641    
1642            repeat_max--;
1643            }
1644    
1645          /* If the minimum is greater than zero, replicate the group as many
1646          times as necessary, and adjust the maximum to the number of subsequent
1647          copies that we need. */
1648    
1649          else
1650            {
1651          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1652            {            {
1653            memcpy(code, previous, len);            memcpy(code, previous, len);
1654            code += len;            code += len;
1655            }            }
1656            if (repeat_max > 0) repeat_max -= repeat_min;
1657            }
1658    
1659          /* This code is common to both the zero and non-zero minimum cases. If
1660          the maximum is limited, it replicates the group in a nested fashion,
1661          remembering the bracket starts on a stack. In the case of a zero minimum,
1662          the first one was set up above. In all cases the repeat_max now specifies
1663          the number of additional copies needed. */
1664    
1665          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        if (repeat_max >= 0)
1666            {
1667            for (i = repeat_max - 1; i >= 0; i--)
1668            {            {
1669            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1670    
1671              /* All but the final copy start a new nesting, maintaining the
1672              chain of brackets outstanding. */
1673    
1674              if (i != 0)
1675                {
1676                int offset;
1677                *code++ = OP_BRA;
1678                offset = (bralink == NULL)? 0 : code - bralink;
1679                bralink = code;
1680                *code++ = offset >> 8;
1681                *code++ = offset & 255;
1682                }
1683    
1684            memcpy(code, previous, len);            memcpy(code, previous, len);
1685            code += len;            code += len;
1686            }            }
1687    
1688            /* Now chain through the pending brackets, and fill in their length
1689            fields (which are holding the chain links pro tem). */
1690    
1691            while (bralink != NULL)
1692              {
1693              int oldlinkoffset;
1694              int offset = code - bralink + 1;
1695              uschar *bra = code - offset;
1696              oldlinkoffset = (bra[1] << 8) + bra[2];
1697              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1698              *code++ = OP_KET;
1699              *code++ = bra[1] = offset >> 8;
1700              *code++ = bra[2] = (offset & 255);
1701              }
1702          }          }
1703    
1704        /* If the maximum is unlimited, set a repeater in the final copy. */        /* If the maximum is unlimited, set a repeater in the final copy. We
1705          can't just offset backwards from the current code point, because we
1706          don't know if there's been an options resetting after the ket. The
1707          correct offset was computed above. */
1708    
1709        if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1710        }        }
1711    
1712      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1126  for (;; ptr++) Line 1719  for (;; ptr++)
1719    
1720      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1721    
1722        END_REPEAT:
1723      previous = NULL;      previous = NULL;
1724      break;      break;
1725    
1726    
1727      /* Start of nested bracket sub-expression, or comment or lookahead.      /* Start of nested bracket sub-expression, or comment or lookahead or
1728      First deal with special things that can come after a bracket; all are      lookbehind or option setting or condition. First deal with special things
1729      introduced by ?, and the appearance of any of them means that this is not a      that can come after a bracket; all are introduced by ?, and the appearance
1730      referencing group. They were checked for validity in the first pass over      of any of them means that this is not a referencing group. They were
1731      the string, so we don't have to check for syntax errors here.  */      checked for validity in the first pass over the string, so we don't have to
1732        check for syntax errors here.  */
1733    
1734      case '(':      case '(':
1735      previous = code;              /* Only real brackets can be repeated */      newoptions = options;
1736        skipbytes = 0;
1737    
1738      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1739        {        {
1740        bravalue = OP_BRA;        int set, unset;
1741          int *optset;
1742    
1743        switch (*(++ptr))        switch (*(++ptr))
1744          {          {
1745          case '#':          case '#':                 /* Comment; skip to ket */
         case 'i':  
         case 'm':  
         case 's':  
         case 'x':  
1746          ptr++;          ptr++;
1747          while (*ptr != ')') ptr++;          while (*ptr != ')') ptr++;
         previous = NULL;  
1748          continue;          continue;
1749    
1750          case ':':                 /* Non-extracting bracket */          case ':':                 /* Non-extracting bracket */
1751            bravalue = OP_BRA;
1752          ptr++;          ptr++;
1753          break;          break;
1754    
1755          case '=':                 /* Assertions can't be repeated */          case '(':
1756            bravalue = OP_COND;       /* Conditional group */
1757            if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758              {
1759              int condref = *ptr - '0';
1760              while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761              if (condref == 0)
1762                {
1763                *errorptr = ERR35;
1764                goto FAILED;
1765                }
1766              ptr++;
1767              code[3] = OP_CREF;
1768              code[4] = condref >> 8;
1769              code[5] = condref & 255;
1770              skipbytes = 3;
1771              }
1772            else ptr--;
1773            break;
1774    
1775            case '=':                 /* Positive lookahead */
1776          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
1777          ptr++;          ptr++;
         previous = NULL;  
1778          break;          break;
1779    
1780          case '!':          case '!':                 /* Negative lookahead */
1781          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
1782          ptr++;          ptr++;
         previous = NULL;  
1783          break;          break;
1784    
1785          case '>':                         /* "Match once" brackets */          case '<':                 /* Lookbehinds */
1786          if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */          switch (*(++ptr))
1787            {            {
1788            bravalue = OP_ONCE;            case '=':               /* Positive lookbehind */
1789              bravalue = OP_ASSERTBACK;
1790              ptr++;
1791              break;
1792    
1793              case '!':               /* Negative lookbehind */
1794              bravalue = OP_ASSERTBACK_NOT;
1795            ptr++;            ptr++;
           previous = NULL;  
1796            break;            break;
1797    
1798              default:                /* Syntax error */
1799              *errorptr = ERR24;
1800              goto FAILED;
1801            }            }
1802          /* Else fall through */          break;
1803    
1804          default:          case '>':                 /* One-time brackets */
1805          *errorptr = ERR12;          bravalue = OP_ONCE;
1806          goto FAILED;          ptr++;
1807            break;
1808    
1809            case 'R':                 /* Pattern recursion */
1810            *code++ = OP_RECURSE;
1811            ptr++;
1812            continue;
1813    
1814            default:                  /* Option setting */
1815            set = unset = 0;
1816            optset = &set;
1817    
1818            while (*ptr != ')' && *ptr != ':')
1819              {
1820              switch (*ptr++)
1821                {
1822                case '-': optset = &unset; break;
1823    
1824                case 'i': *optset |= PCRE_CASELESS; break;
1825                case 'm': *optset |= PCRE_MULTILINE; break;
1826                case 's': *optset |= PCRE_DOTALL; break;
1827                case 'x': *optset |= PCRE_EXTENDED; break;
1828                case 'U': *optset |= PCRE_UNGREEDY; break;
1829                case 'X': *optset |= PCRE_EXTRA; break;
1830    
1831                default:
1832                *errorptr = ERR12;
1833                goto FAILED;
1834                }
1835              }
1836    
1837            /* Set up the changed option bits, but don't change anything yet. */
1838    
1839            newoptions = (options | set) & (~unset);
1840    
1841            /* If the options ended with ')' this is not the start of a nested
1842            group with option changes, so the options change at this level. At top
1843            level there is nothing else to be done (the options will in fact have
1844            been set from the start of compiling as a result of the first pass) but
1845            at an inner level we must compile code to change the ims options if
1846            necessary, and pass the new setting back so that it can be put at the
1847            start of any following branches, and when this group ends, a resetting
1848            item can be compiled. */
1849    
1850            if (*ptr == ')')
1851              {
1852              if ((options & PCRE_INGROUP) != 0 &&
1853                  (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1854                {
1855                *code++ = OP_OPT;
1856                *code++ = *optchanged = newoptions & PCRE_IMS;
1857                }
1858              options = newoptions;  /* Change options at this level */
1859              previous = NULL;       /* This item can't be repeated */
1860              continue;              /* It is complete */
1861              }
1862    
1863            /* If the options ended with ':' we are heading into a nested group
1864            with possible change of options. Such groups are non-capturing and are
1865            not assertions of any kind. All we need to do is skip over the ':';
1866            the newoptions value is handled below. */
1867    
1868            bravalue = OP_BRA;
1869            ptr++;
1870          }          }
1871        }        }
1872    
1873      /* Else we have a referencing group */      /* Else we have a referencing group; adjust the opcode. If the bracket
1874        number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875        arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876    
1877      else      else
1878        {        {
1879        if (++(*brackets) > EXTRACT_MAX)        if (++(*brackets) > EXTRACT_BASIC_MAX)
1880          {          {
1881          *errorptr = ERR13;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882          goto FAILED;          code[3] = OP_BRANUMBER;
1883            code[4] = *brackets >> 8;
1884            code[5] = *brackets & 255;
1885            skipbytes = 3;
1886          }          }
1887        bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
1888        }        }
1889    
1890      /* Process nested bracketed re; at end pointer is on the bracket. We copy      /* Process nested bracketed re. Assertions may not be repeated, but other
1891      code into a non-register variable in order to be able to pass its address      kinds can be. We copy code into a non-register variable in order to be able
1892      because some compilers complain otherwise. */      to pass its address because some compilers complain otherwise. Pass in a
1893        new setting for the ims options if they have changed. */
1894    
1895        previous = (bravalue >= OP_ONCE)? code : NULL;
1896      *code = bravalue;      *code = bravalue;
1897        tempcode = code;
1898    
1899        if (!compile_regex(
1900             options | PCRE_INGROUP,       /* Set for all nested groups */
1901             ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902               newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903             brackets,                     /* Extracting bracket count */
1904             &tempcode,                    /* Where to put code (updated) */
1905             &ptr,                         /* Input pointer (updated) */
1906             errorptr,                     /* Where to put an error message */
1907             (bravalue == OP_ASSERTBACK ||
1908              bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909             skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1910             &subreqchar,                  /* For possible last char */
1911             &subcountlits,                /* For literal count */
1912             cd))                          /* Tables block */
1913          goto FAILED;
1914    
1915        /* At the end of compiling, code is still pointing to the start of the
1916        group, while tempcode has been updated to point past the end of the group
1917        and any option resetting that may follow it. The pattern pointer (ptr)
1918        is on the bracket. */
1919    
1920        /* If this is a conditional bracket, check that there are no more than
1921        two branches in the group. */
1922    
1923        else if (bravalue == OP_COND)
1924        {        {
1925        uschar *mcode = code;        uschar *tc = code;
1926        if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))        condcount = 0;
1927    
1928          do {
1929             condcount++;
1930             tc += (tc[1] << 8) | tc[2];
1931             }
1932          while (*tc != OP_KET);
1933    
1934          if (condcount > 2)
1935            {
1936            *errorptr = ERR27;
1937          goto FAILED;          goto FAILED;
1938        code = mcode;          }
1939          }
1940    
1941        /* Handle updating of the required character. If the subpattern didn't
1942        set one, leave it as it was. Otherwise, update it for normal brackets of
1943        all kinds, forward assertions, and conditions with two branches. Don't
1944        update the literal count for forward assertions, however. If the bracket
1945        is followed by a quantifier with zero repeat, we have to back off. Hence
1946        the definition of prevreqchar and subcountlits outside the main loop so
1947        that they can be accessed for the back off. */
1948    
1949        if (subreqchar > 0 &&
1950             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951             (bravalue == OP_COND && condcount == 2)))
1952          {
1953          prevreqchar = *reqchar;
1954          *reqchar = subreqchar;
1955          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1956        }        }
1957    
1958        /* Now update the main code pointer to the end of the group. */
1959    
1960        code = tempcode;
1961    
1962        /* Error if hit end of pattern */
1963    
1964      if (*ptr != ')')      if (*ptr != ')')
1965        {        {
1966        *errorptr = ERR14;        *errorptr = ERR14;
# Line 1222  for (;; ptr++) Line 1973  for (;; ptr++)
1973      for validity in the pre-compiling pass. */      for validity in the pre-compiling pass. */
1974    
1975      case '\\':      case '\\':
1976      oldptr = ptr;      tempptr = ptr;
1977      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1978    
1979      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1980      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1236  for (;; ptr++) Line 1987  for (;; ptr++)
1987        {        {
1988        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1989          {          {
1990          int refnum = -c - ESC_REF;          int number = -c - ESC_REF;
         if (*brackets < refnum)  
           {  
           *errorptr = ERR15;  
           goto FAILED;  
           }  
1991          previous = code;          previous = code;
1992          *code++ = OP_REF;          *code++ = OP_REF;
1993          *code++ = refnum;          *code++ = number >> 8;
1994            *code++ = number & 255;
1995          }          }
1996        else        else
1997          {          {
1998          previous = (-c > ESC_b && -c < ESC_X)? code : NULL;          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1999          *code++ = -c;          *code++ = -c;
2000          }          }
2001        continue;        continue;
# Line 1256  for (;; ptr++) Line 2003  for (;; ptr++)
2003    
2004      /* Data character: reset and fall through */      /* Data character: reset and fall through */
2005    
2006      ptr = oldptr;      ptr = tempptr;
2007      c = '\\';      c = '\\';
2008    
2009      /* Handle a run of data characters until a metacharacter is encountered.      /* Handle a run of data characters until a metacharacter is encountered.
# Line 1274  for (;; ptr++) Line 2021  for (;; ptr++)
2021        {        {
2022        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2023          {          {
2024          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2025          if (c == '#')          if (c == '#')
2026            {            {
2027            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2028              on the Macintosh. */
2029              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030            if (c == 0) break;            if (c == 0) break;
2031            continue;            continue;
2032            }            }
# Line 1289  for (;; ptr++) Line 2038  for (;; ptr++)
2038    
2039        if (c == '\\')        if (c == '\\')
2040          {          {
2041          oldptr = ptr;          tempptr = ptr;
2042          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043          if (c < 0) { ptr = oldptr; break; }          if (c < 0) { ptr = tempptr; break; }
2044    
2045            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046            two or more characters in the UTF-8 encoding. */
2047    
2048    #ifdef SUPPORT_UTF8
2049            if (c > 127 && (options & PCRE_UTF8) != 0)
2050              {
2051              uschar buffer[8];
2052              int len = ord2utf8(c, buffer);
2053              for (c = 0; c < len; c++) *code++ = buffer[c];
2054              length += len;
2055              continue;
2056              }
2057    #endif
2058          }          }
2059    
2060        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1302  for (;; ptr++) Line 2065  for (;; ptr++)
2065    
2066      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2067    
2068      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069    
2070        /* Update the last character and the count of literals */
2071    
2072        prevreqchar = (length > 1)? code[-2] : *reqchar;
2073        *reqchar = code[-1];
2074        *countlits += length;
2075    
2076      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2077      the next state. */      the next state. */
2078    
2079      previous[1] = length;      previous[1] = length;
2080      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2081      break;      break;
2082      }      }
2083    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1332  return FALSE; Line 2101  return FALSE;
2101  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return
2102  it points to the closing bracket, or vertical bar, or end of string.  it points to the closing bracket, or vertical bar, or end of string.
2103  The code variable is pointing at the byte into which the BRA operator has been  The code variable is pointing at the byte into which the BRA operator has been
2104  stored.  stored. If the ims options are changed at the start (for a (?ims: group) or
2105    during any branch, we need to insert an OP_OPT item at the start of every
2106    following branch to ensure they get set correctly at run time, and also pass
2107    the new options into every subsequent branch compile.
2108    
2109  Argument:  Argument:
2110    options   the option bits    options     the option bits
2111    brackets  -> int containing the number of extracting brackets used    optchanged  new ims options to set as if (?ims) were at the start, or -1
2112    codeptr   -> the address of the current code pointer                 for no change
2113    ptrptr    -> the address of the current pattern pointer    brackets    -> int containing the number of extracting brackets used
2114    errorptr  -> pointer to error message    codeptr     -> the address of the current code pointer
2115      ptrptr      -> the address of the current pattern pointer
2116      errorptr    -> pointer to error message
2117      lookbehind  TRUE if this is a lookbehind assertion
2118      skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119      reqchar     -> place to put the last required character, or a negative number
2120      countlits   -> place to put the shortest literal count of any branch
2121      cd          points to the data block with tables pointers
2122    
2123  Returns:    TRUE on success  Returns:      TRUE on success
2124  */  */
2125    
2126  static BOOL  static BOOL
2127  compile_regex(int options, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128    const uschar **ptrptr, const char **errorptr)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129      int *reqchar, int *countlits, compile_data *cd)
2130  {  {
2131  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2132  uschar *code = *codeptr;  uschar *code = *codeptr;
2133    uschar *last_branch = code;
2134  uschar *start_bracket = code;  uschar *start_bracket = code;
2135    uschar *reverse_count = NULL;
2136    int oldoptions = options & PCRE_IMS;
2137    int branchreqchar, branchcountlits;
2138    
2139    *reqchar = -1;
2140    *countlits = INT_MAX;
2141    code += 3 + skipbytes;
2142    
2143    /* Loop for each alternative branch */
2144    
2145  for (;;)  for (;;)
2146    {    {
2147    int length;    int length;
   uschar *last_branch = code;  
2148    
2149    code += 3;    /* Handle change of options */
2150    if (!compile_branch(options, brackets, &code, &ptr, errorptr))  
2151      if (optchanged >= 0)
2152        {
2153        *code++ = OP_OPT;
2154        *code++ = optchanged;
2155        options = (options & ~PCRE_IMS) | optchanged;
2156        }
2157    
2158      /* Set up dummy OP_REVERSE if lookbehind assertion */
2159    
2160      if (lookbehind)
2161        {
2162        *code++ = OP_REVERSE;
2163        reverse_count = code;
2164        *code++ = 0;
2165        *code++ = 0;
2166        }
2167    
2168      /* Now compile the branch */
2169    
2170      if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171          &branchreqchar, &branchcountlits, cd))
2172      {      {
2173      *ptrptr = ptr;      *ptrptr = ptr;
2174      return FALSE;      return FALSE;
2175      }      }
2176    
2177    /* Fill in the length of the last branch */    /* Fill in the length of the last branch */
2178    
2179    length = code - last_branch;    length = code - last_branch;
2180    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
2181    last_branch[2] = length & 255;    last_branch[2] = length & 255;
2182    
2183      /* Save the last required character if all branches have the same; a current
2184      value of -1 means unset, while -2 means "previous branch had no last required
2185      char".  */
2186    
2187      if (*reqchar != -2)
2188        {
2189        if (branchreqchar >= 0)
2190          {
2191          if (*reqchar == -1) *reqchar = branchreqchar;
2192          else if (*reqchar != branchreqchar) *reqchar = -2;
2193          }
2194        else *reqchar = -2;
2195        }
2196    
2197      /* Keep the shortest literal count */
2198    
2199      if (branchcountlits < *countlits) *countlits = branchcountlits;
2200      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2201    
2202      /* If lookbehind, check that this branch matches a fixed-length string,
2203      and put the length into the OP_REVERSE item. Temporarily mark the end of
2204      the branch with OP_END. */
2205    
2206      if (lookbehind)
2207        {
2208        *code = OP_END;
2209        length = find_fixedlength(last_branch, options);
2210        DPRINTF(("fixed length = %d\n", length));
2211        if (length < 0)
2212          {
2213          *errorptr = ERR25;
2214          *ptrptr = ptr;
2215          return FALSE;
2216          }
2217        reverse_count[0] = (length >> 8);
2218        reverse_count[1] = length & 255;
2219        }
2220    
2221    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Insert a
2222    terminating ket and the length of the whole bracketed item, and return,    terminating ket and the length of the whole bracketed item, and return,
2223    leaving the pointer at the terminating char. */    leaving the pointer at the terminating char. If any of the ims options
2224      were changed inside the group, compile a resetting op-code following. */
2225    
2226    if (*ptr != '|')    if (*ptr != '|')
2227      {      {
# Line 1380  for (;;) Line 2229  for (;;)
2229      *code++ = OP_KET;      *code++ = OP_KET;
2230      *code++ = length >> 8;      *code++ = length >> 8;
2231      *code++ = length & 255;      *code++ = length & 255;
2232        if (optchanged >= 0)
2233          {
2234          *code++ = OP_OPT;
2235          *code++ = oldoptions;
2236          }
2237      *codeptr = code;      *codeptr = code;
2238      *ptrptr = ptr;      *ptrptr = ptr;
2239      return TRUE;      return TRUE;
# Line 1388  for (;;) Line 2242  for (;;)
2242    /* Another branch follows; insert an "or" node and advance the pointer. */    /* Another branch follows; insert an "or" node and advance the pointer. */
2243    
2244    *code = OP_ALT;    *code = OP_ALT;
2245      last_branch = code;
2246      code += 3;
2247    ptr++;    ptr++;
2248    }    }
2249  /* Control never reaches here */  /* Control never reaches here */
# Line 1395  for (;;) Line 2251  for (;;)
2251    
2252    
2253    
2254    
2255    /*************************************************
2256    *      Find first significant op code            *
2257    *************************************************/
2258    
2259    /* This is called by several functions that scan a compiled expression looking
2260    for a fixed first character, or an anchoring op code etc. It skips over things
2261    that do not influence this. For one application, a change of caseless option is
2262    important.
2263    
2264    Arguments:
2265      code       pointer to the start of the group
2266      options    pointer to external options
2267      optbit     the option bit whose changing is significant, or
2268                 zero if none are
2269      optstop    TRUE to return on option change, otherwise change the options
2270                   value and continue
2271    
2272    Returns:     pointer to the first significant opcode
2273    */
2274    
2275    static const uschar*
2276    first_significant_code(const uschar *code, int *options, int optbit,
2277      BOOL optstop)
2278    {
2279    for (;;)
2280      {
2281      switch ((int)*code)
2282        {
2283        case OP_OPT:
2284        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2285          {
2286          if (optstop) return code;
2287          *options = (int)code[1];
2288          }
2289        code += 2;
2290        break;
2291    
2292        case OP_CREF:
2293        case OP_BRANUMBER:
2294        code += 3;
2295        break;
2296    
2297        case OP_WORD_BOUNDARY:
2298        case OP_NOT_WORD_BOUNDARY:
2299        code++;
2300        break;
2301    
2302        case OP_ASSERT_NOT:
2303        case OP_ASSERTBACK:
2304        case OP_ASSERTBACK_NOT:
2305        do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2306        code += 3;
2307        break;
2308    
2309        default:
2310        return code;
2311        }
2312      }
2313    /* Control never reaches here */
2314    }
2315    
2316    
2317    
2318    
2319  /*************************************************  /*************************************************
2320  *          Check for anchored expression         *  *          Check for anchored expression         *
2321  *************************************************/  *************************************************/
# Line 1405  all of whose alternatives start with OP_ Line 2326  all of whose alternatives start with OP_
2326  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2328    
2329  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2331  trying them again.  so there is no point trying them again.
2332    
2333  Argument:  points to start of expression (the bracket)  Arguments:
2334  Returns:   TRUE or FALSE    code       points to start of expression (the bracket)
2335      options    points to the options setting
2336    
2337    Returns:     TRUE or FALSE
2338  */  */
2339    
2340  static BOOL  static BOOL
2341  is_anchored(register const uschar *code, BOOL multiline)  is_anchored(register const uschar *code, int *options)
2342  {  {
2343  do {  do {
2344     int op = (int)code[3];     const uschar *scode = first_significant_code(code + 3, options,
2345     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)       PCRE_MULTILINE, FALSE);
2346       { if (!is_anchored(code+3, multiline)) return FALSE; }     register int op = *scode;
2347     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348       { if (code[4] != OP_ANY) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2349     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350                (*options & PCRE_DOTALL) != 0)
2351         { if (scode[1] != OP_ANY) return FALSE; }
2352       else if (op != OP_SOD &&
2353               ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2354         return FALSE;
2355     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2356     }     }
2357  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1432  return TRUE; Line 2361  return TRUE;
2361    
2362    
2363  /*************************************************  /*************************************************
2364  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2365  *************************************************/  *************************************************/
2366    
2367  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2368  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2369    matching and for non-DOTALL patterns that start with .* (which must start at
2370    the beginning or after \n).
2371    
2372  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2373  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1446  static BOOL Line 2377  static BOOL
2377  is_startline(const uschar *code)  is_startline(const uschar *code)
2378  {  {
2379  do {  do {
2380     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2381       { if (!is_startline(code+3)) return FALSE; }     register int op = *scode;
2382     else if (code[3] != OP_CIRC) return FALSE;     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383         { if (!is_startline(scode)) return FALSE; }
2384       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385         { if (scode[1] != OP_ANY) return FALSE; }
2386       else if (op != OP_CIRC) return FALSE;
2387     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2388     }     }
2389  while (*code == OP_ALT);  while (*code == OP_ALT);
# Line 1467  Consider each alternative branch. If the Line 2402  Consider each alternative branch. If the
2402  a bracket all of whose alternatives start with the same char (recurse ad lib),  a bracket all of whose alternatives start with the same char (recurse ad lib),
2403  then we return that char, otherwise -1.  then we return that char, otherwise -1.
2404    
2405  Argument:  points to start of expression (the bracket)  Arguments:
2406  Returns:   -1 or the fixed first char    code       points to start of expression (the bracket)
2407      options    pointer to the options (used to check casing changes)
2408    
2409    Returns:     -1 or the fixed first char
2410  */  */
2411    
2412  static int  static int
2413  find_firstchar(uschar *code)  find_firstchar(const uschar *code, int *options)
2414  {  {
2415  register int c = -1;  register int c = -1;
2416  do  do {
2417    {     int d;
2418    register int charoffset = 4;     const uschar *scode = first_significant_code(code + 3, options,
2419         PCRE_CASELESS, TRUE);
2420    if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)     register int op = *scode;
2421      {  
2422      register int d;     if (op >= OP_BRA) op = OP_BRA;
2423      if ((d = find_firstchar(code+3)) < 0) return -1;  
2424      if (c < 0) c = d; else if (c != d) return -1;     switch(op)
2425      }       {
2426         default:
2427    else switch(code[3])       return -1;
2428      {  
2429      default:       case OP_BRA:
2430      return -1;       case OP_ASSERT:
2431         case OP_ONCE:
2432      case OP_EXACT:       /* Fall through */       case OP_COND:
2433      charoffset++;       if ((d = find_firstchar(scode, options)) < 0) return -1;
2434         if (c < 0) c = d; else if (c != d) return -1;
2435      case OP_CHARS:       /* Fall through */       break;
2436      charoffset++;  
2437         case OP_EXACT:       /* Fall through */
2438         scode++;
2439    
2440         case OP_CHARS:       /* Fall through */
2441         scode++;
2442    
2443         case OP_PLUS:
2444         case OP_MINPLUS:
2445         if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2446         break;
2447         }
2448    
2449      case OP_PLUS:     code += (code[1] << 8) + code[2];
2450      case OP_MINPLUS:     }
     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;  
     break;  
     }  
   code += (code[1] << 8) + code[2];  
   }  
2451  while (*code == OP_ALT);  while (*code == OP_ALT);
2452  return c;  return c;
2453  }  }
2454    
2455    
2456    
2457    
2458    
2459  /*************************************************  /*************************************************
2460  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
2461  *************************************************/  *************************************************/
# Line 1522  Arguments: Line 2468  Arguments:
2468    options      various option bits    options      various option bits
2469    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2470    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2471      tables       pointer to character tables or NULL
2472    
2473  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2474                 with errorptr and erroroffset set                 with errorptr and erroroffset set
# Line 1529  Returns:       pointer to compiled data Line 2476  Returns:       pointer to compiled data
2476    
2477  pcre *  pcre *
2478  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2479    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2480  {  {
2481  real_pcre *re;  real_pcre *re;
 int spaces = 0;  
2482  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2483  int runlength;  int runlength;
2484  int c, size;  int c, reqchar, countlits;
2485  int bracount = 0;  int bracount = 0;
 int brastack[200];  
2486  int top_backref = 0;  int top_backref = 0;
2487    int branch_extra = 0;
2488    int branch_newextra;
2489  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2490    size_t size;
2491  uschar *code;  uschar *code;
2492  const uschar *ptr;  const uschar *ptr;
2493    compile_data compile_block;
2494    int brastack[BRASTACK_SIZE];
2495    uschar bralenstack[BRASTACK_SIZE];
2496    
2497  #ifdef DEBUG  #ifdef DEBUG
2498  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2499  #endif  #endif
2500    
2501    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502    
2503    #ifndef SUPPORT_UTF8
2504    if ((options & PCRE_UTF8) != 0)
2505      {
2506      *errorptr = ERR32;
2507      return NULL;
2508      }
2509    #endif
2510    
2511  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512  can do is just return NULL. */  can do is just return NULL. */
2513    
# Line 1568  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2529  if ((options & ~PUBLIC_OPTIONS) != 0)
2529    return NULL;    return NULL;
2530    }    }
2531    
2532    /* Set up pointers to the individual character tables */
2533    
2534    if (tables == NULL) tables = pcre_default_tables;
2535    compile_block.lcc = tables + lcc_offset;
2536    compile_block.fcc = tables + fcc_offset;
2537    compile_block.cbits = tables + cbits_offset;
2538    compile_block.ctypes = tables + ctypes_offset;
2539    
2540    /* Reflect pattern for debugging output */
2541    
2542  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
2543  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
2544    
# Line 1583  while ((c = *(++ptr)) != 0) Line 2554  while ((c = *(++ptr)) != 0)
2554    {    {
2555    int min, max;    int min, max;
2556    int class_charcount;    int class_charcount;
2557      int bracket_length;
2558    
2559    if ((pcre_ctypes[c] & ctype_space) != 0)    if ((options & PCRE_EXTENDED) != 0)
     {  
     if ((options & PCRE_EXTENDED) != 0) continue;  
     spaces++;  
     }  
   
   if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2560      {      {
2561      while ((c = *(++ptr)) != 0 && c != '\n');      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2562      continue;      if (c == '#')
2563          {
2564          /* The space before the ; is to avoid a warning on a silly compiler
2565          on the Macintosh. */
2566          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567          continue;
2568          }
2569      }      }
2570    
2571    switch(c)    switch(c)
# Line 1606  while ((c = *(++ptr)) != 0) Line 2578  while ((c = *(++ptr)) != 0)
2578      case '\\':      case '\\':
2579        {        {
2580        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2581        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2582        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583        if (c >= 0)        if (c >= 0)
2584          {          {
# Line 1617  while ((c = *(++ptr)) != 0) Line 2589  while ((c = *(++ptr)) != 0)
2589        }        }
2590      length++;      length++;
2591    
2592      /* A back reference needs an additional char, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
2593      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
2594      back reference. */      back reference. */
2595    
# Line 1625  while ((c = *(++ptr)) != 0) Line 2597  while ((c = *(++ptr)) != 0)
2597        {        {
2598        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2599        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2600        length++;   /* For single back reference */        length += 2;   /* For single back reference */
2601        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602          {          {
2603          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2604          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2605          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2606            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1652  while ((c = *(++ptr)) != 0) Line 2624  while ((c = *(++ptr)) != 0)
2624      or back reference. */      or back reference. */
2625    
2626      case '{':      case '{':
2627      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2628      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2629      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2631        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1667  while ((c = *(++ptr)) != 0) Line 2639  while ((c = *(++ptr)) != 0)
2639      if (ptr[1] == '?') ptr++;      if (ptr[1] == '?') ptr++;
2640      continue;      continue;
2641    
2642      /* An alternation contains an offset to the next branch or ket. */      /* An alternation contains an offset to the next branch or ket. If any ims
2643        options changed in the previous branch(es), and/or if we are in a
2644        lookbehind assertion, extra space will be needed at the start of the
2645        branch. This is handled by branch_extra. */
2646    
2647      case '|':      case '|':
2648      length += 3;      length += 3 + branch_extra;
2649      continue;      continue;
2650    
2651      /* A character class uses 33 characters. Don't worry about character types      /* A character class uses 33 characters. Don't worry about character types
# Line 1684  while ((c = *(++ptr)) != 0) Line 2660  while ((c = *(++ptr)) != 0)
2660        {        {
2661        if (*ptr == '\\')        if (*ptr == '\\')
2662          {          {
2663          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2664              &compile_block);
2665          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2666          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2667          }          }
# Line 1701  while ((c = *(++ptr)) != 0) Line 2678  while ((c = *(++ptr)) != 0)
2678    
2679        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2680    
2681        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2682          {          {
2683          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2684          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2685          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2686            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1717  while ((c = *(++ptr)) != 0) Line 2694  while ((c = *(++ptr)) != 0)
2694      /* Brackets may be genuine groups or special things */      /* Brackets may be genuine groups or special things */
2695    
2696      case '(':      case '(':
2697        branch_newextra = 0;
2698        bracket_length = 3;
2699    
2700      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2701    
2702      if (ptr[1] == '?') switch (c = ptr[2])      if (ptr[1] == '?')
2703        {        {
2704        /* Skip over comments entirely */        int set, unset;
2705        case '#':        int *optset;
       ptr += 3;  
       while (*ptr != 0 && *ptr != ')') ptr++;  
       if (*ptr == 0)  
         {  
         *errorptr = ERR18;  
         goto PCRE_ERROR_RETURN;  
         }  
       continue;  
2706    
2707        /* Non-referencing groups and lookaheads just move the pointer on, and        switch (c = ptr[2])
2708        then behave like a non-special bracket, except that they don't increment          {
2709        the count of extracting brackets. */          /* Skip over comments entirely */
2710            case '#':
2711        case ':':          ptr += 3;
2712        case '=':          while (*ptr != 0 && *ptr != ')') ptr++;
2713        case '!':          if (*ptr == 0)
2714        ptr += 2;            {
2715        break;            *errorptr = ERR18;
2716              goto PCRE_ERROR_RETURN;
2717              }
2718            continue;
2719    
2720        /* Ditto for the "once only" bracket, allowed only if the extra bit          /* Non-referencing groups and lookaheads just move the pointer on, and
2721        is set. */          then behave like a non-special bracket, except that they don't increment
2722            the count of extracting brackets. Ditto for the "once only" bracket,
2723            which is in Perl from version 5.005. */
2724    
2725        case '>':          case ':':
2726        if ((options & PCRE_EXTRA) != 0)          case '=':
2727          {          case '!':
2728            case '>':
2729          ptr += 2;          ptr += 2;
2730          break;          break;
         }  
       /* Else fall thourh */  
2731    
2732        /* Else loop setting valid options until ) is met. Anything else is an          /* A recursive call to the regex is an extension, to provide the
2733        error. */          facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2734    
2735        default:          case 'R':
2736        ptr += 2;          if (ptr[3] != ')')
       for (;; ptr++)  
         {  
         if ((c = *ptr) == 'i')  
2737            {            {
2738            options |= PCRE_CASELESS;            *errorptr = ERR29;
2739            continue;            goto PCRE_ERROR_RETURN;
2740            }            }
2741          else if ((c = *ptr) == 'm')          ptr += 3;
2742            length += 1;
2743            break;
2744    
2745            /* Lookbehinds are in Perl from version 5.005 */
2746    
2747            case '<':
2748            if (ptr[3] == '=' || ptr[3] == '!')
2749            {            {
2750            options |= PCRE_MULTILINE;            ptr += 3;
2751            continue;            branch_newextra = 3;
2752              length += 3;         /* For the first branch */
2753              break;
2754              }
2755            *errorptr = ERR24;
2756            goto PCRE_ERROR_RETURN;
2757    
2758            /* Conditionals are in Perl from version 5.005. The bracket must either
2759            be followed by a number (for bracket reference) or by an assertion
2760            group. */
2761    
2762            case '(':
2763            if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764              {
2765              ptr += 4;
2766              length += 3;
2767              while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768              if (*ptr != ')')
2769                {
2770                *errorptr = ERR26;
2771                goto PCRE_ERROR_RETURN;
2772                }
2773            }            }
2774          else if (c == 's')          else   /* An assertion must follow */
2775            {            {
2776            options |= PCRE_DOTALL;            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2777            continue;            if (ptr[2] != '?' ||
2778                 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2779                {
2780                ptr += 2;    /* To get right offset in message */
2781                *errorptr = ERR28;
2782                goto PCRE_ERROR_RETURN;
2783                }
2784              }
2785            break;
2786    
2787            /* Else loop checking valid options until ) is met. Anything else is an
2788            error. If we are without any brackets, i.e. at top level, the settings
2789            act as if specified in the options, so massage the options immediately.
2790            This is for backward compatibility with Perl 5.004. */
2791    
2792            default:
2793            set = unset = 0;
2794            optset = &set;
2795            ptr += 2;
2796    
2797            for (;; ptr++)
2798              {
2799              c = *ptr;
2800              switch (c)
2801                {
2802                case 'i':
2803                *optset |= PCRE_CASELESS;
2804                continue;
2805    
2806                case 'm':
2807                *optset |= PCRE_MULTILINE;
2808                continue;
2809    
2810                case 's':
2811                *optset |= PCRE_DOTALL;
2812                continue;
2813    
2814                case 'x':
2815                *optset |= PCRE_EXTENDED;
2816                continue;
2817    
2818                case 'X':
2819                *optset |= PCRE_EXTRA;
2820                continue;
2821    
2822                case 'U':
2823                *optset |= PCRE_UNGREEDY;
2824                continue;
2825    
2826                case '-':
2827                optset = &unset;
2828                continue;
2829    
2830                /* A termination by ')' indicates an options-setting-only item;
2831                this is global at top level; otherwise nothing is done here and
2832                it is handled during the compiling process on a per-bracket-group
2833                basis. */
2834    
2835                case ')':
2836                if (brastackptr == 0)
2837                  {
2838                  options = (options | set) & (~unset);
2839                  set = unset = 0;     /* To save length */
2840                  }
2841                /* Fall through */
2842    
2843                /* A termination by ':' indicates the start of a nested group with
2844                the given options set. This is again handled at compile time, but
2845                we must allow for compiled space if any of the ims options are
2846                set. We also have to allow for resetting space at the end of
2847                the group, which is why 4 is added to the length and not just 2.
2848                If there are several changes of options within the same group, this
2849                will lead to an over-estimate on the length, but this shouldn't
2850                matter very much. We also have to allow for resetting options at
2851                the start of any alternations, which we do by setting
2852                branch_newextra to 2. Finally, we record whether the case-dependent
2853                flag ever changes within the regex. This is used by the "required
2854                character" code. */
2855    
2856                case ':':
2857                if (((set|unset) & PCRE_IMS) != 0)
2858                  {
2859                  length += 4;
2860                  branch_newextra = 2;
2861                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2862                  }
2863                goto END_OPTIONS;
2864    
2865                /* Unrecognized option character */
2866    
2867                default:
2868                *errorptr = ERR12;
2869                goto PCRE_ERROR_RETURN;
2870                }
2871            }            }
2872          else if (c == 'x')  
2873            /* If we hit a closing bracket, that's it - this is a freestanding
2874            option-setting. We need to ensure that branch_extra is updated if
2875            necessary. The only values branch_newextra can have here are 0 or 2.
2876            If the value is 2, then branch_extra must either be 2 or 5, depending
2877            on whether this is a lookbehind group or not. */
2878    
2879            END_OPTIONS:
2880            if (c == ')')
2881            {            {
2882            options |= PCRE_EXTENDED;            if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2883            length -= spaces;          /* Already counted spaces */              branch_extra += branch_newextra;
2884            continue;            continue;
2885            }            }
         else if (c == ')') break;  
2886    
2887          *errorptr = ERR12;          /* If options were terminated by ':' control comes here. Fall through
2888          goto PCRE_ERROR_RETURN;          to handle the group below. */
2889          }          }
       continue;                      /* End of this bracket handling */  
2890        }        }
2891    
2892      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
2893      Perlish way. */      Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894        need an additional 3 bytes of store per extracting bracket. */
2895    
2896      else bracount++;      else
2897          {
2898          bracount++;
2899          if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900          }
2901    
2902      /* Non-special forms of bracket. Save length for computing whole length      /* Save length for computing whole length at end if there's a repeat that
2903      at end if there's a repeat that requires duplication of the group. */      requires duplication of the group. Also save the current value of
2904        branch_extra, and start the new group with the new value. If non-zero, this
2905        will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2906    
2907      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2908        {        {
# Line 1804  while ((c = *(++ptr)) != 0) Line 2910  while ((c = *(++ptr)) != 0)
2910        goto PCRE_ERROR_RETURN;        goto PCRE_ERROR_RETURN;
2911        }        }
2912    
2913        bralenstack[brastackptr] = branch_extra;
2914        branch_extra = branch_newextra;
2915    
2916      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2917      length += 3;      length += bracket_length;
2918      continue;      continue;
2919    
2920      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
2921      have to replicate this bracket up to that many times. If brastackptr is      have to replicate this bracket up to that many times. If brastackptr is
2922      0 this is an unmatched bracket which will generate an error, but take care      0 this is an unmatched bracket which will generate an error, but take care
2923      not to try to access brastack[-1]. */      not to try to access brastack[-1] when computing the length and restoring
2924        the branch_extra value. */
2925    
2926      case ')':      case ')':
2927      length += 3;      length += 3;
2928        {        {
2929        int minval = 1;        int minval = 1;
2930        int maxval = 1;        int maxval = 1;
2931        int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;        int duplength;
2932    
2933          if (brastackptr > 0)
2934            {
2935            duplength = length - brastack[--brastackptr];
2936            branch_extra = bralenstack[brastackptr];
2937            }
2938          else duplength = 0;
2939    
2940        /* Leave ptr at the final char; for read_repeat_counts this happens        /* Leave ptr at the final char; for read_repeat_counts this happens
2941        automatically; for the others we need an increment. */        automatically; for the others we need an increment. */
2942    
2943        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2944          {          {
2945          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2946              &compile_block);
2947          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2948          }          }
2949        else if (c == '*') { minval = 0; maxval = -1; ptr++; }        else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2950        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2951        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2952    
2953        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2955        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2957    
2958        if (minval == 0) length++;        if (minval == 0)
2959          else if (minval > 1) length += (minval - 1) * duplength;          {
2960        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2961            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2962            }
2963    
2964          /* When the minimum is greater than zero, 1 we have to replicate up to
2965          minval-1 times, with no additions required in the copies. Then, if
2966          there is a limited maximum we have to replicate up to maxval-1 times
2967          allowing for a BRAZERO item before each optional copy and nesting
2968          brackets for all but one of the optional copies. */
2969    
2970          else
2971            {
2972            length += (minval - 1) * duplength;
2973            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2974              length += (maxval - minval) * (duplength + 7) - 6;
2975            }
2976        }        }
2977      continue;      continue;
2978    
# Line 1854  while ((c = *(++ptr)) != 0) Line 2987  while ((c = *(++ptr)) != 0)
2987      runlength = 0;      runlength = 0;
2988      do      do
2989        {        {
2990        if ((pcre_ctypes[c] & ctype_space) != 0)        if ((options & PCRE_EXTENDED) != 0)
         {  
         if ((options & PCRE_EXTENDED) != 0) continue;  
         spaces++;  
         }  
   
       if (c == '#' && (options & PCRE_EXTENDED) != 0)  
2991          {          {
2992          while ((c = *(++ptr)) != 0 && c != '\n');          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2993          continue;          if (c == '#')
2994              {
2995              /* The space before the ; is to avoid a warning on a silly compiler
2996              on the Macintosh. */
2997              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998              continue;
2999              }
3000          }          }
3001    
3002        /* Backslash may introduce a data char or a metacharacter; stop the        /* Backslash may introduce a data char or a metacharacter; stop the
# Line 1872  while ((c = *(++ptr)) != 0) Line 3005  while ((c = *(++ptr)) != 0)
3005        if (c == '\\')        if (c == '\\')
3006          {          {
3007          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
3008          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3009              &compile_block);
3010          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
3012    
3013    #ifdef SUPPORT_UTF8
3014            if (c > 127 && (options & PCRE_UTF8) != 0)
3015              {
3016              int i;
3017              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018                if (c <= utf8_table1[i]) break;
3019              runlength += i;
3020              }
3021    #endif
3022          }          }
3023    
3024        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1884  while ((c = *(++ptr)) != 0) Line 3028  while ((c = *(++ptr)) != 0)
3028    
3029      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3030    
3031      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < MAXLIT &&
3032          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033    
3034      ptr--;      ptr--;
3035      length += runlength;      length += runlength;
# Line 1915  if (re == NULL) Line 3060  if (re == NULL)
3060    return NULL;    return NULL;
3061    }    }
3062    
3063  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3064    
3065  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3066    re->size = size;
3067  re->options = options;  re->options = options;
3068    re->tables = tables;
3069    
3070  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
3071  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 1928  ptr = (const uschar *)pattern; Line 3075  ptr = (const uschar *)pattern;
3075  code = re->code;  code = re->code;
3076  *code = OP_BRA;  *code = OP_BRA;
3077  bracount = 0;  bracount = 0;
3078  (void)compile_regex(options, &bracount, &code, &ptr, errorptr);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079      &reqchar, &countlits, &compile_block);
3080  re->top_bracket = bracount;  re->top_bracket = bracount;
3081  re->top_backref = top_backref;  re->top_backref = top_backref;
3082    
# Line 1945  if debugging, leave the test till after Line 3093  if debugging, leave the test till after
3093  if (code - re->code > length) *errorptr = ERR23;  if (code - re->code > length) *errorptr = ERR23;
3094  #endif  #endif
3095    
3096    /* Give an error if there's back reference to a non-existent capturing
3097    subpattern. */
3098    
3099    if (top_backref > re->top_bracket) *errorptr = ERR15;
3100    
3101  /* Failed to compile */  /* Failed to compile */
3102    
3103  if (*errorptr != NULL)  if (*errorptr != NULL)
# Line 1955  if (*errorptr != NULL) Line 3108  if (*errorptr != NULL)
3108    return NULL;    return NULL;
3109    }    }
3110    
3111  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3112  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3114  unanchored matches no end. In the case of multiline matches, an alternative is  
3115  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3116    that speeds up unanchored matches no end. If not, see if we can set the
3117    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118    start with ^. and also when all branches start with .* for non-DOTALL matches.
3119    */
3120    
3121  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3122    {    {
3123    if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))    int temp_options = options;
3124      if (is_anchored(re->code, &temp_options))
3125      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
3126    else    else
3127      {      {
3128      int ch = find_firstchar(re->code);      int ch = find_firstchar(re->code, &temp_options);
3129      if (ch >= 0)      if (ch >= 0)
3130        {        {
3131        re->first_char = ch;        re->first_char = ch;
# Line 1978  if ((options & PCRE_ANCHORED) == 0) Line 3136  if ((options & PCRE_ANCHORED) == 0)
3136      }      }
3137    }    }
3138    
3139    /* Save the last required character if there are at least two literal
3140    characters on all paths, or if there is no first character setting. */
3141    
3142    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143      {
3144      re->req_char = reqchar;
3145      re->options |= PCRE_REQCHSET;
3146      }
3147    
3148  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3149    
3150  #ifdef DEBUG  #ifdef DEBUG
3151    
3152  printf("Length = %d top_bracket = %d top_backref=%d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
3153    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
3154    
3155  if (re->options != 0)  if (re->options != 0)
3156    {    {
3157    printf("%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3158      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3164      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",      ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3165      ((re->options & PCRE_EXTRA) != 0)? "extra " : "");      ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3166        ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3167    }    }
3168    
3169  if ((re->options & PCRE_FIRSTSET) != 0)  if ((re->options & PCRE_FIRSTSET) != 0)
# Line 2003  if ((re->options & PCRE_FIRSTSET) != 0) Line 3172  if ((re->options & PCRE_FIRSTSET) != 0)
3172      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3173    }    }
3174    
3175    if ((re->options & PCRE_REQCHSET) != 0)
3176      {
3177      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178        else printf("Req char = \\x%02x\n", re->req_char);
3179      }
3180    
3181  code_end = code;  code_end = code;
3182  code_base = code = re->code;  code_base = code = re->code;
3183    
# Line 2014  while (code < code_end) Line 3189  while (code < code_end)
3189    
3190    if (*code >= OP_BRA)    if (*code >= OP_BRA)
3191      {      {
3192      printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);      if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193          printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194        else
3195          printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196      code += 2;      code += 2;
3197      }      }
3198    
3199    else switch(*code)    else switch(*code)
3200      {      {
3201        case OP_OPT:
3202        printf(" %.2x %s", code[1], OP_names[*code]);
3203        code++;
3204        break;
3205    
3206      case OP_CHARS:      case OP_CHARS:
3207      charlength = *(++code);      charlength = *(++code);
3208      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 2033  while (code < code_end) Line 3216  while (code < code_end)
3216      case OP_KET:      case OP_KET:
3217      case OP_ASSERT:      case OP_ASSERT:
3218      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3219        case OP_ASSERTBACK:
3220        case OP_ASSERTBACK_NOT:
3221      case OP_ONCE:      case OP_ONCE:
3222        case OP_REVERSE:
3223        case OP_BRANUMBER:
3224        case OP_COND:
3225        case OP_CREF:
3226      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227      code += 2;      code += 2;
3228      break;      break;
# Line 2106  while (code < code_end) Line 3295  while (code < code_end)
3295      break;      break;
3296    
3297      case OP_REF:      case OP_REF:
3298      printf("    \\%d", *(++code));      printf("    \\%d", (code[1] << 8) | code[2]);
3299      code ++;      code += 3;
3300      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
3301    
3302      case OP_CLASS:      case OP_CLASS:
     case OP_NEGCLASS:  
3303        {        {
3304        int i, min, max;        int i, min, max;
3305          code++;
3306        if (*code++ == OP_CLASS) printf("    [");        printf("    [");
         else printf("   ^[");  
3307    
3308        for (i = 0; i < 256; i++)        for (i = 0; i < 256; i++)
3309          {          {
# Line 2198  return (pcre *)re; Line 3385  return (pcre *)re;
3385    
3386    
3387  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
3388  *          Match a back-reference                *  *          Match a back-reference                *
3389  *************************************************/  *************************************************/
3390    
3391  /* If a back reference hasn't been set, the match fails.  /* If a back reference hasn't been set, the length that is passed is greater
3392    than the number of characters left in the string, so the match fails.
3393    
3394  Arguments:  Arguments:
3395    number      reference number    offset      index into the offset vector
3396    eptr        points into the subject    eptr        points into the subject
3397    length      length to be matched    length      length to be matched
3398    md          points to match data block    md          points to match data block
3399      ims         the ims flags
3400    
3401  Returns:      TRUE if matched  Returns:      TRUE if matched
3402  */  */
3403    
3404  static BOOL  static BOOL
3405  match_ref(int number, register const uschar *eptr, int length, match_data *md)  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406      unsigned long int ims)
3407  {  {
3408  const uschar *p = md->start_subject + md->offset_vector[number];  const uschar *p = md->start_subject + md->offset_vector[offset];
3409    
3410  #ifdef DEBUG  #ifdef DEBUG
3411  if (eptr >= md->end_subject)  if (eptr >= md->end_subject)
# Line 2272  printf("\n"); Line 3422  printf("\n");
3422    
3423  /* Always fail if not enough characters left */  /* Always fail if not enough characters left */
3424    
3425  if (length > md->end_subject - p) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
3426    
3427  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
3428    
3429  if (md->caseless)  if ((ims & PCRE_CASELESS) != 0)
3430    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
3431      while (length-- > 0)
3432        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3433      }
3434  else  else
3435    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3436    
# Line 2290  return TRUE; Line 3443  return TRUE;
3443  *         Match from current position            *  *         Match from current position            *
3444  *************************************************/  *************************************************/
3445    
3446  /* On entry ecode points to the first opcode, and eptr to the first character.  /* On entry ecode points to the first opcode, and eptr to the first character
3447    in the subject string, while eptrb holds the value of eptr at the start of the
3448    last bracketed group - used for breaking infinite loops matching zero-length
3449    strings.
3450    
3451  Arguments:  Arguments:
3452     eptr        pointer in subject     eptr        pointer in subject
3453     ecode       position in code     ecode       position in code
3454     offset_top  current top pointer     offset_top  current top pointer
3455     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3456       ims         current /i, /m, and /s options
3457       eptrb       pointer to chain of blocks containing eptr at start of
3458                     brackets - for testing for empty matches
3459       flags       can contain
3460                     match_condassert - this is an assertion condition
3461                     match_isgroup - this is the start of a bracketed group
3462    
3463  Returns:       TRUE if matched  Returns:       TRUE if matched
3464  */  */
3465    
3466  static BOOL  static BOOL
3467  match(register const uschar *eptr, register const uschar *ecode, int offset_top,  match(register const uschar *eptr, register const uschar *ecode,
3468    match_data *md)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3469      int flags)
3470  {  {
3471    unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3472    eptrblock newptrb;
3473    
3474    /* At the start of a bracketed group, add the current subject pointer to the
3475    stack of such pointers, to be re-instated at the end of the group when we hit
3476    the closing ket. When match() is called in other circumstances, we don't add to
3477    the stack. */
3478    
3479    if ((flags & match_isgroup) != 0)
3480      {
3481      newptrb.prev = eptrb;
3482      newptrb.saved_eptr = eptr;
3483      eptrb = &newptrb;
3484      }
3485    
3486    /* Now start processing the operations. */
3487    
3488  for (;;)  for (;;)
3489    {    {
3490      int op = (int)*ecode;
3491    int min, max, ctype;    int min, max, ctype;
3492    register int i;    register int i;
3493    register int c;    register int c;
3494    BOOL minimize = FALSE;    BOOL minimize = FALSE;
3495    
3496    /* Opening bracket. Check the alternative branches in turn, failing if none    /* Opening capturing bracket. If there is space in the offset vector, save
3497    match. We have to set the start offset if required and there is space    the current subject position in the working slot at the top of the vector. We
3498    in the offset vector so that it is available for subsequent back references    mustn't change the current values of the data slot, because they may be set
3499    if the bracket matches. However, if the bracket fails, we must put back the    from a previous iteration of this group, and be referred to by a reference
3500    previous value of both offsets in case they were set by a previous copy of    inside the group.
3501    the same bracket. Don't worry about setting the flag for the error case here;  
3502    that is handled in the code for KET. */    If the bracket fails to match, we need to restore this value and also the
3503      values of the final offsets, in case they were set by a previous iteration of
3504      the same bracket.
3505    
3506      If there isn't enough space in the offset vector, treat this as if it were a
3507      non-capturing bracket. Don't worry about setting the flag for the error case
3508      here; that is handled in the code for KET. */
3509    
3510    if ((int)*ecode >= OP_BRA)    if (op > OP_BRA)
3511      {      {
3512      int number = (*ecode - OP_BRA) << 1;      int offset;
3513      int save_offset1 = 0, save_offset2 = 0;      int number = op - OP_BRA;
3514    
3515      DPRINTF(("start bracket %d\n", number/2));      /* For extended extraction brackets (large number), we have to fish out the
3516        number from a dummy opcode at the start. */
3517    
3518        if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519        offset = number << 1;
3520    
3521    #ifdef DEBUG
3522        printf("start bracket %d subject=", number);
3523        pchars(eptr, 16, TRUE, md);
3524        printf("\n");
3525    #endif
3526    
3527      if (number > 0 && number < md->offset_end)      if (offset < md->offset_max)
3528        {        {
3529        save_offset1 = md->offset_vector[number];        int save_offset1 = md->offset_vector[offset];
3530        save_offset2 = md->offset_vector[number+1];        int save_offset2 = md->offset_vector[offset+1];
3531        md->offset_vector[number] = eptr - md->start_subject;        int save_offset3 = md->offset_vector[md->offset_end - number];
3532    
3533        DPRINTF(("saving %d %d\n", save_offset1, save_offset2));        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3534          md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3535    
3536          do
3537            {
3538            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3539              return TRUE;
3540            ecode += (ecode[1] << 8) + ecode[2];
3541            }
3542          while (*ecode == OP_ALT);
3543    
3544          DPRINTF(("bracket %d failed\n", number));
3545    
3546          md->offset_vector[offset] = save_offset1;
3547          md->offset_vector[offset+1] = save_offset2;
3548          md->offset_vector[md->offset_end - number] = save_offset3;
3549    
3550          return FALSE;
3551        }        }
3552    
3553      /* Recurse for all the alternatives. */      /* Insufficient room for saving captured contents */
3554    
3555        else op = OP_BRA;
3556        }
3557    
3558      /* Other types of node can be handled by a switch */
3559    
3560      switch(op)
3561        {
3562        case OP_BRA:     /* Non-capturing bracket: optimized */
3563        DPRINTF(("start bracket 0\n"));
3564      do      do
3565        {        {
3566        if (match(eptr, ecode+3, offset_top, md)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567            return TRUE;
3568        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3569        }        }
3570      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3571        DPRINTF(("bracket 0 failed\n"));
3572        return FALSE;
3573    
3574        /* Conditional group: compilation checked that there are no more than
3575        two branches. If the condition is false, skipping the first branch takes us
3576        past the end if there is only one branch, but that's OK because that is
3577        exactly what going to the ket would do. */
3578    
3579        case OP_COND:
3580        if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3581          {
3582          int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583          return match(eptr,
3584            ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585              6 : 3 + (ecode[1] << 8) + ecode[2]),
3586            offset_top, md, ims, eptrb, match_isgroup);
3587          }
3588    
3589      DPRINTF(("bracket %d failed\n", number/2));      /* The condition is an assertion. Call match() to evaluate it - setting
3590        the final argument TRUE causes it to stop at the end of an assertion. */
3591    
3592      if (number > 0 && number < md->offset_end)      else
3593        {        {
3594        md->offset_vector[number] = save_offset1;        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595        md->offset_vector[number+1] = save_offset2;            match_condassert | match_isgroup))
3596            {
3597            ecode += 3 + (ecode[4] << 8) + ecode[5];
3598            while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599            }
3600          else ecode += (ecode[1] << 8) + ecode[2];
3601          return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602        }        }
3603        /* Control never reaches here */
3604    
3605      return FALSE;      /* Skip over conditional reference or large extraction number data if
3606      }      encountered. */
3607    
3608    /* Other types of node can be handled by a switch */      case OP_CREF:
3609        case OP_BRANUMBER:
3610        ecode += 3;
3611        break;
3612    
3613        /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614        an empty string - recursion will then try other alternatives, if any. */
3615    
   switch(*ecode)  
     {  
3616      case OP_END:      case OP_END:
3617        if (md->notempty && eptr == md->start_match) return FALSE;
3618      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3619      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3620      return TRUE;      return TRUE;
3621    
3622      /* The equivalent of Prolog's "cut" - if the rest doesn't match, the      /* Change option settings */
     whole thing doesn't match, so we have to get out via a longjmp(). */  
3623    
3624      case OP_CUT:      case OP_OPT:
3625      if (match(eptr, ecode+1, offset_top, md)) return TRUE;      ims = ecode[1];
3626      longjmp(md->fail_env, 1);      ecode += 2;
3627        DPRINTF(("ims set to %02lx\n", ims));
3628        break;
3629    
3630      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
3631      matching won't pass the KET for an assertion. If any one branch matches,      matching won't pass the KET for an assertion. If any one branch matches,
3632      the assertion is true. */      the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3633        start of each branch to move the current point backwards, so the code at
3634        this level is identical to the lookahead case. */
3635    
3636      case OP_ASSERT:      case OP_ASSERT:
3637        case OP_ASSERTBACK:
3638      do      do
3639        {        {
3640        if (match(eptr, ecode+3, offset_top, md)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3642        }        }
3643      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3644      if (*ecode == OP_KET) return FALSE;      if (*ecode == OP_KET) return FALSE;
3645    
3646        /* If checking an assertion for a condition, return TRUE. */
3647    
3648        if ((flags & match_condassert) != 0) return TRUE;
3649    
3650      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3651      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
3652    
# Line 2396  for (;;) Line 3658  for (;;)
3658      /* Negative assertion: all branches must fail to match */      /* Negative assertion: all branches must fail to match */
3659    
3660      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
3661        case OP_ASSERTBACK_NOT:
3662      do      do
3663        {        {
3664        if (match(eptr, ecode+3, offset_top, md)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3665            return FALSE;
3666        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3667        }        }
3668      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3669    
3670        if ((flags & match_condassert) != 0) return TRUE;
3671    
3672      ecode += 3;      ecode += 3;
3673      continue;      continue;
3674    
3675        /* Move the subject pointer back. This occurs only at the start of
3676        each branch of a lookbehind assertion. If we are too close to the start to
3677        move back, this match function fails. When working with UTF-8 we move
3678        back a number of characters, not bytes. */
3679    
3680        case OP_REVERSE:
3681    #ifdef SUPPORT_UTF8
3682        c = (ecode[1] << 8) + ecode[2];
3683        for (i = 0; i < c; i++)
3684          {
3685          eptr--;
3686          BACKCHAR(eptr)
3687          }
3688    #else
3689        eptr -= (ecode[1] << 8) + ecode[2];
3690    #endif
3691    
3692        if (eptr < md->start_subject) return FALSE;
3693        ecode += 3;
3694        break;
3695    
3696        /* Recursion matches the current regex, nested. If there are any capturing
3697        brackets started but not finished, we have to save their starting points
3698        and reinstate them after the recursion. However, we don't know how many
3699        such there are (offset_top records the completed total) so we just have
3700        to save all the potential data. There may be up to 99 such values, which
3701        is a bit large to put on the stack, but using malloc for small numbers
3702        seems expensive. As a compromise, the stack is used when there are fewer
3703        than 16 values to store; otherwise malloc is used. A problem is what to do
3704        if the malloc fails ... there is no way of returning to the top level with
3705        an error. Save the top 15 values on the stack, and accept that the rest
3706        may be wrong. */
3707    
3708        case OP_RECURSE:
3709          {
3710          BOOL rc;
3711          int *save;
3712          int stacksave[15];
3713    
3714          c = md->offset_max;
3715    
3716          if (c < 16) save = stacksave; else
3717            {
3718            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3719            if (save == NULL)
3720              {
3721              save = stacksave;
3722              c = 15;
3723              }
3724            }
3725    
3726          for (i = 1; i <= c; i++)
3727            save[i] = md->offset_vector[md->offset_end - i];
3728          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3729            match_isgroup);
3730          for (i = 1; i <= c; i++)
3731            md->offset_vector[md->offset_end - i] = save[i];
3732          if (save != stacksave) (pcre_free)(save);
3733          if (!rc) return FALSE;
3734    
3735          /* In case the recursion has set more capturing values, save the final
3736          number, then move along the subject till after the recursive match,
3737          and advance one byte in the pattern code. */
3738    
3739          offset_top = md->end_offset_top;
3740          eptr = md->end_match_ptr;
3741          ecode++;
3742          }
3743        break;
3744    
3745      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3746      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
3747      a move back into the brackets. Check the alternative branches in turn - the      a move back into the brackets. Check the alternative branches in turn - the
3748      matching won't pass the KET for this kind of subpattern. If any one branch      matching won't pass the KET for this kind of subpattern. If any one branch
3749      matches, we carry on, leaving the subject pointer. */      matches, we carry on as at the end of a normal bracket, leaving the subject
3750        pointer. */
3751    
3752      case OP_ONCE:      case OP_ONCE:
     do  
3753        {        {
3754        if (match(eptr, ecode+3, offset_top, md)) break;        const uschar *prev = ecode;
3755        ecode += (ecode[1] << 8) + ecode[2];        const uschar *saved_eptr = eptr;
3756        }  
3757      while (*ecode == OP_ALT);        do
3758      if (*ecode == OP_KET) return FALSE;          {
3759            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3760              break;
3761            ecode += (ecode[1] << 8) + ecode[2];
3762            }
3763          while (*ecode == OP_ALT);
3764    
3765      /* Continue as from after the assertion, updating the offsets high water        /* If hit the end of the group (which could be repeated), fail */
     mark, since extracts may have been taken. */  
3766    
3767      do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);        if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3768      ecode += 3;  
3769      offset_top = md->end_offset_top;        /* Continue as from after the assertion, updating the offsets high water
3770      eptr = md->end_match_ptr;        mark, since extracts may have been taken. */
3771      continue;  
3772          do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3773    
3774          offset_top = md->end_offset_top;
3775          eptr = md->end_match_ptr;
3776    
3777          /* For a non-repeating ket, just continue at this level. This also
3778          happens for a repeating ket if no characters were matched in the group.
3779          This is the forcible breaking of infinite loops as implemented in Perl
3780          5.005. If there is an options reset, it will get obeyed in the normal
3781          course of events. */
3782    
3783          if (*ecode == OP_KET || eptr == saved_eptr)
3784            {
3785            ecode += 3;
3786            break;
3787            }
3788    
3789          /* The repeating kets try the rest of the pattern or restart from the
3790          preceding bracket, in the appropriate order. We need to reset any options
3791          that changed within the bracket before re-running it, so check the next
3792          opcode. */
3793    
3794          if (ecode[3] == OP_OPT)
3795            {
3796            ims = (ims & ~PCRE_IMS) | ecode[4];
3797            DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798            }
3799    
3800          if (*ecode == OP_KETRMIN)
3801            {
3802            if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803                match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804                  return TRUE;
3805            }
3806          else  /* OP_KETRMAX */
3807            {
3808            if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809                match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3810            }
3811          }
3812        return FALSE;
3813    
3814      /* An alternation is the end of a branch; scan along to find the end of the      /* An alternation is the end of a branch; scan along to find the end of the
3815      bracketed group and go to there. */      bracketed group and go to there. */
# Line 2445  for (;;) Line 3827  for (;;)
3827      case OP_BRAZERO:      case OP_BRAZERO:
3828        {        {
3829        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3830        if (match(eptr, next, offset_top, md)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3831            return TRUE;
3832        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833        ecode = next + 3;        ecode = next + 3;
3834        }        }
# Line 2455  for (;;) Line 3838  for (;;)
3838        {        {
3839        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3840        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841        if (match(eptr, next+3, offset_top, md)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3842            return TRUE;
3843        ecode++;        ecode++;
3844        }        }
3845      break;;      break;
3846    
3847      /* End of a group, repeated or non-repeating. If we are at the end of      /* End of a group, repeated or non-repeating. If we are at the end of
3848      an assertion "group", stop matching and return TRUE, but record the      an assertion "group", stop matching and return TRUE, but record the
3849      current high water mark for use by positive assertions. */      current high water mark for use by positive assertions. Do this also
3850        for the "once" (not-backup up) groups. */
3851    
3852      case OP_KET:      case OP_KET:
3853      case OP_KETRMIN:      case OP_KETRMIN:
3854      case OP_KETRMAX:      case OP_KETRMAX:
3855        {        {
       int number;  
3856        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857          const uschar *saved_eptr = eptrb->saved_eptr;
3858    
3859          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3860    
3861        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862              *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3863              *prev == OP_ONCE)
3864          {          {
3865          md->end_match_ptr = eptr;      /* For ONCE */          md->end_match_ptr = eptr;      /* For ONCE */
3866          md->end_offset_top = offset_top;          md->end_offset_top = offset_top;
3867          return TRUE;          return TRUE;
3868          }          }
3869    
3870        /* In all other cases we have to check the group number back at the        /* In all other cases except a conditional group we have to check the
3871        start and if necessary complete handling an extraction by setting the        group number back at the start and if necessary complete handling an
3872        final offset and bumping the high water mark. */        extraction by setting the offsets and bumping the high water mark. */
3873    
3874        number = (*prev - OP_BRA) << 1;        if (*prev != OP_COND)
3875            {
3876            int offset;
3877            int number = *prev - OP_BRA;
3878    
3879        DPRINTF(("end bracket %d\n", number/2));          /* For extended extraction brackets (large number), we have to fish out
3880            the number from a dummy opcode at the start. */
3881    
3882        if (number > 0)          if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883          {          offset = number << 1;
3884          if (number >= md->offset_end) md->offset_overflow = TRUE; else  
3885    #ifdef DEBUG
3886            printf("end bracket %d", number);
3887            printf("\n");
3888    #endif
3889    
3890            if (number > 0)
3891            {            {
3892            md->offset_vector[number+1] = eptr - md->start_subject;            if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3893            if (offset_top <= number) offset_top = number + 2;              {
3894                md->offset_vector[offset] =
3895                  md->offset_vector[md->offset_end - number];
3896                md->offset_vector[offset+1] = eptr - md->start_subject;
3897                if (offset_top <= offset) offset_top = offset + 2;
3898                }
3899            }            }
3900          }          }
3901    
3902        /* For a non-repeating