/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC revision 67 by nigel, Sat Feb 24 21:40:13 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-2000 University of Cambridge             Copyright (c) 1997-2003 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 32  restrictions: Line 32  restrictions:
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
   
35  /* Define DEBUG to get debugging output on stdout. */  /* Define DEBUG to get debugging output on stdout. */
36    
37  /* #define DEBUG */  /* #define DEBUG */
# Line 60  the external pcre header. */ Line 59  the external pcre header. */
59  #endif  #endif
60    
61    
62  /* Number of items on the nested bracket stacks at compile time. This should  /* Maximum number of items on the nested bracket stacks at compile time. This
63  not be set greater than 200. */  applies to the nesting of all kinds of parentheses. It does not limit
64    un-nested, non-capturing parentheses. This number can be made bigger if
65    necessary - it is used to dimension one int and one unsigned char vector at
66    compile time. */
67    
68  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
69    
70    
71    /* Maximum number of ints of offset to save on the stack for recursive calls.
72    If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73    because the offset vector is always a multiple of 3 long. */
74    
75    #define REC_STACK_SAVE_MAX 30
76    
77    
78    /* The number of bytes in a literal character string above which we can't add
79    any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80    could be 255 when UTF-8 support is excluded, but that means that some of the
81    test output would be different, which just complicates things.) */
82    
83    #define MAXLIT 250
84    
85    
86    /* The maximum remaining length of subject we are prepared to search for a
87    req_byte match. */
88    
89    #define REQ_BYTE_MAX 1000
90    
91    
92    /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93    the definition is next to the definition of the opcodes in internal.h. */
94    
95    static uschar OP_lengths[] = { OP_LENGTHS };
96    
97  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98    
99  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101    
 /* Text forms of OP_ values and things, for debugging (not all used) */  
   
 #ifdef DEBUG  
 static const char *OP_names[] = {  
   "End", "\\A", "\\B", "\\b", "\\D", "\\d",  
   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",  
   "Opt", "^", "$", "Any", "chars", "not",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{",  
   "class", "Ref", "Recurse",  
   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",  
   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",  
   "Brazero", "Braminzero", "Bra"  
 };  
 #endif  
   
102  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
104  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
# Line 97  is invalid. */ Line 107  is invalid. */
107  static const short int escapes[] = {  static const short int escapes[] = {
108      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
109      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
110    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
111      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
112      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
113      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
114    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
115      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
116      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_r, -ESC_s,  ESC_t,      0,      0, -ESC_w,   /* p - w */
117      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
118  };  };
119    
# Line 113  as this is assumed for handling case ind Line 123  as this is assumed for handling case ind
123    
124  static const char *posix_names[] = {  static const char *posix_names[] = {
125    "alpha", "lower", "upper",    "alpha", "lower", "upper",
126    "alnum", "ascii", "cntrl", "digit", "graph",    "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127    "print", "punct", "space", "word",  "xdigit" };    "print", "punct", "space", "word",  "xdigit" };
128    
129  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
130    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131    
132  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class; up to three may be combined
133  to form the class. */  to form the class. The table for [:blank:] is dynamically modified to remove
134    the vertical space characters. */
135    
136  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
137    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_lower, cbit_upper, -1,             /* alpha */
# Line 128  static const int posix_class_maps[] = { Line 139  static const int posix_class_maps[] = {
139    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,         -1,             /* upper */
140    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_digit, cbit_lower, cbit_upper,     /* alnum */
141    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl, -1,             /* ascii */
142      cbit_space, -1,         -1,             /* blank - a GNU extension */
143    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,         -1,             /* cntrl */
144    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,         -1,             /* digit */
145    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,         -1,             /* graph */
146    cbit_print, -1,         -1,             /* print */    cbit_print, -1,         -1,             /* print */
147    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,         -1,             /* punct */
148    cbit_space, -1,         -1,             /* space */    cbit_space, -1,         -1,             /* space */
149    cbit_word,  -1,         -1,             /* word */    cbit_word,  -1,         -1,             /* word - a Perl extension */
150    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,         -1              /* xdigit */
151  };  };
152    
# Line 143  static const int posix_class_maps[] = { Line 155  static const int posix_class_maps[] = {
155    
156  static BOOL  static BOOL
157    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
158      BOOL, int, int *, int *, compile_data *);      BOOL, int, int *, int *, branch_chain *, compile_data *);
159    
160  /* Structure for building a chain of data that actually lives on the  /* Structure for building a chain of data that actually lives on the
161  stack, for holding the values of the subject pointer at the start of each  stack, for holding the values of the subject pointer at the start of each
# Line 160  typedef struct eptrblock { Line 172  typedef struct eptrblock {
172  #define match_condassert   0x01    /* Called to check a condition assertion */  #define match_condassert   0x01    /* Called to check a condition assertion */
173  #define match_isgroup      0x02    /* Set if start of bracketed group */  #define match_isgroup      0x02    /* Set if start of bracketed group */
174    
175    /* Non-error returns from the match() function. Error returns are externally
176    defined PCRE_ERROR_xxx codes, which are all negative. */
177    
178    #define MATCH_MATCH        1
179    #define MATCH_NOMATCH      0
180    
181    
182    
183  /*************************************************  /*************************************************
# Line 168  typedef struct eptrblock { Line 186  typedef struct eptrblock {
186    
187  /* PCRE is thread-clean and doesn't use any global variables in the normal  /* PCRE is thread-clean and doesn't use any global variables in the normal
188  sense. However, it calls memory allocation and free functions via the two  sense. However, it calls memory allocation and free functions via the two
189  indirections below, which are can be changed by the caller, but are shared  indirections below, and it can optionally do callouts. These values can be
190  between all threads. */  changed by the caller, but are shared between all threads. However, when
191    compiling for Virtual Pascal, things are done differently (see pcre.in). */
192    
193    #ifndef VPCOMPAT
194  void *(*pcre_malloc)(size_t) = malloc;  void *(*pcre_malloc)(size_t) = malloc;
195  void  (*pcre_free)(void *) = free;  void  (*pcre_free)(void *) = free;
196    int   (*pcre_callout)(pcre_callout_block *) = NULL;
197    #endif
198    
199    
200    /*************************************************
201    *    Macros and tables for character handling    *
202    *************************************************/
203    
204    /* When UTF-8 encoding is being used, a character is no longer just a single
205    byte. The macros for character handling generate simple sequences when used in
206    byte-mode, and more complicated ones for UTF-8 characters. */
207    
208    #ifndef SUPPORT_UTF8
209    #define GETCHAR(c, eptr) c = *eptr;
210    #define GETCHARINC(c, eptr) c = *eptr++;
211    #define GETCHARINCTEST(c, eptr) c = *eptr++;
212    #define GETCHARLEN(c, eptr, len) c = *eptr;
213    #define BACKCHAR(eptr)
214    
215    #else   /* SUPPORT_UTF8 */
216    
217    /* Get the next UTF-8 character, not advancing the pointer. This is called when
218    we know we are in UTF-8 mode. */
219    
220    #define GETCHAR(c, eptr) \
221      c = *eptr; \
222      if ((c & 0xc0) == 0xc0) \
223        { \
224        int gcii; \
225        int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
226        int gcss = 6*gcaa; \
227        c = (c & utf8_table3[gcaa]) << gcss; \
228        for (gcii = 1; gcii <= gcaa; gcii++) \
229          { \
230          gcss -= 6; \
231          c |= (eptr[gcii] & 0x3f) << gcss; \
232          } \
233        }
234    
235    /* Get the next UTF-8 character, advancing the pointer. This is called when we
236    know we are in UTF-8 mode. */
237    
238    #define GETCHARINC(c, eptr) \
239      c = *eptr++; \
240      if ((c & 0xc0) == 0xc0) \
241        { \
242        int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
243        int gcss = 6*gcaa; \
244        c = (c & utf8_table3[gcaa]) << gcss; \
245        while (gcaa-- > 0) \
246          { \
247          gcss -= 6; \
248          c |= (*eptr++ & 0x3f) << gcss; \
249          } \
250        }
251    
252    /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
253    
254    #define GETCHARINCTEST(c, eptr) \
255      c = *eptr++; \
256      if (md->utf8 && (c & 0xc0) == 0xc0) \
257        { \
258        int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
259        int gcss = 6*gcaa; \
260        c = (c & utf8_table3[gcaa]) << gcss; \
261        while (gcaa-- > 0) \
262          { \
263          gcss -= 6; \
264          c |= (*eptr++ & 0x3f) << gcss; \
265          } \
266        }
267    
268    /* Get the next UTF-8 character, not advancing the pointer, incrementing length
269    if there are extra bytes. This is called when we know we are in UTF-8 mode. */
270    
271    #define GETCHARLEN(c, eptr, len) \
272      c = *eptr; \
273      if ((c & 0xc0) == 0xc0) \
274        { \
275        int gcii; \
276        int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
277        int gcss = 6*gcaa; \
278        c = (c & utf8_table3[gcaa]) << gcss; \
279        for (gcii = 1; gcii <= gcaa; gcii++) \
280          { \
281          gcss -= 6; \
282          c |= (eptr[gcii] & 0x3f) << gcss; \
283          } \
284        len += gcaa; \
285        }
286    
287    /* If the pointer is not at the start of a character, move it back until
288    it is. Called only in UTF-8 mode. */
289    
290    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
291    
292    #endif
293    
294    
295    
# Line 191  tables. */ Line 307  tables. */
307    
308    
309    
310    #ifdef SUPPORT_UTF8
311    /*************************************************
312    *           Tables for UTF-8 support             *
313    *************************************************/
314    
315    /* These are the breakpoints for different numbers of bytes in a UTF-8
316    character. */
317    
318    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
319    
320    /* These are the indicator bits and the mask for the data bits to set in the
321    first byte of a character, indexed by the number of additional bytes. */
322    
323    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
324    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
325    
326    /* Table of the number of extra characters, indexed by the first character
327    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
328    0x3d. */
329    
330    static uschar utf8_table4[] = {
331      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
332      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
333      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
334      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
335    
336    
337    /*************************************************
338    *       Convert character value to UTF-8         *
339    *************************************************/
340    
341    /* This function takes an integer value in the range 0 - 0x7fffffff
342    and encodes it as a UTF-8 character in 0 to 6 bytes.
343    
344    Arguments:
345      cvalue     the character value
346      buffer     pointer to buffer for result - at least 6 bytes long
347    
348    Returns:     number of characters placed in the buffer
349    */
350    
351    static int
352    ord2utf8(int cvalue, uschar *buffer)
353    {
354    register int i, j;
355    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
356      if (cvalue <= utf8_table1[i]) break;
357    buffer += i;
358    for (j = i; j > 0; j--)
359     {
360     *buffer-- = 0x80 | (cvalue & 0x3f);
361     cvalue >>= 6;
362     }
363    *buffer = utf8_table2[i] | cvalue;
364    return i + 1;
365    }
366    #endif
367    
368    
369    
370    /*************************************************
371    *         Print compiled regex                   *
372    *************************************************/
373    
374    /* The code for doing this is held in a separate file that is also included in
375    pcretest.c. It defines a function called print_internals(). */
376    
377    #ifdef DEBUG
378    #include "printint.c"
379    #endif
380    
381    
382    
383  /*************************************************  /*************************************************
384  *          Return version string                 *  *          Return version string                 *
385  *************************************************/  *************************************************/
# Line 221  Therefore, I haven't changed the API for Line 410  Therefore, I haven't changed the API for
410  Arguments:  Arguments:
411    external_re   points to compiled code    external_re   points to compiled code
412    optptr        where to pass back the options    optptr        where to pass back the options
413    first_char    where to pass back the first character,    first_byte    where to pass back the first character,
414                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
415                  or -2 otherwise                  or -2 otherwise
416    
# Line 230  Returns:        number of capturing subp Line 419  Returns:        number of capturing subp
419  */  */
420    
421  int  int
422  pcre_info(const pcre *external_re, int *optptr, int *first_char)  pcre_info(const pcre *external_re, int *optptr, int *first_byte)
423  {  {
424  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
425  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
426  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
427  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
428  if (first_char != NULL)  if (first_byte != NULL)
429    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
430       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
431  return re->top_bracket;  return re->top_bracket;
432  }  }
# Line 253  that additional items can be added compa Line 442  that additional items can be added compa
442    
443  Arguments:  Arguments:
444    external_re      points to compiled code    external_re      points to compiled code
445    external_study   points to study data, or NULL    extra_data       points extra data, or NULL
446    what             what information is required    what             what information is required
447    where            where to put the information    where            where to put the information
448    
# Line 261  Returns:           0 if data returned, n Line 450  Returns:           0 if data returned, n
450  */  */
451    
452  int  int
453  pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,  pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
454    void *where)    void *where)
455  {  {
456  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
457  const real_pcre_extra *study = (const real_pcre_extra *)study_data;  const pcre_study_data *study = NULL;
458    
459  if (re == NULL || where == NULL) return PCRE_ERROR_NULL;  if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
460  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
461    
462    if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
463      study = extra_data->study_data;
464    
465  switch (what)  switch (what)
466    {    {
467    case PCRE_INFO_OPTIONS:    case PCRE_INFO_OPTIONS:
# Line 280  switch (what) Line 472  switch (what)
472    *((size_t *)where) = re->size;    *((size_t *)where) = re->size;
473    break;    break;
474    
475      case PCRE_INFO_STUDYSIZE:
476      *((size_t *)where) = (study == NULL)? 0 : study->size;
477      break;
478    
479    case PCRE_INFO_CAPTURECOUNT:    case PCRE_INFO_CAPTURECOUNT:
480    *((int *)where) = re->top_bracket;    *((int *)where) = re->top_bracket;
481    break;    break;
# Line 288  switch (what) Line 484  switch (what)
484    *((int *)where) = re->top_backref;    *((int *)where) = re->top_backref;
485    break;    break;
486    
487    case PCRE_INFO_FIRSTCHAR:    case PCRE_INFO_FIRSTBYTE:
488    *((int *)where) =    *((int *)where) =
489      ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :      ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
490      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
491    break;    break;
492    
# Line 302  switch (what) Line 498  switch (what)
498    
499    case PCRE_INFO_LASTLITERAL:    case PCRE_INFO_LASTLITERAL:
500    *((int *)where) =    *((int *)where) =
501      ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;      ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
502      break;
503    
504      case PCRE_INFO_NAMEENTRYSIZE:
505      *((int *)where) = re->name_entry_size;
506      break;
507    
508      case PCRE_INFO_NAMECOUNT:
509      *((int *)where) = re->name_count;
510      break;
511    
512      case PCRE_INFO_NAMETABLE:
513      *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
514      break;
515    
516      default: return PCRE_ERROR_BADOPTION;
517      }
518    
519    return 0;
520    }
521    
522    
523    
524    /*************************************************
525    * Return info about what features are configured *
526    *************************************************/
527    
528    /* This is function which has an extensible interface so that additional items
529    can be added compatibly.
530    
531    Arguments:
532      what             what information is required
533      where            where to put the information
534    
535    Returns:           0 if data returned, negative on error
536    */
537    
538    int
539    pcre_config(int what, void *where)
540    {
541    switch (what)
542      {
543      case PCRE_CONFIG_UTF8:
544      #ifdef SUPPORT_UTF8
545      *((int *)where) = 1;
546      #else
547      *((int *)where) = 0;
548      #endif
549      break;
550    
551      case PCRE_CONFIG_NEWLINE:
552      *((int *)where) = NEWLINE;
553      break;
554    
555      case PCRE_CONFIG_LINK_SIZE:
556      *((int *)where) = LINK_SIZE;
557      break;
558    
559      case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
560      *((int *)where) = POSIX_MALLOC_THRESHOLD;
561      break;
562    
563      case PCRE_CONFIG_MATCH_LIMIT:
564      *((unsigned int *)where) = MATCH_LIMIT;
565    break;    break;
566    
567    default: return PCRE_ERROR_BADOPTION;    default: return PCRE_ERROR_BADOPTION;
# Line 349  while (length-- > 0) Line 608  while (length-- > 0)
608    
609  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
610  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
611  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
612  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
613  sequence.  the \. On exit, it is on the final character of the escape sequence.
614    
615  Arguments:  Arguments:
616    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 373  check_escape(const uschar **ptrptr, cons Line 632  check_escape(const uschar **ptrptr, cons
632  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
633  int c, i;  int c, i;
634    
635  c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  /* If backslash is at the end of the pattern, it's an error. */
636    
637    c = *(++ptr);
638  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
639    
640  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 392  else Line 653  else
653    const uschar *oldptr;    const uschar *oldptr;
654    switch (c)    switch (c)
655      {      {
656        /* A number of Perl escapes are not handled by PCRE. We give an explicit
657        error. */
658    
659        case 'l':
660        case 'L':
661        case 'N':
662        case 'p':
663        case 'P':
664        case 'u':
665        case 'U':
666        case 'X':
667        *errorptr = ERR37;
668        break;
669    
670      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
671      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
672      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 433  else Line 708  else
708        }        }
709    
710      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
711      larger first octal digit */      larger first octal digit. */
712    
713      case '0':      case '0':
714      c -= '0';      c -= '0';
715      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
716        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
717          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
718        c &= 255;     /* Take least significant 8 bits */
719      break;      break;
720    
721      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
722        which can be greater than 0xff, but only if the ddd are hex digits. */
723    
724      case 'x':      case 'x':
725    #ifdef SUPPORT_UTF8
726        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
727          {
728          const uschar *pt = ptr + 2;
729          register int count = 0;
730          c = 0;
731          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
732            {
733            count++;
734            c = c * 16 + cd->lcc[*pt] -
735              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
736            pt++;
737            }
738          if (*pt == '}')
739            {
740            if (c < 0 || count > 8) *errorptr = ERR34;
741            ptr = pt;
742            break;
743            }
744          /* If the sequence of hex digits does not end with '}', then we don't
745          recognize this construct; fall through to the normal \x handling. */
746          }
747    #endif
748    
749        /* Read just a single hex char */
750    
751      c = 0;      c = 0;
752      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
753        {        {
# Line 454  else Line 757  else
757        }        }
758      break;      break;
759    
760        /* Other special escapes not starting with a digit are straightforward */
761    
762      case 'c':      case 'c':
763      c = *(++ptr);      c = *(++ptr);
764      if (c == 0)      if (c == 0)
# Line 583  return p; Line 888  return p;
888    
889    
890  /*************************************************  /*************************************************
891    *      Find first significant op code            *
892    *************************************************/
893    
894    /* This is called by several functions that scan a compiled expression looking
895    for a fixed first character, or an anchoring op code etc. It skips over things
896    that do not influence this. For some calls, a change of option is important.
897    
898    Arguments:
899      code       pointer to the start of the group
900      options    pointer to external options
901      optbit     the option bit whose changing is significant, or
902                   zero if none are
903    
904    Returns:     pointer to the first significant opcode
905    */
906    
907    static const uschar*
908    first_significant_code(const uschar *code, int *options, int optbit)
909    {
910    for (;;)
911      {
912      switch ((int)*code)
913        {
914        case OP_OPT:
915        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
916          *options = (int)code[1];
917        code += 2;
918        break;
919    
920        case OP_ASSERT_NOT:
921        case OP_ASSERTBACK:
922        case OP_ASSERTBACK_NOT:
923        do code += GET(code, 1); while (*code == OP_ALT);
924        /* Fall through */
925    
926        case OP_CALLOUT:
927        case OP_CREF:
928        case OP_BRANUMBER:
929        case OP_WORD_BOUNDARY:
930        case OP_NOT_WORD_BOUNDARY:
931        code += OP_lengths[*code];
932        break;
933    
934        default:
935        return code;
936        }
937      }
938    /* Control never reaches here */
939    }
940    
941    
942    
943    
944    /*************************************************
945  *        Find the fixed length of a pattern      *  *        Find the fixed length of a pattern      *
946  *************************************************/  *************************************************/
947    
948  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a pattern and compute the fixed length of subject that will match it,
949  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
950    In UTF8 mode, the result is in characters rather than bytes.
951    
952  Arguments:  Arguments:
953    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
954      options  the compiling options
955    
956  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length,
957                 or -2 if \C was encountered
958  */  */
959    
960  static int  static int
961  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
962  {  {
963  int length = -1;  int length = -1;
964    
965  register int branchlength = 0;  register int branchlength = 0;
966  register uschar *cc = code + 3;  register uschar *cc = code + 1 + LINK_SIZE;
967    
968  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
969  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 617  for (;;) Line 979  for (;;)
979      case OP_BRA:      case OP_BRA:
980      case OP_ONCE:      case OP_ONCE:
981      case OP_COND:      case OP_COND:
982      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
983      if (d < 0) return -1;      if (d < 0) return d;
984      branchlength += d;      branchlength += d;
985      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
986      cc += 3;      cc += 1 + LINK_SIZE;
987      break;      break;
988    
989      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
# Line 636  for (;;) Line 998  for (;;)
998      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
999        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1000      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
1001      cc += 3;      cc += 1 + LINK_SIZE;
1002      branchlength = 0;      branchlength = 0;
1003      break;      break;
1004    
# Line 646  for (;;) Line 1008  for (;;)
1008      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1009      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1010      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1011      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1012      cc += 3;      /* Fall through */
     break;  
1013    
1014      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1015    
1016      case OP_REVERSE:      case OP_REVERSE:
1017      cc++;      case OP_BRANUMBER:
     /* Fall through */  
   
1018      case OP_CREF:      case OP_CREF:
1019      case OP_OPT:      case OP_OPT:
1020      cc++;      case OP_CALLOUT:
     /* Fall through */  
   
1021      case OP_SOD:      case OP_SOD:
1022        case OP_SOM:
1023      case OP_EOD:      case OP_EOD:
1024      case OP_EODN:      case OP_EODN:
1025      case OP_CIRC:      case OP_CIRC:
1026      case OP_DOLL:      case OP_DOLL:
1027      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1028      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1029      cc++;      cc += OP_lengths[*cc];
1030      break;      break;
1031    
1032      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1033        This requires a scan of the string, unfortunately. We assume valid UTF-8
1034        strings, so all we do is reduce the length by one for every byte whose bits
1035        are 10xxxxxx. */
1036    
1037      case OP_CHARS:      case OP_CHARS:
1038      branchlength += *(++cc);      branchlength += *(++cc);
1039    #ifdef SUPPORT_UTF8
1040        if ((options & PCRE_UTF8) != 0)
1041          for (d = 1; d <= *cc; d++)
1042            if ((cc[d] & 0xc0) == 0x80) branchlength--;
1043    #endif
1044      cc += *cc + 1;      cc += *cc + 1;
1045      break;      break;
1046    
1047      /* Handle exact repetitions */      /* Handle exact repetitions. The count is already in characters, but we
1048        need to skip over a multibyte character in UTF8 mode.  */
1049    
1050      case OP_EXACT:      case OP_EXACT:
1051        branchlength += GET2(cc,1);
1052        cc += 4;
1053    #ifdef SUPPORT_UTF8
1054        if ((options & PCRE_UTF8) != 0)
1055          {
1056          while((*cc & 0x80) == 0x80) cc++;
1057          }
1058    #endif
1059        break;
1060    
1061      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1062      branchlength += (cc[1] << 8) + cc[2];      branchlength += GET2(cc,1);
1063      cc += 4;      cc += 4;
1064      break;      break;
1065    
# Line 699  for (;;) Line 1076  for (;;)
1076      cc++;      cc++;
1077      break;      break;
1078    
1079        /* The single-byte matcher isn't allowed */
1080    
1081        case OP_ANYBYTE:
1082        return -2;
1083    
1084      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1085    
1086    #ifdef SUPPORT_UTF8
1087        case OP_XCLASS:
1088        cc += GET(cc, 1) - 33;
1089        /* Fall through */
1090    #endif
1091    
1092      case OP_CLASS:      case OP_CLASS:
1093      cc += (*cc == OP_REF)? 2 : 33;      case OP_NCLASS:
1094        cc += 33;
1095    
1096      switch (*cc)      switch (*cc)
1097        {        {
# Line 715  for (;;) Line 1103  for (;;)
1103    
1104        case OP_CRRANGE:        case OP_CRRANGE:
1105        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1106        if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;        if (GET2(cc,1) != GET2(cc,3)) return -1;
1107        branchlength += (cc[1] << 8) + cc[2];        branchlength += GET2(cc,1);
1108        cc += 5;        cc += 5;
1109        break;        break;
1110    
# Line 738  for (;;) Line 1126  for (;;)
1126    
1127    
1128  /*************************************************  /*************************************************
1129  *           Check for POSIX class syntax         *  *    Scan compiled regex for numbered bracket    *
1130  *************************************************/  *************************************************/
1131    
1132  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This little function scans through a compiled pattern until it finds a
1133  encountered in a character class. It checks whether this is followed by an  capturing bracket with the given number.
 optional ^ and then a sequence of letters, terminated by a matching ":]" or  
 ".]" or "=]".  
1134    
1135  Argument:  Arguments:
1136    ptr      pointer to the initial [    code        points to start of expression
1137    endptr   where to return the end pointer    utf8        TRUE in UTF-8 mode
1138    cd       pointer to compile data    number      the required bracket number
1139    
1140  Returns:   TRUE or FALSE  Returns:      pointer to the opcode for the bracket, or NULL if not found
1141  */  */
1142    
1143  static BOOL  static const uschar *
1144  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  find_bracket(const uschar *code, BOOL utf8, int number)
1145  {  {
1146  int terminator;          /* Don't combine these lines; the Solaris cc */  #ifndef SUPPORT_UTF8
1147  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  utf8 = utf8;               /* Stop pedantic compilers complaining */
1148  if (*(++ptr) == '^') ptr++;  #endif
1149  while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
1150  if (*ptr == terminator && ptr[1] == ']')  for (;;)
1151    {    {
1152    *endptr = ptr;    register int c = *code;
1153    return TRUE;    if (c == OP_END) return NULL;
1154      else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1155      else if (c > OP_BRA)
1156        {
1157        int n = c - OP_BRA;
1158        if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1159        if (n == number) return (uschar *)code;
1160        code += OP_lengths[OP_BRA];
1161        }
1162      else
1163        {
1164        code += OP_lengths[c];
1165    
1166        /* In UTF-8 mode, opcodes that are followed by a character may be followed
1167        by a multi-byte character. The length in the table is a minimum, so we have
1168        to scan along to skip the extra characters. All opcodes are less than 128,
1169        so we can use relatively efficient code. */
1170    
1171    #ifdef SUPPORT_UTF8
1172        if (utf8) switch(c)
1173          {
1174          case OP_EXACT:
1175          case OP_UPTO:
1176          case OP_MINUPTO:
1177          case OP_STAR:
1178          case OP_MINSTAR:
1179          case OP_PLUS:
1180          case OP_MINPLUS:
1181          case OP_QUERY:
1182          case OP_MINQUERY:
1183          while ((*code & 0xc0) == 0x80) code++;
1184          break;
1185          }
1186    #endif
1187        }
1188    }    }
 return FALSE;  
1189  }  }
1190    
1191    
1192    
   
1193  /*************************************************  /*************************************************
1194  *          Check POSIX class name                *  *    Scan compiled branch for non-emptiness      *
1195  *************************************************/  *************************************************/
1196    
1197  /* This function is called to check the name given in a POSIX-style class entry  /* This function scans through a branch of a compiled pattern to see whether it
1198  such as [:alnum:].  can match the empty string or not. It is called only from could_be_empty()
1199    below. Note that first_significant_code() skips over assertions. If we hit an
1200    unclosed bracket, we return "empty" - this means we've struck an inner bracket
1201    whose current branch will already have been scanned.
1202    
1203  Arguments:  Arguments:
1204    ptr        points to the first letter    code        points to start of search
1205    len        the length of the name    endcode     points to where to stop
1206      utf8        TRUE if in UTF8 mode
1207    
1208  Returns:     a value representing the name, or -1 if unknown  Returns:      TRUE if what is matched could be empty
1209  */  */
1210    
1211  static int  static BOOL
1212  check_posix_name(const uschar *ptr, int len)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1213  {  {
1214  register int yield = 0;  register int c;
1215  while (posix_name_lengths[yield] != 0)  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1216         code < endcode;
1217         code = first_significant_code(code + OP_lengths[c], NULL, 0))
1218    {    {
1219    if (len == posix_name_lengths[yield] &&    const uschar *ccode;
     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;  
   yield++;  
   }  
 return -1;  
 }  
1220    
1221      c = *code;
1222    
1223      if (c >= OP_BRA)
1224        {
1225        BOOL empty_branch;
1226        if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1227    
1228        /* Scan a closed bracket */
1229    
1230  /*************************************************      empty_branch = FALSE;
1231  *           Compile one branch                   *      do
1232  *************************************************/        {
1233          if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1234            empty_branch = TRUE;
1235          code += GET(code, 1);
1236          }
1237        while (*code == OP_ALT);
1238        if (!empty_branch) return FALSE;   /* All branches are non-empty */
1239        code += 1 + LINK_SIZE;
1240        c = *code;
1241        }
1242    
1243  /* Scan the pattern, compiling it into the code vector.    else switch (c)
1244        {
1245        /* Check for quantifiers after a class */
1246    
1247    #ifdef SUPPORT_UTF8
1248        case OP_XCLASS:
1249        ccode = code + GET(code, 1);
1250        goto CHECK_CLASS_REPEAT;
1251    #endif
1252    
1253        case OP_CLASS:
1254        case OP_NCLASS:
1255        ccode = code + 33;
1256    
1257    #ifdef SUPPORT_UTF8
1258        CHECK_CLASS_REPEAT:
1259    #endif
1260    
1261        switch (*ccode)
1262          {
1263          case OP_CRSTAR:            /* These could be empty; continue */
1264          case OP_CRMINSTAR:
1265          case OP_CRQUERY:
1266          case OP_CRMINQUERY:
1267          break;
1268    
1269          default:                   /* Non-repeat => class must match */
1270          case OP_CRPLUS:            /* These repeats aren't empty */
1271          case OP_CRMINPLUS:
1272          return FALSE;
1273    
1274          case OP_CRRANGE:
1275          case OP_CRMINRANGE:
1276          if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1277          break;
1278          }
1279        break;
1280    
1281        /* Opcodes that must match a character */
1282    
1283        case OP_NOT_DIGIT:
1284        case OP_DIGIT:
1285        case OP_NOT_WHITESPACE:
1286        case OP_WHITESPACE:
1287        case OP_NOT_WORDCHAR:
1288        case OP_WORDCHAR:
1289        case OP_ANY:
1290        case OP_ANYBYTE:
1291        case OP_CHARS:
1292        case OP_NOT:
1293        case OP_PLUS:
1294        case OP_MINPLUS:
1295        case OP_EXACT:
1296        case OP_NOTPLUS:
1297        case OP_NOTMINPLUS:
1298        case OP_NOTEXACT:
1299        case OP_TYPEPLUS:
1300        case OP_TYPEMINPLUS:
1301        case OP_TYPEEXACT:
1302        return FALSE;
1303    
1304        /* End of branch */
1305    
1306        case OP_KET:
1307        case OP_KETRMAX:
1308        case OP_KETRMIN:
1309        case OP_ALT:
1310        return TRUE;
1311    
1312        /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1313        followed by a multibyte character */
1314    
1315    #ifdef SUPPORT_UTF8
1316        case OP_STAR:
1317        case OP_MINSTAR:
1318        case OP_QUERY:
1319        case OP_MINQUERY:
1320        case OP_UPTO:
1321        case OP_MINUPTO:
1322        if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1323        break;
1324    #endif
1325        }
1326      }
1327    
1328    return TRUE;
1329    }
1330    
1331    
1332    
1333    /*************************************************
1334    *    Scan compiled regex for non-emptiness       *
1335    *************************************************/
1336    
1337    /* This function is called to check for left recursive calls. We want to check
1338    the current branch of the current pattern to see if it could match the empty
1339    string. If it could, we must look outwards for branches at other levels,
1340    stopping when we pass beyond the bracket which is the subject of the recursion.
1341    
1342  Arguments:  Arguments:
1343    options      the option bits    code        points to start of the recursion
1344    brackets     points to number of brackets used    endcode     points to where to stop (current RECURSE item)
1345    code         points to the pointer to the current code point    bcptr       points to the chain of current (unclosed) branch starts
1346    ptrptr       points to the current pattern pointer    utf8        TRUE if in UTF-8 mode
1347    errorptr     points to pointer to error message  
1348    optchanged   set to the value of the last OP_OPT item compiled  Returns:      TRUE if what is matched could be empty
1349    reqchar      set to the last literal character required, else -1  */
1350    countlits    set to count of mandatory literal characters  
1351    cd           contains pointers to tables  static BOOL
1352    could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1353      BOOL utf8)
1354    {
1355    while (bcptr != NULL && bcptr->current >= code)
1356      {
1357      if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1358      bcptr = bcptr->outer;
1359      }
1360    return TRUE;
1361    }
1362    
1363    
1364    
1365    /*************************************************
1366    *           Check for POSIX class syntax         *
1367    *************************************************/
1368    
1369    /* This function is called when the sequence "[:" or "[." or "[=" is
1370    encountered in a character class. It checks whether this is followed by an
1371    optional ^ and then a sequence of letters, terminated by a matching ":]" or
1372    ".]" or "=]".
1373    
1374    Argument:
1375      ptr      pointer to the initial [
1376      endptr   where to return the end pointer
1377      cd       pointer to compile data
1378    
1379  Returns:       TRUE on success  Returns:   TRUE or FALSE
                FALSE, with *errorptr set on error  
1380  */  */
1381    
1382  static BOOL  static BOOL
1383  compile_branch(int options, int *brackets, uschar **codeptr,  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1384    const uschar **ptrptr, const char **errorptr, int *optchanged,  {
1385    int *reqchar, int *countlits, compile_data *cd)  int terminator;          /* Don't combine these lines; the Solaris cc */
1386    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1387    if (*(++ptr) == '^') ptr++;
1388    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1389    if (*ptr == terminator && ptr[1] == ']')
1390      {
1391      *endptr = ptr;
1392      return TRUE;
1393      }
1394    return FALSE;
1395    }
1396    
1397    
1398    
1399    
1400    /*************************************************
1401    *          Check POSIX class name                *
1402    *************************************************/
1403    
1404    /* This function is called to check the name given in a POSIX-style class entry
1405    such as [:alnum:].
1406    
1407    Arguments:
1408      ptr        points to the first letter
1409      len        the length of the name
1410    
1411    Returns:     a value representing the name, or -1 if unknown
1412    */
1413    
1414    static int
1415    check_posix_name(const uschar *ptr, int len)
1416    {
1417    register int yield = 0;
1418    while (posix_name_lengths[yield] != 0)
1419      {
1420      if (len == posix_name_lengths[yield] &&
1421        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1422      yield++;
1423      }
1424    return -1;
1425    }
1426    
1427    
1428    
1429    
1430    /*************************************************
1431    *           Compile one branch                   *
1432    *************************************************/
1433    
1434    /* Scan the pattern, compiling it into the code vector. If the options are
1435    changed during the branch, the pointer is used to change the external options
1436    bits.
1437    
1438    Arguments:
1439      optionsptr     pointer to the option bits
1440      brackets       points to number of extracting brackets used
1441      code           points to the pointer to the current code point
1442      ptrptr         points to the current pattern pointer
1443      errorptr       points to pointer to error message
1444      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1445      reqbyteptr     set to the last literal character required, else < 0
1446      bcptr          points to current branch chain
1447      cd             contains pointers to tables etc.
1448    
1449    Returns:         TRUE on success
1450                     FALSE, with *errorptr set on error
1451    */
1452    
1453    static BOOL
1454    compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1455      const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1456      int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1457  {  {
1458  int repeat_type, op_type;  int repeat_type, op_type;
1459  int repeat_min, repeat_max;  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1460  int bravalue, length;  int bravalue = 0;
1461    int length;
1462  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
1463  int prevreqchar;  int firstbyte, reqbyte;
1464    int zeroreqbyte, zerofirstbyte;
1465    int req_caseopt, reqvary, tempreqvary;
1466  int condcount = 0;  int condcount = 0;
1467  int subcountlits = 0;  int options = *optionsptr;
1468  register int c;  register int c;
1469  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1470  uschar *tempcode;  uschar *tempcode;
1471    BOOL inescq = FALSE;
1472    BOOL groupsetfirstbyte = FALSE;
1473  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1474  const uschar *tempptr;  const uschar *tempptr;
1475  uschar *previous = NULL;  uschar *previous = NULL;
1476  uschar class[32];  uschar class[32];
1477    
1478    #ifdef SUPPORT_UTF8
1479    BOOL class_utf8;
1480    BOOL utf8 = (options & PCRE_UTF8) != 0;
1481    uschar *class_utf8data;
1482    uschar utf8_char[6];
1483    #else
1484    BOOL utf8 = FALSE;
1485    #endif
1486    
1487  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
1488    
1489  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
1490  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
1491    
1492  /* Initialize no required char, and count of literals */  /* Initialize no first char, no required char. REQ_UNSET means "no char
1493    matching encountered yet". It gets changed to REQ_NONE if we hit something that
1494    matches a non-fixed char first char; reqbyte just remains unset if we never
1495    find one.
1496    
1497    When we hit a repeat whose minimum is zero, we may have to adjust these values
1498    to take the zero repeat into account. This is implemented by setting them to
1499    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1500    item types that can be repeated set these backoff variables appropriately. */
1501    
1502    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1503    
1504    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1505    according to the current setting of the caseless flag. REQ_CASELESS is a bit
1506    value > 255. It is added into the firstbyte or reqbyte variables to record the
1507    case status of the value. */
1508    
1509  *reqchar = prevreqchar = -1;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
 *countlits = 0;  
1510    
1511  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1512    
1513  for (;; ptr++)  for (;; ptr++)
1514    {    {
1515    BOOL negate_class;    BOOL negate_class;
1516      BOOL possessive_quantifier;
1517    int class_charcount;    int class_charcount;
1518    int class_lastchar;    int class_lastchar;
1519    int newoptions;    int newoptions;
1520    int condref;    int recno;
1521    int subreqchar;    int skipbytes;
1522      int subreqbyte;
1523      int subfirstbyte;
1524    
1525    c = *ptr;    c = *ptr;
1526      if (inescq && c != 0) goto NORMAL_CHAR;
1527    
1528    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
1529      {      {
1530      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
# Line 872  for (;; ptr++) Line 1532  for (;; ptr++)
1532        {        {
1533        /* The space before the ; is to avoid a warning on a silly compiler        /* The space before the ; is to avoid a warning on a silly compiler
1534        on the Macintosh. */        on the Macintosh. */
1535        while ((c = *(++ptr)) != 0 && c != '\n') ;        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1536        continue;        if (c != 0) continue;   /* Else fall through to handle end of string */
1537        }        }
1538      }      }
1539    
# Line 884  for (;; ptr++) Line 1544  for (;; ptr++)
1544      case 0:      case 0:
1545      case '|':      case '|':
1546      case ')':      case ')':
1547        *firstbyteptr = firstbyte;
1548        *reqbyteptr = reqbyte;
1549      *codeptr = code;      *codeptr = code;
1550      *ptrptr = ptr;      *ptrptr = ptr;
1551      return TRUE;      return TRUE;
1552    
1553      /* Handle single-character metacharacters */      /* Handle single-character metacharacters. In multiline mode, ^ disables
1554        the setting of any following char as a first character. */
1555    
1556      case '^':      case '^':
1557        if ((options & PCRE_MULTILINE) != 0)
1558          {
1559          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1560          }
1561      previous = NULL;      previous = NULL;
1562      *code++ = OP_CIRC;      *code++ = OP_CIRC;
1563      break;      break;
# Line 900  for (;; ptr++) Line 1567  for (;; ptr++)
1567      *code++ = OP_DOLL;      *code++ = OP_DOLL;
1568      break;      break;
1569    
1570        /* There can never be a first char if '.' is first, whatever happens about
1571        repeats. The value of reqbyte doesn't change either. */
1572    
1573      case '.':      case '.':
1574        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1575        zerofirstbyte = firstbyte;
1576        zeroreqbyte = reqbyte;
1577      previous = code;      previous = code;
1578      *code++ = OP_ANY;      *code++ = OP_ANY;
1579      break;      break;
1580    
1581      /* Character classes. These always build a 32-byte bitmap of the permitted      /* Character classes. If the included characters are all < 255 in value, we
1582      characters, except in the special case where there is only one character.      build a 32-byte bitmap of the permitted characters, except in the special
1583      For negated classes, we build the map as usual, then invert it at the end.      case where there is only one such character. For negated classes, we build
1584        the map as usual, then invert it at the end. However, we use a different
1585        opcode so that data characters > 255 can be handled correctly.
1586    
1587        If the class contains characters outside the 0-255 range, a different
1588        opcode is compiled. It may optionally have a bit map for characters < 256,
1589        but those above are are explicitly listed afterwards. A flag byte tells
1590        whether the bitmap is present, and whether this is a negated class or not.
1591      */      */
1592    
1593      case '[':      case '[':
1594      previous = code;      previous = code;
1595      *code++ = OP_CLASS;  
1596        /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1597        they are encountered at the top level, so we'll do that too. */
1598    
1599        if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1600            check_posix_syntax(ptr, &tempptr, cd))
1601          {
1602          *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1603          goto FAILED;
1604          }
1605    
1606      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. */
1607    
# Line 921  for (;; ptr++) Line 1610  for (;; ptr++)
1610        negate_class = TRUE;        negate_class = TRUE;
1611        c = *(++ptr);        c = *(++ptr);
1612        }        }
1613      else negate_class = FALSE;      else
1614          {
1615          negate_class = FALSE;
1616          }
1617    
1618      /* Keep a count of chars so that we can optimize the case of just a single      /* Keep a count of chars with values < 256 so that we can optimize the case
1619      character. */      of just a single character (as long as it's < 256). For higher valued UTF-8
1620        characters, we don't yet do any optimization. */
1621    
1622      class_charcount = 0;      class_charcount = 0;
1623      class_lastchar = -1;      class_lastchar = -1;
1624    
1625    #ifdef SUPPORT_UTF8
1626        class_utf8 = FALSE;                       /* No chars >= 256 */
1627        class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1628    #endif
1629    
1630      /* Initialize the 32-char bit map to all zeros. We have to build the      /* Initialize the 32-char bit map to all zeros. We have to build the
1631      map in a temporary bit of store, in case the class contains only 1      map in a temporary bit of store, in case the class contains only 1
1632      character, because in that case the compiled code doesn't use the      character (< 256), because in that case the compiled code doesn't use the
1633      bit map. */      bit map. */
1634    
1635      memset(class, 0, 32 * sizeof(uschar));      memset(class, 0, 32 * sizeof(uschar));
1636    
1637      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
1638      means that an initial ] is taken as a data character. */      means that an initial ] is taken as a data character. The first pass
1639        through the regex checked the overall syntax, so we don't need to be very
1640        strict here. At the start of the loop, c contains the first byte of the
1641        character. */
1642    
1643      do      do
1644        {        {
1645        if (c == 0)  #ifdef SUPPORT_UTF8
1646          if (utf8 && c > 127)
1647            {                           /* Braces are required because the */
1648            GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
1649            }
1650    #endif
1651    
1652          /* Inside \Q...\E everything is literal except \E */
1653    
1654          if (inescq)
1655          {          {
1656          *errorptr = ERR6;          if (c == '\\' && ptr[1] == 'E')
1657          goto FAILED;            {
1658              inescq = FALSE;
1659              ptr++;
1660              continue;
1661              }
1662            else goto LONE_SINGLE_CHARACTER;
1663          }          }
1664    
1665        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
1666        form [:^name]. A square bracket that doesn't match the syntax is        form [:^name:]. A square bracket that doesn't match the syntax is
1667        treated as a literal. We also recognize the POSIX constructions        treated as a literal. We also recognize the POSIX constructions
1668        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1669        5.6 does. */        5.6 and 5.8 do. */
1670    
1671        if (c == '[' &&        if (c == '[' &&
1672            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
# Line 989  for (;; ptr++) Line 1704  for (;; ptr++)
1704            posix_class = 0;            posix_class = 0;
1705    
1706          /* Or into the map we are building up to 3 of the static class          /* Or into the map we are building up to 3 of the static class
1707          tables, or their negations. */          tables, or their negations. The [:blank:] class sets up the same
1708            chars as the [:space:] class (all white space). We remove the vertical
1709            white space chars afterwards. */
1710    
1711          posix_class *= 3;          posix_class *= 3;
1712          for (i = 0; i < 3; i++)          for (i = 0; i < 3; i++)
1713            {            {
1714              BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1715            int taboffset = posix_class_maps[posix_class + i];            int taboffset = posix_class_maps[posix_class + i];
1716            if (taboffset < 0) break;            if (taboffset < 0) break;
1717            if (local_negate)            if (local_negate)
1718                {
1719              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1720                if (isblank) class[1] |= 0x3c;
1721                }
1722            else            else
1723                {
1724              for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];              for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1725                if (isblank) class[1] &= ~0x3c;
1726                }
1727            }            }
1728    
1729          ptr = tempptr + 1;          ptr = tempptr + 1;
1730          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1731          continue;          continue;    /* End of POSIX syntax handling */
1732          }          }
1733    
1734        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
# Line 1013  for (;; ptr++) Line 1737  for (;; ptr++)
1737        Inside a class (and only there) it is treated as backspace. Elsewhere        Inside a class (and only there) it is treated as backspace. Elsewhere
1738        it marks a word boundary. Other escapes have preset maps ready to        it marks a word boundary. Other escapes have preset maps ready to
1739        or into the one we are building. We assume they have more than one        or into the one we are building. We assume they have more than one
1740        character in them, so set class_count bigger than one. */        character in them, so set class_charcount bigger than one. */
1741    
1742        if (c == '\\')        if (c == '\\')
1743          {          {
1744          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1745          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
1746    
1747            if (-c == ESC_Q)            /* Handle start of quoted string */
1748              {
1749              if (ptr[1] == '\\' && ptr[2] == 'E')
1750                {
1751                ptr += 2; /* avoid empty string */
1752                }
1753              else inescq = TRUE;
1754              continue;
1755              }
1756    
1757          else if (c < 0)          else if (c < 0)
1758            {            {
1759            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
1760            class_charcount = 10;            class_charcount = 10;     /* Greater than 1 is what matters */
1761            switch (-c)            switch (-c)
1762              {              {
1763              case ESC_d:              case ESC_d:
# Line 1043  for (;; ptr++) Line 1778  for (;; ptr++)
1778    
1779              case ESC_s:              case ESC_s:
1780              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1781                class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1782              continue;              continue;
1783    
1784              case ESC_S:              case ESC_S:
1785              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1786                class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1787              continue;              continue;
1788    
1789                /* Unrecognized escapes are faulted if PCRE is running in its
1790                strict mode. By default, for compatibility with Perl, they are
1791                treated as literals. */
1792    
1793              default:              default:
1794              *errorptr = ERR7;              if ((options & PCRE_EXTRA) != 0)
1795              goto FAILED;                {
1796                  *errorptr = ERR7;
1797                  goto FAILED;
1798                  }
1799                c = *ptr;    /* The final character */
1800              }              }
1801            }            }
1802          /* Fall through if single character */  
1803          }          /* Fall through if we have a single character (c >= 0). This may be
1804            > 256 in UTF-8 mode. */
1805    
1806            }   /* End of backslash handling */
1807    
1808        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
1809        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
# Line 1065  for (;; ptr++) Line 1813  for (;; ptr++)
1813          {          {
1814          int d;          int d;
1815          ptr += 2;          ptr += 2;
         d = *ptr;  
1816    
1817          if (d == 0)  #ifdef SUPPORT_UTF8
1818            {          if (utf8)
1819            *errorptr = ERR6;            {                           /* Braces are required because the */
1820            goto FAILED;            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1821            }            }
1822            else
1823    #endif
1824            d = *ptr;
1825    
1826          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1827          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1828            in such circumstances. */
1829    
1830          if (d == '\\')          if (d == '\\')
1831            {            {
1832              const uschar *oldptr = ptr;
1833            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1834    
1835              /* \b is backslash; any other special means the '-' was literal */
1836    
1837            if (d < 0)            if (d < 0)
1838              {              {
1839              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1840                {                {
1841                *errorptr = ERR7;                ptr = oldptr - 2;
1842                goto FAILED;                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
1843                }                }
1844              }              }
1845            }            }
1846    
1847            /* Check that the two values are in the correct order */
1848    
1849          if (d < c)          if (d < c)
1850            {            {
1851            *errorptr = ERR8;            *errorptr = ERR8;
1852            goto FAILED;            goto FAILED;
1853            }            }
1854    
1855            /* If d is greater than 255, we can't just use the bit map, so set up
1856            for the UTF-8 supporting class type. If we are not caseless, we can
1857            just set up a single range. If we are caseless, the characters < 256
1858            are handled with a bitmap, in order to get the case-insensitive
1859            handling. */
1860    
1861    #ifdef SUPPORT_UTF8
1862            if (d > 255)
1863              {
1864              class_utf8 = TRUE;
1865              *class_utf8data++ = XCL_RANGE;
1866              if ((options & PCRE_CASELESS) == 0)
1867                {
1868                class_utf8data += ord2utf8(c, class_utf8data);
1869                class_utf8data += ord2utf8(d, class_utf8data);
1870                continue;  /* Go get the next char in the class */
1871                }
1872              class_utf8data += ord2utf8(256, class_utf8data);
1873              class_utf8data += ord2utf8(d, class_utf8data);
1874              d = 255;
1875              /* Fall through */
1876              }
1877    #endif
1878            /* We use the bit map if the range is entirely < 255, or if part of it
1879            is < 255 and matching is caseless. */
1880    
1881          for (; c <= d; c++)          for (; c <= d; c++)
1882            {            {
1883            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
# Line 1106  for (;; ptr++) Line 1889  for (;; ptr++)
1889            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
1890            class_lastchar = c;            class_lastchar = c;
1891            }            }
1892    
1893          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
1894          }          }
1895    
1896        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1897        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1898    
1899        class [c/8] |= (1 << (c&7));        LONE_SINGLE_CHARACTER:
1900        if ((options & PCRE_CASELESS) != 0)  
1901          /* Handle a multibyte character */
1902    
1903    #ifdef SUPPORT_UTF8
1904          if (utf8 && c > 255)
1905          {          {
1906          c = cd->fcc[c];   /* flip case */          class_utf8 = TRUE;
1907          class[c/8] |= (1 << (c&7));          *class_utf8data++ = XCL_SINGLE;
1908            class_utf8data += ord2utf8(c, class_utf8data);
1909            }
1910          else
1911    #endif
1912          /* Handle a single-byte character */
1913            {
1914            class [c/8] |= (1 << (c&7));
1915            if ((options & PCRE_CASELESS) != 0)
1916              {
1917              c = cd->fcc[c];   /* flip case */
1918              class[c/8] |= (1 << (c&7));
1919              }
1920            class_charcount++;
1921            class_lastchar = c;
1922          }          }
       class_charcount++;  
       class_lastchar = c;  
1923        }        }
1924    
1925      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached; the check for end of string happens inside the
1926      loop. This "while" is the end of the "do" above. */      loop. This "while" is the end of the "do" above. */
1927    
1928      while ((c = *(++ptr)) != ']');      while ((c = *(++ptr)) != ']' || inescq);
   
     /* If class_charcount is 1 and class_lastchar is not negative, we saw  
     precisely one character. This doesn't need the whole 32-byte bit map.  
     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if  
     it's negative. */  
1929    
1930      if (class_charcount == 1 && class_lastchar >= 0)      /* If class_charcount is 1, we saw precisely one character with a value <
1931        256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1932        the one character is < 128. In non-UTF-8 mode we can always optimize.
1933    
1934        The optimization throws away the bit map. We turn the item into a
1935        1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1936        that OP_NOT does not support multibyte characters. In the positive case, it
1937        can cause firstbyte to be set. Otherwise, there can be no first char if
1938        this item is first, whatever repeat count may follow. In the case of
1939        reqbyte, save the previous value for reinstating. */
1940    
1941    #ifdef SUPPORT_UTF8
1942        if (class_charcount == 1 &&
1943              (!utf8 ||
1944              (!class_utf8 && class_lastchar < 128)))
1945    #else
1946        if (class_charcount == 1)
1947    #endif
1948        {        {
1949          zeroreqbyte = reqbyte;
1950        if (negate_class)        if (negate_class)
1951          {          {
1952          code[-1] = OP_NOT;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1953            zerofirstbyte = firstbyte;
1954            *code++ = OP_NOT;
1955          }          }
1956        else        else
1957          {          {
1958          code[-1] = OP_CHARS;          if (firstbyte == REQ_UNSET)
1959              {
1960              zerofirstbyte = REQ_NONE;
1961              firstbyte = class_lastchar | req_caseopt;
1962              }
1963            else
1964              {
1965              zerofirstbyte = firstbyte;
1966              reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
1967              }
1968            *code++ = OP_CHARS;
1969          *code++ = 1;          *code++ = 1;
1970          }          }
1971        *code++ = class_lastchar;        *code++ = class_lastchar;
1972          break;  /* End of class handling */
1973          }       /* End of 1-byte optimization */
1974    
1975        /* Otherwise, if this is the first thing in the branch, there can be no
1976        first char setting, whatever the repeat count. Any reqbyte setting must
1977        remain unchanged after any kind of repeat. */
1978    
1979        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1980        zerofirstbyte = firstbyte;
1981        zeroreqbyte = reqbyte;
1982    
1983        /* If there are characters with values > 255, we have to compile an
1984        extended class, with its own opcode. If there are no characters < 256,
1985        we can omit the bitmap. */
1986    
1987    #ifdef SUPPORT_UTF8
1988        if (class_utf8)
1989          {
1990          *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
1991          *code++ = OP_XCLASS;
1992          code += LINK_SIZE;
1993          *code = negate_class? XCL_NOT : 0;
1994    
1995          /* If the map is required, install it, and move on to the end of
1996          the extra data */
1997    
1998          if (class_charcount > 0)
1999            {
2000            *code++ |= XCL_MAP;
2001            memcpy(code, class, 32);
2002            code = class_utf8data;
2003            }
2004    
2005          /* If the map is not required, slide down the extra data. */
2006    
2007          else
2008            {
2009            int len = class_utf8data - (code + 33);
2010            memmove(code + 1, code + 33, len);
2011            code += len + 1;
2012            }
2013    
2014          /* Now fill in the complete length of the item */
2015    
2016          PUT(previous, 1, code - previous);
2017          break;   /* End of class handling */
2018        }        }
2019    #endif
2020    
2021      /* Otherwise, negate the 32-byte map if necessary, and copy it into      /* If there are no characters > 255, negate the 32-byte map if necessary,
2022      the code vector. */      and copy it into the code vector. If this is the first thing in the branch,
2023        there can be no first char setting, whatever the repeat count. Any reqbyte
2024        setting must remain unchanged after any kind of repeat. */
2025    
2026        if (negate_class)
2027          {
2028          *code++ = OP_NCLASS;
2029          for (c = 0; c < 32; c++) code[c] = ~class[c];
2030          }
2031      else      else
2032        {        {
2033        if (negate_class)        *code++ = OP_CLASS;
2034          for (c = 0; c < 32; c++) code[c] = ~class[c];        memcpy(code, class, 32);
       else  
         memcpy(code, class, 32);  
       code += 32;  
2035        }        }
2036        code += 32;
2037      break;      break;
2038    
2039      /* Various kinds of repeat */      /* Various kinds of repeat */
# Line 1188  for (;; ptr++) Line 2065  for (;; ptr++)
2065        goto FAILED;        goto FAILED;
2066        }        }
2067    
2068      /* If the next character is '?' this is a minimizing repeat, by default,      if (repeat_min == 0)
2069      but if PCRE_UNGREEDY is set, it works the other way round. Advance to the        {
2070      next character. */        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2071          reqbyte = zeroreqbyte;        /* Ditto */
2072          }
2073    
2074      if (ptr[1] == '?')      /* Remember whether this is a variable length repeat */
2075        { repeat_type = greedy_non_default; ptr++; }  
2076        reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2077    
2078        op_type = 0;                    /* Default single-char op codes */
2079        possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2080    
2081        /* Save start of previous item, in case we have to move it up to make space
2082        for an inserted OP_ONCE for the additional '+' extension. */
2083    
2084        tempcode = previous;
2085    
2086        /* If the next character is '+', we have a possessive quantifier. This
2087        implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2088        If the next character is '?' this is a minimizing repeat, by default,
2089        but if PCRE_UNGREEDY is set, it works the other way round. We change the
2090        repeat type to the non-default. */
2091    
2092        if (ptr[1] == '+')
2093          {
2094          repeat_type = 0;                  /* Force greedy */
2095          possessive_quantifier = TRUE;
2096          ptr++;
2097          }
2098        else if (ptr[1] == '?')
2099          {
2100          repeat_type = greedy_non_default;
2101          ptr++;
2102          }
2103      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2104    
2105        /* If previous was a recursion, we need to wrap it inside brackets so that
2106        it can be replicated if necessary. */
2107    
2108        if (*previous == OP_RECURSE)
2109          {
2110          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2111          code += 1 + LINK_SIZE;
2112          *previous = OP_BRA;
2113          PUT(previous, 1, code - previous);
2114          *code = OP_KET;
2115          PUT(code, 1, code - previous);
2116          code += 1 + LINK_SIZE;
2117          }
2118    
2119      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
2120      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
2121      abolish the previous item altogether. A repeat with a zero minimum wipes      abolish the previous item altogether. If a one-char item has a minumum of
2122      out any reqchar setting, backing up to the previous value. We must also      more than one, ensure that it is set in reqbyte - it might not be if a
2123      adjust the countlits value. */      sequence such as x{3} is the first thing in a branch because the x will
2124        have gone into firstbyte instead.  */
2125    
2126      if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
2127        {        {
2128        int len = previous[1];        /* Deal with UTF-8 characters that take up more than one byte. It's
2129          easier to write this out separately than try to macrify it. Use c to
2130        if (repeat_min == 0) *reqchar = prevreqchar;        hold the length of the character in bytes, plus 0x80 to flag that it's a
2131        *countlits += repeat_min - 1;        length rather than a small character. */
2132    
2133        if (len == 1)  #ifdef SUPPORT_UTF8
2134          {        if (utf8 && (code[-1] & 0x80) != 0)
2135          c = previous[2];          {
2136          code = previous;          uschar *lastchar = code - 1;
2137            while((*lastchar & 0xc0) == 0x80) lastchar--;
2138            c = code - lastchar;            /* Length of UTF-8 character */
2139            memcpy(utf8_char, lastchar, c); /* Save the char */
2140            if (lastchar == previous + 2)   /* There was only one character */
2141              {
2142              code = previous;              /* Abolish the previous item */
2143              }
2144            else
2145              {
2146              previous[1] -= c;             /* Adjust length of previous */
2147              code = lastchar;              /* Lost char off the end */
2148              tempcode = code;              /* Adjust position to be moved for '+' */
2149              }
2150            c |= 0x80;                      /* Flag c as a length */
2151          }          }
2152        else        else
2153    #endif
2154    
2155          /* Handle the case of a single byte - either with no UTF8 support, or
2156          with UTF-8 disabled, or for a UTF-8 character < 128. */
2157    
2158          {          {
2159          c = previous[len+1];          c = *(--code);
2160          previous[1]--;          if (code == previous + 2)   /* There was only one character */
2161          code--;            {
2162              code = previous;              /* Abolish the previous item */
2163              if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2164              }
2165            else
2166              {
2167              previous[1]--;             /* adjust length */
2168              tempcode = code;           /* Adjust position to be moved for '+' */
2169              }
2170          }          }
2171        op_type = 0;                 /* Use single-char op codes */  
2172        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2173        }        }
2174    
2175      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
2176      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
2177      character repeats by adding a suitable offset into repeat_type. */      character repeats by setting opt_type to add a suitable offset into
2178        repeat_type. OP_NOT is currently used only for single-byte chars. */
2179    
2180      else if ((int)*previous == OP_NOT)      else if (*previous == OP_NOT)
2181        {        {
2182        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2183        c = previous[1];        c = previous[1];
# Line 1238  for (;; ptr++) Line 2187  for (;; ptr++)
2187    
2188      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
2189      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
2190      repeats by adding a suitable offset into repeat_type. */      repeats by setting op_type to add a suitable offset into repeat_type. */
2191    
2192      else if ((int)*previous < OP_EODN || *previous == OP_ANY)      else if (*previous < OP_EODN)
2193        {        {
2194        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2195        c = *previous;        c = *previous;
# Line 1267  for (;; ptr++) Line 2216  for (;; ptr++)
2216          else          else
2217            {            {
2218            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
2219            *code++ = repeat_max >> 8;            PUT2INC(code, 0, repeat_max);
           *code++ = (repeat_max & 255);  
2220            }            }
2221          }          }
2222    
# Line 1285  for (;; ptr++) Line 2233  for (;; ptr++)
2233          if (repeat_min != 1)          if (repeat_min != 1)
2234            {            {
2235            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2236            *code++ = repeat_min >> 8;            PUT2INC(code, 0, repeat_min);
           *code++ = (repeat_min & 255);  
2237            }            }
2238    
2239          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
# Line 1294  for (;; ptr++) Line 2241  for (;; ptr++)
2241          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
2242          string. For a character type nothing need be done; it will just get          string. For a character type nothing need be done; it will just get
2243          put back naturally. Note that the final character is always going to          put back naturally. Note that the final character is always going to
2244          get added below. */          get added below, so we leave code ready for its insertion. */
2245    
2246          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
2247            {            {
2248            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else
2249    
2250              /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2251              bit set as a flag. The length will always be between 2 and 6. */
2252    
2253    #ifdef SUPPORT_UTF8
2254              if (utf8 && c >= 128) previous[1] += c & 7; else
2255    #endif
2256              previous[1]++;
2257            }            }
2258    
2259          /*  For a single negated character we also have to put back the          /*  For a single negated character we also have to put back the
2260          item that got cancelled. */          item that got cancelled. At present this applies only to single byte
2261            characters in any mode. */
2262    
2263          else if (*previous == OP_NOT) code++;          else if (*previous == OP_NOT) code++;
2264    
2265          /* If the maximum is unlimited, insert an OP_STAR. */          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2266            we have to insert the character for the previous code. In UTF-8 mode,
2267            long characters have their length in c, with the 0x80 bit as a flag. */
2268    
2269          if (repeat_max < 0)          if (repeat_max < 0)
2270            {            {
2271    #ifdef SUPPORT_UTF8
2272              if (utf8 && c >= 128)
2273                {
2274                memcpy(code, utf8_char, c & 7);
2275                code += c & 7;
2276                }
2277              else
2278    #endif
2279            *code++ = c;            *code++ = c;
2280            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
2281            }            }
2282    
2283          /* Else insert an UPTO if the max is greater than the min. */          /* Else insert an UPTO if the max is greater than the min, again
2284            preceded by the character, for the previously inserted code. */
2285    
2286          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
2287            {            {
2288    #ifdef SUPPORT_UTF8
2289              if (utf8 && c >= 128)
2290                {
2291                memcpy(code, utf8_char, c & 7);
2292                code += c & 7;
2293                }
2294              else
2295    #endif
2296            *code++ = c;            *code++ = c;
2297            repeat_max -= repeat_min;            repeat_max -= repeat_min;
2298            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
2299            *code++ = repeat_max >> 8;            PUT2INC(code, 0, repeat_max);
           *code++ = (repeat_max & 255);  
2300            }            }
2301          }          }
2302    
2303        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
2304    
2305    #ifdef SUPPORT_UTF8
2306          if (utf8 && c >= 128)
2307            {
2308            memcpy(code, utf8_char, c & 7);
2309            code += c & 7;
2310            }
2311          else
2312    #endif
2313    
2314        *code++ = c;        *code++ = c;
2315        }        }
2316    
2317      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
2318      stuff after it, but just skip the item if the repeat was {0,0}. */      stuff after it, but just skip the item if the repeat was {0,0}. */
2319    
2320      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS ||
2321                 *previous == OP_NCLASS ||
2322    #ifdef SUPPORT_UTF8
2323                 *previous == OP_XCLASS ||
2324    #endif
2325                 *previous == OP_REF)
2326        {        {
2327        if (repeat_max == 0)        if (repeat_max == 0)
2328          {          {
# Line 1350  for (;; ptr++) Line 2338  for (;; ptr++)
2338        else        else
2339          {          {
2340          *code++ = OP_CRRANGE + repeat_type;          *code++ = OP_CRRANGE + repeat_type;
2341          *code++ = repeat_min >> 8;          PUT2INC(code, 0, repeat_min);
         *code++ = repeat_min & 255;  
2342          if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */          if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2343          *code++ = repeat_max >> 8;          PUT2INC(code, 0, repeat_max);
         *code++ = repeat_max & 255;  
2344          }          }
2345        }        }
2346    
2347      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
2348      cases. */      cases. */
2349    
2350      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2351               (int)*previous == OP_COND)               *previous == OP_COND)
2352        {        {
2353        register int i;        register int i;
2354        int ketoffset = 0;        int ketoffset = 0;
# Line 1378  for (;; ptr++) Line 2364  for (;; ptr++)
2364        if (repeat_max == -1)        if (repeat_max == -1)
2365          {          {
2366          register uschar *ket = previous;          register uschar *ket = previous;
2367          do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);          do ket += GET(ket, 1); while (*ket != OP_KET);
2368          ketoffset = code - ket;          ketoffset = code - ket;
2369          }          }
2370    
# Line 1386  for (;; ptr++) Line 2372  for (;; ptr++)
2372        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
2373        data, whereas in other cases it appears the minimum number of times. For        data, whereas in other cases it appears the minimum number of times. For
2374        this reason, it is simplest to treat this case separately, as otherwise        this reason, it is simplest to treat this case separately, as otherwise
2375        the code gets far too mess. There are several special subcases when the        the code gets far too messy. There are several special subcases when the
2376        minimum is zero. */        minimum is zero. */
2377    
2378        if (repeat_min == 0)        if (repeat_min == 0)
2379          {          {
         /* If we set up a required char from the bracket, we must back off  
         to the previous value and reset the countlits value too. */  
   
         if (subcountlits > 0)  
           {  
           *reqchar = prevreqchar;  
           *countlits -= subcountlits;  
           }  
   
2380          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
2381          altogether. */          altogether. */
2382    
# Line 1429  for (;; ptr++) Line 2406  for (;; ptr++)
2406          else          else
2407            {            {
2408            int offset;            int offset;
2409            memmove(previous+4, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
2410            code += 4;            code += 2 + LINK_SIZE;
2411            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
2412            *previous++ = OP_BRA;            *previous++ = OP_BRA;
2413    
# Line 1439  for (;; ptr++) Line 2416  for (;; ptr++)
2416    
2417            offset = (bralink == NULL)? 0 : previous - bralink;            offset = (bralink == NULL)? 0 : previous - bralink;
2418            bralink = previous;            bralink = previous;
2419            *previous++ = offset >> 8;            PUTINC(previous, 0, offset);
           *previous++ = offset & 255;  
2420            }            }
2421    
2422          repeat_max--;          repeat_max--;
# Line 1448  for (;; ptr++) Line 2424  for (;; ptr++)
2424    
2425        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
2426        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
2427        copies that we need. */        copies that we need. If we set a first char from the group, and didn't
2428          set a required char, copy the latter from the former. */
2429    
2430        else        else
2431          {          {
2432          for (i = 1; i < repeat_min; i++)          if (repeat_min > 1)
2433            {            {
2434            memcpy(code, previous, len);            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2435            code += len;            for (i = 1; i < repeat_min; i++)
2436                {
2437                memcpy(code, previous, len);
2438                code += len;
2439                }
2440            }            }
2441          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
2442          }          }
# Line 1481  for (;; ptr++) Line 2462  for (;; ptr++)
2462              *code++ = OP_BRA;              *code++ = OP_BRA;
2463              offset = (bralink == NULL)? 0 : code - bralink;              offset = (bralink == NULL)? 0 : code - bralink;
2464              bralink = code;              bralink = code;
2465              *code++ = offset >> 8;              PUTINC(code, 0, offset);
             *code++ = offset & 255;  
2466              }              }
2467    
2468            memcpy(code, previous, len);            memcpy(code, previous, len);
# Line 1497  for (;; ptr++) Line 2477  for (;; ptr++)
2477            int oldlinkoffset;            int oldlinkoffset;
2478            int offset = code - bralink + 1;            int offset = code - bralink + 1;
2479            uschar *bra = code - offset;            uschar *bra = code - offset;
2480            oldlinkoffset = (bra[1] << 8) + bra[2];            oldlinkoffset = GET(bra, 1);
2481            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2482            *code++ = OP_KET;            *code++ = OP_KET;
2483            *code++ = bra[1] = offset >> 8;            PUTINC(code, 0, offset);
2484            *code++ = bra[2] = (offset & 255);            PUT(bra, 1, offset);
2485            }            }
2486          }          }
2487    
# Line 1521  for (;; ptr++) Line 2501  for (;; ptr++)
2501        goto FAILED;        goto FAILED;
2502        }        }
2503    
2504      /* In all case we no longer have a previous item. */      /* If the character following a repeat is '+', we wrap the entire repeated
2505        item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2506        Sun's Java package. The repeated item starts at tempcode, not at previous,
2507        which might be the first part of a string whose (former) last char we
2508        repeated. However, we don't support '+' after a greediness '?'. */
2509    
2510        if (possessive_quantifier)
2511          {
2512          int len = code - tempcode;
2513          memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2514          code += 1 + LINK_SIZE;
2515          len += 1 + LINK_SIZE;
2516          tempcode[0] = OP_ONCE;
2517          *code++ = OP_KET;
2518          PUTINC(code, 0, len);
2519          PUT(tempcode, 1, len);
2520          }
2521    
2522        /* In all case we no longer have a previous item. We also set the
2523        "follows varying string" flag for subsequently encountered reqbytes if
2524        it isn't already set and we have just passed a varying length item. */
2525    
2526      END_REPEAT:      END_REPEAT:
2527      previous = NULL;      previous = NULL;
2528        cd->req_varyopt |= reqvary;
2529      break;      break;
2530    
2531    
# Line 1537  for (;; ptr++) Line 2538  for (;; ptr++)
2538    
2539      case '(':      case '(':
2540      newoptions = options;      newoptions = options;
2541      condref = -1;      skipbytes = 0;
2542    
2543      if (*(++ptr) == '?')      if (*(++ptr) == '?')
2544        {        {
# Line 1558  for (;; ptr++) Line 2559  for (;; ptr++)
2559    
2560          case '(':          case '(':
2561          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
2562          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)  
2563            /* Condition to test for recursion */
2564    
2565            if (ptr[1] == 'R')
2566            {            {
2567            condref = *ptr - '0';            code[1+LINK_SIZE] = OP_CREF;
2568              PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2569              skipbytes = 3;
2570              ptr += 3;
2571              }
2572    
2573            /* Condition to test for a numbered subpattern match */
2574    
2575            else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2576              {
2577              int condref;                 /* Don't amalgamate; some compilers */
2578              condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
2579            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2580              if (condref == 0)
2581                {
2582                *errorptr = ERR35;
2583                goto FAILED;
2584                }
2585            ptr++;            ptr++;
2586              code[1+LINK_SIZE] = OP_CREF;
2587              PUT2(code, 2+LINK_SIZE, condref);
2588              skipbytes = 3;
2589            }            }
2590          else ptr--;          /* For conditions that are assertions, we just fall through, having
2591            set bravalue above. */
2592          break;          break;
2593    
2594          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
# Line 1589  for (;; ptr++) Line 2613  for (;; ptr++)
2613            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
2614            ptr++;            ptr++;
2615            break;            break;
   
           default:                /* Syntax error */  
           *errorptr = ERR24;  
           goto FAILED;  
2616            }            }
2617          break;          break;
2618    
# Line 1601  for (;; ptr++) Line 2621  for (;; ptr++)
2621          ptr++;          ptr++;
2622          break;          break;
2623    
2624            case 'C':                 /* Callout - may be followed by digits */
2625            *code++ = OP_CALLOUT;
2626              {
2627              int n = 0;
2628              while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
2629                n = n * 10 + *ptr - '0';
2630              if (n > 255)
2631                {
2632                *errorptr = ERR38;
2633                goto FAILED;
2634                }
2635              *code++ = n;
2636              }
2637            previous = NULL;
2638            continue;
2639    
2640            case 'P':                 /* Named subpattern handling */
2641            if (*(++ptr) == '<')      /* Definition */
2642              {
2643              int i, namelen;
2644              uschar *slot = cd->name_table;
2645              const uschar *name;     /* Don't amalgamate; some compilers */
2646              name = ++ptr;           /* grumble at autoincrement in declaration */
2647    
2648              while (*ptr++ != '>');
2649              namelen = ptr - name - 1;
2650    
2651              for (i = 0; i < cd->names_found; i++)
2652                {
2653                int crc = memcmp(name, slot+2, namelen);
2654                if (crc == 0)
2655                  {
2656                  if (slot[2+namelen] == 0)
2657                    {
2658                    *errorptr = ERR43;
2659                    goto FAILED;
2660                    }
2661                  crc = -1;             /* Current name is substring */
2662                  }
2663                if (crc < 0)
2664                  {
2665                  memmove(slot + cd->name_entry_size, slot,
2666                    (cd->names_found - i) * cd->name_entry_size);
2667                  break;
2668                  }
2669                slot += cd->name_entry_size;
2670                }
2671    
2672              PUT2(slot, 0, *brackets + 1);
2673              memcpy(slot + 2, name, namelen);
2674              slot[2+namelen] = 0;
2675              cd->names_found++;
2676              goto NUMBERED_GROUP;
2677              }
2678    
2679            if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2680              {
2681              int i, namelen;
2682              int type = *ptr++;
2683              const uschar *name = ptr;
2684              uschar *slot = cd->name_table;
2685    
2686              while (*ptr != ')') ptr++;
2687              namelen = ptr - name;
2688    
2689              for (i = 0; i < cd->names_found; i++)
2690                {
2691                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2692                slot += cd->name_entry_size;
2693                }
2694              if (i >= cd->names_found)
2695                {
2696                *errorptr = ERR15;
2697                goto FAILED;
2698                }
2699    
2700              recno = GET2(slot, 0);
2701    
2702              if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2703    
2704              /* Back reference */
2705    
2706              previous = code;
2707              *code++ = OP_REF;
2708              PUT2INC(code, 0, recno);
2709              cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2710              if (recno > cd->top_backref) cd->top_backref = recno;
2711              continue;
2712              }
2713    
2714            /* Should never happen */
2715            break;
2716    
2717          case 'R':                 /* Pattern recursion */          case 'R':                 /* Pattern recursion */
2718          *code++ = OP_RECURSE;          ptr++;                    /* Same as (?0)      */
2719          ptr++;          /* Fall through */
2720    
2721            /* Recursion or "subroutine" call */
2722    
2723            case '0': case '1': case '2': case '3': case '4':
2724            case '5': case '6': case '7': case '8': case '9':
2725              {
2726              const uschar *called;
2727              recno = 0;
2728    
2729              while ((cd->ctypes[*ptr] & ctype_digit) != 0)
2730                recno = recno * 10 + *ptr++ - '0';
2731    
2732              /* Come here from code above that handles a named recursion */
2733    
2734              HANDLE_RECURSION:
2735    
2736              previous = code;
2737    
2738              /* Find the bracket that is being referenced. Temporarily end the
2739              regex in case it doesn't exist. */
2740    
2741              *code = OP_END;
2742              called = (recno == 0)?
2743                cd->start_code : find_bracket(cd->start_code, utf8, recno);
2744    
2745              if (called == NULL)
2746                {
2747                *errorptr = ERR15;
2748                goto FAILED;
2749                }
2750    
2751              /* If the subpattern is still open, this is a recursive call. We
2752              check to see if this is a left recursion that could loop for ever,
2753              and diagnose that case. */
2754    
2755              if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2756                {
2757                *errorptr = ERR40;
2758                goto FAILED;
2759                }
2760    
2761              /* Insert the recursion/subroutine item */
2762    
2763              *code = OP_RECURSE;
2764              PUT(code, 1, called - cd->start_code);
2765              code += 1 + LINK_SIZE;
2766              }
2767          continue;          continue;
2768    
2769            /* Character after (? not specially recognized */
2770    
2771          default:                  /* Option setting */          default:                  /* Option setting */
2772          set = unset = 0;          set = unset = 0;
2773          optset = &set;          optset = &set;
# Line 1622  for (;; ptr++) Line 2784  for (;; ptr++)
2784              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
2785              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
2786              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
   
             default:  
             *errorptr = ERR12;  
             goto FAILED;  
2787              }              }
2788            }            }
2789    
# Line 1634  for (;; ptr++) Line 2792  for (;; ptr++)
2792          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
2793    
2794          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
2795          group with option changes, so the options change at this level. At top          group with option changes, so the options change at this level. Compile
2796          level there is nothing else to be done (the options will in fact have          code to change the ims options if this setting actually changes any of
2797          been set from the start of compiling as a result of the first pass) but          them. We also pass the new setting back so that it can be put at the
2798          at an inner level we must compile code to change the ims options if          start of any following branches, and when this group ends (if we are in
2799          necessary, and pass the new setting back so that it can be put at the          a group), a resetting item can be compiled.
2800          start of any following branches, and when this group ends, a resetting  
2801          item can be compiled. */          Note that if this item is right at the start of the pattern, the
2802            options will have been abstracted and made global, so there will be no
2803            change to compile. */
2804    
2805          if (*ptr == ')')          if (*ptr == ')')
2806            {            {
2807            if ((options & PCRE_INGROUP) != 0 &&            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
               (options & PCRE_IMS) != (newoptions & PCRE_IMS))  
2808              {              {
2809              *code++ = OP_OPT;              *code++ = OP_OPT;
2810              *code++ = *optchanged = newoptions & PCRE_IMS;              *code++ = newoptions & PCRE_IMS;
2811              }              }
2812            options = newoptions;  /* Change options at this level */  
2813              /* Change options at this level, and pass them back for use
2814              in subsequent branches. Reset the greedy defaults and the case
2815              value for firstbyte and reqbyte. */
2816    
2817              *optionsptr = options = newoptions;
2818              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2819              greedy_non_default = greedy_default ^ 1;
2820              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2821    
2822            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
2823            continue;              /* It is complete */            continue;              /* It is complete */
2824            }            }
# Line 1665  for (;; ptr++) Line 2833  for (;; ptr++)
2833          }          }
2834        }        }
2835    
2836      /* Else we have a referencing group; adjust the opcode. */      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2837        non-capturing and behave like (?:...) brackets */
2838    
2839        else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2840          {
2841          bravalue = OP_BRA;
2842          }
2843    
2844        /* Else we have a referencing group; adjust the opcode. If the bracket
2845        number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2846        arrange for the true number to follow later, in an OP_BRANUMBER item. */
2847    
2848      else      else
2849        {        {
2850        if (++(*brackets) > EXTRACT_MAX)        NUMBERED_GROUP:
2851          if (++(*brackets) > EXTRACT_BASIC_MAX)
2852          {          {
2853          *errorptr = ERR13;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2854          goto FAILED;          code[1+LINK_SIZE] = OP_BRANUMBER;
2855            PUT2(code, 2+LINK_SIZE, *brackets);
2856            skipbytes = 3;
2857          }          }
2858        bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
2859        }        }
2860    
2861      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed re. Assertions may not be repeated, but other
# Line 1685  for (;; ptr++) Line 2866  for (;; ptr++)
2866      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
2867      *code = bravalue;      *code = bravalue;
2868      tempcode = code;      tempcode = code;
2869        tempreqvary = cd->req_varyopt;     /* Save value before bracket */
2870    
2871      if (!compile_regex(      if (!compile_regex(
2872           options | PCRE_INGROUP,       /* Set for all nested groups */           newoptions,                   /* The complete new option state */
2873           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?           options & PCRE_IMS,           /* The previous ims option state */
2874             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */           brackets,                     /* Extracting bracket count */
          brackets,                     /* Bracket level */  
2875           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
2876           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
2877           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
2878           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
2879            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2880           condref,                      /* Condition reference number */           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
2881           &subreqchar,                  /* For possible last char */           &subfirstbyte,                /* For possible first char */
2882           &subcountlits,                /* For literal count */           &subreqbyte,                  /* For possible last char */
2883             bcptr,                        /* Current branch chain */
2884           cd))                          /* Tables block */           cd))                          /* Tables block */
2885        goto FAILED;        goto FAILED;
2886    
# Line 1710  for (;; ptr++) Line 2892  for (;; ptr++)
2892      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
2893      two branches in the group. */      two branches in the group. */
2894    
2895      if (bravalue == OP_COND)      else if (bravalue == OP_COND)
2896        {        {
2897        uschar *tc = code;        uschar *tc = code;
2898        condcount = 0;        condcount = 0;
2899    
2900        do {        do {
2901           condcount++;           condcount++;
2902           tc += (tc[1] << 8) | tc[2];           tc += GET(tc,1);
2903           }           }
2904        while (*tc != OP_KET);        while (*tc != OP_KET);
2905    
# Line 1726  for (;; ptr++) Line 2908  for (;; ptr++)
2908          *errorptr = ERR27;          *errorptr = ERR27;
2909          goto FAILED;          goto FAILED;
2910          }          }
2911    
2912          /* If there is just one branch, we must not make use of its firstbyte or
2913          reqbyte, because this is equivalent to an empty second branch. */
2914    
2915          if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2916        }        }
2917    
2918      /* Handle updating of the required character. If the subpattern didn't      /* Handle updating of the required and first characters. Update for normal
2919      set one, leave it as it was. Otherwise, update it for normal brackets of      brackets of all kinds, and conditions with two branches (see code above).
2920      all kinds, forward assertions, and conditions with two branches. Don't      If the bracket is followed by a quantifier with zero repeat, we have to
2921      update the literal count for forward assertions, however. If the bracket      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2922      is followed by a quantifier with zero repeat, we have to back off. Hence      main loop so that they can be accessed for the back off. */
2923      the definition of prevreqchar and subcountlits outside the main loop so  
2924      that they can be accessed for the back off. */      zeroreqbyte = reqbyte;
2925        zerofirstbyte = firstbyte;
2926      if (subreqchar > 0 &&      groupsetfirstbyte = FALSE;
2927           (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||  
2928           (bravalue == OP_COND && condcount == 2)))      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2929        {        {
2930        prevreqchar = *reqchar;        /* If we have not yet set a firstbyte in this branch, take it from the
2931        *reqchar = subreqchar;        subpattern, remembering that it was set here so that a repeat of more
2932        if (bravalue != OP_ASSERT) *countlits += subcountlits;        than one can replicate it as reqbyte if necessary. If the subpattern has
2933          no firstbyte, set "none" for the whole branch. In both cases, a zero
2934          repeat forces firstbyte to "none". */
2935    
2936          if (firstbyte == REQ_UNSET)
2937            {
2938            if (subfirstbyte >= 0)
2939              {
2940              firstbyte = subfirstbyte;
2941              groupsetfirstbyte = TRUE;
2942              }
2943            else firstbyte = REQ_NONE;
2944            zerofirstbyte = REQ_NONE;
2945            }
2946    
2947          /* If firstbyte was previously set, convert the subpattern's firstbyte
2948          into reqbyte if there wasn't one, using the vary flag that was in
2949          existence beforehand. */
2950    
2951          else if (subfirstbyte >= 0 && subreqbyte < 0)
2952            subreqbyte = subfirstbyte | tempreqvary;
2953    
2954          /* If the subpattern set a required byte (or set a first byte that isn't
2955          really the first byte - see above), set it. */
2956    
2957          if (subreqbyte >= 0) reqbyte = subreqbyte;
2958        }        }
2959    
2960        /* For a forward assertion, we take the reqbyte, if set. This can be
2961        helpful if the pattern that follows the assertion doesn't set a different
2962        char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
2963        for an assertion, however because it leads to incorrect effect for patterns
2964        such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
2965        of a firstbyte. This is overcome by a scan at the end if there's no
2966        firstbyte, looking for an asserted first char. */
2967    
2968        else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
2969    
2970      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
2971    
2972      code = tempcode;      code = tempcode;
# Line 1775  for (;; ptr++) Line 2997  for (;; ptr++)
2997    
2998      if (c < 0)      if (c < 0)
2999        {        {
3000          if (-c == ESC_Q)            /* Handle start of quoted string */
3001            {
3002            if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3003              else inescq = TRUE;
3004            continue;
3005            }
3006    
3007          /* For metasequences that actually match a character, we disable the
3008          setting of a first character if it hasn't already been set. */
3009    
3010          if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3011            firstbyte = REQ_NONE;
3012    
3013          /* Set values to reset to if this is followed by a zero repeat. */
3014    
3015          zerofirstbyte = firstbyte;
3016          zeroreqbyte = reqbyte;
3017    
3018          /* Back references are handled specially */
3019    
3020        if (-c >= ESC_REF)        if (-c >= ESC_REF)
3021          {          {
3022            int number = -c - ESC_REF;
3023          previous = code;          previous = code;
3024          *code++ = OP_REF;          *code++ = OP_REF;
3025          *code++ = -c - ESC_REF;          PUT2INC(code, 0, number);
3026          }          }
3027        else        else
3028          {          {
# Line 1807  for (;; ptr++) Line 3050  for (;; ptr++)
3050    
3051      do      do
3052        {        {
3053          /* If in \Q...\E, check for the end; if not, we always have a literal */
3054    
3055          if (inescq)
3056            {
3057            if (c == '\\' && ptr[1] == 'E')
3058              {
3059              inescq = FALSE;
3060              ptr++;
3061              }
3062            else
3063              {
3064              *code++ = c;
3065              length++;
3066              }
3067            continue;
3068            }
3069    
3070          /* Skip white space and comments for /x patterns */
3071    
3072        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
3073          {          {
3074          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
# Line 1814  for (;; ptr++) Line 3076  for (;; ptr++)
3076            {            {
3077            /* The space before the ; is to avoid a warning on a silly compiler            /* The space before the ; is to avoid a warning on a silly compiler
3078            on the Macintosh. */            on the Macintosh. */
3079            while ((c = *(++ptr)) != 0 && c != '\n') ;            while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3080            if (c == 0) break;            if (c == 0) break;
3081            continue;            continue;
3082            }            }
# Line 1829  for (;; ptr++) Line 3091  for (;; ptr++)
3091          tempptr = ptr;          tempptr = ptr;
3092          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3093          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
3094    
3095            /* If a character is > 127 in UTF-8 mode, we have to turn it into
3096            two or more characters in the UTF-8 encoding. */
3097    
3098    #ifdef SUPPORT_UTF8
3099            if (utf8 && c > 127)
3100              {
3101              uschar buffer[8];
3102              int len = ord2utf8(c, buffer);
3103              for (c = 0; c < len; c++) *code++ = buffer[c];
3104              length += len;
3105              continue;
3106              }
3107    #endif
3108            }
3109    
3110          /* Ordinary character or single-char escape */
3111    
3112          *code++ = c;
3113          length++;
3114          }
3115    
3116        /* This "while" is the end of the "do" above. */
3117    
3118        while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3119    
3120        /* Update the first and last requirements. These are always bytes, even in
3121        UTF-8 mode. However, there is a special case to be considered when there
3122        are only one or two characters. Because this gets messy in UTF-8 mode, the
3123        code is kept separate. When we get here "length" contains the number of
3124        bytes. */
3125    
3126    #ifdef SUPPORT_UTF8
3127        if (utf8 && length > 1)
3128          {
3129          uschar *t = previous + 3;                      /* After this code, t */
3130          while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */
3131    
3132          /* Handle the case when there is only one multibyte character. It must
3133          have at least two bytes because of the "length > 1" test above. */
3134    
3135          if (t == code)
3136            {
3137            /* If no previous first byte, set it from this character, but revert to
3138            none on a zero repeat. */
3139    
3140            if (firstbyte == REQ_UNSET)
3141              {
3142              zerofirstbyte = REQ_NONE;
3143              firstbyte = previous[2];
3144              }
3145    
3146            /* Otherwise, leave the first byte value alone, and don't change it on
3147            a zero repeat */
3148    
3149            else zerofirstbyte = firstbyte;
3150    
3151            /* In both cases, a zero repeat resets the previous required byte */
3152    
3153            zeroreqbyte = reqbyte;
3154            }
3155    
3156          /* Handle the case when there is more than one character. These may be
3157          single-byte or multibyte characters */
3158    
3159          else
3160            {
3161            t = code - 1;                       /* After this code, t is at the */
3162            while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */
3163    
3164            /* If no previous first byte, set it from the first character, and
3165            retain it on a zero repeat (of the last character). The required byte
3166            is reset on a zero repeat, either to the byte before the last
3167            character, unless this is the first byte of the string. In that case,
3168            it reverts to its previous value. */
3169    
3170            if (firstbyte == REQ_UNSET)
3171              {
3172              zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3173              zeroreqbyte = (t - 1 == previous + 2)?
3174                reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3175              }
3176    
3177            /* If there was a previous first byte, leave it alone, and don't change
3178            it on a zero repeat. The required byte is reset on a zero repeat to the
3179            byte before the last character. */
3180    
3181            else
3182              {
3183              zerofirstbyte = firstbyte;
3184              zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3185              }
3186          }          }
3187    
3188        /* Ordinary character or single-char escape */        /* In all cases (we know length > 1), the new required byte is the last
3189          byte of the string. */
3190    
3191        *code++ = c;        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
       length++;  
3192        }        }
3193    
3194      /* This "while" is the end of the "do" above. */      else   /* End of UTF-8 coding */
3195    #endif
3196    
3197      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3198        or when UTF-8 is not enabled. */
3199    
3200          {
3201          /* firstbyte was not previously set; take it from this string */
3202    
3203          if (firstbyte == REQ_UNSET)
3204            {
3205            if (length == 1)
3206              {
3207              zerofirstbyte = REQ_NONE;
3208              firstbyte = previous[2] | req_caseopt;
3209              zeroreqbyte = reqbyte;
3210              }
3211            else
3212              {
3213              zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3214              zeroreqbyte = (length > 2)?
3215                (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3216              reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3217              }
3218            }
3219    
3220      /* Update the last character and the count of literals */        /* firstbyte was previously set */
3221    
3222      prevreqchar = (length > 1)? code[-2] : *reqchar;        else
3223      *reqchar = code[-1];          {
3224      *countlits += length;          zerofirstbyte = firstbyte;
3225            zeroreqbyte = (length == 1)? reqbyte :
3226              code[-2] | req_caseopt | cd->req_varyopt;
3227            reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3228            }
3229          }
3230    
3231      /* Compute the length and set it in the data vector, and advance to      /* Set the length in the data vector, and advance to the next state. */
     the next state. */  
3232    
3233      previous[1] = length;      previous[1] = length;
3234      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
3235      break;      break;
3236      }      }
3237    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1881  following branch to ensure they get set Line 3261  following branch to ensure they get set
3261  the new options into every subsequent branch compile.  the new options into every subsequent branch compile.
3262    
3263  Argument:  Argument:
3264    options     the option bits    options        option bits, including any changes for this subpattern
3265    optchanged  new ims options to set as if (?ims) were at the start, or -1    oldims         previous settings of ims option bits
3266                 for no change    brackets       -> int containing the number of extracting brackets used
3267    brackets    -> int containing the number of extracting brackets used    codeptr        -> the address of the current code pointer
3268    codeptr     -> the address of the current code pointer    ptrptr         -> the address of the current pattern pointer
3269    ptrptr      -> the address of the current pattern pointer    errorptr       -> pointer to error message
3270    errorptr    -> pointer to error message    lookbehind     TRUE if this is a lookbehind assertion
3271    lookbehind  TRUE if this is a lookbehind assertion    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3272    condref     > 0 for OPT_CREF setting at start of conditional group    firstbyteptr   place to put the first required character, or a negative number
3273    reqchar     -> place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
3274    countlits   -> place to put the shortest literal count of any branch    bcptr          pointer to the chain of currently open branches
3275    cd          points to the data block with tables pointers    cd             points to the data block with tables pointers etc.
3276    
3277  Returns:      TRUE on success  Returns:      TRUE on success
3278  */  */
3279    
3280  static BOOL  static BOOL
3281  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3282    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3283    int *reqchar, int *countlits, compile_data *cd)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3284  {  {
3285  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
3286  uschar *code = *codeptr;  uschar *code = *codeptr;
3287  uschar *last_branch = code;  uschar *last_branch = code;
3288  uschar *start_bracket = code;  uschar *start_bracket = code;
3289  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
3290  int oldoptions = options & PCRE_IMS;  int firstbyte, reqbyte;
3291  int branchreqchar, branchcountlits;  int branchfirstbyte, branchreqbyte;
3292    branch_chain bc;
3293    
3294  *reqchar = -1;  bc.outer = bcptr;
3295  *countlits = INT_MAX;  bc.current = code;
 code += 3;  
3296    
3297  /* At the start of a reference-based conditional group, insert the reference  firstbyte = reqbyte = REQ_UNSET;
 number as an OP_CREF item. */  
3298    
3299  if (condref > 0)  /* Offset is set zero to mark that this bracket is still open */
3300    {  
3301    *code++ = OP_CREF;  PUT(code, 1, 0);
3302    *code++ = condref;  code += 1 + LINK_SIZE + skipbytes;
   }  
3303    
3304  /* Loop for each alternative branch */  /* Loop for each alternative branch */
3305    
3306  for (;;)  for (;;)
3307    {    {
3308    int length;    /* Handle a change of ims options at the start of the branch */
   
   /* Handle change of options */  
3309    
3310    if (optchanged >= 0)    if ((options & PCRE_IMS) != oldims)
3311      {      {
3312      *code++ = OP_OPT;      *code++ = OP_OPT;
3313      *code++ = optchanged;      *code++ = options & PCRE_IMS;
     options = (options & ~PCRE_IMS) | optchanged;  
3314      }      }
3315    
3316    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 1944  for (;;) Line 3319  for (;;)
3319      {      {
3320      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
3321      reverse_count = code;      reverse_count = code;
3322      *code++ = 0;      PUTINC(code, 0, 0);
     *code++ = 0;  
3323      }      }
3324    
3325    /* Now compile the branch */    /* Now compile the branch */
3326    
3327    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,    if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3328        &branchreqchar, &branchcountlits, cd))          &branchfirstbyte, &branchreqbyte, &bc, cd))
3329      {      {
3330      *ptrptr = ptr;      *ptrptr = ptr;
3331      return FALSE;      return FALSE;
3332      }      }
3333    
3334    /* Fill in the length of the last branch */    /* If this is the first branch, the firstbyte and reqbyte values for the
3335      branch become the values for the regex. */
3336    
3337    length = code - last_branch;    if (*last_branch != OP_ALT)
3338    last_branch[1] = length >> 8;      {
3339    last_branch[2] = length & 255;      firstbyte = branchfirstbyte;
3340        reqbyte = branchreqbyte;
3341        }
3342    
3343    /* Save the last required character if all branches have the same; a current    /* If this is not the first branch, the first char and reqbyte have to
3344    value of -1 means unset, while -2 means "previous branch had no last required    match the values from all the previous branches, except that if the previous
3345    char".  */    value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3346      REQ_VARY for the regex. */
3347    
3348    if (*reqchar != -2)    else
3349      {      {
3350      if (branchreqchar >= 0)      /* If we previously had a firstbyte, but it doesn't match the new branch,
3351        we have to abandon the firstbyte for the regex, but if there was previously
3352        no reqbyte, it takes on the value of the old firstbyte. */
3353    
3354        if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3355        {        {
3356        if (*reqchar == -1) *reqchar = branchreqchar;        if (reqbyte < 0) reqbyte = firstbyte;
3357        else if (*reqchar != branchreqchar) *reqchar = -2;        firstbyte = REQ_NONE;
3358        }        }
     else *reqchar = -2;  
     }  
3359    
3360    /* Keep the shortest literal count */      /* If we (now or from before) have no firstbyte, a firstbyte from the
3361        branch becomes a reqbyte if there isn't a branch reqbyte. */
3362    
3363        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3364            branchreqbyte = branchfirstbyte;
3365    
3366    if (branchcountlits < *countlits) *countlits = branchcountlits;      /* Now ensure that the reqbytes match */
3367    DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));  
3368        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3369          reqbyte = REQ_NONE;
3370        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
3371        }
3372    
3373    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
3374    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
# Line 1988  for (;;) Line 3376  for (;;)
3376    
3377    if (lookbehind)    if (lookbehind)
3378      {      {
3379        int length;
3380      *code = OP_END;      *code = OP_END;
3381      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
3382      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
3383      if (length < 0)      if (length < 0)
3384        {        {
3385        *errorptr = ERR25;        *errorptr = (length == -2)? ERR36 : ERR25;
3386        *ptrptr = ptr;        *ptrptr = ptr;
3387        return FALSE;        return FALSE;
3388        }        }
3389      reverse_count[0] = (length >> 8);      PUT(reverse_count, 0, length);
     reverse_count[1] = length & 255;  
3390      }      }
3391    
3392    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Go back through
3393    terminating ket and the length of the whole bracketed item, and return,    the alternative branches and reverse the chain of offsets, with the field in
3394    leaving the pointer at the terminating char. If any of the ims options    the BRA item now becoming an offset to the first alternative. If there are
3395    were changed inside the group, compile a resetting op-code following. */    no alternatives, it points to the end of the group. The length in the
3396      terminating ket is always the length of the whole bracketed item. If any of
3397      the ims options were changed inside the group, compile a resetting op-code
3398      following, except at the very end of the pattern. Return leaving the pointer
3399      at the terminating char. */
3400    
3401    if (*ptr != '|')    if (*ptr != '|')
3402      {      {
3403      length = code - start_bracket;      int length = code - last_branch;
3404      *code++ = OP_KET;      do
     *code++ = length >> 8;  
     *code++ = length & 255;  
     if (optchanged >= 0)  
3405        {        {
3406        *code++ = OP_OPT;        int prev_length = GET(last_branch, 1);
3407        *code++ = oldoptions;        PUT(last_branch, 1, length);
3408          length = prev_length;
3409          last_branch -= length;
3410        }        }
3411      *codeptr = code;      while (length > 0);
     *ptrptr = ptr;  
     return TRUE;  
     }  
   
   /* Another branch follows; insert an "or" node and advance the pointer. */  
   
   *code = OP_ALT;  
   last_branch = code;  
   code += 3;  
   ptr++;  
   }  
 /* Control never reaches here */  
 }  
   
   
   
   
 /*************************************************  
 *      Find first significant op code            *  
 *************************************************/  
3412    
3413  /* This is called by several functions that scan a compiled expression looking      /* Fill in the ket */
 for a fixed first character, or an anchoring op code etc. It skips over things  
 that do not influence this. For one application, a change of caseless option is  
 important.  
3414    
3415  Arguments:      *code = OP_KET;
3416    code       pointer to the start of the group      PUT(code, 1, code - start_bracket);
3417    options    pointer to external options      code += 1 + LINK_SIZE;
   optbit     the option bit whose changing is significant, or  
              zero if none are  
   optstop    TRUE to return on option change, otherwise change the options  
                value and continue  
3418    
3419  Returns:     pointer to the first significant opcode      /* Resetting option if needed */
 */  
3420    
3421  static const uschar*      if ((options & PCRE_IMS) != oldims && *ptr == ')')
 first_significant_code(const uschar *code, int *options, int optbit,  
   BOOL optstop)  
 {  
 for (;;)  
   {  
   switch ((int)*code)  
     {  
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
3422        {        {
3423        if (optstop) return code;        *code++ = OP_OPT;
3424        *options = (int)code[1];        *code++ = oldims;
3425        }        }
     code += 2;  
     break;  
3426    
3427      case OP_CREF:      /* Set values to pass back */
     code += 2;  
     break;  
3428    
3429      case OP_WORD_BOUNDARY:      *codeptr = code;
3430      case OP_NOT_WORD_BOUNDARY:      *ptrptr = ptr;
3431      code++;      *firstbyteptr = firstbyte;
3432      break;      *reqbyteptr = reqbyte;
3433        return TRUE;
3434        }
3435    
3436      case OP_ASSERT_NOT:    /* Another branch follows; insert an "or" node. Its length field points back
3437      case OP_ASSERTBACK:    to the previous branch while the bracket remains open. At the end the chain
3438      case OP_ASSERTBACK_NOT:    is reversed. It's done like this so that the start of the bracket has a
3439      do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);    zero offset until it is closed, making it possible to detect recursion. */
     code += 3;  
     break;  
3440    
3441      default:    *code = OP_ALT;
3442      return code;    PUT(code, 1, code - last_branch);
3443      }    bc.current = last_branch = code;
3444      code += 1 + LINK_SIZE;
3445      ptr++;
3446    }    }
3447  /* Control never reaches here */  /* Control never reaches here */
3448  }  }
# Line 2108  all of whose alternatives start with OP_ Line 3460  all of whose alternatives start with OP_
3460  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
3461  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
3462    
3463    We can also consider a regex to be anchored if OP_SOM starts all its branches.
3464    This is the code for \G, which means "match at start of match position, taking
3465    into account the match offset".
3466    
3467  A branch is also implicitly anchored if it starts with .* and DOTALL is set,  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3468  because that will try the rest of the pattern at all possible matching points,  because that will try the rest of the pattern at all possible matching points,
3469  so there is no point trying them again.  so there is no point trying again.... er ....
3470    
3471    .... except when the .* appears inside capturing parentheses, and there is a
3472    subsequent back reference to those parentheses. We haven't enough information
3473    to catch that case precisely.
3474    
3475    At first, the best we could do was to detect when .* was in capturing brackets
3476    and the highest back reference was greater than or equal to that level.
3477    However, by keeping a bitmap of the first 31 back references, we can catch some
3478    of the more common cases more precisely.
3479    
3480  Arguments:  Arguments:
3481    code       points to start of expression (the bracket)    code           points to start of expression (the bracket)
3482    options    points to the options setting    options        points to the options setting
3483      bracket_map    a bitmap of which brackets we are inside while testing; this
3484                      handles up to substring 31; after that we just have to take
3485                      the less precise approach
3486      backref_map    the back reference bitmap
3487    
3488  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
3489  */  */
3490    
3491  static BOOL  static BOOL
3492  is_anchored(register const uschar *code, int *options)  is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3493      unsigned int backref_map)
3494  {  {
3495  do {  do {
3496     const uschar *scode = first_significant_code(code + 3, options,     const uschar *scode =
3497       PCRE_MULTILINE, FALSE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3498     register int op = *scode;     register int op = *scode;
3499     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)  
3500       { if (!is_anchored(scode, options)) return FALSE; }     /* Capturing brackets */
3501    
3502       if (op > OP_BRA)
3503         {
3504         int new_map;
3505         op -= OP_BRA;
3506         if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3507         new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3508         if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3509         }
3510    
3511       /* Other brackets */
3512    
3513       else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3514         {
3515         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3516         }
3517    
3518       /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3519       are or may be referenced. */
3520    
3521     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3522              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
3523       { if (scode[1] != OP_ANY) return FALSE; }       {
3524     else if (op != OP_SOD &&       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3525         }
3526    
3527       /* Check for explicit anchoring */
3528    
3529       else if (op != OP_SOD && op != OP_SOM &&
3530             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3531       return FALSE;       return FALSE;
3532     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3533     }     }
3534  while (*code == OP_ALT);  while (*code == OP_ALT);   /* Loop for each alternative */
3535  return TRUE;  return TRUE;
3536  }  }
3537    
# Line 2149  return TRUE; Line 3544  return TRUE;
3544  /* This is called to find out if every branch starts with ^ or .* so that  /* This is called to find out if every branch starts with ^ or .* so that
3545  "first char" processing can be done to speed things up in multiline  "first char" processing can be done to speed things up in multiline
3546  matching and for non-DOTALL patterns that start with .* (which must start at  matching and for non-DOTALL patterns that start with .* (which must start at
3547  the beginning or after \n).  the beginning or after \n). As in the case of is_anchored() (see above), we
3548    have to take account of back references to capturing brackets that contain .*
3549    because in that case we can't make the assumption.
3550    
3551  Argument:  points to start of expression (the bracket)  Arguments:
3552  Returns:   TRUE or FALSE    code           points to start of expression (the bracket)
3553      bracket_map    a bitmap of which brackets we are inside while testing; this
3554                      handles up to substring 31; after that we just have to take
3555                      the less precise approach
3556      backref_map    the back reference bitmap
3557    
3558    Returns:         TRUE or FALSE
3559  */  */
3560    
3561  static BOOL  static BOOL
3562  is_startline(const uschar *code)  is_startline(const uschar *code, unsigned int bracket_map,
3563      unsigned int backref_map)
3564  {  {
3565  do {  do {
3566     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3567     register int op = *scode;     register int op = *scode;
3568     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)  
3569       { if (!is_startline(scode)) return FALSE; }     /* Capturing brackets */
3570    
3571       if (op > OP_BRA)
3572         {
3573         int new_map;
3574         op -= OP_BRA;
3575         if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3576         new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3577         if (!is_startline(scode, new_map, backref_map)) return FALSE;
3578         }
3579    
3580       /* Other brackets */
3581    
3582       else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3583         { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3584    
3585       /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3586       may be referenced. */
3587    
3588     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3589       { if (scode[1] != OP_ANY) return FALSE; }       {
3590         if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3591         }
3592    
3593       /* Check for explicit circumflex */
3594    
3595     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
3596     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3597     }     }
3598  while (*code == OP_ALT);  while (*code == OP_ALT);  /* Loop for each alternative */
3599  return TRUE;  return TRUE;
3600  }  }
3601    
3602    
3603    
3604  /*************************************************  /*************************************************
3605  *          Check for fixed first char            *  *       Check for asserted fixed first char      *
3606  *************************************************/  *************************************************/
3607    
3608  /* Try to find out if there is a fixed first character. This is called for  /* During compilation, the "first char" settings from forward assertions are
3609  unanchored expressions, as it speeds up their processing quite considerably.  discarded, because they can cause conflicts with actual literals that follow.
3610  Consider each alternative branch. If they all start with the same char, or with  However, if we end up without a first char setting for an unanchored pattern,
3611  a bracket all of whose alternatives start with the same char (recurse ad lib),  it is worth scanning the regex to see if there is an initial asserted first
3612  then we return that char, otherwise -1.  char. If all branches start with the same asserted char, or with a bracket all
3613    of whose alternatives start with the same asserted char (recurse ad lib), then
3614    we return that char, otherwise -1.
3615    
3616  Arguments:  Arguments:
3617    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
3618    options    pointer to the options (used to check casing changes)    options    pointer to the options (used to check casing changes)
3619      inassert   TRUE if in an assertion
3620    
3621  Returns:     -1 or the fixed first char  Returns:     -1 or the fixed first char
3622  */  */
3623    
3624  static int  static int
3625  find_firstchar(const uschar *code, int *options)  find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3626  {  {
3627  register int c = -1;  register int c = -1;
3628  do {  do {
3629     int d;     int d;
3630     const uschar *scode = first_significant_code(code + 3, options,     const uschar *scode =
3631       PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3632     register int op = *scode;     register int op = *scode;
3633    
3634     if (op >= OP_BRA) op = OP_BRA;     if (op >= OP_BRA) op = OP_BRA;
# Line 2212  do { Line 3642  do {
3642       case OP_ASSERT:       case OP_ASSERT:
3643       case OP_ONCE:       case OP_ONCE:
3644       case OP_COND:       case OP_COND:
3645       if ((d = find_firstchar(scode, options)) < 0) return -1;       if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3646           return -1;
3647       if (c < 0) c = d; else if (c != d) return -1;       if (c < 0) c = d; else if (c != d) return -1;
3648       break;       break;
3649    
# Line 2224  do { Line 3655  do {
3655    
3656       case OP_PLUS:       case OP_PLUS:
3657       case OP_MINPLUS:       case OP_MINPLUS:
3658       if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;       if (!inassert) return -1;
3659         if (c < 0)
3660           {
3661           c = scode[1];
3662           if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3663           }
3664         else if (c != scode[1]) return -1;
3665       break;       break;
3666       }       }
3667    
3668     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3669     }     }
3670  while (*code == OP_ALT);  while (*code == OP_ALT);
3671  return c;  return c;
# Line 2237  return c; Line 3674  return c;
3674    
3675    
3676    
   
3677  /*************************************************  /*************************************************
3678  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
3679  *************************************************/  *************************************************/
# Line 2261  pcre_compile(const char *pattern, int op Line 3697  pcre_compile(const char *pattern, int op
3697    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
3698  {  {
3699  real_pcre *re;  real_pcre *re;
3700  int length = 3;      /* For initial BRA plus length */  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3701  int runlength;  int runlength;
3702  int c, reqchar, countlits;  int c, firstbyte, reqbyte;
3703  int bracount = 0;  int bracount = 0;
 int top_backref = 0;  
3704  int branch_extra = 0;  int branch_extra = 0;
3705  int branch_newextra;  int branch_newextra;
3706    int item_count = -1;
3707    int name_count = 0;
3708    int max_name_size = 0;
3709    #ifdef SUPPORT_UTF8
3710    int lastcharlength = 0;
3711    BOOL utf8;
3712    BOOL class_utf8;
3713    #endif
3714    BOOL inescq = FALSE;
3715  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
3716  size_t size;  size_t size;
3717  uschar *code;  uschar *code;
3718    const uschar *codestart;
3719  const uschar *ptr;  const uschar *ptr;
3720  compile_data compile_block;  compile_data compile_block;
3721  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
3722  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
3723    
 #ifdef DEBUG  
 uschar *code_base, *code_end;  
 #endif  
   
3724  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
3725  can do is just return NULL. */  can do is just return NULL. */
3726    
# Line 2295  if (erroroffset == NULL) Line 3736  if (erroroffset == NULL)
3736    }    }
3737  *erroroffset = 0;  *erroroffset = 0;
3738    
3739    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3740    
3741    #ifdef SUPPORT_UTF8
3742    utf8 = (options & PCRE_UTF8) != 0;
3743    #else
3744    if ((options & PCRE_UTF8) != 0)
3745      {
3746      *errorptr = ERR32;
3747      return NULL;
3748      }
3749    #endif
3750    
3751  if ((options & ~PUBLIC_OPTIONS) != 0)  if ((options & ~PUBLIC_OPTIONS) != 0)
3752    {    {
3753    *errorptr = ERR17;    *errorptr = ERR17;
# Line 2309  compile_block.fcc = tables + fcc_offset; Line 3762  compile_block.fcc = tables + fcc_offset;
3762  compile_block.cbits = tables + cbits_offset;  compile_block.cbits = tables + cbits_offset;
3763  compile_block.ctypes = tables + ctypes_offset;  compile_block.ctypes = tables + ctypes_offset;
3764    
3765    /* Maximum back reference and backref bitmap. This is updated for numeric
3766    references during the first pass, but for named references during the actual
3767    compile pass. The bitmap records up to 31 back references to help in deciding
3768    whether (.*) can be treated as anchored or not. */
3769    
3770    compile_block.top_backref = 0;
3771    compile_block.backref_map = 0;
3772    
3773  /* Reflect pattern for debugging output */  /* Reflect pattern for debugging output */
3774    
3775  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
# Line 2317  DPRINTF(("%s\n", pattern)); Line 3778  DPRINTF(("%s\n", pattern));
3778  /* The first thing to do is to make a pass over the pattern to compute the  /* The first thing to do is to make a pass over the pattern to compute the
3779  amount of store required to hold the compiled code. This does not have to be  amount of store required to hold the compiled code. This does not have to be
3780  perfect as long as errors are overestimates. At the same time we can detect any  perfect as long as errors are overestimates. At the same time we can detect any
3781  internal flag settings. Make an attempt to correct for any counted white space  flag settings right at the start, and extract them. Make an attempt to correct
3782  if an "extended" flag setting appears late in the pattern. We can't be so  for any counted white space if an "extended" flag setting appears late in the
3783  clever for #-comments. */  pattern. We can't be so clever for #-comments. */
3784    
3785  ptr = (const uschar *)(pattern - 1);  ptr = (const uschar *)(pattern - 1);
3786  while ((c = *(++ptr)) != 0)  while ((c = *(++ptr)) != 0)
3787    {    {
3788    int min, max;    int min, max;
3789    int class_charcount;    int class_optcount;
3790      int bracket_length;
3791      int duplength;
3792    
3793      /* If we are inside a \Q...\E sequence, all chars are literal */
3794    
3795      if (inescq) goto NORMAL_CHAR;
3796    
3797      /* Otherwise, first check for ignored whitespace and comments */
3798    
3799    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3800      {      {
# Line 2334  while ((c = *(++ptr)) != 0) Line 3803  while ((c = *(++ptr)) != 0)
3803        {        {
3804        /* The space before the ; is to avoid a warning on a silly compiler        /* The space before the ; is to avoid a warning on a silly compiler
3805        on the Macintosh. */        on the Macintosh. */
3806        while ((c = *(++ptr)) != 0 && c != '\n') ;        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3807          if (c == 0) break;
3808        continue;        continue;
3809        }        }
3810      }      }
3811    
3812      item_count++;    /* Is zero for the first non-comment item */
3813    
3814    switch(c)    switch(c)
3815      {      {
3816      /* A backslashed item may be an escaped "normal" character or a      /* A backslashed item may be an escaped "normal" character or a
# Line 2358  while ((c = *(++ptr)) != 0) Line 3830  while ((c = *(++ptr)) != 0)
3830          goto NORMAL_CHAR;          goto NORMAL_CHAR;
3831          }          }
3832        }        }
3833    
3834        /* If \Q, enter "literal" mode */
3835    
3836        if (-c == ESC_Q)
3837          {
3838          inescq = TRUE;
3839          continue;
3840          }
3841    
3842        /* Other escapes need one byte, and are of length one for repeats */
3843    
3844      length++;      length++;
3845    #ifdef SUPPORT_UTF8
3846        lastcharlength = 1;
3847    #endif
3848    
3849      /* A back reference needs an additional char, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
3850      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
3851      back reference. */      back reference. */
3852    
3853      if (c <= -ESC_REF)      if (c <= -ESC_REF)
3854        {        {
3855        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
3856        if (refnum > top_backref) top_backref = refnum;        compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3857        length++;   /* For single back reference */        if (refnum > compile_block.top_backref)
3858            compile_block.top_backref = refnum;
3859          length += 2;   /* For single back reference */
3860        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3861          {          {
3862          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
# Line 2382  while ((c = *(++ptr)) != 0) Line 3870  while ((c = *(++ptr)) != 0)
3870        }        }
3871      continue;      continue;
3872    
3873      case '^':      case '^':     /* Single-byte metacharacters */
3874      case '.':      case '.':
3875      case '$':      case '$':
3876      case '*':     /* These repeats won't be after brackets; */      length++;
3877      case '+':     /* those are handled separately */  #ifdef SUPPORT_UTF8
3878        lastcharlength = 1;
3879    #endif
3880        continue;
3881    
3882        case '*':            /* These repeats won't be after brackets; */
3883        case '+':            /* those are handled separately */
3884      case '?':      case '?':
3885      length++;      length++;
3886        goto POSESSIVE;      /* A few lines below */
3887    
3888        /* This covers the cases of braced repeats after a single char, metachar,
3889        class, or back reference. */
3890    
3891        case '{':
3892        if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3893        ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3894        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3895    
3896        /* These special cases just insert one extra opcode */
3897    
3898        if ((min == 0 && (max == 1 || max == -1)) ||
3899          (min == 1 && max == -1))
3900            length++;
3901    
3902        /* These cases might insert additional copies of a preceding character. */
3903    
3904        else
3905          {
3906    #ifdef SUPPORT_UTF8
3907          /* In UTF-8 mode, we should find the length in lastcharlength */
3908          if (utf8)
3909            {
3910            if (min != 1)
3911              {
3912              length -= lastcharlength;   /* Uncount the original char or metachar */
3913              if (min > 0) length += 3 + lastcharlength;
3914              }
3915            length += lastcharlength + ((max > 0)? 3 : 1);
3916            }
3917          else
3918    #endif
3919    
3920          /* Not UTF-8 mode: all characters are one byte */
3921            {
3922            if (min != 1)
3923              {
3924              length--;   /* Uncount the original char or metachar */
3925              if (min > 0) length += 4;
3926              }
3927    
3928            length += (max > 0)? 4 : 2;
3929            }
3930          }
3931    
3932        if (ptr[1] == '?') ptr++;      /* Needs no extra length */
3933    
3934        POSESSIVE:                     /* Test for possessive quantifier */
3935        if (ptr[1] == '+')
3936          {
3937          ptr++;
3938          length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
3939          }
3940        continue;
3941    
3942        /* An alternation contains an offset to the next branch or ket. If any ims
3943        options changed in the previous branch(es), and/or if we are in a
3944        lookbehind assertion, extra space will be needed at the start of the
3945        branch. This is handled by branch_extra. */
3946    
3947        case '|':
3948        length += 1 + LINK_SIZE + branch_extra;
3949      continue;      continue;
3950    
3951      /* This covers the cases of repeats after a single char, metachar, class,      /* A character class uses 33 characters provided that all the character
3952      or back reference. */      values are less than 256. Otherwise, it uses a bit map for low valued
3953        characters, and individual items for others. Don't worry about character
3954        types that aren't allowed in classes - they'll get picked up during the
3955        compile. A character class that contains only one single-byte character
3956        uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
3957        where we can. (In UTF-8 mode we can do this only for chars < 128.) */
3958    
3959        case '[':
3960        class_optcount = 0;
3961    
3962    #ifdef SUPPORT_UTF8
3963        class_utf8 = FALSE;
3964    #endif
3965    
3966        if (*(++ptr) == '^') ptr++;
3967    
3968        /* Written as a "do" so that an initial ']' is taken as data */
3969    
3970        if (*ptr != 0) do
3971          {
3972          /* Inside \Q...\E everything is literal except \E */
3973    
3974          if (inescq)
3975            {
3976            if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
3977            inescq = FALSE;
3978            ptr += 1;
3979            continue;
3980            }
3981    
3982          /* Outside \Q...\E, check for escapes */
3983    
3984          if (*ptr == '\\')
3985            {
3986    #ifdef SUPPORT_UTF8
3987            int prevchar = ptr[-1];
3988    #endif
3989            int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
3990              &compile_block);
3991            if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3992    
3993            /* \b is backspace inside a class */
3994    
3995            if (-ch == ESC_b) ch = '\b';
3996    
3997            /* \Q enters quoting mode */
3998    
3999            if (-ch == ESC_Q)
4000              {
4001              inescq = TRUE;
4002              continue;
4003              }
4004    
4005            /* Handle escapes that turn into characters */
4006    
4007            if (ch >= 0)
4008              {
4009    #ifdef SUPPORT_UTF8
4010              if (utf8)
4011                {
4012                if (ch > 127) class_optcount = 10;  /* Ensure > 1 */
4013                if (ch > 255)
4014                  {
4015                  uschar buffer[6];
4016                  if (!class_utf8)
4017                    {
4018                    class_utf8 = TRUE;
4019                    length += LINK_SIZE + 1 + 1;
4020                    }
4021                  length += 1 + ord2utf8(ch, buffer);
4022    
4023                  /* If this wide character is preceded by '-', add an extra 2 to
4024                  the length in case the previous character was < 128, because in
4025                  this case the whole range will be put into the list. */
4026    
4027                  if (prevchar == '-') length += 2;
4028                  }
4029                }
4030    #endif
4031              class_optcount++;            /* for possible optimization */
4032              }
4033            else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
4034            }
4035    
4036          /* Check the syntax for POSIX stuff. The bits we actually handle are
4037          checked during the real compile phase. */
4038    
4039          else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4040            {
4041            ptr++;
4042            class_optcount = 10;    /* Make sure > 1 */
4043            }
4044    
4045          /* Anything else just increments the possible optimization count. If
4046          there are wide characters, we are going to have to use an XCLASS. */
4047    
4048          else
4049            {
4050            NON_SPECIAL_CHARACTER:
4051            class_optcount++;
4052    
4053    #ifdef SUPPORT_UTF8
4054            if (utf8)
4055              {
4056              int ch;
4057              int extra = 0;
4058              GETCHARLEN(ch, ptr, extra);
4059              if (ch > 127) class_optcount = 10;   /* No optimization possible */
4060              if (ch > 255)
4061                {
4062                if (!class_utf8)
4063                  {
4064                  class_utf8 = TRUE;
4065                  length += LINK_SIZE + 1 + 1;
4066                  }
4067                length += 2 + extra;
4068    
4069      case '{':              /* If this wide character is preceded by '-', add an extra 2 to
4070      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;              the length in case the previous character was < 128, because in
4071      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);              this case the whole range will be put into the list. */
     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;  
     if ((min == 0 && (max == 1 || max == -1)) ||  
       (min == 1 && max == -1))  
         length++;  
     else  
       {  
       length--;   /* Uncount the original char or metachar */  
       if (min == 1) length++; else if (min > 0) length += 4;  
       if (max > 0) length += 4; else length += 2;  
       }  
     if (ptr[1] == '?') ptr++;  
     continue;  
4072    
4073      /* An alternation contains an offset to the next branch or ket. If any ims              if (ptr[-1] == '-') length += 2;
     options changed in the previous branch(es), and/or if we are in a  
     lookbehind assertion, extra space will be needed at the start of the  
     branch. This is handled by branch_extra. */  
4074    
4075      case '|':              /* Advance to the end of this character */
     length += 3 + branch_extra;  
     continue;  
4076    
4077      /* A character class uses 33 characters. Don't worry about character types              ptr += extra;
4078      that aren't allowed in classes - they'll get picked up during the compile.              }
4079      A character class that contains only one character uses 2 or 3 bytes,            }
4080      depending on whether it is negated or not. Notice this where we can. */  #endif
4081            }
4082          }
4083        while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4084    
4085      case '[':      if (*ptr == 0)                          /* Missing terminating ']' */
     class_charcount = 0;  
     if (*(++ptr) == '^') ptr++;  
     do  
4086        {        {
4087        if (*ptr == '\\')        *errorptr = ERR6;
4088          {        goto PCRE_ERROR_RETURN;
         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,  
           &compile_block);  
         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;  
         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;  
         }  
       else class_charcount++;  
       ptr++;  
4089        }        }
     while (*ptr != 0 && *ptr != ']');  
4090    
4091      /* Repeats for negated single chars are handled by the general code */      /* We can optimize when there was only one optimizable character. Repeats
4092        for positive and negated single one-byte chars are handled by the general
4093        code. Here, we handle repeats for the class opcodes. */
4094    
4095      if (class_charcount == 1) length += 3; else      if (class_optcount == 1) length += 3; else
4096        {        {
4097        length += 33;        length += 33;
4098    
# Line 2466  while ((c = *(++ptr)) != 0) Line 4115  while ((c = *(++ptr)) != 0)
4115    
4116      case '(':      case '(':
4117      branch_newextra = 0;      branch_newextra = 0;
4118        bracket_length = 1 + LINK_SIZE;
4119    
4120      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
4121    
# Line 2499  while ((c = *(++ptr)) != 0) Line 4149  while ((c = *(++ptr)) != 0)
4149          ptr += 2;          ptr += 2;
4150          break;          break;
4151    
4152          /* A recursive call to the regex is an extension, to provide the          /* (?R) specifies a recursive call to the regex, which is an extension
4153          facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */          to provide the facility which can be obtained by (?p{perl-code}) in
4154            Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4155