/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 62 by nigel, Sat Feb 24 21:39:54 2007 UTC revision 63 by nigel, Sat Feb 24 21:40:03 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-2001 University of Cambridge             Copyright (c) 1997-2003 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 32  restrictions: Line 32  restrictions:
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
   
35  /* Define DEBUG to get debugging output on stdout. */  /* Define DEBUG to get debugging output on stdout. */
36    
37  /* #define DEBUG */  /* #define DEBUG */
# Line 69  compile time. */ Line 68  compile time. */
68  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
69    
70    
71    
72    /* Maximum number of ints of offset to save on the stack for recursive calls.
73    If the offset vector is bigger, malloc is used. This should be a multiple of 3,
74    because the offset vector is always a multiple of 3 long. */
75    
76    #define REC_STACK_SAVE_MAX 30
77    
78    
79  /* The number of bytes in a literal character string above which we can't add  /* The number of bytes in a literal character string above which we can't add
80  any more is different when UTF-8 characters may be encountered. */  any more is set at 250 in order to allow for UTF-8 characters. (In theory it
81    could be 255 when UTF-8 support is excluded, but that means that some of the
82    test output would be different, which just complicates things.) */
83    
 #ifdef SUPPORT_UTF8  
84  #define MAXLIT 250  #define MAXLIT 250
 #else  
 #define MAXLIT 255  
 #endif  
85    
86    
87    /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
88    the definition is next to the definition of the opcodes in internal.h. */
89    
90    static uschar OP_lengths[] = { OP_LENGTHS };
91    
92  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
93    
94  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
95  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
96    
 /* Text forms of OP_ values and things, for debugging (not all used) */  
   
 #ifdef DEBUG  
 static const char *OP_names[] = {  
   "End", "\\A", "\\B", "\\b", "\\D", "\\d",  
   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",  
   "Opt", "^", "$", "Any", "chars", "not",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{",  
   "class", "Ref", "Recurse",  
   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",  
   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",  
   "Brazero", "Braminzero", "Branumber", "Bra"  
 };  
 #endif  
   
97  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
98  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
99  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
# Line 110  is invalid. */ Line 102  is invalid. */
102  static const short int escapes[] = {  static const short int escapes[] = {
103      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
104      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
105    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
106      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
107      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
108      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
109    '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
110      0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
111      0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_r, -ESC_s,  ESC_t,      0,      0, -ESC_w,   /* p - w */
112      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
113  };  };
114    
# Line 126  as this is assumed for handling case ind Line 118  as this is assumed for handling case ind
118    
119  static const char *posix_names[] = {  static const char *posix_names[] = {
120    "alpha", "lower", "upper",    "alpha", "lower", "upper",
121    "alnum", "ascii", "cntrl", "digit", "graph",    "alnum", "ascii", "blank", "cntrl", "digit", "graph",
122    "print", "punct", "space", "word",  "xdigit" };    "print", "punct", "space", "word",  "xdigit" };
123    
124  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
125    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
126    
127  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class; up to three may be combined
128  to form the class. */  to form the class. The table for [:blank:] is dynamically modified to remove
129    the vertical space characters. */
130    
131  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
132    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_lower, cbit_upper, -1,             /* alpha */
# Line 141  static const int posix_class_maps[] = { Line 134  static const int posix_class_maps[] = {
134    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,         -1,             /* upper */
135    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_digit, cbit_lower, cbit_upper,     /* alnum */
136    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl, -1,             /* ascii */
137      cbit_space, -1,         -1,             /* blank - a GNU extension */
138    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,         -1,             /* cntrl */
139    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,         -1,             /* digit */
140    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,         -1,             /* graph */
141    cbit_print, -1,         -1,             /* print */    cbit_print, -1,         -1,             /* print */
142    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,         -1,             /* punct */
143    cbit_space, -1,         -1,             /* space */    cbit_space, -1,         -1,             /* space */
144    cbit_word,  -1,         -1,             /* word */    cbit_word,  -1,         -1,             /* word - a Perl extension */
145    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,         -1              /* xdigit */
146  };  };
147    
# Line 156  static const int posix_class_maps[] = { Line 150  static const int posix_class_maps[] = {
150    
151  static BOOL  static BOOL
152    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
153      BOOL, int, int *, int *, compile_data *);      BOOL, int, int *, int *, branch_chain *, compile_data *);
154    
155  /* Structure for building a chain of data that actually lives on the  /* Structure for building a chain of data that actually lives on the
156  stack, for holding the values of the subject pointer at the start of each  stack, for holding the values of the subject pointer at the start of each
# Line 173  typedef struct eptrblock { Line 167  typedef struct eptrblock {
167  #define match_condassert   0x01    /* Called to check a condition assertion */  #define match_condassert   0x01    /* Called to check a condition assertion */
168  #define match_isgroup      0x02    /* Set if start of bracketed group */  #define match_isgroup      0x02    /* Set if start of bracketed group */
169    
170    /* Non-error returns from the match() function. Error returns are externally
171    defined PCRE_ERROR_xxx codes, which are all negative. */
172    
173    #define MATCH_MATCH        1
174    #define MATCH_NOMATCH      0
175    
176    
177    
178  /*************************************************  /*************************************************
# Line 181  typedef struct eptrblock { Line 181  typedef struct eptrblock {
181    
182  /* PCRE is thread-clean and doesn't use any global variables in the normal  /* PCRE is thread-clean and doesn't use any global variables in the normal
183  sense. However, it calls memory allocation and free functions via the two  sense. However, it calls memory allocation and free functions via the two
184  indirections below, which are can be changed by the caller, but are shared  indirections below, and it can optionally do callouts. These values can be
185  between all threads. */  changed by the caller, but are shared between all threads. However, when
186    compiling for Virtual Pascal, things are done differently (see pcre.in). */
187    
188    #ifndef VPCOMPAT
189  void *(*pcre_malloc)(size_t) = malloc;  void *(*pcre_malloc)(size_t) = malloc;
190  void  (*pcre_free)(void *) = free;  void  (*pcre_free)(void *) = free;
191    int   (*pcre_callout)(pcre_callout_block *) = NULL;
192    #endif
193    
194    
195  /*************************************************  /*************************************************
# Line 198  byte. The macros for character handling Line 201  byte. The macros for character handling
201  byte-mode, and more complicated ones for UTF-8 characters. */  byte-mode, and more complicated ones for UTF-8 characters. */
202    
203  #ifndef SUPPORT_UTF8  #ifndef SUPPORT_UTF8
204    #define GETCHAR(c, eptr) c = *eptr;
205  #define GETCHARINC(c, eptr) c = *eptr++;  #define GETCHARINC(c, eptr) c = *eptr++;
206    #define GETCHARINCTEST(c, eptr) c = *eptr++;
207  #define GETCHARLEN(c, eptr, len) c = *eptr;  #define GETCHARLEN(c, eptr, len) c = *eptr;
208  #define BACKCHAR(eptr)  #define BACKCHAR(eptr)
209    
210  #else   /* SUPPORT_UTF8 */  #else   /* SUPPORT_UTF8 */
211    
212  /* Get the next UTF-8 character, advancing the pointer */  /* Get the next UTF-8 character, not advancing the pointer. This is called when
213    we know we are in UTF-8 mode. */
214    
215    #define GETCHAR(c, eptr) \
216      c = *eptr; \
217      if ((c & 0xc0) == 0xc0) \
218        { \
219        int i; \
220        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
221        int s = 6*a; \
222        c = (c & utf8_table3[a]) << s; \
223        for (i = 1; i <= a; i++) \
224          { \
225          s -= 6; \
226          c |= (eptr[i] & 0x3f) << s; \
227          } \
228        }
229    
230    /* Get the next UTF-8 character, advancing the pointer. This is called when we
231    know we are in UTF-8 mode. */
232    
233  #define GETCHARINC(c, eptr) \  #define GETCHARINC(c, eptr) \
234    c = *eptr++; \    c = *eptr++; \
235      if ((c & 0xc0) == 0xc0) \
236        { \
237        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
238        int s = 6*a; \
239        c = (c & utf8_table3[a]) << s; \
240        while (a-- > 0) \
241          { \
242          s -= 6; \
243          c |= (*eptr++ & 0x3f) << s; \
244          } \
245        }
246    
247    /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
248    
249    #define GETCHARINCTEST(c, eptr) \
250      c = *eptr++; \
251    if (md->utf8 && (c & 0xc0) == 0xc0) \    if (md->utf8 && (c & 0xc0) == 0xc0) \
252      { \      { \
253      int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \      int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
# Line 220  byte-mode, and more complicated ones for Line 260  byte-mode, and more complicated ones for
260        } \        } \
261      }      }
262    
263  /* Get the next UTF-8 character, not advancing the pointer, setting length */  /* Get the next UTF-8 character, not advancing the pointer, incrementing length
264    if there are extra bytes. This is called when we know we are in UTF-8 mode. */
265    
266  #define GETCHARLEN(c, eptr, len) \  #define GETCHARLEN(c, eptr, len) \
267    c = *eptr; \    c = *eptr; \
268    len = 1; \    if ((c & 0xc0) == 0xc0) \
   if (md->utf8 && (c & 0xc0) == 0xc0) \  
269      { \      { \
270      int i; \      int i; \
271      int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \      int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
# Line 240  byte-mode, and more complicated ones for Line 280  byte-mode, and more complicated ones for
280      }      }
281    
282  /* If the pointer is not at the start of a character, move it back until  /* If the pointer is not at the start of a character, move it back until
283  it is. */  it is. Called only in UTF-8 mode. */
284    
285  #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;  #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
286    
# Line 323  return i + 1; Line 363  return i + 1;
363    
364    
365  /*************************************************  /*************************************************
366    *         Print compiled regex                   *
367    *************************************************/
368    
369    /* The code for doing this is held in a separate file that is also included in
370    pcretest.c. It defines a function called print_internals(). */
371    
372    #ifdef DEBUG
373    #include "printint.c"
374    #endif
375    
376    
377    
378    /*************************************************
379  *          Return version string                 *  *          Return version string                 *
380  *************************************************/  *************************************************/
381    
# Line 352  Therefore, I haven't changed the API for Line 405  Therefore, I haven't changed the API for
405  Arguments:  Arguments:
406    external_re   points to compiled code    external_re   points to compiled code
407    optptr        where to pass back the options    optptr        where to pass back the options
408    first_char    where to pass back the first character,    first_byte    where to pass back the first character,
409                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
410                  or -2 otherwise                  or -2 otherwise
411    
# Line 361  Returns:        number of capturing subp Line 414  Returns:        number of capturing subp
414  */  */
415    
416  int  int
417  pcre_info(const pcre *external_re, int *optptr, int *first_char)  pcre_info(const pcre *external_re, int *optptr, int *first_byte)
418  {  {
419  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
420  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
421  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
422  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
423  if (first_char != NULL)  if (first_byte != NULL)
424    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
425       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426  return re->top_bracket;  return re->top_bracket;
427  }  }
# Line 384  that additional items can be added compa Line 437  that additional items can be added compa
437    
438  Arguments:  Arguments:
439    external_re      points to compiled code    external_re      points to compiled code
440    external_study   points to study data, or NULL    extra_data       points extra data, or NULL
441    what             what information is required    what             what information is required
442    where            where to put the information    where            where to put the information
443    
# Line 392  Returns:           0 if data returned, n Line 445  Returns:           0 if data returned, n
445  */  */
446    
447  int  int
448  pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,  pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
449    void *where)    void *where)
450  {  {
451  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
452  const real_pcre_extra *study = (const real_pcre_extra *)study_data;  const pcre_study_data *study = NULL;
453    
454  if (re == NULL || where == NULL) return PCRE_ERROR_NULL;  if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
455  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
456    
457    if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
458      study = extra_data->study_data;
459    
460  switch (what)  switch (what)
461    {    {
462    case PCRE_INFO_OPTIONS:    case PCRE_INFO_OPTIONS:
# Line 411  switch (what) Line 467  switch (what)
467    *((size_t *)where) = re->size;    *((size_t *)where) = re->size;
468    break;    break;
469    
470      case PCRE_INFO_STUDYSIZE:
471      *((size_t *)where) = (study == NULL)? 0 : study->size;
472      break;
473    
474    case PCRE_INFO_CAPTURECOUNT:    case PCRE_INFO_CAPTURECOUNT:
475    *((int *)where) = re->top_bracket;    *((int *)where) = re->top_bracket;
476    break;    break;
# Line 419  switch (what) Line 479  switch (what)
479    *((int *)where) = re->top_backref;    *((int *)where) = re->top_backref;
480    break;    break;
481    
482    case PCRE_INFO_FIRSTCHAR:    case PCRE_INFO_FIRSTBYTE:
483    *((int *)where) =    *((int *)where) =
484      ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :      ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
485      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
486    break;    break;
487    
# Line 433  switch (what) Line 493  switch (what)
493    
494    case PCRE_INFO_LASTLITERAL:    case PCRE_INFO_LASTLITERAL:
495    *((int *)where) =    *((int *)where) =
496      ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;      ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
497      break;
498    
499      case PCRE_INFO_NAMEENTRYSIZE:
500      *((int *)where) = re->name_entry_size;
501      break;
502    
503      case PCRE_INFO_NAMECOUNT:
504      *((int *)where) = re->name_count;
505      break;
506    
507      case PCRE_INFO_NAMETABLE:
508      *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
509      break;
510    
511      default: return PCRE_ERROR_BADOPTION;
512      }
513    
514    return 0;
515    }
516    
517    
518    
519    /*************************************************
520    * Return info about what features are configured *
521    *************************************************/
522    
523    /* This is function which has an extensible interface so that additional items
524    can be added compatibly.
525    
526    Arguments:
527      what             what information is required
528      where            where to put the information
529    
530    Returns:           0 if data returned, negative on error
531    */
532    
533    int
534    pcre_config(int what, void *where)
535    {
536    switch (what)
537      {
538      case PCRE_CONFIG_UTF8:
539      #ifdef SUPPORT_UTF8
540      *((int *)where) = 1;
541      #else
542      *((int *)where) = 0;
543      #endif
544      break;
545    
546      case PCRE_CONFIG_NEWLINE:
547      *((int *)where) = NEWLINE;
548      break;
549    
550      case PCRE_CONFIG_LINK_SIZE:
551      *((int *)where) = LINK_SIZE;
552      break;
553    
554      case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
555      *((int *)where) = POSIX_MALLOC_THRESHOLD;
556      break;
557    
558      case PCRE_CONFIG_MATCH_LIMIT:
559      *((unsigned int *)where) = MATCH_LIMIT;
560    break;    break;
561    
562    default: return PCRE_ERROR_BADOPTION;    default: return PCRE_ERROR_BADOPTION;
# Line 525  else Line 648  else
648    const uschar *oldptr;    const uschar *oldptr;
649    switch (c)    switch (c)
650      {      {
651        /* A number of Perl escapes are not handled by PCRE. We give an explicit
652        error. */
653    
654        case 'l':
655        case 'L':
656        case 'N':
657        case 'p':
658        case 'P':
659        case 'u':
660        case 'U':
661        case 'X':
662        *errorptr = ERR37;
663        break;
664    
665      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
666      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
667      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 746  return p; Line 883  return p;
883    
884    
885  /*************************************************  /*************************************************
886    *      Find first significant op code            *
887    *************************************************/
888    
889    /* This is called by several functions that scan a compiled expression looking
890    for a fixed first character, or an anchoring op code etc. It skips over things
891    that do not influence this. For some calls, a change of option is important.
892    
893    Arguments:
894      code       pointer to the start of the group
895      options    pointer to external options
896      optbit     the option bit whose changing is significant, or
897                   zero if none are
898    
899    Returns:     pointer to the first significant opcode
900    */
901    
902    static const uschar*
903    first_significant_code(const uschar *code, int *options, int optbit)
904    {
905    for (;;)
906      {
907      switch ((int)*code)
908        {
909        case OP_OPT:
910        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
911          *options = (int)code[1];
912        code += 2;
913        break;
914    
915        case OP_ASSERT_NOT:
916        case OP_ASSERTBACK:
917        case OP_ASSERTBACK_NOT:
918        do code += GET(code, 1); while (*code == OP_ALT);
919        /* Fall through */
920    
921        case OP_CALLOUT:
922        case OP_CREF:
923        case OP_BRANUMBER:
924        case OP_WORD_BOUNDARY:
925        case OP_NOT_WORD_BOUNDARY:
926        code += OP_lengths[*code];
927        break;
928    
929        default:
930        return code;
931        }
932      }
933    /* Control never reaches here */
934    }
935    
936    
937    
938    
939    /*************************************************
940  *        Find the fixed length of a pattern      *  *        Find the fixed length of a pattern      *
941  *************************************************/  *************************************************/
942    
943  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a pattern and compute the fixed length of subject that will match it,
944  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
945    In UTF8 mode, the result is in characters rather than bytes.
946    
947  Arguments:  Arguments:
948    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
949    options  the compiling options    options  the compiling options
950    
951  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length,
952                 or -2 if \C was encountered
953  */  */
954    
955  static int  static int
# Line 765  find_fixedlength(uschar *code, int optio Line 958  find_fixedlength(uschar *code, int optio
958  int length = -1;  int length = -1;
959    
960  register int branchlength = 0;  register int branchlength = 0;
961  register uschar *cc = code + 3;  register uschar *cc = code + 1 + LINK_SIZE;
962    
963  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
964  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 782  for (;;) Line 975  for (;;)
975      case OP_ONCE:      case OP_ONCE:
976      case OP_COND:      case OP_COND:
977      d = find_fixedlength(cc, options);      d = find_fixedlength(cc, options);
978      if (d < 0) return -1;      if (d < 0) return d;
979      branchlength += d;      branchlength += d;
980      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
981      cc += 3;      cc += 1 + LINK_SIZE;
982      break;      break;
983    
984      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
# Line 800  for (;;) Line 993  for (;;)
993      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
994        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
995      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
996      cc += 3;      cc += 1 + LINK_SIZE;
997      branchlength = 0;      branchlength = 0;
998      break;      break;
999    
# Line 810  for (;;) Line 1003  for (;;)
1003      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1004      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1005      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1006      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1007      cc += 3;      /* Fall through */
     break;  
1008    
1009      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1010    
1011      case OP_REVERSE:      case OP_REVERSE:
1012      case OP_BRANUMBER:      case OP_BRANUMBER:
1013      case OP_CREF:      case OP_CREF:
     cc++;  
     /* Fall through */  
   
1014      case OP_OPT:      case OP_OPT:
1015      cc++;      case OP_CALLOUT:
     /* Fall through */  
   
1016      case OP_SOD:      case OP_SOD:
1017        case OP_SOM:
1018      case OP_EOD:      case OP_EOD:
1019      case OP_EODN:      case OP_EODN:
1020      case OP_CIRC:      case OP_CIRC:
1021      case OP_DOLL:      case OP_DOLL:
1022      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1023      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1024      cc++;      cc += OP_lengths[*cc];
1025      break;      break;
1026    
1027      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1028      This requires a scan of the string, unfortunately. We assume valid UTF-8      This requires a scan of the string, unfortunately. We assume valid UTF-8
1029      strings, so all we do is reduce the length by one for byte whose bits are      strings, so all we do is reduce the length by one for every byte whose bits
1030      10xxxxxx. */      are 10xxxxxx. */
1031    
1032      case OP_CHARS:      case OP_CHARS:
1033      branchlength += *(++cc);      branchlength += *(++cc);
1034  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1035      for (d = 1; d <= *cc; d++)      if ((options & PCRE_UTF8) != 0)
1036        if ((cc[d] & 0xc0) == 0x80) branchlength--;        for (d = 1; d <= *cc; d++)
1037            if ((cc[d] & 0xc0) == 0x80) branchlength--;
1038  #endif  #endif
1039      cc += *cc + 1;      cc += *cc + 1;
1040      break;      break;
1041    
1042      /* Handle exact repetitions */      /* Handle exact repetitions. The count is already in characters, but we
1043        need to skip over a multibyte character in UTF8 mode.  */
1044    
1045      case OP_EXACT:      case OP_EXACT:
1046        branchlength += GET2(cc,1);
1047        cc += 4;
1048    #ifdef SUPPORT_UTF8
1049        if ((options & PCRE_UTF8) != 0)
1050          {
1051          while((*cc & 0x80) == 0x80) cc++;
1052          }
1053    #endif
1054        break;
1055    
1056      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1057      branchlength += (cc[1] << 8) + cc[2];      branchlength += GET2(cc,1);
1058      cc += 4;      cc += 4;
1059      break;      break;
1060    
# Line 871  for (;;) Line 1071  for (;;)
1071      cc++;      cc++;
1072      break;      break;
1073    
1074        /* The single-byte matcher isn't allowed */
1075    
1076        case OP_ANYBYTE:
1077        return -2;
1078    
1079      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1080    
1081    #ifdef SUPPORT_UTF8
1082        case OP_XCLASS:
1083        cc += GET(cc, 1) - 33;
1084        /* Fall through */
1085    #endif
1086    
1087      case OP_CLASS:      case OP_CLASS:
1088        case OP_NCLASS:
1089      cc += 33;      cc += 33;
1090    
1091      switch (*cc)      switch (*cc)
# Line 887  for (;;) Line 1098  for (;;)
1098    
1099        case OP_CRRANGE:        case OP_CRRANGE:
1100        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1101        if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;        if (GET2(cc,1) != GET2(cc,3)) return -1;
1102        branchlength += (cc[1] << 8) + cc[2];        branchlength += GET2(cc,1);
1103        cc += 5;        cc += 5;
1104        break;        break;
1105    
# Line 910  for (;;) Line 1121  for (;;)
1121    
1122    
1123  /*************************************************  /*************************************************
1124  *           Check for POSIX class syntax         *  *    Scan compiled regex for numbered bracket    *
1125  *************************************************/  *************************************************/
1126    
1127  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This little function scans through a compiled pattern until it finds a
1128  encountered in a character class. It checks whether this is followed by an  capturing bracket with the given number.
 optional ^ and then a sequence of letters, terminated by a matching ":]" or  
 ".]" or "=]".  
1129    
1130  Argument:  Arguments:
1131    ptr      pointer to the initial [    code        points to start of expression
1132    endptr   where to return the end pointer    utf8        TRUE in UTF-8 mode
1133    cd       pointer to compile data    number      the required bracket number
1134    
1135  Returns:   TRUE or FALSE  Returns:      pointer to the opcode for the bracket, or NULL if not found
1136  */  */
1137    
1138  static BOOL  static const uschar *
1139  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  find_bracket(const uschar *code, BOOL utf8, int number)
1140  {  {
1141  int terminator;          /* Don't combine these lines; the Solaris cc */  for (;;)
 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  
 if (*(++ptr) == '^') ptr++;  
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1142    {    {
1143    *endptr = ptr;    register int c = *code;
1144    return TRUE;    if (c == OP_END) return NULL;
1145    }    else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1146  return FALSE;    else if (c > OP_BRA)
1147  }      {
1148        int n = c - OP_BRA;
1149        if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1150        if (n == number) return (uschar *)code;
1151        code += OP_lengths[OP_BRA];
1152  /*************************************************      }
1153  *          Check POSIX class name                *    else
1154  *************************************************/      {
1155        code += OP_lengths[c];
 /* This function is called to check the name given in a POSIX-style class entry  
 such as [:alnum:].  
   
 Arguments:  
   ptr        points to the first letter  
   len        the length of the name  
1156    
1157  Returns:     a value representing the name, or -1 if unknown      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1158  */      by a multi-byte character. The length in the table is a minimum, so we have
1159        to scan along to skip the extra characters. All opcodes are less than 128,
1160        so we can use relatively efficient code. */
1161    
1162  static int  #ifdef SUPPORT_UTF8
1163  check_posix_name(const uschar *ptr, int len)      if (utf8) switch(c)
1164  {        {
1165  register int yield = 0;        case OP_EXACT:
1166  while (posix_name_lengths[yield] != 0)        case OP_UPTO:
1167    {        case OP_MINUPTO:
1168    if (len == posix_name_lengths[yield] &&        case OP_STAR:
1169      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;        case OP_MINSTAR:
1170    yield++;        case OP_PLUS:
1171          case OP_MINPLUS:
1172          case OP_QUERY:
1173          case OP_MINQUERY:
1174          while ((*code & 0xc0) == 0x80) code++;
1175          break;
1176          }
1177    #endif
1178        }
1179    }    }
 return -1;  
1180  }  }
1181    
1182    
1183    
   
1184  /*************************************************  /*************************************************
1185  *           Compile one branch                   *  *    Scan compiled branch for non-emptiness      *
1186  *************************************************/  *************************************************/
1187    
1188  /* Scan the pattern, compiling it into the code vector.  /* This function scans through a branch of a compiled pattern to see whether it
1189    can match the empty string or not. It is called only from could_be_empty()
1190    below. Note that first_significant_code() skips over assertions. If we hit an
1191    unclosed bracket, we return "empty" - this means we've struck an inner bracket
1192    whose current branch will already have been scanned.
1193    
1194  Arguments:  Arguments:
1195    options      the option bits    code        points to start of search
1196    brackets     points to number of extracting brackets used    endcode     points to where to stop
1197    code         points to the pointer to the current code point    utf8        TRUE if in UTF8 mode
   ptrptr       points to the current pattern pointer  
   errorptr     points to pointer to error message  
   optchanged   set to the value of the last OP_OPT item compiled  
   reqchar      set to the last literal character required, else -1  
   countlits    set to count of mandatory literal characters  
   cd           contains pointers to tables  
1198    
1199  Returns:       TRUE on success  Returns:      TRUE if what is matched could be empty
                FALSE, with *errorptr set on error  
1200  */  */
1201    
1202  static BOOL  static BOOL
1203  compile_branch(int options, int *brackets, uschar **codeptr,  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
   const uschar **ptrptr, const char **errorptr, int *optchanged,  
   int *reqchar, int *countlits, compile_data *cd)  
1204  {  {
 int repeat_type, op_type;  
 int repeat_min, repeat_max;  
 int bravalue, length;  
 int greedy_default, greedy_non_default;  
 int prevreqchar;  
 int condcount = 0;  
 int subcountlits = 0;  
1205  register int c;  register int c;
1206  register uschar *code = *codeptr;  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1207  uschar *tempcode;       code < endcode;
1208  const uschar *ptr = *ptrptr;       code = first_significant_code(code + OP_lengths[c], NULL, 0))
1209  const uschar *tempptr;    {
1210  uschar *previous = NULL;    const uschar *ccode;
 uschar class[32];  
1211    
1212  /* Set up the default and non-default settings for greediness */    c = *code;
1213    
1214  greedy_default = ((options & PCRE_UNGREEDY) != 0);    if (c >= OP_BRA)
1215  greedy_non_default = greedy_default ^ 1;      {
1216        BOOL empty_branch;
1217        if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1218    
1219  /* Initialize no required char, and count of literals */      /* Scan a closed bracket */
1220    
1221  *reqchar = prevreqchar = -1;      empty_branch = FALSE;
1222  *countlits = 0;      do
1223          {
1224          if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1225            empty_branch = TRUE;
1226          code += GET(code, 1);
1227          }
1228        while (*code == OP_ALT);
1229        if (!empty_branch) return FALSE;   /* All branches are non-empty */
1230        code += 1 + LINK_SIZE;
1231        c = *code;
1232        }
1233    
1234  /* Switch on next character until the end of the branch */    else switch (c)
1235        {
1236        /* Check for quantifiers after a class */
1237    
1238  for (;; ptr++)  #ifdef SUPPORT_UTF8
1239    {      case OP_XCLASS:
1240    BOOL negate_class;      ccode = code + GET(code, 1);
1241    int class_charcount;      goto CHECK_CLASS_REPEAT;
1242    #endif
1243    
1244        case OP_CLASS:
1245        case OP_NCLASS:
1246        ccode = code + 33;
1247    
1248    #ifdef SUPPORT_UTF8
1249        CHECK_CLASS_REPEAT:
1250    #endif
1251    
1252        switch (*ccode)
1253          {
1254          case OP_CRSTAR:            /* These could be empty; continue */
1255          case OP_CRMINSTAR:
1256          case OP_CRQUERY:
1257          case OP_CRMINQUERY:
1258          break;
1259    
1260          default:                   /* Non-repeat => class must match */
1261          case OP_CRPLUS:            /* These repeats aren't empty */
1262          case OP_CRMINPLUS:
1263          return FALSE;
1264    
1265          case OP_CRRANGE:
1266          case OP_CRMINRANGE:
1267          if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1268          break;
1269          }
1270        break;
1271    
1272        /* Opcodes that must match a character */
1273    
1274        case OP_NOT_DIGIT:
1275        case OP_DIGIT:
1276        case OP_NOT_WHITESPACE:
1277        case OP_WHITESPACE:
1278        case OP_NOT_WORDCHAR:
1279        case OP_WORDCHAR:
1280        case OP_ANY:
1281        case OP_ANYBYTE:
1282        case OP_CHARS:
1283        case OP_NOT:
1284        case OP_PLUS:
1285        case OP_MINPLUS:
1286        case OP_EXACT:
1287        case OP_NOTPLUS:
1288        case OP_NOTMINPLUS:
1289        case OP_NOTEXACT:
1290        case OP_TYPEPLUS:
1291        case OP_TYPEMINPLUS:
1292        case OP_TYPEEXACT:
1293        return FALSE;
1294    
1295        /* End of branch */
1296    
1297        case OP_KET:
1298        case OP_KETRMAX:
1299        case OP_KETRMIN:
1300        case OP_ALT:
1301        return TRUE;
1302    
1303        /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1304        followed by a multibyte character */
1305    
1306    #ifdef SUPPORT_UTF8
1307        case OP_STAR:
1308        case OP_MINSTAR:
1309        case OP_QUERY:
1310        case OP_MINQUERY:
1311        case OP_UPTO:
1312        case OP_MINUPTO:
1313        if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1314        break;
1315    #endif
1316        }
1317      }
1318    
1319    return TRUE;
1320    }
1321    
1322    
1323    
1324    /*************************************************
1325    *    Scan compiled regex for non-emptiness       *
1326    *************************************************/
1327    
1328    /* This function is called to check for left recursive calls. We want to check
1329    the current branch of the current pattern to see if it could match the empty
1330    string. If it could, we must look outwards for branches at other levels,
1331    stopping when we pass beyond the bracket which is the subject of the recursion.
1332    
1333    Arguments:
1334      code        points to start of the recursion
1335      endcode     points to where to stop (current RECURSE item)
1336      bcptr       points to the chain of current (unclosed) branch starts
1337      utf8        TRUE if in UTF-8 mode
1338    
1339    Returns:      TRUE if what is matched could be empty
1340    */
1341    
1342    static BOOL
1343    could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1344      BOOL utf8)
1345    {
1346    while (bcptr != NULL && bcptr->current >= code)
1347      {
1348      if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1349      bcptr = bcptr->outer;
1350      }
1351    return TRUE;
1352    }
1353    
1354    
1355    
1356    /*************************************************
1357    *           Check for POSIX class syntax         *
1358    *************************************************/
1359    
1360    /* This function is called when the sequence "[:" or "[." or "[=" is
1361    encountered in a character class. It checks whether this is followed by an
1362    optional ^ and then a sequence of letters, terminated by a matching ":]" or
1363    ".]" or "=]".
1364    
1365    Argument:
1366      ptr      pointer to the initial [
1367      endptr   where to return the end pointer
1368      cd       pointer to compile data
1369    
1370    Returns:   TRUE or FALSE
1371    */
1372    
1373    static BOOL
1374    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1375    {
1376    int terminator;          /* Don't combine these lines; the Solaris cc */
1377    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1378    if (*(++ptr) == '^') ptr++;
1379    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1380    if (*ptr == terminator && ptr[1] == ']')
1381      {
1382      *endptr = ptr;
1383      return TRUE;
1384      }
1385    return FALSE;
1386    }
1387    
1388    
1389    
1390    
1391    /*************************************************
1392    *          Check POSIX class name                *
1393    *************************************************/
1394    
1395    /* This function is called to check the name given in a POSIX-style class entry
1396    such as [:alnum:].
1397    
1398    Arguments:
1399      ptr        points to the first letter
1400      len        the length of the name
1401    
1402    Returns:     a value representing the name, or -1 if unknown
1403    */
1404    
1405    static int
1406    check_posix_name(const uschar *ptr, int len)
1407    {
1408    register int yield = 0;
1409    while (posix_name_lengths[yield] != 0)
1410      {
1411      if (len == posix_name_lengths[yield] &&
1412        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1413      yield++;
1414      }
1415    return -1;
1416    }
1417    
1418    
1419    
1420    
1421    /*************************************************
1422    *           Compile one branch                   *
1423    *************************************************/
1424    
1425    /* Scan the pattern, compiling it into the code vector. If the options are
1426    changed during the branch, the pointer is used to change the external options
1427    bits.
1428    
1429    Arguments:
1430      optionsptr     pointer to the option bits
1431      brackets       points to number of extracting brackets used
1432      code           points to the pointer to the current code point
1433      ptrptr         points to the current pattern pointer
1434      errorptr       points to pointer to error message
1435      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1436      reqbyteptr     set to the last literal character required, else < 0
1437      bcptr          points to current branch chain
1438      cd             contains pointers to tables etc.
1439    
1440    Returns:         TRUE on success
1441                     FALSE, with *errorptr set on error
1442    */
1443    
1444    static BOOL
1445    compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1446      const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1447      int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1448    {
1449    int repeat_type, op_type;
1450    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1451    int bravalue = 0;
1452    int length;
1453    int greedy_default, greedy_non_default;
1454    int firstbyte, reqbyte;
1455    int zeroreqbyte, zerofirstbyte;
1456    int req_caseopt;
1457    int condcount = 0;
1458    int options = *optionsptr;
1459    register int c;
1460    register uschar *code = *codeptr;
1461    uschar *tempcode;
1462    BOOL inescq = FALSE;
1463    BOOL groupsetfirstbyte = FALSE;
1464    const uschar *ptr = *ptrptr;
1465    const uschar *tempptr;
1466    uschar *previous = NULL;
1467    uschar class[32];
1468    
1469    #ifdef SUPPORT_UTF8
1470    BOOL class_utf8;
1471    BOOL utf8 = (options & PCRE_UTF8) != 0;
1472    uschar *class_utf8data;
1473    uschar utf8_char[6];
1474    #else
1475    BOOL utf8 = FALSE;
1476    #endif
1477    
1478    /* Set up the default and non-default settings for greediness */
1479    
1480    greedy_default = ((options & PCRE_UNGREEDY) != 0);
1481    greedy_non_default = greedy_default ^ 1;
1482    
1483    /* Initialize no first char, no required char. REQ_UNSET means "no char
1484    matching encountered yet". It gets changed to REQ_NONE if we hit something that
1485    matches a non-fixed char first char; reqbyte just remains unset if we never
1486    find one.
1487    
1488    When we hit a repeat whose minimum is zero, we may have to adjust these values
1489    to take the zero repeat into account. This is implemented by setting them to
1490    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1491    item types that can be repeated set these backoff variables appropriately. */
1492    
1493    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1494    
1495    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1496    according to the current setting of the caseless flag. REQ_CASELESS is a bit
1497    value > 255. It is added into the firstbyte or reqbyte variables to record the
1498    case status of the value. */
1499    
1500    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1501    
1502    /* Switch on next character until the end of the branch */
1503    
1504    for (;; ptr++)
1505      {
1506      BOOL negate_class;
1507      BOOL possessive_quantifier;
1508      int class_charcount;
1509    int class_lastchar;    int class_lastchar;
1510    int newoptions;    int newoptions;
1511      int recno;
1512    int skipbytes;    int skipbytes;
1513    int subreqchar;    int subreqbyte;
1514      int subfirstbyte;
1515    
1516    c = *ptr;    c = *ptr;
1517      if (inescq && c != 0) goto NORMAL_CHAR;
1518    
1519    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
1520      {      {
1521      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
# Line 1045  for (;; ptr++) Line 1524  for (;; ptr++)
1524        /* The space before the ; is to avoid a warning on a silly compiler        /* The space before the ; is to avoid a warning on a silly compiler
1525        on the Macintosh. */        on the Macintosh. */
1526        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1527        continue;        if (c != 0) continue;   /* Else fall through to handle end of string */
1528        }        }
1529      }      }
1530    
# Line 1056  for (;; ptr++) Line 1535  for (;; ptr++)
1535      case 0:      case 0:
1536      case '|':      case '|':
1537      case ')':      case ')':
1538        *firstbyteptr = firstbyte;
1539        *reqbyteptr = reqbyte;
1540      *codeptr = code;      *codeptr = code;
1541      *ptrptr = ptr;      *ptrptr = ptr;
1542      return TRUE;      return TRUE;
1543    
1544      /* Handle single-character metacharacters */      /* Handle single-character metacharacters. In multiline mode, ^ disables
1545        the setting of any following char as a first character. */
1546    
1547      case '^':      case '^':
1548        if ((options & PCRE_MULTILINE) != 0)
1549          {
1550          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1551          }
1552      previous = NULL;      previous = NULL;
1553      *code++ = OP_CIRC;      *code++ = OP_CIRC;
1554      break;      break;
# Line 1072  for (;; ptr++) Line 1558  for (;; ptr++)
1558      *code++ = OP_DOLL;      *code++ = OP_DOLL;
1559      break;      break;
1560    
1561        /* There can never be a first char if '.' is first, whatever happens about
1562        repeats. The value of reqbyte doesn't change either. */
1563    
1564      case '.':      case '.':
1565        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1566        zerofirstbyte = firstbyte;
1567        zeroreqbyte = reqbyte;
1568      previous = code;      previous = code;
1569      *code++ = OP_ANY;      *code++ = OP_ANY;
1570      break;      break;
1571    
1572      /* Character classes. These always build a 32-byte bitmap of the permitted      /* Character classes. If the included characters are all < 255 in value, we
1573      characters, except in the special case where there is only one character.      build a 32-byte bitmap of the permitted characters, except in the special
1574      For negated classes, we build the map as usual, then invert it at the end.      case where there is only one such character. For negated classes, we build
1575        the map as usual, then invert it at the end. However, we use a different
1576        opcode so that data characters > 255 can be handled correctly.
1577    
1578        If the class contains characters outside the 0-255 range, a different
1579        opcode is compiled. It may optionally have a bit map for characters < 256,
1580        but those above are are explicitly listed afterwards. A flag byte tells
1581        whether the bitmap is present, and whether this is a negated class or not.
1582      */      */
1583    
1584      case '[':      case '[':
1585      previous = code;      previous = code;
1586      *code++ = OP_CLASS;  
1587        /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1588        they are encountered at the top level, so we'll do that too. */
1589    
1590        if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1591            check_posix_syntax(ptr, &tempptr, cd))
1592          {
1593          *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1594          goto FAILED;
1595          }
1596    
1597      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. */
1598    
# Line 1093  for (;; ptr++) Line 1601  for (;; ptr++)
1601        negate_class = TRUE;        negate_class = TRUE;
1602        c = *(++ptr);        c = *(++ptr);
1603        }        }
1604      else negate_class = FALSE;      else
1605          {
1606          negate_class = FALSE;
1607          }
1608    
1609      /* Keep a count of chars so that we can optimize the case of just a single      /* Keep a count of chars with values < 256 so that we can optimize the case
1610      character. */      of just a single character (as long as it's < 256). For higher valued UTF-8
1611        characters, we don't yet do any optimization. */
1612    
1613      class_charcount = 0;      class_charcount = 0;
1614      class_lastchar = -1;      class_lastchar = -1;
1615    
1616    #ifdef SUPPORT_UTF8
1617        class_utf8 = FALSE;                       /* No chars >= 256 */
1618        class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1619    #endif
1620    
1621      /* Initialize the 32-char bit map to all zeros. We have to build the      /* Initialize the 32-char bit map to all zeros. We have to build the
1622      map in a temporary bit of store, in case the class contains only 1      map in a temporary bit of store, in case the class contains only 1
1623      character, because in that case the compiled code doesn't use the      character (< 256), because in that case the compiled code doesn't use the
1624      bit map. */      bit map. */
1625    
1626      memset(class, 0, 32 * sizeof(uschar));      memset(class, 0, 32 * sizeof(uschar));
1627    
1628      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
1629      means that an initial ] is taken as a data character. */      means that an initial ] is taken as a data character. The first pass
1630        through the regex checked the overall syntax, so we don't need to be very
1631        strict here. At the start of the loop, c contains the first byte of the
1632        character. */
1633    
1634      do      do
1635        {        {
1636        if (c == 0)  #ifdef SUPPORT_UTF8
1637          if (utf8 && c > 127) GETCHARLEN(c, ptr, ptr);
1638    #endif
1639    
1640          /* Inside \Q...\E everything is literal except \E */
1641    
1642          if (inescq)
1643          {          {
1644          *errorptr = ERR6;          if (c == '\\' && ptr[1] == 'E')
1645          goto FAILED;            {
1646              inescq = FALSE;
1647              ptr++;
1648              continue;
1649              }
1650            else goto LONE_SINGLE_CHARACTER;
1651          }          }
1652    
1653        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
1654        form [:^name]. A square bracket that doesn't match the syntax is        form [:^name:]. A square bracket that doesn't match the syntax is
1655        treated as a literal. We also recognize the POSIX constructions        treated as a literal. We also recognize the POSIX constructions
1656        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1657        5.6 does. */        5.6 and 5.8 do. */
1658    
1659        if (c == '[' &&        if (c == '[' &&
1660            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
# Line 1161  for (;; ptr++) Line 1692  for (;; ptr++)
1692            posix_class = 0;            posix_class = 0;
1693    
1694          /* Or into the map we are building up to 3 of the static class          /* Or into the map we are building up to 3 of the static class
1695          tables, or their negations. */          tables, or their negations. The [:blank:] class sets up the same
1696            chars as the [:space:] class (all white space). We remove the vertical
1697            white space chars afterwards. */
1698    
1699          posix_class *= 3;          posix_class *= 3;
1700          for (i = 0; i < 3; i++)          for (i = 0; i < 3; i++)
1701            {            {
1702              BOOL isblank = strncmp(ptr, "blank", 5) == 0;
1703            int taboffset = posix_class_maps[posix_class + i];            int taboffset = posix_class_maps[posix_class + i];
1704            if (taboffset < 0) break;            if (taboffset < 0) break;
1705            if (local_negate)            if (local_negate)
1706                {
1707              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1708                if (isblank) class[1] |= 0x3c;
1709                }
1710            else            else
1711                {
1712              for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];              for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1713                if (isblank) class[1] &= ~0x3c;
1714                }
1715            }            }
1716    
1717          ptr = tempptr + 1;          ptr = tempptr + 1;
1718          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1719          continue;          continue;    /* End of POSIX syntax handling */
1720          }          }
1721    
1722        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
# Line 1185  for (;; ptr++) Line 1725  for (;; ptr++)
1725        Inside a class (and only there) it is treated as backspace. Elsewhere        Inside a class (and only there) it is treated as backspace. Elsewhere
1726        it marks a word boundary. Other escapes have preset maps ready to        it marks a word boundary. Other escapes have preset maps ready to
1727        or into the one we are building. We assume they have more than one        or into the one we are building. We assume they have more than one
1728        character in them, so set class_count bigger than one. */        character in them, so set class_charcount bigger than one. */
1729    
1730        if (c == '\\')        if (c == '\\')
1731          {          {
1732          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1733          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
1734    
1735            if (-c == ESC_Q)            /* Handle start of quoted string */
1736              {
1737              if (ptr[1] == '\\' && ptr[2] == 'E')
1738                {
1739                ptr += 2; /* avoid empty string */
1740                }
1741              else inescq = TRUE;
1742              continue;
1743              }
1744    
1745          else if (c < 0)          else if (c < 0)
1746            {            {
1747            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
1748            class_charcount = 10;            class_charcount = 10;     /* Greater than 1 is what matters */
1749            switch (-c)            switch (-c)
1750              {              {
1751              case ESC_d:              case ESC_d:
# Line 1215  for (;; ptr++) Line 1766  for (;; ptr++)
1766    
1767              case ESC_s:              case ESC_s:
1768              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1769                class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1770              continue;              continue;
1771    
1772              case ESC_S:              case ESC_S:
1773              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1774                class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1775              continue;              continue;
1776    
1777                /* Unrecognized escapes are faulted if PCRE is running in its
1778                strict mode. By default, for compatibility with Perl, they are
1779                treated as literals. */
1780    
1781              default:              default:
1782              *errorptr = ERR7;              if ((options & PCRE_EXTRA) != 0)
1783              goto FAILED;                {
1784                  *errorptr = ERR7;
1785                  goto FAILED;
1786                  }
1787                c = *ptr;    /* The final character */
1788              }              }
1789            }            }
1790    
1791          /* Fall through if single character, but don't at present allow          /* Fall through if we have a single character (c >= 0). This may be
1792          chars > 255 in UTF-8 mode. */          > 256 in UTF-8 mode. */
1793    
1794  #ifdef SUPPORT_UTF8          }   /* End of backslash handling */
         if (c > 255)  
           {  
           *errorptr = ERR33;  
           goto FAILED;  
           }  
 #endif  
         }  
1795    
1796        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
1797        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
# Line 1247  for (;; ptr++) Line 1801  for (;; ptr++)
1801          {          {
1802          int d;          int d;
1803          ptr += 2;          ptr += 2;
         d = *ptr;  
1804    
1805          if (d == 0)  #ifdef SUPPORT_UTF8
1806            {          if (utf8)
1807            *errorptr = ERR6;            {                           /* Braces are required because the */
1808            goto FAILED;            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1809            }            }
1810            else
1811    #endif
1812            d = *ptr;
1813    
1814          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1815          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
# Line 1264  for (;; ptr++) Line 1820  for (;; ptr++)
1820            const uschar *oldptr = ptr;            const uschar *oldptr = ptr;
1821            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1822    
 #ifdef SUPPORT_UTF8  
           if (d > 255)  
             {  
             *errorptr = ERR33;  
             goto FAILED;  
             }  
 #endif  
1823            /* \b is backslash; any other special means the '-' was literal */            /* \b is backslash; any other special means the '-' was literal */
1824    
1825            if (d < 0)            if (d < 0)
# Line 1278  for (;; ptr++) Line 1827  for (;; ptr++)
1827              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1828                {                {
1829                ptr = oldptr - 2;                ptr = oldptr - 2;
1830                goto SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
1831                }                }
1832              }              }
1833            }            }
1834    
1835            /* Check that the two values are in the correct order */
1836    
1837          if (d < c)          if (d < c)
1838            {            {
1839            *errorptr = ERR8;            *errorptr = ERR8;
1840            goto FAILED;            goto FAILED;
1841            }            }
1842    
1843          for (; c <= d; c++)          /* If d is greater than 255, we can't just use the bit map, so set up
1844            for the UTF-8 supporting class type. If we are not caseless, we can
1845            just set up a single range. If we are caseless, the characters < 256
1846            are handled with a bitmap, in order to get the case-insensitive
1847            handling. */
1848    
1849    #ifdef SUPPORT_UTF8
1850            if (d > 255)
1851            {            {
1852            class[c/8] |= (1 << (c&7));            class_utf8 = TRUE;
1853            if ((options & PCRE_CASELESS) != 0)            *class_utf8data++ = XCL_RANGE;
1854              if ((options & PCRE_CASELESS) == 0)
1855                {
1856                class_utf8data += ord2utf8(c, class_utf8data);
1857                class_utf8data += ord2utf8(d, class_utf8data);
1858                continue;  /* Go get the next char in the class */
1859                }
1860              class_utf8data += ord2utf8(256, class_utf8data);
1861              class_utf8data += ord2utf8(d, class_utf8data);
1862              d = 255;
1863              /* Fall through */
1864              }
1865    #endif
1866            /* We use the bit map if the range is entirely < 255, or if part of it
1867            is < 255 and matching is caseless. */
1868    
1869            for (; c <= d; c++)
1870              {
1871              class[c/8] |= (1 << (c&7));
1872              if ((options & PCRE_CASELESS) != 0)
1873              {              {
1874              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
1875              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
# Line 1300  for (;; ptr++) Line 1877  for (;; ptr++)
1877            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
1878            class_lastchar = c;            class_lastchar = c;
1879            }            }
1880    
1881          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
1882          }          }
1883    
1884        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1885        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1886    
1887        SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
1888    
1889        class [c/8] |= (1 << (c&7));        /* Handle a multibyte character */
1890        if ((options & PCRE_CASELESS) != 0)  
1891    #ifdef SUPPORT_UTF8
1892          if (utf8 && c > 255)
1893            {
1894            class_utf8 = TRUE;
1895            *class_utf8data++ = XCL_SINGLE;
1896            class_utf8data += ord2utf8(c, class_utf8data);
1897            }
1898          else
1899    #endif
1900          /* Handle a single-byte character */
1901          {          {
1902          c = cd->fcc[c];   /* flip case */          class [c/8] |= (1 << (c&7));
1903          class[c/8] |= (1 << (c&7));          if ((options & PCRE_CASELESS) != 0)
1904              {
1905              c = cd->fcc[c];   /* flip case */
1906              class[c/8] |= (1 << (c&7));
1907              }
1908            class_charcount++;
1909            class_lastchar = c;
1910          }          }
       class_charcount++;  
       class_lastchar = c;  
1911        }        }
1912    
1913      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached; the check for end of string happens inside the
1914      loop. This "while" is the end of the "do" above. */      loop. This "while" is the end of the "do" above. */
1915    
1916      while ((c = *(++ptr)) != ']');      while ((c = *(++ptr)) != ']' || inescq);
1917    
1918      /* If class_charcount is 1 and class_lastchar is not negative, we saw      /* If class_charcount is 1, we saw precisely one character with a value <
1919      precisely one character. This doesn't need the whole 32-byte bit map.      256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1920      We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if      the one character is < 128. In non-UTF-8 mode we can always optimize.
1921      it's negative. */  
1922        The optimization throws away the bit map. We turn the item into a
1923        1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1924        that OP_NOT does not support multibyte characters. In the positive case, it
1925        can cause firstbyte to be set. Otherwise, there can be no first char if
1926        this item is first, whatever repeat count may follow. In the case of
1927        reqbyte, save the previous value for reinstating. */
1928    
1929      if (class_charcount == 1 && class_lastchar >= 0)  #ifdef SUPPORT_UTF8
1930        if (!class_utf8 && class_charcount == 1 && class_lastchar < 128)
1931    #else
1932        if (class_charcount == 1)
1933    #endif
1934        {        {
1935          zeroreqbyte = reqbyte;
1936        if (negate_class)        if (negate_class)
1937          {          {
1938          code[-1] = OP_NOT;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1939            zerofirstbyte = firstbyte;
1940            *code++ = OP_NOT;
1941          }          }
1942        else        else
1943          {          {
1944          code[-1] = OP_CHARS;          if (firstbyte == REQ_UNSET)
1945              {
1946              zerofirstbyte = REQ_NONE;
1947              firstbyte = class_lastchar | req_caseopt;
1948              }
1949            else
1950              {
1951              zerofirstbyte = firstbyte;
1952              reqbyte = class_lastchar | req_caseopt;
1953              }
1954            *code++ = OP_CHARS;
1955          *code++ = 1;          *code++ = 1;
1956          }          }
1957        *code++ = class_lastchar;        *code++ = class_lastchar;
1958          break;  /* End of class handling */
1959          }       /* End of 1-byte optimization */
1960    
1961        /* Otherwise, if this is the first thing in the branch, there can be no
1962        first char setting, whatever the repeat count. Any reqbyte setting must
1963        remain unchanged after any kind of repeat. */
1964    
1965        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1966        zerofirstbyte = firstbyte;
1967        zeroreqbyte = reqbyte;
1968    
1969        /* If there are characters with values > 255, we have to compile an
1970        extended class, with its own opcode. If there are no characters < 256,
1971        we can omit the bitmap. */
1972    
1973    #ifdef SUPPORT_UTF8
1974        if (class_utf8)
1975          {
1976          *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
1977          *code++ = OP_XCLASS;
1978          code += LINK_SIZE;
1979          *code = negate_class? XCL_NOT : 0;
1980    
1981          /* If the map is required, install it, and move on to the end of
1982          the extra data */
1983    
1984          if (class_charcount > 0)
1985            {
1986            *code++ |= XCL_MAP;
1987            memcpy(code, class, 32);
1988            code = class_utf8data;
1989            }
1990    
1991          /* If the map is not required, slide down the extra data. */
1992    
1993          else
1994            {
1995            int len = class_utf8data - (code + 33);
1996            memmove(code + 1, code + 33, len);
1997            code += len + 1;
1998            }
1999    
2000          /* Now fill in the complete length of the item */
2001    
2002          PUT(previous, 1, code - previous);
2003          break;   /* End of class handling */
2004        }        }
2005    #endif
2006    
2007      /* Otherwise, negate the 32-byte map if necessary, and copy it into      /* If there are no characters > 255, negate the 32-byte map if necessary,
2008      the code vector. */      and copy it into the code vector. If this is the first thing in the branch,
2009        there can be no first char setting, whatever the repeat count. Any reqbyte
2010        setting must remain unchanged after any kind of repeat. */
2011    
2012        if (negate_class)
2013          {
2014          *code++ = OP_NCLASS;
2015          for (c = 0; c < 32; c++) code[c] = ~class[c];
2016          }
2017      else      else
2018        {        {
2019        if (negate_class)        *code++ = OP_CLASS;
2020          for (c = 0; c < 32; c++) code[c] = ~class[c];        memcpy(code, class, 32);
       else  
         memcpy(code, class, 32);  
       code += 32;  
2021        }        }
2022        code += 32;
2023      break;      break;
2024    
2025      /* Various kinds of repeat */      /* Various kinds of repeat */
# Line 1384  for (;; ptr++) Line 2051  for (;; ptr++)
2051        goto FAILED;        goto FAILED;
2052        }        }
2053    
2054      /* If the next character is '?' this is a minimizing repeat, by default,      if (repeat_min == 0)
2055      but if PCRE_UNGREEDY is set, it works the other way round. Advance to the        {
2056      next character. */        firstbyte = zerofirstbyte;   /* Adjust for zero repeat */
2057          reqbyte = zeroreqbyte;       /* Ditto */
2058          }
2059    
2060      if (ptr[1] == '?')      op_type = 0;                    /* Default single-char op codes */
2061        { repeat_type = greedy_non_default; ptr++; }      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2062    
2063        /* Save start of previous item, in case we have to move it up to make space
2064        for an inserted OP_ONCE for the additional '+' extension. */
2065    
2066        tempcode = previous;
2067    
2068        /* If the next character is '+', we have a possessive quantifier. This
2069        implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2070        If the next character is '?' this is a minimizing repeat, by default,
2071        but if PCRE_UNGREEDY is set, it works the other way round. We change the
2072        repeat type to the non-default. */
2073    
2074        if (ptr[1] == '+')
2075          {
2076          repeat_type = 0;                  /* Force greedy */
2077          possessive_quantifier = TRUE;
2078          ptr++;
2079          }
2080        else if (ptr[1] == '?')
2081          {
2082          repeat_type = greedy_non_default;
2083          ptr++;
2084          }
2085      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2086    
2087        /* If previous was a recursion, we need to wrap it inside brackets so that
2088        it can be replicated if necessary. */
2089    
2090        if (*previous == OP_RECURSE)
2091          {
2092          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2093          code += 1 + LINK_SIZE;
2094          *previous = OP_BRA;
2095          PUT(previous, 1, code - previous);
2096          *code = OP_KET;
2097          PUT(code, 1, code - previous);
2098          code += 1 + LINK_SIZE;
2099          }
2100    
2101      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
2102      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
2103      abolish the previous item altogether. A repeat with a zero minimum wipes      abolish the previous item altogether. If a one-char item has a minumum of
2104      out any reqchar setting, backing up to the previous value. We must also      more than one, ensure that it is set in reqbyte - it might not be if a
2105      adjust the countlits value. */      sequence such as x{3} is the first thing in a branch because the x will
2106        have gone into firstbyte instead.  */
2107    
2108      if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
2109        {        {
2110        int len = previous[1];        /* Deal with UTF-8 characters that take up more than one byte. It's
2111          easier to write this out separately than try to macrify it. Use c to
2112          hold the length of the character in bytes, plus 0x80 to flag that it's a
2113          length rather than a small character. */
2114    
2115        if (repeat_min == 0) *reqchar = prevreqchar;  #ifdef SUPPORT_UTF8
2116        *countlits += repeat_min - 1;        if (utf8 && (code[-1] & 0x80) != 0)
   
       if (len == 1)  
2117          {          {
2118          c = previous[2];          uschar *lastchar = code - 1;
2119          code = previous;          while((*lastchar & 0xc0) == 0x80) lastchar--;
2120            c = code - lastchar;            /* Length of UTF-8 character */
2121            memcpy(utf8_char, lastchar, c); /* Save the char */
2122            if (lastchar == previous + 2)   /* There was only one character */
2123              {
2124              code = previous;              /* Abolish the previous item */
2125              }
2126            else
2127              {
2128              previous[1] -= c;             /* Adjust length of previous */
2129              code = lastchar;              /* Lost char off the end */
2130              tempcode = code;              /* Adjust position to be moved for '+' */
2131              }
2132            c |= 0x80;                      /* Flag c as a length */
2133          }          }
2134        else        else
2135    #endif
2136    
2137          /* Handle the case of a single byte - either with no UTF8 support, or
2138          with UTF-8 disabled, or for a UTF-8 character < 128. */
2139    
2140          {          {
2141          c = previous[len+1];          c = *(--code);
2142          previous[1]--;          if (code == previous + 2)   /* There was only one character */
2143          code--;            {
2144              code = previous;              /* Abolish the previous item */
2145              if (repeat_min > 1) reqbyte = c | req_caseopt;
2146              }
2147            else
2148              {
2149              previous[1]--;             /* adjust length */
2150              tempcode = code;           /* Adjust position to be moved for '+' */
2151              }
2152          }          }
2153        op_type = 0;                 /* Use single-char op codes */  
2154        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2155        }        }
2156    
2157      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
2158      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
2159      character repeats by adding a suitable offset into repeat_type. */      character repeats by setting opt_type to add a suitable offset into
2160        repeat_type. OP_NOT is currently used only for single-byte chars. */
2161    
2162      else if ((int)*previous == OP_NOT)      else if (*previous == OP_NOT)
2163        {        {
2164        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2165        c = previous[1];        c = previous[1];
# Line 1434  for (;; ptr++) Line 2169  for (;; ptr++)
2169    
2170      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
2171      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
2172      repeats by adding a suitable offset into repeat_type. */      repeats by setting op_type to add a suitable offset into repeat_type. */
2173    
2174      else if ((int)*previous < OP_EODN || *previous == OP_ANY)      else if (*previous < OP_EODN)
2175        {        {
2176        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2177        c = *previous;        c = *previous;
# Line 1463  for (;; ptr++) Line 2198  for (;; ptr++)
2198          else          else
2199            {            {
2200            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
2201            *code++ = repeat_max >> 8;            PUT2INC(code, 0, repeat_max);
           *code++ = (repeat_max & 255);  
2202            }            }
2203          }          }
2204    
# Line 1481  for (;; ptr++) Line 2215  for (;; ptr++)
2215          if (repeat_min != 1)          if (repeat_min != 1)
2216            {            {
2217            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2218            *code++ = repeat_min >> 8;            PUT2INC(code, 0, repeat_min);
           *code++ = (repeat_min & 255);  
2219            }            }
2220    
2221          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
# Line 1490  for (;; ptr++) Line 2223  for (;; ptr++)
2223          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
2224          string. For a character type nothing need be done; it will just get          string. For a character type nothing need be done; it will just get
2225          put back naturally. Note that the final character is always going to          put back naturally. Note that the final character is always going to
2226          get added below. */          get added below, so we leave code ready for its insertion. */
2227    
2228          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
2229            {            {
2230            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else
2231    
2232              /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2233              bit set as a flag. The length will always be between 2 and 6. */
2234    
2235    #ifdef SUPPORT_UTF8
2236              if (utf8 && c >= 128) previous[1] += c & 7; else
2237    #endif
2238              previous[1]++;
2239            }            }
2240    
2241          /*  For a single negated character we also have to put back the          /*  For a single negated character we also have to put back the
2242          item that got cancelled. */          item that got cancelled. At present this applies only to single byte
2243            characters in any mode. */
2244    
2245          else if (*previous == OP_NOT) code++;          else if (*previous == OP_NOT) code++;
2246    
2247          /* If the maximum is unlimited, insert an OP_STAR. */          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2248            we have to insert the character for the previous code. In UTF-8 mode,
2249            long characters have their length in c, with the 0x80 bit as a flag. */
2250    
2251          if (repeat_max < 0)          if (repeat_max < 0)
2252            {            {
2253    #ifdef SUPPORT_UTF8
2254              if (utf8 && c >= 128)
2255                {
2256                memcpy(code, utf8_char, c & 7);
2257                code += c & 7;
2258                }
2259              else
2260    #endif
2261            *code++ = c;            *code++ = c;
2262            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
2263            }            }
2264    
2265          /* Else insert an UPTO if the max is greater than the min. */          /* Else insert an UPTO if the max is greater than the min, again
2266            preceded by the character, for the previously inserted code. */
2267    
2268          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
2269            {            {
2270    #ifdef SUPPORT_UTF8
2271              if (utf8 && c >= 128)
2272                {
2273                memcpy(code, utf8_char, c & 7);
2274                code += c & 7;
2275                }
2276              else
2277    #endif
2278            *code++ = c;            *code++ = c;
2279            repeat_max -= repeat_min;            repeat_max -= repeat_min;
2280            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
2281            *code++ = repeat_max >> 8;            PUT2INC(code, 0, repeat_max);
           *code++ = (repeat_max & 255);  
2282            }            }
2283          }          }
2284    
2285        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
2286    
2287    #ifdef SUPPORT_UTF8
2288          if (utf8 && c >= 128)
2289            {
2290            memcpy(code, utf8_char, c & 7);
2291            code += c & 7;
2292            }
2293          else
2294    #endif
2295    
2296        *code++ = c;        *code++ = c;
2297        }        }
2298    
2299      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
2300      stuff after it, but just skip the item if the repeat was {0,0}. */      stuff after it, but just skip the item if the repeat was {0,0}. */
2301    
2302      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS ||
2303                 *previous == OP_NCLASS ||
2304    #ifdef SUPPORT_UTF8
2305                 *previous == OP_XCLASS ||
2306    #endif
2307                 *previous == OP_REF)
2308        {        {
2309        if (repeat_max == 0)        if (repeat_max == 0)
2310          {          {
# Line 1546  for (;; ptr++) Line 2320  for (;; ptr++)
2320        else        else
2321          {          {
2322          *code++ = OP_CRRANGE + repeat_type;          *code++ = OP_CRRANGE + repeat_type;
2323          *code++ = repeat_min >> 8;          PUT2INC(code, 0, repeat_min);
         *code++ = repeat_min & 255;  
2324          if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */          if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2325          *code++ = repeat_max >> 8;          PUT2INC(code, 0, repeat_max);
         *code++ = repeat_max & 255;  
2326          }          }
2327        }        }
2328    
2329      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
2330      cases. */      cases. */
2331    
2332      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2333               (int)*previous == OP_COND)               *previous == OP_COND)
2334        {        {
2335        register int i;        register int i;
2336        int ketoffset = 0;        int ketoffset = 0;
# Line 1574  for (;; ptr++) Line 2346  for (;; ptr++)
2346        if (repeat_max == -1)        if (repeat_max == -1)
2347          {          {
2348          register uschar *ket = previous;          register uschar *ket = previous;
2349          do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);          do ket += GET(ket, 1); while (*ket != OP_KET);
2350          ketoffset = code - ket;          ketoffset = code - ket;
2351          }          }
2352    
# Line 1587  for (;; ptr++) Line 2359  for (;; ptr++)
2359    
2360        if (repeat_min == 0)        if (repeat_min == 0)
2361          {          {
         /* If we set up a required char from the bracket, we must back off  
         to the previous value and reset the countlits value too. */  
   
         if (subcountlits > 0)  
           {  
           *reqchar = prevreqchar;  
           *countlits -= subcountlits;  
           }  
   
2362          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
2363          altogether. */          altogether. */
2364    
# Line 1625  for (;; ptr++) Line 2388  for (;; ptr++)
2388          else          else
2389            {            {
2390            int offset;            int offset;
2391            memmove(previous+4, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
2392            code += 4;            code += 2 + LINK_SIZE;
2393            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
2394            *previous++ = OP_BRA;            *previous++ = OP_BRA;
2395    
# Line 1635  for (;; ptr++) Line 2398  for (;; ptr++)
2398    
2399            offset = (bralink == NULL)? 0 : previous - bralink;            offset = (bralink == NULL)? 0 : previous - bralink;
2400            bralink = previous;            bralink = previous;
2401            *previous++ = offset >> 8;            PUTINC(previous, 0, offset);
           *previous++ = offset & 255;  
2402            }            }
2403    
2404          repeat_max--;          repeat_max--;
# Line 1644  for (;; ptr++) Line 2406  for (;; ptr++)
2406    
2407        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
2408        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
2409        copies that we need. */        copies that we need. If we set a first char from the group, and didn't
2410          set a required char, copy the latter from the former. */
2411    
2412        else        else
2413          {          {
2414          for (i = 1; i < repeat_min; i++)          if (repeat_min > 1)
2415            {            {
2416            memcpy(code, previous, len);            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2417            code += len;            for (i = 1; i < repeat_min; i++)
2418                {
2419                memcpy(code, previous, len);
2420                code += len;
2421                }
2422            }            }
2423          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
2424          }          }
# Line 1677  for (;; ptr++) Line 2444  for (;; ptr++)
2444              *code++ = OP_BRA;              *code++ = OP_BRA;
2445              offset = (bralink == NULL)? 0 : code - bralink;              offset = (bralink == NULL)? 0 : code - bralink;
2446              bralink = code;              bralink = code;
2447              *code++ = offset >> 8;              PUTINC(code, 0, offset);
             *code++ = offset & 255;  
2448              }              }
2449    
2450            memcpy(code, previous, len);            memcpy(code, previous, len);
# Line 1693  for (;; ptr++) Line 2459  for (;; ptr++)
2459            int oldlinkoffset;            int oldlinkoffset;
2460            int offset = code - bralink + 1;            int offset = code - bralink + 1;
2461            uschar *bra = code - offset;            uschar *bra = code - offset;
2462            oldlinkoffset = (bra[1] << 8) + bra[2];            oldlinkoffset = GET(bra, 1);
2463            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2464            *code++ = OP_KET;            *code++ = OP_KET;
2465            *code++ = bra[1] = offset >> 8;            PUTINC(code, 0, offset);
2466            *code++ = bra[2] = (offset & 255);            PUT(bra, 1, offset);
2467            }            }
2468          }          }
2469    
# Line 1717  for (;; ptr++) Line 2483  for (;; ptr++)
2483        goto FAILED;        goto FAILED;
2484        }        }
2485    
2486        /* If the character following a repeat is '+', we wrap the entire repeated
2487        item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2488        Sun's Java package. The repeated item starts at tempcode, not at previous,
2489        which might be the first part of a string whose (former) last char we
2490        repeated. However, we don't support '+' after a greediness '?'. */
2491    
2492        if (possessive_quantifier)
2493          {
2494          int len = code - tempcode;
2495          memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2496          code += 1 + LINK_SIZE;
2497          len += 1 + LINK_SIZE;
2498          tempcode[0] = OP_ONCE;
2499          *code++ = OP_KET;
2500          PUTINC(code, 0, len);
2501          PUT(tempcode, 1, len);
2502          }
2503    
2504      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
2505    
2506      END_REPEAT:      END_REPEAT:
# Line 1754  for (;; ptr++) Line 2538  for (;; ptr++)
2538    
2539          case '(':          case '(':
2540          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
2541          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)  
2542            /* Condition to test for recursion */
2543    
2544            if (ptr[1] == 'R')
2545              {
2546              code[1+LINK_SIZE] = OP_CREF;
2547              PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2548              skipbytes = 3;
2549              ptr += 3;
2550              }
2551    
2552            /* Condition to test for a numbered subpattern match */
2553    
2554            else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2555            {            {
2556            int condref = *ptr - '0';            int condref = *(++ptr) - '0';
2557            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2558            if (condref == 0)            if (condref == 0)
2559              {              {
# Line 1764  for (;; ptr++) Line 2561  for (;; ptr++)
2561              goto FAILED;              goto FAILED;
2562              }              }
2563            ptr++;            ptr++;
2564            code[3] = OP_CREF;            code[1+LINK_SIZE] = OP_CREF;
2565            code[4] = condref >> 8;            PUT2(code, 2+LINK_SIZE, condref);
           code[5] = condref & 255;  
2566            skipbytes = 3;            skipbytes = 3;
2567            }            }
2568          else ptr--;          /* For conditions that are assertions, we just fall through, having
2569            set bravalue above. */
2570          break;          break;
2571    
2572          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
# Line 1794  for (;; ptr++) Line 2591  for (;; ptr++)
2591            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
2592            ptr++;            ptr++;
2593            break;            break;
   
           default:                /* Syntax error */  
           *errorptr = ERR24;  
           goto FAILED;  
2594            }            }
2595          break;          break;
2596    
# Line 1806  for (;; ptr++) Line 2599  for (;; ptr++)
2599          ptr++;          ptr++;
2600          break;          break;
2601    
2602            case 'C':                 /* Callout - may be followed by digits */
2603            *code++ = OP_CALLOUT;
2604              {
2605              int n = 0;
2606              while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
2607                n = n * 10 + *ptr - '0';
2608              if (n > 255)
2609                {
2610                *errorptr = ERR38;
2611                goto FAILED;
2612                }
2613              *code++ = n;
2614              }
2615            previous = NULL;
2616            continue;
2617    
2618            case 'P':                 /* Named subpattern handling */
2619            if (*(++ptr) == '<')      /* Definition */
2620              {
2621              int i, namelen;
2622              const uschar *name = ++ptr;
2623              uschar *slot = cd->name_table;
2624    
2625              while (*ptr++ != '>');
2626              namelen = ptr - name - 1;
2627    
2628              for (i = 0; i < cd->names_found; i++)
2629                {
2630                int c = memcmp(name, slot+2, namelen + 1);
2631                if (c == 0)
2632                  {
2633                  *errorptr = ERR43;
2634                  goto FAILED;
2635                  }
2636                if (c < 0)
2637                  {
2638                  memmove(slot + cd->name_entry_size, slot,
2639                    (cd->names_found - i) * cd->name_entry_size);
2640                  break;
2641                  }
2642                slot += cd->name_entry_size;
2643                }
2644    
2645              PUT2(slot, 0, *brackets + 1);
2646              memcpy(slot + 2, name, namelen);
2647              slot[2+namelen] = 0;
2648              cd->names_found++;
2649              goto NUMBERED_GROUP;
2650              }
2651    
2652            if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2653              {
2654              int i, namelen;
2655              int type = *ptr++;
2656              const uschar *name = ptr;
2657              uschar *slot = cd->name_table;
2658    
2659              while (*ptr != ')') ptr++;
2660              namelen = ptr - name;
2661    
2662              for (i = 0; i < cd->names_found; i++)
2663                {
2664                if (strncmp(name, slot+2, namelen) == 0) break;
2665                slot += cd->name_entry_size;
2666                }
2667              if (i >= cd->names_found)
2668                {
2669                *errorptr = ERR15;
2670                goto FAILED;
2671                }
2672    
2673              recno = GET2(slot, 0);
2674    
2675              if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2676    
2677              /* Back reference */
2678    
2679              previous = code;
2680              *code++ = OP_REF;
2681              PUT2INC(code, 0, recno);
2682              cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2683              if (recno > cd->top_backref) cd->top_backref = recno;
2684              continue;
2685              }
2686    
2687            /* Should never happen */
2688            break;
2689    
2690          case 'R':                 /* Pattern recursion */          case 'R':                 /* Pattern recursion */
2691          *code++ = OP_RECURSE;          ptr++;                    /* Same as (?0)      */
2692          ptr++;          /* Fall through */
2693    
2694            /* Recursion or "subroutine" call */
2695    
2696            case '0': case '1': case '2': case '3': case '4':
2697            case '5': case '6': case '7': case '8': case '9':
2698              {
2699              const uschar *called;
2700              recno = 0;
2701    
2702              while ((cd->ctypes[*ptr] & ctype_digit) != 0)
2703                recno = recno * 10 + *ptr++ - '0';
2704    
2705              /* Come here from code above that handles a named recursion */
2706    
2707              HANDLE_RECURSION:
2708    
2709              previous = code;
2710    
2711              /* Find the bracket that is being referenced. Temporarily end the
2712              regex in case it doesn't exist. */
2713    
2714              *code = OP_END;
2715              called = (recno == 0)?
2716                cd->start_code : find_bracket(cd->start_code, utf8, recno);
2717    
2718              if (called == NULL)
2719                {
2720                *errorptr = ERR15;
2721                goto FAILED;
2722                }
2723    
2724              /* If the subpattern is still open, this is a recursive call. We
2725              check to see if this is a left recursion that could loop for ever,
2726              and diagnose that case. */
2727    
2728              if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2729                {
2730                *errorptr = ERR40;
2731                goto FAILED;
2732                }
2733    
2734              /* Insert the recursion/subroutine item */
2735    
2736              *code = OP_RECURSE;
2737              PUT(code, 1, called - cd->start_code);
2738              code += 1 + LINK_SIZE;
2739              }
2740          continue;          continue;
2741    
2742            /* Character after (? not specially recognized */
2743    
2744          default:                  /* Option setting */          default:                  /* Option setting */
2745          set = unset = 0;          set = unset = 0;
2746          optset = &set;          optset = &set;
# Line 1827  for (;; ptr++) Line 2757  for (;; ptr++)
2757              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
2758              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
2759              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
   
             default:  
             *errorptr = ERR12;  
             goto FAILED;  
2760              }              }
2761            }            }
2762    
# Line 1839  for (;; ptr++) Line 2765  for (;; ptr++)
2765          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
2766    
2767          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
2768          group with option changes, so the options change at this level. At top          group with option changes, so the options change at this level. Compile
2769          level there is nothing else to be done (the options will in fact have          code to change the ims options if this setting actually changes any of
2770          been set from the start of compiling as a result of the first pass) but          them. We also pass the new setting back so that it can be put at the
2771          at an inner level we must compile code to change the ims options if          start of any following branches, and when this group ends (if we are in
2772          necessary, and pass the new setting back so that it can be put at the          a group), a resetting item can be compiled.
2773          start of any following branches, and when this group ends, a resetting  
2774          item can be compiled. */          Note that if this item is right at the start of the pattern, the
2775            options will have been abstracted and made global, so there will be no
2776            change to compile. */
2777    
2778          if (*ptr == ')')          if (*ptr == ')')
2779            {            {
2780            if ((options & PCRE_INGROUP) != 0 &&            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
               (options & PCRE_IMS) != (newoptions & PCRE_IMS))  
2781              {              {
2782              *code++ = OP_OPT;              *code++ = OP_OPT;
2783              *code++ = *optchanged = newoptions & PCRE_IMS;              *code++ = newoptions & PCRE_IMS;
2784              }              }
2785            options = newoptions;  /* Change options at this level */  
2786              /* Change options at this level, and pass them back for use
2787              in subsequent branches. Reset the greedy defaults and the case
2788              value for firstbyte and reqbyte. */
2789    
2790              *optionsptr = options = newoptions;
2791              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2792              greedy_non_default = greedy_default ^ 1;
2793              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2794    
2795            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
2796            continue;              /* It is complete */            continue;              /* It is complete */
2797            }            }
# Line 1870  for (;; ptr++) Line 2806  for (;; ptr++)
2806          }          }
2807        }        }
2808    
2809        /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2810        non-capturing and behave like (?:...) brackets */
2811    
2812        else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2813          {
2814          bravalue = OP_BRA;
2815          }
2816    
2817      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a referencing group; adjust the opcode. If the bracket
2818      number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and      number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2819      arrange for the true number to follow later, in an OP_BRANUMBER item. */      arrange for the true number to follow later, in an OP_BRANUMBER item. */
2820    
2821      else      else
2822        {        {
2823          NUMBERED_GROUP:
2824        if (++(*brackets) > EXTRACT_BASIC_MAX)        if (++(*brackets) > EXTRACT_BASIC_MAX)
2825          {          {
2826          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2827          code[3] = OP_BRANUMBER;          code[1+LINK_SIZE] = OP_BRANUMBER;
2828          code[4] = *brackets >> 8;          PUT2(code, 2+LINK_SIZE, *brackets);
         code[5] = *brackets & 255;  
2829          skipbytes = 3;          skipbytes = 3;
2830          }          }
2831        else bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
# Line 1897  for (;; ptr++) Line 2841  for (;; ptr++)
2841      tempcode = code;      tempcode = code;
2842    
2843      if (!compile_regex(      if (!compile_regex(
2844           options | PCRE_INGROUP,       /* Set for all nested groups */           newoptions,                   /* The complete new option state */
2845           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?           options & PCRE_IMS,           /* The previous ims option state */
            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */  
2846           brackets,                     /* Extracting bracket count */           brackets,                     /* Extracting bracket count */
2847           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
2848           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
# Line 1907  for (;; ptr++) Line 2850  for (;; ptr++)
2850           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
2851            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2852           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
2853           &subreqchar,                  /* For possible last char */           &subfirstbyte,                /* For possible first char */
2854           &subcountlits,                /* For literal count */           &subreqbyte,                  /* For possible last char */
2855             bcptr,                        /* Current branch chain */
2856           cd))                          /* Tables block */           cd))                          /* Tables block */
2857        goto FAILED;        goto FAILED;
2858    
# Line 1927  for (;; ptr++) Line 2871  for (;; ptr++)
2871    
2872        do {        do {
2873           condcount++;           condcount++;
2874           tc += (tc[1] << 8) | tc[2];           tc += GET(tc,1);
2875           }           }
2876        while (*tc != OP_KET);        while (*tc != OP_KET);
2877    
# Line 1936  for (;; ptr++) Line 2880  for (;; ptr++)
2880          *errorptr = ERR27;          *errorptr = ERR27;
2881          goto FAILED;          goto FAILED;
2882          }          }
2883    
2884          /* If there is just one branch, we must not make use of its firstbyte or
2885          reqbyte, because this is equivalent to an empty second branch. */
2886    
2887          if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2888        }        }
2889    
2890      /* Handle updating of the required character. If the subpattern didn't      /* Handle updating of the required and first characters. Update for normal
2891      set one, leave it as it was. Otherwise, update it for normal brackets of      brackets of all kinds, and conditions with two branches (see code above).
2892      all kinds, forward assertions, and conditions with two branches. Don't      If the bracket is followed by a quantifier with zero repeat, we have to
2893      update the literal count for forward assertions, however. If the bracket      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2894      is followed by a quantifier with zero repeat, we have to back off. Hence      main loop so that they can be accessed for the back off. */
2895      the definition of prevreqchar and subcountlits outside the main loop so  
2896      that they can be accessed for the back off. */      zeroreqbyte = reqbyte;
2897        zerofirstbyte = firstbyte;
2898      if (subreqchar > 0 &&      groupsetfirstbyte = FALSE;
2899           (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||  
2900           (bravalue == OP_COND && condcount == 2)))      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2901        {        {
2902        prevreqchar = *reqchar;        /* If we have not yet set a firstbyte in this branch, take it from the
2903        *reqchar = subreqchar;        subpattern, remembering that it was set here so that a repeat of more
2904        if (bravalue != OP_ASSERT) *countlits += subcountlits;        than one can replicate it as reqbyte if necessary. If the subpattern has
2905          no firstbyte, set "none" for the whole branch. In both cases, a zero
2906          repeat forces firstbyte to "none". */
2907    
2908          if (firstbyte == REQ_UNSET)
2909            {
2910            if (subfirstbyte >= 0)
2911              {
2912              firstbyte = subfirstbyte;
2913              groupsetfirstbyte = TRUE;
2914              }
2915            else firstbyte = REQ_NONE;
2916            zerofirstbyte = REQ_NONE;
2917            }
2918    
2919          /* If firstbyte was previously set, convert the subpattern's firstbyte
2920          into reqbyte if there wasn't one. */
2921    
2922          else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte;
2923    
2924          /* If the subpattern set a required char (or set a first char that isn't
2925          really the first char - see above), set it. */
2926    
2927          if (subreqbyte >= 0) reqbyte = subreqbyte;
2928        }        }
2929    
2930        /* For a forward assertion, we take the reqbyte, if set. This can be
2931        helpful if the pattern that follows the assertion doesn't set a different
2932        char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
2933        for an assertion, however because it leads to incorrect effect for patterns
2934        such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
2935        of a firstbyte. This is overcome by a scan at the end if there's no
2936        firstbyte, looking for an asserted first char. */
2937    
2938        else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
2939    
2940      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
2941    
2942      code = tempcode;      code = tempcode;
# Line 1985  for (;; ptr++) Line 2967  for (;; ptr++)
2967    
2968      if (c < 0)      if (c < 0)
2969        {        {
2970          if (-c == ESC_Q)            /* Handle start of quoted string */
2971            {
2972            if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
2973              else inescq = TRUE;
2974            continue;
2975            }
2976    
2977          /* For metasequences that actually match a character, we disable the
2978          setting of a first character if it hasn't already been set. */
2979    
2980          if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
2981            firstbyte = REQ_NONE;
2982    
2983          /* Set values to reset to if this is followed by a zero repeat. */
2984    
2985          zerofirstbyte = firstbyte;
2986          zeroreqbyte = reqbyte;
2987    
2988          /* Back references are handled specially */
2989    
2990        if (-c >= ESC_REF)        if (-c >= ESC_REF)
2991          {          {
2992          int number = -c - ESC_REF;          int number = -c - ESC_REF;
2993          previous = code;          previous = code;
2994          *code++ = OP_REF;          *code++ = OP_REF;
2995          *code++ = number >> 8;          PUT2INC(code, 0, number);
         *code++ = number & 255;  
2996          }          }
2997        else        else
2998          {          {
# Line 2019  for (;; ptr++) Line 3020  for (;; ptr++)
3020    
3021      do      do
3022        {        {
3023          /* If in \Q...\E, check for the end; if not, we always have a literal */
3024    
3025          if (inescq)
3026            {
3027            if (c == '\\' && ptr[1] == 'E')
3028              {
3029              inescq = FALSE;
3030              ptr++;
3031              }
3032            else
3033              {
3034              *code++ = c;
3035              length++;
3036              }
3037            continue;
3038            }
3039    
3040          /* Skip white space and comments for /x patterns */
3041    
3042        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
3043          {          {
3044          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
# Line 2046  for (;; ptr++) Line 3066  for (;; ptr++)
3066          two or more characters in the UTF-8 encoding. */          two or more characters in the UTF-8 encoding. */
3067    
3068  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3069          if (c > 127 && (options & PCRE_UTF8) != 0)          if (utf8 && c > 127)
3070            {            {
3071            uschar buffer[8];            uschar buffer[8];
3072            int len = ord2utf8(c, buffer);            int len = ord2utf8(c, buffer);
# Line 2067  for (;; ptr++) Line 3087  for (;; ptr++)
3087    
3088      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3089    
3090      /* Update the last character and the count of literals */      /* Update the first and last requirements. These are always bytes, even in
3091        UTF-8 mode. However, there is a special case to be considered when there
3092        are only one or two characters. Because this gets messy in UTF-8 mode, the
3093        code is kept separate. When we get here "length" contains the number of
3094        bytes. */
3095    
3096      prevreqchar = (length > 1)? code[-2] : *reqchar;  #ifdef SUPPORT_UTF8
3097      *reqchar = code[-1];      if (utf8 && length > 1)
3098      *countlits += length;        {
3099          uschar *t = previous + 3;                      /* After this code, t */
3100          while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */
3101    
3102      /* Compute the length and set it in the data vector, and advance to        /* Handle the case when there is only one multibyte character. It must
3103      the next state. */        have at least two bytes because of the "length > 1" test above. */
3104    
3105      previous[1] = length;        if (t == code)
3106      if (length < MAXLIT) ptr--;          {
3107      break;          /* If no previous first byte, set it from this character, but revert to
3108      }          none on a zero repeat. */
   }                   /* end of big loop */  
3109    
3110  /* Control never reaches here by falling through, only by a goto for all the          if (firstbyte == REQ_UNSET)
3111  error states. Pass back the position in the pattern so that it can be displayed            {
3112  to the user for diagnosing the error. */            zerofirstbyte = REQ_NONE;
3113              firstbyte = previous[2];
3114              }
3115    
3116  FAILED:          /* Otherwise, leave the first byte value alone, and don't change it on
3117  *ptrptr = ptr;          a zero repeat */
 return FALSE;  
 }  
3118    
3119            else zerofirstbyte = firstbyte;
3120    
3121            /* In both cases, a zero repeat resets the previous required byte */
3122    
3123            zeroreqbyte = reqbyte;
3124            }
3125    
3126  /*************************************************        /* Handle the case when there is more than one character. These may be
3127  *     Compile sequence of alternatives           *        single-byte or multibyte characters */
 *************************************************/  
3128    
3129  /* On entry, ptr is pointing past the bracket character, but on return        else
3130  it points to the closing bracket, or vertical bar, or end of string.          {
3131  The code variable is pointing at the byte into which the BRA operator has been          uschar *t = code - 1;               /* After this code, t is at the */
3132            while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */
3133    
3134            /* If no previous first byte, set it from the first character, and
3135            retain it on a zero repeat (of the last character). The required byte
3136            is reset on a zero repeat, either to the byte before the last
3137            character, unless this is the first byte of the string. In that case,
3138            it reverts to its previous value. */
3139    
3140            if (firstbyte == REQ_UNSET)
3141              {
3142              zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3143              zeroreqbyte = (t - 1 == previous + 2)? reqbyte : t[-1] | req_caseopt;
3144              }
3145    
3146            /* If there was a previous first byte, leave it alone, and don't change
3147            it on a zero repeat. The required byte is reset on a zero repeat to the
3148            byte before the last character. */
3149    
3150            else
3151              {
3152              zerofirstbyte = firstbyte;
3153              zeroreqbyte = t[-1] | req_caseopt;
3154              }
3155            }
3156    
3157          /* In all cases (we know length > 1), the new required byte is the last
3158          byte of the string. */
3159    
3160          reqbyte = code[-1] | req_caseopt;
3161          }
3162    
3163        else   /* End of UTF-8 coding */
3164    #endif
3165    
3166        /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3167        or when UTF-8 is not enabled. */
3168    
3169          {
3170          /* firstbyte was not previously set; take it from this string */
3171    
3172          if (firstbyte == REQ_UNSET)
3173            {
3174            if (length == 1)
3175              {
3176              zerofirstbyte = REQ_NONE;
3177              firstbyte = previous[2] | req_caseopt;
3178              zeroreqbyte = reqbyte;
3179              }
3180            else
3181              {
3182              zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3183              zeroreqbyte = (length > 2)? (code[-2] | req_caseopt) : reqbyte;
3184              reqbyte = code[-1] | req_caseopt;
3185              }
3186            }
3187    
3188          /* firstbyte was previously set */
3189    
3190          else
3191            {
3192            zerofirstbyte = firstbyte;
3193            zeroreqbyte = (length == 1)? reqbyte : code[-2] | req_caseopt;
3194            reqbyte = code[-1] | req_caseopt;
3195            }
3196          }
3197    
3198        /* Set the length in the data vector, and advance to the next state. */
3199    
3200        previous[1] = length;
3201        if (length < MAXLIT) ptr--;
3202        break;
3203        }
3204      }                   /* end of big loop */
3205    
3206    /* Control never reaches here by falling through, only by a goto for all the
3207    error states. Pass back the position in the pattern so that it can be displayed
3208    to the user for diagnosing the error. */
3209    
3210    FAILED:
3211    *ptrptr = ptr;
3212    return FALSE;
3213    }
3214    
3215    
3216    
3217    
3218    /*************************************************
3219    *     Compile sequence of alternatives           *
3220    *************************************************/
3221    
3222    /* On entry, ptr is pointing past the bracket character, but on return
3223    it points to the closing bracket, or vertical bar, or end of string.
3224    The code variable is pointing at the byte into which the BRA operator has been
3225  stored. If the ims options are changed at the start (for a (?ims: group) or  stored. If the ims options are changed at the start (for a (?ims: group) or
3226  during any branch, we need to insert an OP_OPT item at the start of every  during any branch, we need to insert an OP_OPT item at the start of every
3227  following branch to ensure they get set correctly at run time, and also pass  following branch to ensure they get set correctly at run time, and also pass
3228  the new options into every subsequent branch compile.  the new options into every subsequent branch compile.
3229    
3230  Argument:  Argument:
3231    options     the option bits    options        option bits, including any changes for this subpattern
3232    optchanged  new ims options to set as if (?ims) were at the start, or -1    oldims         previous settings of ims option bits
3233                 for no change    brackets       -> int containing the number of extracting brackets used
3234    brackets    -> int containing the number of extracting brackets used    codeptr        -> the address of the current code pointer
3235    codeptr     -> the address of the current code pointer    ptrptr         -> the address of the current pattern pointer
3236    ptrptr      -> the address of the current pattern pointer    errorptr       -> pointer to error message
3237    errorptr    -> pointer to error message    lookbehind     TRUE if this is a lookbehind assertion
3238    lookbehind  TRUE if this is a lookbehind assertion    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3239    skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)    firstbyteptr   place to put the first required character, or a negative number
3240    reqchar     -> place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
3241    countlits   -> place to put the shortest literal count of any branch    bcptr          pointer to the chain of currently open branches
3242    cd          points to the data block with tables pointers    cd             points to the data block with tables pointers etc.
3243    
3244  Returns:      TRUE on success  Returns:      TRUE on success
3245  */  */
3246    
3247  static BOOL  static BOOL
3248  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3249    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3250    int *reqchar, int *countlits, compile_data *cd)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3251  {  {
3252  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
3253  uschar *code = *codeptr;  uschar *code = *codeptr;
3254  uschar *last_branch = code;  uschar *last_branch = code;
3255  uschar *start_bracket = code;  uschar *start_bracket = code;
3256  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
3257  int oldoptions = options & PCRE_IMS;  int firstbyte, reqbyte;
3258  int branchreqchar, branchcountlits;  int branchfirstbyte, branchreqbyte;
3259    branch_chain bc;
3260    
3261    bc.outer = bcptr;
3262    bc.current = code;
3263    
3264    firstbyte = reqbyte = REQ_UNSET;
3265    
3266    /* Offset is set zero to mark that this bracket is still open */
3267    
3268  *reqchar = -1;  PUT(code, 1, 0);
3269  *countlits = INT_MAX;  code += 1 + LINK_SIZE + skipbytes;
 code += 3 + skipbytes;  
3270    
3271  /* Loop for each alternative branch */  /* Loop for each alternative branch */
3272    
3273  for (;;)  for (;;)
3274    {    {
3275    int length;    /* Handle a change of ims options at the start of the branch */
3276    
3277    /* Handle change of options */    if ((options & PCRE_IMS) != oldims)
   
   if (optchanged >= 0)  
3278      {      {
3279      *code++ = OP_OPT;      *code++ = OP_OPT;
3280      *code++ = optchanged;      *code++ = options & PCRE_IMS;
     options = (options & ~PCRE_IMS) | optchanged;  
3281      }      }
3282    
3283    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 2161  for (;;) Line 3286  for (;;)
3286      {      {
3287      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
3288      reverse_count = code;      reverse_count = code;
3289      *code++ = 0;      PUTINC(code, 0, 0);
     *code++ = 0;  
3290      }      }
3291    
3292    /* Now compile the branch */    /* Now compile the branch */
3293    
3294    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,    if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3295        &branchreqchar, &branchcountlits, cd))          &branchfirstbyte, &branchreqbyte, &bc, cd))
3296      {      {
3297      *ptrptr = ptr;      *ptrptr = ptr;
3298      return FALSE;      return FALSE;
3299      }      }
3300    
3301    /* Fill in the length of the last branch */    /* If this is the first branch, the firstbyte and reqbyte values for the
3302      branch become the values for the regex. */
3303    
3304    length = code - last_branch;    if (*last_branch != OP_ALT)
3305    last_branch[1] = length >> 8;      {
3306    last_branch[2] = length & 255;      firstbyte = branchfirstbyte;
3307        reqbyte = branchreqbyte;
3308        }
3309    
3310    /* Save the last required character if all branches have the same; a current    /* If this is not the first branch, the first char and reqbyte have to
3311    value of -1 means unset, while -2 means "previous branch had no last required    match the values from all the previous branches. */
   char".  */  
3312    
3313    if (*reqchar != -2)    else
3314      {      {
3315      if (branchreqchar >= 0)      /* If we previously had a firstbyte, but it doesn't match the new branch,
3316        we have to abandon the firstbyte for the regex, but if there was previously
3317        no reqbyte, it takes on the value of the old firstbyte. */
3318    
3319        if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3320        {        {
3321        if (*reqchar == -1) *reqchar = branchreqchar;        if (reqbyte < 0) reqbyte = firstbyte;
3322        else if (*reqchar != branchreqchar) *reqchar = -2;        firstbyte = REQ_NONE;
3323        }        }
     else *reqchar = -2;  
     }  
3324    
3325    /* Keep the shortest literal count */      /* If we (now or from before) have no firstbyte, a firstbyte from the
3326        branch becomes a reqbyte if there isn't a branch reqbyte. */
3327    
3328        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3329            branchreqbyte = branchfirstbyte;
3330    
3331        /* Now ensure that the reqbytes match */
3332    
3333    if (branchcountlits < *countlits) *countlits = branchcountlits;      if (reqbyte != branchreqbyte) reqbyte = REQ_NONE;
3334    DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));      }
3335    
3336    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
3337    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
# Line 2205  for (;;) Line 3339  for (;;)
3339    
3340    if (lookbehind)    if (lookbehind)
3341      {      {
3342        int length;
3343      *code = OP_END;      *code = OP_END;
3344      length = find_fixedlength(last_branch, options);      length = find_fixedlength(last_branch, options);
3345      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
3346      if (length < 0)      if (length < 0)
3347        {        {
3348        *errorptr = ERR25;        *errorptr = (length == -2)? ERR36 : ERR25;
3349        *ptrptr = ptr;        *ptrptr = ptr;
3350        return FALSE;        return FALSE;
3351        }        }
3352      reverse_count[0] = (length >> 8);      PUT(reverse_count, 0, length);
     reverse_count[1] = length & 255;  
3353      }      }
3354    
3355    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Go back through
3356    terminating ket and the length of the whole bracketed item, and return,    the alternative branches and reverse the chain of offsets, with the field in
3357    leaving the pointer at the terminating char. If any of the ims options    the BRA item now becoming an offset to the first alternative. If there are
3358    were changed inside the group, compile a resetting op-code following. */    no alternatives, it points to the end of the group. The length in the
3359      terminating ket is always the length of the whole bracketed item. If any of
3360      the ims options were changed inside the group, compile a resetting op-code
3361      following, except at the very end of the pattern. Return leaving the pointer
3362      at the terminating char. */
3363    
3364    if (*ptr != '|')    if (*ptr != '|')
3365      {      {
3366      length = code - start_bracket;      int length = code - last_branch;
3367      *code++ = OP_KET;      do
     *code++ = length >> 8;  
     *code++ = length & 255;  
     if (optchanged >= 0)  
3368        {        {
3369        *code++ = OP_OPT;        int prev_length = GET(last_branch, 1);
3370        *code++ = oldoptions;        PUT(last_branch, 1, length);
3371          length = prev_length;
3372          last_branch -= length;
3373        }        }
3374      *codeptr = code;      while (length > 0);
     *ptrptr = ptr;  
     return TRUE;  
     }  
   
   /* Another branch follows; insert an "or" node and advance the pointer. */  
   
   *code = OP_ALT;  
   last_branch = code;  
   code += 3;  
   ptr++;  
   }  
 /* Control never reaches here */  
 }  
   
   
3375    
3376        /* Fill in the ket */
3377    
3378  /*************************************************      *code = OP_KET;
3379  *      Find first significant op code            *      PUT(code, 1, code - start_bracket);
3380  *************************************************/      code += 1 + LINK_SIZE;
   
 /* This is called by several functions that scan a compiled expression looking  
 for a fixed first character, or an anchoring op code etc. It skips over things  
 that do not influence this. For one application, a change of caseless option is  
 important.  
   
 Arguments:  
   code       pointer to the start of the group  
   options    pointer to external options  
   optbit     the option bit whose changing is significant, or  
              zero if none are  
   optstop    TRUE to return on option change, otherwise change the options  
                value and continue  
3381    
3382  Returns:     pointer to the first significant opcode      /* Resetting option if needed */
 */  
3383    
3384  static const uschar*      if ((options & PCRE_IMS) != oldims && *ptr == ')')
 first_significant_code(const uschar *code, int *options, int optbit,  
   BOOL optstop)  
 {  
 for (;;)  
   {  
   switch ((int)*code)  
     {  
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
3385        {        {
3386        if (optstop) return code;        *code++ = OP_OPT;
3387        *options = (int)code[1];        *code++ = oldims;
3388        }        }
     code += 2;  
     break;  
3389    
3390      case OP_CREF:      /* Set values to pass back */
     case OP_BRANUMBER:  
     code += 3;  
     break;  
3391    
3392      case OP_WORD_BOUNDARY:      *codeptr = code;
3393      case OP_NOT_WORD_BOUNDARY:      *ptrptr = ptr;
3394      code++;      *firstbyteptr = firstbyte;
3395      break;      *reqbyteptr = reqbyte;
3396        return TRUE;
3397        }
3398    
3399      case OP_ASSERT_NOT:    /* Another branch follows; insert an "or" node. Its length field points back
3400      case OP_ASSERTBACK:    to the previous branch while the bracket remains open. At the end the chain
3401      case OP_ASSERTBACK_NOT:    is reversed. It's done like this so that the start of the bracket has a
3402      do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);    zero offset until it is closed, making it possible to detect recursion. */
     code += 3;  
     break;  
3403    
3404      default:    *code = OP_ALT;
3405      return code;    PUT(code, 1, code - last_branch);
3406      }    bc.current = last_branch = code;
3407      code += 1 + LINK_SIZE;
3408      ptr++;
3409    }    }
3410  /* Control never reaches here */  /* Control never reaches here */
3411  }  }
# Line 2326  all of whose alternatives start with OP_ Line 3423  all of whose alternatives start with OP_
3423  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
3424  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
3425    
3426    We can also consider a regex to be anchored if OP_SOM starts all its branches.
3427    This is the code for \G, which means "match at start of match position, taking
3428    into account the match offset".
3429    
3430  A branch is also implicitly anchored if it starts with .* and DOTALL is set,  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3431  because that will try the rest of the pattern at all possible matching points,  because that will try the rest of the pattern at all possible matching points,
3432  so there is no point trying them again.  so there is no point trying again.... er ....
3433    
3434    .... except when the .* appears inside capturing parentheses, and there is a
3435    subsequent back reference to those parentheses. We haven't enough information
3436    to catch that case precisely.
3437    
3438    At first, the best we could do was to detect when .* was in capturing brackets
3439    and the highest back reference was greater than or equal to that level.
3440    However, by keeping a bitmap of the first 31 back references, we can catch some
3441    of the more common cases more precisely.
3442    
3443  Arguments:  Arguments:
3444    code       points to start of expression (the bracket)    code           points to start of expression (the bracket)
3445    options    points to the options setting    options        points to the options setting
3446      bracket_map    a bitmap of which brackets we are inside while testing; this
3447                      handles up to substring 31; after that we just have to take
3448                      the less precise approach
3449      backref_map    the back reference bitmap
3450    
3451  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
3452  */  */
3453    
3454  static BOOL  static BOOL
3455  is_anchored(register const uschar *code, int *options)  is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3456      unsigned int backref_map)
3457  {  {
3458  do {  do {
3459     const uschar *scode = first_significant_code(code + 3, options,     const uschar *scode =
3460       PCRE_MULTILINE, FALSE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3461     register int op = *scode;     register int op = *scode;
3462     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)  
3463       { if (!is_anchored(scode, options)) return FALSE; }     /* Capturing brackets */
3464    
3465       if (op > OP_BRA)
3466         {
3467         int new_map;
3468         op -= OP_BRA;
3469         if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3470         new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3471         if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3472         }
3473    
3474       /* Other brackets */
3475    
3476       else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3477         {
3478         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3479         }
3480    
3481       /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3482       are or may be referenced. */
3483    
3484     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3485              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
3486       { if (scode[1] != OP_ANY) return FALSE; }       {
3487     else if (op != OP_SOD &&       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3488         }
3489    
3490       /* Check for explicit anchoring */
3491    
3492       else if (op != OP_SOD && op != OP_SOM &&
3493             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3494       return FALSE;       return FALSE;
3495     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3496     }     }
3497  while (*code == OP_ALT);  while (*code == OP_ALT);   /* Loop for each alternative */
3498  return TRUE;  return TRUE;
3499  }  }
3500    
# Line 2367  return TRUE; Line 3507  return TRUE;
3507  /* This is called to find out if every branch starts with ^ or .* so that  /* This is called to find out if every branch starts with ^ or .* so that
3508  "first char" processing can be done to speed things up in multiline  "first char" processing can be done to speed things up in multiline
3509  matching and for non-DOTALL patterns that start with .* (which must start at  matching and for non-DOTALL patterns that start with .* (which must start at
3510  the beginning or after \n).  the beginning or after \n). As in the case of is_anchored() (see above), we
3511    have to take account of back references to capturing brackets that contain .*
3512    because in that case we can't make the assumption.
3513    
3514  Argument:  points to start of expression (the bracket)  Arguments:
3515  Returns:   TRUE or FALSE    code           points to start of expression (the bracket)
3516      bracket_map    a bitmap of which brackets we are inside while testing; this
3517                      handles up to substring 31; after that we just have to take
3518                      the less precise approach
3519      backref_map    the back reference bitmap
3520    
3521    Returns:         TRUE or FALSE
3522  */  */
3523    
3524  static BOOL  static BOOL
3525  is_startline(const uschar *code)  is_startline(const uschar *code, unsigned int bracket_map,
3526      unsigned int backref_map)
3527  {  {
3528  do {  do {
3529     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3530     register int op = *scode;     register int op = *scode;
3531     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)  
3532       { if (!is_startline(scode)) return FALSE; }     /* Capturing brackets */
3533    
3534       if (op > OP_BRA)
3535         {
3536         int new_map;
3537         op -= OP_BRA;
3538         if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3539         new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3540         if (!is_startline(scode, new_map, backref_map)) return FALSE;
3541         }
3542    
3543       /* Other brackets */
3544    
3545       else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3546         { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3547    
3548       /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3549       may be referenced. */
3550    
3551     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3552       { if (scode[1] != OP_ANY) return FALSE; }       {
3553         if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3554         }
3555    
3556       /* Check for explicit circumflex */
3557    
3558     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
3559     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3560     }     }
3561  while (*code == OP_ALT);  while (*code == OP_ALT);  /* Loop for each alternative */
3562  return TRUE;  return TRUE;
3563  }  }
3564    
3565    
3566    
3567  /*************************************************  /*************************************************
3568  *          Check for fixed first char            *  *       Check for asserted fixed first char      *
3569  *************************************************/  *************************************************/
3570    
3571  /* Try to find out if there is a fixed first character. This is called for  /* During compilation, the "first char" settings from forward assertions are
3572  unanchored expressions, as it speeds up their processing quite considerably.  discarded, because they can cause conflicts with actual literals that follow.
3573  Consider each alternative branch. If they all start with the same char, or with  However, if we end up without a first char setting for an unanchored pattern,
3574  a bracket all of whose alternatives start with the same char (recurse ad lib),  it is worth scanning the regex to see if there is an initial asserted first
3575  then we return that char, otherwise -1.  char. If all branches start with the same asserted char, or with a bracket all
3576    of whose alternatives start with the same asserted char (recurse ad lib), then
3577    we return that char, otherwise -1.
3578    
3579  Arguments:  Arguments:
3580    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
3581    options    pointer to the options (used to check casing changes)    options    pointer to the options (used to check casing changes)
3582      inassert   TRUE if in an assertion
3583    
3584  Returns:     -1 or the fixed first char  Returns:     -1 or the fixed first char
3585  */  */
3586    
3587  static int  static int
3588  find_firstchar(const uschar *code, int *options)  find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3589  {  {
3590  register int c = -1;  register int c = -1;
3591  do {  do {
3592     int d;     int d;
3593     const uschar *scode = first_significant_code(code + 3, options,     const uschar *scode =
3594       PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3595     register int op = *scode;     register int op = *scode;
3596    
3597     if (op >= OP_BRA) op = OP_BRA;     if (op >= OP_BRA) op = OP_BRA;
# Line 2430  do { Line 3605  do {
3605       case OP_ASSERT:       case OP_ASSERT:
3606       case OP_ONCE:       case OP_ONCE:
3607       case OP_COND:       case OP_COND:
3608       if ((d = find_firstchar(scode, options)) < 0) return -1;       if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3609           return -1;
3610       if (c < 0) c = d; else if (c != d) return -1;       if (c < 0) c = d; else if (c != d) return -1;
3611       break;       break;
3612    
# Line 2442  do { Line 3618  do {
3618    
3619       case OP_PLUS:       case OP_PLUS:
3620       case OP_MINPLUS:       case OP_MINPLUS:
3621       if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;       if (!inassert) return -1;
3622         if (c < 0)
3623           {
3624           c = scode[1];
3625           if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3626           }
3627         else if (c != scode[1]) return -1;
3628       break;       break;
3629       }       }
3630    
3631     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3632     }     }
3633  while (*code == OP_ALT);  while (*code == OP_ALT);
3634  return c;  return c;
# Line 2455  return c; Line 3637  return c;
3637    
3638    
3639    
   
3640  /*************************************************  /*************************************************
3641  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
3642  *************************************************/  *************************************************/
# Line 2479  pcre_compile(const char *pattern, int op Line 3660  pcre_compile(const char *pattern, int op
3660    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
3661  {  {
3662  real_pcre *re;  real_pcre *re;
3663  int length = 3;      /* For initial BRA plus length */  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3664  int runlength;  int runlength;
3665  int c, reqchar, countlits;  int c, firstbyte, reqbyte;
3666  int bracount = 0;  int bracount = 0;
 int top_backref = 0;  
3667  int branch_extra = 0;  int branch_extra = 0;
3668  int branch_newextra;  int branch_newextra;
3669    int item_count = -1;
3670    int name_count = 0;
3671    int max_name_size = 0;
3672    #ifdef SUPPORT_UTF8
3673    int lastcharlength = 0;
3674    BOOL utf8;
3675    BOOL class_utf8;
3676    #endif
3677    BOOL inescq = FALSE;
3678  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
3679  size_t size;  size_t size;
3680  uschar *code;  uschar *code;
3681    const uschar *codestart;
3682  const uschar *ptr;  const uschar *ptr;
3683  compile_data compile_block;  compile_data compile_block;
3684  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
3685  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
3686    
 #ifdef DEBUG  
 uschar *code_base, *code_end;  
 #endif  
   
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifndef SUPPORT_UTF8  
 if ((options & PCRE_UTF8) != 0)  
   {  
   *errorptr = ERR32;  
   return NULL;  
   }  
 #endif  
   
3687  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
3688  can do is just return NULL. */  can do is just return NULL. */
3689    
# Line 2523  if (erroroffset == NULL) Line 3699  if (erroroffset == NULL)
3699    }    }
3700  *erroroffset = 0;  *erroroffset = 0;
3701    
3702    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3703    
3704    #ifdef SUPPORT_UTF8
3705    utf8 = (options & PCRE_UTF8) != 0;
3706    #else
3707    if ((options & PCRE_UTF8) != 0)
3708      {
3709      *errorptr = ERR32;
3710      return NULL;
3711      }
3712    #endif
3713    
3714  if ((options & ~PUBLIC_OPTIONS) != 0)  if ((options & ~PUBLIC_OPTIONS) != 0)
3715    {    {
3716    *errorptr = ERR17;    *errorptr = ERR17;
# Line 2537  compile_block.fcc = tables + fcc_offset; Line 3725  compile_block.fcc = tables + fcc_offset;
3725  compile_block.cbits = tables + cbits_offset;  compile_block.cbits = tables + cbits_offset;
3726  compile_block.ctypes = tables + ctypes_offset;  compile_block.ctypes = tables + ctypes_offset;
3727    
3728    /* Maximum back reference and backref bitmap. This is updated for numeric
3729    references during the first pass, but for named references during the actual
3730    compile pass. The bitmap records up to 31 back references to help in deciding
3731    whether (.*) can be treated as anchored or not. */
3732    
3733    compile_block.top_backref = 0;
3734    compile_block.backref_map = 0;
3735    
3736  /* Reflect pattern for debugging output */  /* Reflect pattern for debugging output */
3737    
3738  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
# Line 2545  DPRINTF(("%s\n", pattern)); Line 3741  DPRINTF(("%s\n", pattern));
3741  /* The first thing to do is to make a pass over the pattern to compute the  /* The first thing to do is to make a pass over the pattern to compute the
3742  amount of store required to hold the compiled code. This does not have to be  amount of store required to hold the compiled code. This does not have to be
3743  perfect as long as errors are overestimates. At the same time we can detect any  perfect as long as errors are overestimates. At the same time we can detect any
3744  internal flag settings. Make an attempt to correct for any counted white space  flag settings right at the start, and extract them. Make an attempt to correct
3745  if an "extended" flag setting appears late in the pattern. We can't be so  for any counted white space if an "extended" flag setting appears late in the
3746  clever for #-comments. */  pattern. We can't be so clever for #-comments. */
3747    
3748  ptr = (const uschar *)(pattern - 1);  ptr = (const uschar *)(pattern - 1);
3749  while ((c = *(++ptr)) != 0)  while ((c = *(++ptr)) != 0)
3750    {    {
3751    int min, max;    int min, max;
3752    int class_charcount;    int class_optcount;
3753    int bracket_length;    int bracket_length;
3754      int duplength;
3755    
3756      /* If we are inside a \Q...\E sequence, all chars are literal */
3757    
3758      if (inescq) goto NORMAL_CHAR;
3759    
3760      /* Otherwise, first check for ignored whitespace and comments */
3761    
3762    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3763      {      {
# Line 2564  while ((c = *(++ptr)) != 0) Line 3767  while ((c = *(++ptr)) != 0)
3767        /* The space before the ; is to avoid a warning on a silly compiler        /* The space before the ; is to avoid a warning on a silly compiler
3768        on the Macintosh. */        on the Macintosh. */
3769        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3770          if (c == 0) break;
3771        continue;        continue;
3772        }        }
3773      }      }
3774    
3775      item_count++;    /* Is zero for the first non-comment item */
3776    
3777    switch(c)    switch(c)
3778      {      {
3779      /* A backslashed item may be an escaped "normal" character or a      /* A backslashed item may be an escaped "normal" character or a
# Line 2587  while ((c = *(++ptr)) != 0) Line 3793  while ((c = *(++ptr)) != 0)
3793          goto NORMAL_CHAR;          goto NORMAL_CHAR;
3794          }          }
3795        }        }
3796    
3797        /* If \Q, enter "literal" mode */
3798    
3799        if (-c == ESC_Q)
3800          {
3801          inescq = TRUE;
3802          continue;
3803          }
3804    
3805        /* Other escapes need one byte, and are of length one for repeats */
3806    
3807      length++;      length++;
3808    #ifdef SUPPORT_UTF8
3809        lastcharlength = 1;
3810    #endif
3811    
3812      /* A back reference needs an additional 2 bytes, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
3813      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
# Line 2596  while ((c = *(++ptr)) != 0) Line 3816  while ((c = *(++ptr)) != 0)
3816      if (c <= -ESC_REF)      if (c <= -ESC_REF)
3817        {        {
3818        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
3819        if (refnum > top_backref) top_backref = refnum;        compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3820          if (refnum > compile_block.top_backref)
3821            compile_block.top_backref = refnum;
3822        length += 2;   /* For single back reference */        length += 2;   /* For single back reference */
3823        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3824          {          {
# Line 2611  while ((c = *(++ptr)) != 0) Line 3833  while ((c = *(++ptr)) != 0)
3833        }        }
3834      continue;      continue;
3835    
3836      case '^':      case '^':     /* Single-byte metacharacters */
3837      case '.':      case '.':
3838      case '$':      case '$':
     case '*':     /* These repeats won't be after brackets; */  
     case '+':     /* those are handled separately */  
     case '?':  
3839      length++;      length++;
3840    #ifdef SUPPORT_UTF8
3841        lastcharlength = 1;
3842    #endif
3843      continue;      continue;
3844    
3845      /* This covers the cases of repeats after a single char, metachar, class,      case '*':            /* These repeats won't be after brackets; */
3846      or back reference. */      case '+':            /* those are handled separately */
3847        case '?':
3848        length++;
3849        goto POSESSIVE;      /* A few lines below */
3850    
3851        /* This covers the cases of braced repeats after a single char, metachar,
3852        class, or back reference. */
3853    
3854      case '{':      case '{':
3855      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3856      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3857      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3858    
3859        /* These special cases just insert one extra opcode */
3860    
3861      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
3862        (min == 1 && max == -1))        (min == 1 && max == -1))
3863          length++;          length++;
3864    
3865        /* These cases might insert additional copies of a preceding character. */
3866    
3867      else      else
3868        {        {
3869        length--;   /* Uncount the original char or metachar */  #ifdef SUPPORT_UTF8
3870        if (min == 1) length++; else if (min > 0) length += 4;        /* In UTF-8 mode, we should find the length in lastcharlength */
3871        if (max > 0) length += 4; else length += 2;        if (utf8)
3872            {
3873            if (min != 1)
3874              {
3875              length -= lastcharlength;   /* Uncount the original char or metachar */
3876              if (min > 0) length += 3 + lastcharlength;
3877              }
3878            length += lastcharlength + ((max > 0)? 3 : 1);
3879            }
3880          else
3881    #endif
3882    
3883          /* Not UTF-8 mode: all characters are one byte */
3884            {
3885            if (min != 1)
3886              {
3887              length--;   /* Uncount the original char or metachar */
3888              if (min > 0) length += 4;
3889              }
3890    
3891            length += (max > 0)? 4 : 2;
3892            }
3893          }
3894    
3895        if (ptr[1] == '?') ptr++;      /* Needs no extra length */
3896    
3897        POSESSIVE:                     /* Test for possessive quantifier */
3898        if (ptr[1] == '+')
3899          {
3900          ptr++;
3901          length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
3902        }        }
     if (ptr[1] == '?') ptr++;  
3903      continue;      continue;
3904    
3905      /* An alternation contains an offset to the next branch or ket. If any ims      /* An alternation contains an offset to the next branch or ket. If any ims
# Line 2645  while ((c = *(++ptr)) != 0) Line 3908  while ((c = *(++ptr)) != 0)
3908      branch. This is handled by branch_extra. */      branch. This is handled by branch_extra. */
3909    
3910      case '|':      case '|':
3911      length += 3 + branch_extra;      length += 1 + LINK_SIZE + branch_extra;
3912      continue;      continue;
3913    
3914      /* A character class uses 33 characters. Don't worry about character types      /* A character class uses 33 characters provided that all the character
3915      that aren't allowed in classes - they'll get picked up during the compile.      values are less than 256. Otherwise, it uses a bit map for low valued
3916      A character class that contains only one character uses 2 or 3 bytes,      characters, and individual items for others. Don't worry about character
3917      depending on whether it is negated or not. Notice this where we can. */      types that aren't allowed in classes - they'll get picked up during the
3918        compile. A character class that contains only one single-byte character
3919        uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
3920        where we can. (In UTF-8 mode we can do this only for chars < 128.) */
3921    
3922      case '[':      case '[':
3923      class_charcount = 0;      class_optcount = 0;
3924    
3925    #ifdef SUPPORT_UTF8
3926        class_utf8 = FALSE;
3927    #endif
3928    
3929      if (*(++ptr) == '^') ptr++;      if (*(++ptr) == '^') ptr++;
3930      do  
3931        /* Written as a "do" so that an initial ']' is taken as data */
3932    
3933        if (*ptr != 0) do
3934        {        {
3935          /* Inside \Q...\E everything is literal except \E */
3936    
3937          if (inescq)
3938            {
3939            if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
3940            inescq = FALSE;
3941            ptr += 1;
3942            continue;
3943            }
3944    
3945          /* Outside \Q...\E, check for escapes */
3946    
3947        if (*ptr == '\\')        if (*ptr == '\\')
3948          {          {
3949    #ifdef SUPPORT_UTF8
3950            int prevchar = ptr[-1];
3951    #endif
3952          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
3953            &compile_block);            &compile_block);
3954          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3955          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;  
3956          }          /* \b is backspace inside a class */
3957        else class_charcount++;  
3958        ptr++;          if (-ch == ESC_b) ch = '\b';
3959    
3960            /* \Q enters quoting mode */
3961    
3962            if (-ch == ESC_Q)
3963              {
3964              inescq = TRUE;
3965              continue;
3966              }
3967    
3968            /* Handle escapes that turn into characters */
3969    
3970            if (ch >= 0)
3971              {
3972    #ifdef SUPPORT_UTF8
3973              if (utf8)
3974                {
3975                if (ch > 127) class_optcount = 10;  /* Ensure > 1 */
3976                if (ch > 255)
3977                  {
3978                  uschar buffer[6];
3979                  if (!class_utf8)
3980                    {
3981                    class_utf8 = TRUE;
3982                    length += LINK_SIZE + 1 + 1;
3983                    }
3984                  length += 1 + ord2utf8(ch, buffer);
3985    
3986                  /* If this wide character is preceded by '-', add an extra 2 to
3987                  the length in case the previous character was < 128, because in
3988                  this case the whole range will be put into the list. */
3989    
3990                  if (prevchar == '-') length += 2;
3991                  }
3992                }
3993    #endif
3994              class_optcount++;            /* for possible optimization */
3995              }
3996            else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
3997            }
3998    
3999          /* Check the syntax for POSIX stuff. The bits we actually handle are
4000          checked during the real compile phase. */
4001    
4002          else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4003            {
4004            ptr++;
4005            class_optcount = 10;    /* Make sure > 1 */
4006            }
4007    
4008          /* Anything else just increments the possible optimization count. If
4009          there are wide characters, we are going to have to use an XCLASS. */
4010    
4011          else
4012            {
4013            NON_SPECIAL_CHARACTER:
4014            class_optcount++;
4015    
4016    #ifdef SUPPORT_UTF8
4017            if (utf8)
4018              {
4019              int c;
4020              int extra = 0;
4021              GETCHARLEN(c, ptr, extra);
4022              if (c > 127) class_optcount = 10;   /* No optimization possible */
4023              if (c > 255)
4024                {
4025                if (!class_utf8)
4026                  {
4027                  class_utf8 = TRUE;
4028                  length += LINK_SIZE + 1 + 1;
4029                  }
4030                length += 2 + extra;
4031    
4032                /* If this wide character is preceded by '-', add an extra 2 to
4033                the length in case the previous character was < 128, because in
4034                this case the whole range will be put into the list. */
4035    
4036                if (ptr[-1] == '-') length += 2;
4037    
4038                /* Advance to the end of this character */
4039    
4040                ptr += extra;
4041                }
4042              }
4043    #endif
4044            }
4045          }
4046        while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4047    
4048        if (*ptr == 0)                          /* Missing terminating ']' */
4049          {
4050          *errorptr = ERR6;
4051          goto PCRE_ERROR_RETURN;
4052        }        }
     while (*ptr != 0 && *ptr != ']');  
4053    
4054      /* Repeats for negated single chars are handled by the general code */      /* We can optimize when there was only one optimizable character. Repeats
4055        for positive and negated single one-byte chars are handled by the general
4056        code. Here, we handle repeats for the class opcodes. */
4057    
4058      if (class_charcount == 1) length += 3; else      if (class_optcount == 1) length += 3; else
4059        {        {
4060        length += 33;        length += 33;
4061    
# Line 2695  while ((c = *(++ptr)) != 0) Line 4078  while ((c = *(++ptr)) != 0)
4078    
4079      case '(':      case '(':
4080      branch_newextra = 0;      branch_newextra = 0;
4081      bracket_length = 3;      bracket_length = 1 + LINK_SIZE;
4082    
4083      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
4084    
# Line 2729  while ((c = *(++ptr)) != 0) Line 4112  while ((c = *(++ptr)) != 0)
4112          ptr += 2;          ptr += 2;
4113          break;          break;
4114    
4115          /* A recursive call to the regex is an extension, to provide the          /* (?R) specifies a recursive call to the regex, which is an extension
4116          facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */          to provide the facility which can be obtained by (?p{perl-code}) in
4117            Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4118    
4119            From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4120            the appropriate numbered brackets. This includes both recursive and
4121            non-recursive calls. (?R) is now synonymous with (?0). */
4122    
4123          case 'R':          case 'R':
4124          if (ptr[3] != ')')          ptr++;
4125    
4126            case '0': case '1': case '2': case '3': case '4':
4127            case '5': case '6': case '7': case '8': case '9':
4128            ptr += 2;
4129            if (c != 'R')
4130              while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4131            if (*ptr != ')')
4132            {            {
4133            *errorptr = ERR29;            *errorptr = ERR29;
4134            goto PCRE_ERROR_RETURN;            goto PCRE_ERROR_RETURN;
4135            }            }
4136            length += 1 + LINK_SIZE;
4137    
4138            /* If this item is quantified, it will get wrapped inside brackets so
4139            as to use the code for quantified brackets. We jump down and use the
4140            code that handles this for real brackets. */
4141    
4142            if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4143              {
4144              length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
4145              duplength = 5 + 3 * LINK_SIZE;
4146              goto HANDLE_QUANTIFIED_BRACKETS;
4147              }
4148            continue;
4149    
4150            /* (?C) is an extension which provides "callout" - to provide a bit of
4151            the functionality of the Perl (?{...}) feature. An optional number may
4152            follow (default is zero). */
4153    
4154            case 'C':
4155            ptr += 2;
4156            while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4157            if (*ptr != ')')
4158              {
4159              *errorptr = ERR39;
4160              goto PCRE_ERROR_RETURN;
4161              }
4162            length += 2;
4163            continue;
4164    
4165            /* Named subpatterns are an extension copied from Python */
4166    
4167            case 'P':
4168          ptr += 3;          ptr += 3;
4169          length += 1;          if (*ptr == '<')
4170          break;            {
4171              const uschar *p = ++ptr;
4172              while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4173              if (*ptr != '>')
4174                {
4175                *errorptr = ERR42;
4176                goto PCRE_ERROR_RETURN;
4177                }
4178              name_count++;
4179              if (ptr - p > max_name_size) max_name_size = (ptr - p);
4180              break;
4181              }
4182    
4183            if (*ptr == '=' || *ptr == '>')
4184              {
4185              while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4186              if (*ptr != ')')
4187                {
4188                *errorptr = ERR42;
4189                goto PCRE_ERROR_RETURN;
4190                }
4191              break;
4192              }
4193    
4194            /* Unknown character after (?P */
4195    
4196            *errorptr = ERR41;
4197            goto PCRE_ERROR_RETURN;
4198    
4199          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
4200    
4201          case '<':          case '<':
4202          if (ptr[3] == '=' || ptr[3] == '!')          ptr += 3;
4203            if (*ptr == '=' || *ptr == '!')
4204            {            {
4205            ptr += 3;            branch_newextra = 1 + LINK_SIZE;
4206            branch_newextra = 3;            length += 1 + LINK_SIZE;         /* For the first branch */
           length += 3;         /* For the first branch */  
4207            break;            break;
4208            }            }
4209          *errorptr = ERR24;          *errorptr = ERR24;
# Line 2757  while ((c = *(++ptr)) != 0) Line 4211  while ((c = *(++ptr)) != 0)
4211    
4212          /* Conditionals are in Perl from version 5.005. The bracket must either          /* Conditionals are in Perl from version 5.005. The bracket must either
4213          be followed by a number (for bracket reference) or by an assertion          be followed by a number (for bracket reference) or by an assertion
4214          group. */          group, or (a PCRE extension) by 'R' for a recursion test. */
4215    
4216          case '(':          case '(':
4217          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)          if (ptr[3] == 'R' && ptr[4] == ')')
4218              {
4219              ptr += 4;
4220              length += 3;
4221              }
4222            else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
4223            {            {
4224            ptr += 4;            ptr += 4;
4225            length += 3;            length += 3;
# Line 2827  while ((c = *(++ptr)) != 0) Line 4286  while ((c = *(++ptr)) != 0)
4286              optset = &unset;              optset = &unset;
4287              continue;              continue;
4288    
4289              /* A termination by ')' indicates an options-setting-only item;              /* A termination by ')' indicates an options-setting-only item; if
4290              this is global at top level; otherwise nothing is done here and              this is at the very start of the pattern (indicated by item_count
4291              it is handled during the compiling process on a per-bracket-group              being zero), we use it to set the global options. This is helpful
4292              basis. */              when analyzing the pattern for first characters, etc. Otherwise
4293                nothing is done here and it is handled during the compiling
4294                process.
4295    
4296                [Historical note: Up to Perl 5.8, options settings at top level
4297                were always global settings, wherever they appeared in the pattern.
4298                That is, they were equivalent to an external setting. From 5.8
4299                onwards, they apply only to what follows (which is what you might
4300                expect).] */
4301    
4302              case ')':              case ')':
4303              if (brastackptr == 0)              if (item_count == 0)
4304                {                {
4305                options = (options | set) & (~unset);                options = (options | set) & (~unset);
4306                set = unset = 0;     /* To save length */                set = unset = 0;     /* To save length */
4307                  item_count--;        /* To allow for several */
4308                }                }
4309    
4310              /* Fall through */              /* Fall through */
4311    
4312              /* A termination by ':' indicates the start of a nested group with              /* A termination by ':' indicates the start of a nested group with
# Line 2879  while ((c = *(++ptr)) != 0) Line 4348  while ((c = *(++ptr)) != 0)
4348          END_OPTIONS:          END_OPTIONS:
4349          if (c == ')')          if (c == ')')
4350            {            {
4351            if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))            if (branch_newextra == 2 &&
4352                  (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4353              branch_extra += branch_newextra;              branch_extra += branch_newextra;
4354            continue;            continue;
4355            }            }
# Line 2891  while ((c = *(++ptr)) != 0) Line 4361  while ((c = *(++ptr)) != 0)
4361    
4362      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
4363      Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to      Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4364      need an additional 3 bytes of store per extracting bracket. */      need an additional 3 bytes of store per extracting bracket. However, if
4365        PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4366        must leave the count alone (it will aways be zero). */
4367    
4368      else      else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4369        {        {
4370        bracount++;        bracount++;
4371        if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;        if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
# Line 2924  while ((c = *(++ptr)) != 0) Line 4396  while ((c = *(++ptr)) != 0)
4396      the branch_extra value. */      the branch_extra value. */
4397    
4398      case ')':      case ')':
4399      length += 3;      length += 1 + LINK_SIZE;
4400        if (brastackptr > 0)
4401        {        {
4402        int minval = 1;        duplength = length - brastack[--brastackptr];
4403        int maxval = 1;        branch_extra = bralenstack[brastackptr];
4404        int duplength;        }
4405        else duplength = 0;
4406    
4407        if (brastackptr > 0)      /* The following code is also used when a recursion such as (?3) is
4408          {      followed by a quantifier, because in that case, it has to be wrapped inside
4409          duplength = length - brastack[--brastackptr];      brackets so that the quantifier works. The value of duplength must be
4410          branch_extra = bralenstack[brastackptr];      set before arrival. */
         }  
       else duplength = 0;  
4411    
4412        /* Leave ptr at the final char; for read_repeat_counts this happens      HANDLE_QUANTIFIED_BRACKETS:
       automatically; for the others we need an increment. */  
4413    
4414        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))      /* Leave ptr at the final char; for read_repeat_counts this happens
4415          {      automatically; for the others we need an increment. */
         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,  
           &compile_block);  
         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;  
         }  
       else if (c == '*') { minval = 0; maxval = -1; ptr++; }  
       else if (c == '+') { maxval = -1; ptr++; }  
       else if (c == '?') { minval = 0; ptr++; }  
   
       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the  
       group, and if the maximum is greater than zero, we have to replicate  
       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting  
       bracket set - hence the 7. */  
4416    
4417        if (minval == 0)      if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4418          {        {
4419          length++;        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4420          if (maxval > 0) length += (maxval - 1) * (duplength + 7);        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4421          }        }
4422        else if (c == '*') { min = 0; max = -1; ptr++; }
4423        else if (c == '+') { min = 1; max = -1; ptr++; }
4424        else if (c == '?') { min = 0; max = 1;  ptr++; }
4425        else { min = 1; max = 1; }
4426    
4427        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4428        group, and if the maximum is greater than zero, we have to replicate
4429        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4430        bracket set. */
4431    
4432        if (min == 0)
4433          {
4434          length++;
4435          if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4436          }
4437    
4438        /* When the minimum is greater than zero, 1 we have to replicate up to      /* When the minimum is greater than zero, we have to replicate up to
4439        minval-1 times, with no additions required in the copies. Then, if      minval-1 times, with no additions required in the copies. Then, if there
4440        there is a limited maximum we have to replicate up to maxval-1 times      is a limited maximum we have to replicate up to maxval-1 times allowing
4441        allowing for a BRAZERO item before each optional copy and nesting      for a BRAZERO item before each optional copy and nesting brackets for all
4442        brackets for all but one of the optional copies. */      but one of the optional copies. */
4443    
4444        else      else
4445          {        {
4446          length += (minval - 1) * duplength;        length += (min - 1) * duplength;
4447          if (maxval > minval)   /* Need this test as maxval=-1 means no limit */        if (max > min)   /* Need this test as max=-1 means no limit */
4448            length += (maxval - minval) * (duplength + 7) - 6;          length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4449          }            - (2 + 2*LINK_SIZE);
4450          }
4451    
4452        /* Allow space for once brackets for "possessive quantifier" */
4453    
4454        if (ptr[1] == '+')
4455          {
4456          ptr++;
4457          length += 2 + 2*LINK_SIZE;
4458        }        }
4459      continue;      continue;
4460    
4461      /* Non-special character. For a run of such characters the length required      /* Non-special character. For a run of such characters the length required
4462      is the number of characters + 2, except that the maximum run length is 255.      is the number of characters + 2, except that the maximum run length is
4463      We won't get a skipped space or a non-data escape or the start of a #      MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4464      comment as the first character, so the length can't be zero. */      # comment as the first character, so the length can't be zero. */
4465    
4466      NORMAL_CHAR:      NORMAL_CHAR:
4467      default:      default:
# Line 2987  while ((c = *(++ptr)) != 0) Line 4469  while ((c = *(++ptr)) != 0)
4469      runlength = 0;      runlength = 0;
4470      do      do
4471        {        {
4472    #ifdef SUPPORT_UTF8
4473          lastcharlength = 1;     /* Need length of last char for UTF-8 repeats */
4474    #endif
4475    
4476          /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4477          if (inescq)
4478            {
4479            if (c == '\\' && ptr[1] == 'E')
4480              {
4481              inescq = FALSE;
4482              ptr++;
4483              }
4484            else runlength++;
4485            continue;
4486            }
4487    
4488          /* Skip whitespace and comments for /x */
4489    
4490        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
4491          {          {
4492          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
# Line 3010  while ((c = *(++ptr)) != 0) Line 4510  while ((c = *(++ptr)) != 0)
4510          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4511          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
4512    
4513            /* In UTF-8 mode, add on the number of additional bytes needed to
4514            encode this character, and save the total length in case this is a
4515            final char that is repeated. */
4516    
4517  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4518          if (c > 127 && (options & PCRE_UTF8) != 0)          if (utf8 && c > 127)
4519            {            {
4520            int i;            int i;
4521            for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)            for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4522              if (c <= utf8_table1[i]) break;              if (c <= utf8_table1[i]) break;
4523            runlength += i;            runlength += i;
4524              lastcharlength += i;
4525            }            }
4526  #endif  #endif
4527          }          }
# Line 3031  while ((c = *(++ptr)) != 0) Line 4536  while ((c = *(++ptr)) != 0)
4536      while (runlength < MAXLIT &&      while (runlength < MAXLIT &&
4537        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4538    
4539      ptr--;      /* If we hit a meta-character, back off to point to it */
4540    
4541        if (runlength < MAXLIT) ptr--;
4542    
4543        /* If the last char in the string is a UTF-8 multibyte character, we must
4544        set lastcharlength correctly. If it was specified as an escape, this will
4545        already have been done above. However, we also have to support in-line
4546        UTF-8 characters, so check backwards from where we are. */
4547    
4548    #ifdef SUPPORT_UTF8
4549        if (utf8)
4550          {
4551          const uschar *lastptr = ptr - 1;
4552          if ((*lastptr & 0x80) != 0)
4553            {
4554            while((*lastptr & 0xc0) == 0x80) lastptr--;
4555            lastcharlength = ptr - lastptr;
4556            }
4557          }
4558    #endif
4559    
4560      length += runlength;      length += runlength;
4561      continue;      continue;
4562      }      }
4563    }    }
4564    
4565  length += 4;    /* For final KET and END */  length += 2 + LINK_SIZE;    /* For final KET and END */
4566    
4567  if (length > 65539)  if (length > MAX_PATTERN_SIZE)
4568    {    {
4569    *errorptr = ERR20;    *errorptr = ERR20;
4570    return NULL;    return NULL;
4571    }    }
4572    
4573  /* Compute the size of data block needed and get it, either from malloc or  /* Compute the size of data block needed and get it, either from malloc or
4574  externally provided function. We specify "code[0]" in the offsetof() expression  externally provided function. */
 rather than just "code", because it has been reported that one broken compiler  
 fails on "code" because it is also an independent variable. It should make no  
 difference to the value of the offsetof(). */  
4575    
4576  size = length + offsetof(real_pcre, code[0]);  size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4577  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(pcre_malloc)(size);
4578    
4579  if (re == NULL)  if (re == NULL)
# Line 3066  re->magic_number = MAGIC_NUMBER; Line 4588  re->magic_number = MAGIC_NUMBER;
4588  re->size = size;  re->size = size;
4589  re->options = options;  re->options = options;
4590  re->tables = tables;  re->tables = tables;
4591    re->name_entry_size = max_name_size + 3;
4592    re->name_count = name_count;
4593    
4594    /* The starting points of the name/number translation table and of the code are
4595    passed around in the compile data block. */
4596    
4597    compile_block.names_found = 0;
4598    compile_block.name_entry_size = max_name_size + 3;
4599    compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4600    codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4601    compile_block.start_code = codestart;
4602    
4603  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
4604  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
4605  of the function here. */  of the function here. */
4606    
4607  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern;
4608  code = re->code;  code = (uschar *)codestart;
4609  *code = OP_BRA;  *code = OP_BRA;
4610  bracount = 0;  bracount = 0;
4611  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,  (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4612    &reqchar, &countlits, &compile_block);    errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4613  re->top_bracket = bracount;  re->top_bracket = bracount;
4614  re->top_backref = top_backref;  re->top_backref = compile_block.top_backref;
4615    
4616  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
4617    
# Line 3090  if debugging, leave the test till after Line 4623  if debugging, leave the test till after
4623  *code++ = OP_END;  *code++ = OP_END;
4624    
4625  #ifndef DEBUG  #ifndef DEBUG
4626  if (code - re->code > length) *errorptr = ERR23;  if (code - codestart > length) *errorptr = ERR23;
4627  #endif  #endif
4628    
4629  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
4630  subpattern. */  subpattern. */
4631    
4632  if (top_backref > re->top_bracket) *errorptr = ERR15;  if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4633    
4634  /* Failed to compile */  /* Failed to compile, or error while post-processing */
4635    
4636  if (*errorptr != NULL)  if (*errorptr != NULL)
4637    {    {
# Line 3108  if (*errorptr != NULL) Line 4641  if (*errorptr != NULL)
4641    return NULL;    return NULL;
4642    }    }
4643    
4644  /* If the anchored option was not passed, set flag if we can determine that the  /* If the anchored option was not passed, set the flag if we can determine that
4645  pattern is anchored by virtue of ^ characters or \A or anything else (such as  the pattern is anchored by virtue of ^ characters or \A or anything else (such