/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 25 by nigel, Sat Feb 24 21:38:45 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-2001 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 56  the external pcre header. */ Line 60  the external pcre header. */
60  #endif  #endif
61    
62    
63  /* Number of items on the nested bracket stacks at compile time. This should  /* Maximum number of items on the nested bracket stacks at compile time. This
64  not be set greater than 200. */  applies to the nesting of all kinds of parentheses. It does not limit
65    un-nested, non-capturing parentheses. This number can be made bigger if
66    necessary - it is used to dimension one int and one unsigned char vector at
67    compile time. */
68    
69  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
70    
71    
72    /* The number of bytes in a literal character string above which we can't add
73    any more is different when UTF-8 characters may be encountered. */
74    
75    #ifdef SUPPORT_UTF8
76    #define MAXLIT 250
77    #else
78    #define MAXLIT 255
79    #endif
80    
81    
82  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83    
84  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 78  static const char *OP_names[] = { Line 95  static const char *OP_names[] = {
95    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
98    "class", "Ref",    "class", "Ref", "Recurse",
99    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Branumber", "Bra"
102  };  };
103  #endif  #endif
104    
# Line 97  static const short int escapes[] = { Line 114  static const short int escapes[] = {
114      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
115      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
116      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
117    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
118      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
119      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
120      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
121  };  };
122    
123    /* Tables of names of POSIX character classes and their lengths. The list is
124    terminated by a zero length entry. The first three must be alpha, upper, lower,
125    as this is assumed for handling case independence. */
126    
127    static const char *posix_names[] = {
128      "alpha", "lower", "upper",
129      "alnum", "ascii", "cntrl", "digit", "graph",
130      "print", "punct", "space", "word",  "xdigit" };
131    
132    static const uschar posix_name_lengths[] = {
133      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
134    
135    /* Table of class bit maps for each POSIX class; up to three may be combined
136    to form the class. */
137    
138    static const int posix_class_maps[] = {
139      cbit_lower, cbit_upper, -1,             /* alpha */
140      cbit_lower, -1,         -1,             /* lower */
141      cbit_upper, -1,         -1,             /* upper */
142      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
143      cbit_print, cbit_cntrl, -1,             /* ascii */
144      cbit_cntrl, -1,         -1,             /* cntrl */
145      cbit_digit, -1,         -1,             /* digit */
146      cbit_graph, -1,         -1,             /* graph */
147      cbit_print, -1,         -1,             /* print */
148      cbit_punct, -1,         -1,             /* punct */
149      cbit_space, -1,         -1,             /* space */
150      cbit_word,  -1,         -1,             /* word */
151      cbit_xdigit,-1,         -1              /* xdigit */
152    };
153    
154    
155  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
156    
157  static BOOL  static BOOL
158    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
160    
161    /* Structure for building a chain of data that actually lives on the
162    stack, for holding the values of the subject pointer at the start of each
163    subpattern, so as to detect when an empty string has been matched by a
164    subpattern - to break infinite loops. */
165    
166    typedef struct eptrblock {
167      struct eptrblock *prev;
168      const uschar *saved_eptr;
169    } eptrblock;
170    
171    /* Flag bits for the match() function */
172    
173    #define match_condassert   0x01    /* Called to check a condition assertion */
174    #define match_isgroup      0x02    /* Set if start of bracketed group */
175    
176    
177    
# Line 125  void  (*pcre_free)(void *) = free; Line 189  void  (*pcre_free)(void *) = free;
189    
190    
191    
192    /*************************************************
193    *    Macros and tables for character handling    *
194    *************************************************/
195    
196    /* When UTF-8 encoding is being used, a character is no longer just a single
197    byte. The macros for character handling generate simple sequences when used in
198    byte-mode, and more complicated ones for UTF-8 characters. */
199    
200    #ifndef SUPPORT_UTF8
201    #define GETCHARINC(c, eptr) c = *eptr++;
202    #define GETCHARLEN(c, eptr, len) c = *eptr;
203    #define BACKCHAR(eptr)
204    
205    #else   /* SUPPORT_UTF8 */
206    
207    /* Get the next UTF-8 character, advancing the pointer */
208    
209    #define GETCHARINC(c, eptr) \
210      c = *eptr++; \
211      if (md->utf8 && (c & 0xc0) == 0xc0) \
212        { \
213        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
214        int s = 6 - a;                  /* Amount to shift next byte */  \
215        c &= utf8_table3[a];            /* Low order bits from first byte */ \
216        while (a-- > 0) \
217          { \
218          c |= (*eptr++ & 0x3f) << s; \
219          s += 6; \
220          } \
221        }
222    
223    /* Get the next UTF-8 character, not advancing the pointer, setting length */
224    
225    #define GETCHARLEN(c, eptr, len) \
226      c = *eptr; \
227      len = 1; \
228      if (md->utf8 && (c & 0xc0) == 0xc0) \
229        { \
230        int i; \
231        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
232        int s = 6 - a;                  /* Amount to shift next byte */  \
233        c &= utf8_table3[a];            /* Low order bits from first byte */ \
234        for (i = 1; i <= a; i++) \
235          { \
236          c |= (eptr[i] & 0x3f) << s; \
237          s += 6; \
238          } \
239        len += a; \
240        }
241    
242    /* If the pointer is not at the start of a character, move it back until
243    it is. */
244    
245    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
246    
247    #endif
248    
249    
250    
251  /*************************************************  /*************************************************
252  *             Default character tables           *  *             Default character tables           *
# Line 140  tables. */ Line 262  tables. */
262    
263    
264    
265    #ifdef SUPPORT_UTF8
266    /*************************************************
267    *           Tables for UTF-8 support             *
268    *************************************************/
269    
270    /* These are the breakpoints for different numbers of bytes in a UTF-8
271    character. */
272    
273    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
274    
275    /* These are the indicator bits and the mask for the data bits to set in the
276    first byte of a character, indexed by the number of additional bytes. */
277    
278    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
280    
281    /* Table of the number of extra characters, indexed by the first character
282    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
283    0x3d. */
284    
285    static uschar utf8_table4[] = {
286      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
290    
291    
292    /*************************************************
293    *       Convert character value to UTF-8         *
294    *************************************************/
295    
296    /* This function takes an integer value in the range 0 - 0x7fffffff
297    and encodes it as a UTF-8 character in 0 to 6 bytes.
298    
299    Arguments:
300      cvalue     the character value
301      buffer     pointer to buffer for result - at least 6 bytes long
302    
303    Returns:     number of characters placed in the buffer
304    */
305    
306    static int
307    ord2utf8(int cvalue, uschar *buffer)
308    {
309    register int i, j;
310    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311      if (cvalue <= utf8_table1[i]) break;
312    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
313    cvalue >>= 6 - i;
314    for (j = 0; j < i; j++)
315      {
316      *buffer++ = 0x80 | (cvalue & 0x3f);
317      cvalue >>= 6;
318      }
319    return i + 1;
320    }
321    #endif
322    
323    
324    
325  /*************************************************  /*************************************************
326  *          Return version string                 *  *          Return version string                 *
327  *************************************************/  *************************************************/
328    
329    #define STRING(a)  # a
330    #define XSTRING(s) STRING(s)
331    
332  const char *  const char *
333  pcre_version(void)  pcre_version(void)
334  {  {
335  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
336  }  }
337    
338    
339    
340    
341  /*************************************************  /*************************************************
342  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
343  *************************************************/  *************************************************/
344    
345  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
346  structure.  of the private structure, but its interface was too rigid. It remains for
347    backwards compatibility. The public options are passed back in an int - though
348    the re->options field has been expanded to a long int, all the public options
349    at the low end of it, and so even on 16-bit systems this will still be OK.
350    Therefore, I haven't changed the API for pcre_info().
351    
352  Arguments:  Arguments:
353    external_re   points to compiled code    external_re   points to compiled code
# Line 167  Arguments: Line 356  Arguments:
356                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
357                  or -2 otherwise                  or -2 otherwise
358    
359  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
360                  or negative values on error                  or negative values on error
361  */  */
362    
# Line 177  pcre_info(const pcre *external_re, int * Line 366  pcre_info(const pcre *external_re, int *
366  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
367  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
368  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370  if (first_char != NULL)  if (first_char != NULL)
371    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 186  return re->top_bracket; Line 375  return re->top_bracket;
375    
376    
377    
378    /*************************************************
379    *        Return info about compiled pattern      *
380    *************************************************/
381    
382    /* This is a newer "info" function which has an extensible interface so
383    that additional items can be added compatibly.
384    
385    Arguments:
386      external_re      points to compiled code
387      external_study   points to study data, or NULL
388      what             what information is required
389      where            where to put the information
390    
391    Returns:           0 if data returned, negative on error
392    */
393    
394    int
395    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
396      void *where)
397    {
398    const real_pcre *re = (const real_pcre *)external_re;
399    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
400    
401    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403    
404    switch (what)
405      {
406      case PCRE_INFO_OPTIONS:
407      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408      break;
409    
410      case PCRE_INFO_SIZE:
411      *((size_t *)where) = re->size;
412      break;
413    
414      case PCRE_INFO_CAPTURECOUNT:
415      *((int *)where) = re->top_bracket;
416      break;
417    
418      case PCRE_INFO_BACKREFMAX:
419      *((int *)where) = re->top_backref;
420      break;
421    
422      case PCRE_INFO_FIRSTCHAR:
423      *((int *)where) =
424        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426      break;
427    
428      case PCRE_INFO_FIRSTTABLE:
429      *((const uschar **)where) =
430        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431          study->start_bits : NULL;
432      break;
433    
434      case PCRE_INFO_LASTLITERAL:
435      *((int *)where) =
436        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
437      break;
438    
439      default: return PCRE_ERROR_BADOPTION;
440      }
441    
442    return 0;
443    }
444    
445    
446    
447  #ifdef DEBUG  #ifdef DEBUG
448  /*************************************************  /*************************************************
# Line 223  while (length-- > 0) Line 480  while (length-- > 0)
480    
481  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
482  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
483  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
485  sequence.  the \. On exit, it is on the final character of the escape sequence.
486    
487  Arguments:  Arguments:
488    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 245  check_escape(const uschar **ptrptr, cons Line 502  check_escape(const uschar **ptrptr, cons
502    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
503  {  {
504  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
505  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
506  int i;  
507    /* If backslash is at the end of the pattern, it's an error. */
508    
509    c = *(++ptr);
510  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
511    
512  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 307  else Line 566  else
566        }        }
567    
568      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
569      larger first octal digit */      larger first octal digit. */
570    
571      case '0':      case '0':
572      c -= '0';      c -= '0';
573      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
575          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
576        c &= 255;     /* Take least significant 8 bits */
577      break;      break;
578    
579      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580        which can be greater than 0xff, but only if the ddd are hex digits. */
581    
582      case 'x':      case 'x':
583    #ifdef SUPPORT_UTF8
584        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
585          {
586          const uschar *pt = ptr + 2;
587          register int count = 0;
588          c = 0;
589          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
590            {
591            count++;
592            c = c * 16 + cd->lcc[*pt] -
593              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
594            pt++;
595            }
596          if (*pt == '}')
597            {
598            if (c < 0 || count > 8) *errorptr = ERR34;
599            ptr = pt;
600            break;
601            }
602          /* If the sequence of hex digits does not end with '}', then we don't
603          recognize this construct; fall through to the normal \x handling. */
604          }
605    #endif
606    
607        /* Read just a single hex char */
608    
609      c = 0;      c = 0;
610      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
611        {        {
# Line 328  else Line 615  else
615        }        }
616      break;      break;
617    
618        /* Other special escapes not starting with a digit are straightforward */
619    
620      case 'c':      case 'c':
621      c = *(++ptr);      c = *(++ptr);
622      if (c == 0)      if (c == 0)
# Line 465  if the length is fixed. This is needed f Line 754  if the length is fixed. This is needed f
754    
755  Arguments:  Arguments:
756    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
757      options  the compiling options
758    
759  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length
760  */  */
761    
762  static int  static int
763  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
764  {  {
765  int length = -1;  int length = -1;
766    
# Line 491  for (;;) Line 781  for (;;)
781      case OP_BRA:      case OP_BRA:
782      case OP_ONCE:      case OP_ONCE:
783      case OP_COND:      case OP_COND:
784      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
785      if (d < 0) return -1;      if (d < 0) return -1;
786      branchlength += d;      branchlength += d;
787      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
# Line 527  for (;;) Line 817  for (;;)
817      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
818    
819      case OP_REVERSE:      case OP_REVERSE:
820        case OP_BRANUMBER:
821        case OP_CREF:
822      cc++;      cc++;
823        /* Fall through */
824    
     case OP_CREF:  
825      case OP_OPT:      case OP_OPT:
826      cc++;      cc++;
827      /* Fall through */      /* Fall through */
# Line 544  for (;;) Line 836  for (;;)
836      cc++;      cc++;
837      break;      break;
838    
839      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840        This requires a scan of the string, unfortunately. We assume valid UTF-8
841        strings, so all we do is reduce the length by one for byte whose bits are
842        10xxxxxx. */
843    
844      case OP_CHARS:      case OP_CHARS:
845      branchlength += *(++cc);      branchlength += *(++cc);
846    #ifdef SUPPORT_UTF8
847        for (d = 1; d <= *cc; d++)
848          if ((cc[d] & 0xc0) == 0x80) branchlength--;
849    #endif
850      cc += *cc + 1;      cc += *cc + 1;
851      break;      break;
852    
# Line 576  for (;;) Line 875  for (;;)
875      /* Check a class for variable quantification */      /* Check a class for variable quantification */
876    
877      case OP_CLASS:      case OP_CLASS:
878      cc += (*cc == OP_REF)? 2 : 33;      cc += 33;
879    
880      switch (*cc)      switch (*cc)
881        {        {
# Line 611  for (;;) Line 910  for (;;)
910    
911    
912  /*************************************************  /*************************************************
913    *           Check for POSIX class syntax         *
914    *************************************************/
915    
916    /* This function is called when the sequence "[:" or "[." or "[=" is
917    encountered in a character class. It checks whether this is followed by an
918    optional ^ and then a sequence of letters, terminated by a matching ":]" or
919    ".]" or "=]".
920    
921    Argument:
922      ptr      pointer to the initial [
923      endptr   where to return the end pointer
924      cd       pointer to compile data
925    
926    Returns:   TRUE or FALSE
927    */
928    
929    static BOOL
930    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
931    {
932    int terminator;          /* Don't combine these lines; the Solaris cc */
933    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
934    if (*(++ptr) == '^') ptr++;
935    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
936    if (*ptr == terminator && ptr[1] == ']')
937      {
938      *endptr = ptr;
939      return TRUE;
940      }
941    return FALSE;
942    }
943    
944    
945    
946    
947    /*************************************************
948    *          Check POSIX class name                *
949    *************************************************/
950    
951    /* This function is called to check the name given in a POSIX-style class entry
952    such as [:alnum:].
953    
954    Arguments:
955      ptr        points to the first letter
956      len        the length of the name
957    
958    Returns:     a value representing the name, or -1 if unknown
959    */
960    
961    static int
962    check_posix_name(const uschar *ptr, int len)
963    {
964    register int yield = 0;
965    while (posix_name_lengths[yield] != 0)
966      {
967      if (len == posix_name_lengths[yield] &&
968        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
969      yield++;
970      }
971    return -1;
972    }
973    
974    
975    
976    
977    /*************************************************
978  *           Compile one branch                   *  *           Compile one branch                   *
979  *************************************************/  *************************************************/
980    
# Line 618  for (;;) Line 982  for (;;)
982    
983  Arguments:  Arguments:
984    options      the option bits    options      the option bits
985    brackets     points to number of brackets used    brackets     points to number of extracting brackets used
986    code         points to the pointer to the current code point    code         points to the pointer to the current code point
987    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
988    errorptr     points to pointer to error message    errorptr     points to pointer to error message
989    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
990      reqchar      set to the last literal character required, else -1
991      countlits    set to count of mandatory literal characters
992    cd           contains pointers to tables    cd           contains pointers to tables
993    
994  Returns:       TRUE on success  Returns:       TRUE on success
# Line 632  Returns:       TRUE on success Line 998  Returns:       TRUE on success
998  static BOOL  static BOOL
999  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
1000    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
1001    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
1002  {  {
1003  int repeat_type, op_type;  int repeat_type, op_type;
1004  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1005  int bravalue, length;  int bravalue, length;
1006  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
1007    int prevreqchar;
1008    int condcount = 0;
1009    int subcountlits = 0;
1010  register int c;  register int c;
1011  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1012  uschar *tempcode;  uschar *tempcode;
# Line 651  uschar class[32]; Line 1020  uschar class[32];
1020  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
1022    
1023    /* Initialize no required char, and count of literals */
1024    
1025    *reqchar = prevreqchar = -1;
1026    *countlits = 0;
1027    
1028  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1029    
1030  for (;; ptr++)  for (;; ptr++)
# Line 659  for (;; ptr++) Line 1033  for (;; ptr++)
1033    int class_charcount;    int class_charcount;
1034    int class_lastchar;    int class_lastchar;
1035    int newoptions;    int newoptions;
1036    int condref;    int skipbytes;
1037      int subreqchar;
1038    
1039    c = *ptr;    c = *ptr;
1040    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 667  for (;; ptr++) Line 1042  for (;; ptr++)
1042      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1043      if (c == '#')      if (c == '#')
1044        {        {
1045        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1046          on the Macintosh. */
1047          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048        continue;        continue;
1049        }        }
1050      }      }
# Line 742  for (;; ptr++) Line 1119  for (;; ptr++)
1119          goto FAILED;          goto FAILED;
1120          }          }
1121    
1122          /* Handle POSIX class names. Perl allows a negation extension of the
1123          form [:^name]. A square bracket that doesn't match the syntax is
1124          treated as a literal. We also recognize the POSIX constructions
1125          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1126          5.6 does. */
1127    
1128          if (c == '[' &&
1129              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130              check_posix_syntax(ptr, &tempptr, cd))
1131            {
1132            BOOL local_negate = FALSE;
1133            int posix_class, i;
1134            register const uschar *cbits = cd->cbits;
1135    
1136            if (ptr[1] != ':')
1137              {
1138              *errorptr = ERR31;
1139              goto FAILED;
1140              }
1141    
1142            ptr += 2;
1143            if (*ptr == '^')
1144              {
1145              local_negate = TRUE;
1146              ptr++;
1147              }
1148    
1149            posix_class = check_posix_name(ptr, tempptr - ptr);
1150            if (posix_class < 0)
1151              {
1152              *errorptr = ERR30;
1153              goto FAILED;
1154              }
1155    
1156            /* If matching is caseless, upper and lower are converted to
1157            alpha. This relies on the fact that the class table starts with
1158            alpha, lower, upper as the first 3 entries. */
1159    
1160            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1161              posix_class = 0;
1162    
1163            /* Or into the map we are building up to 3 of the static class
1164            tables, or their negations. */
1165    
1166            posix_class *= 3;
1167            for (i = 0; i < 3; i++)
1168              {
1169              int taboffset = posix_class_maps[posix_class + i];
1170              if (taboffset < 0) break;
1171              if (local_negate)
1172                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1173              else
1174                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1175              }
1176    
1177            ptr = tempptr + 1;
1178            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1179            continue;
1180            }
1181    
1182        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1183        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1184        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 769  for (;; ptr++) Line 1206  for (;; ptr++)
1206              continue;              continue;
1207    
1208              case ESC_w:              case ESC_w:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_W:              case ESC_W:
1213              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1214              continue;              continue;
1215    
1216              case ESC_s:              case ESC_s:
# Line 791  for (;; ptr++) Line 1226  for (;; ptr++)
1226              goto FAILED;              goto FAILED;
1227              }              }
1228            }            }
1229          /* Fall through if single character */  
1230            /* Fall through if single character, but don't at present allow
1231            chars > 255 in UTF-8 mode. */
1232    
1233    #ifdef SUPPORT_UTF8
1234            if (c > 255)
1235              {
1236              *errorptr = ERR33;
1237              goto FAILED;
1238              }
1239    #endif
1240          }          }
1241    
1242        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 811  for (;; ptr++) Line 1256  for (;; ptr++)
1256            }            }
1257    
1258          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1259          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260            in such circumstances. */
1261    
1262          if (d == '\\')          if (d == '\\')
1263            {            {
1264              const uschar *oldptr = ptr;
1265            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266    
1267    #ifdef SUPPORT_UTF8
1268              if (d > 255)
1269                {
1270                *errorptr = ERR33;
1271                goto FAILED;
1272                }
1273    #endif
1274              /* \b is backslash; any other special means the '-' was literal */
1275    
1276            if (d < 0)            if (d < 0)
1277              {              {
1278              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1279                {                {
1280                *errorptr = ERR7;                ptr = oldptr - 2;
1281                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1282                }                }
1283              }              }
1284            }            }
# Line 849  for (;; ptr++) Line 1306  for (;; ptr++)
1306        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1307        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1308    
1309          SINGLE_CHARACTER:
1310    
1311        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1312        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1313          {          {
# Line 933  for (;; ptr++) Line 1392  for (;; ptr++)
1392        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1393      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1394    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1395      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1396      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1397      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1398        out any reqchar setting, backing up to the previous value. We must also
1399        adjust the countlits value. */
1400    
1401      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1402        {        {
1403        int len = previous[1];        int len = previous[1];
1404    
1405          if (repeat_min == 0) *reqchar = prevreqchar;
1406          *countlits += repeat_min - 1;
1407    
1408        if (len == 1)        if (len == 1)
1409          {          {
1410          c = previous[2];          c = previous[2];
# Line 983  for (;; ptr++) Line 1443  for (;; ptr++)
1443        code = previous;        code = previous;
1444    
1445        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1446        repeat_type += op_type;      /* Combine both values for many cases */  
1447          /* If the maximum is zero then the minimum must also be zero; Perl allows
1448          this case, so we do too - by simply omitting the item altogether. */
1449    
1450          if (repeat_max == 0) goto END_REPEAT;
1451    
1452          /* Combine the op_type with the repeat_type */
1453    
1454          repeat_type += op_type;
1455    
1456        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1457        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1060  for (;; ptr++) Line 1528  for (;; ptr++)
1528        }        }
1529    
1530      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1531      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1532    
1533      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1534        {        {
1535          if (repeat_max == 0)
1536            {
1537            code = previous;
1538            goto END_REPEAT;
1539            }
1540        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1541          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1542        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1087  for (;; ptr++) Line 1560  for (;; ptr++)
1560      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1562        {        {
1563        int i, ketoffset = 0;        register int i;
1564          int ketoffset = 0;
1565        int len = code - previous;        int len = code - previous;
1566          uschar *bralink = NULL;
1567    
1568        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1569        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1103  for (;; ptr++) Line 1578  for (;; ptr++)
1578          ketoffset = code - ket;          ketoffset = code - ket;
1579          }          }
1580    
1581        /* If the minimum is greater than zero, and the maximum is unlimited or        /* The case of a zero minimum is special because of the need to stick
1582        equal to the minimum, the first copy remains where it is, and is        OP_BRAZERO in front of it, and because the group appears once in the
1583        replicated up to the minimum number of times. This case includes the +        data, whereas in other cases it appears the minimum number of times. For
1584        repeat, but of course no replication is needed in that case. */        this reason, it is simplest to treat this case separately, as otherwise
1585          the code gets far too messy. There are several special subcases when the
1586          minimum is zero. */
1587    
1588        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))        if (repeat_min == 0)
1589          {          {
1590          for (i = 1; i < repeat_min; i++)          /* If we set up a required char from the bracket, we must back off
1591            to the previous value and reset the countlits value too. */
1592    
1593            if (subcountlits > 0)
1594            {            {
1595            memcpy(code, previous, len);            *reqchar = prevreqchar;
1596            code += len;            *countlits -= subcountlits;
1597            }            }
         }  
1598    
1599        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is also zero, we just omit the group from the output
1600        Then, if there is a fixed upper limit, replicated up to that many times,          altogether. */
       sticking BRAZERO in front of all the optional ones. */  
1601    
1602        else          if (repeat_max == 0)
1603          {            {
1604          if (repeat_min == 0)            code = previous;
1605              goto END_REPEAT;
1606              }
1607    
1608            /* If the maximum is 1 or unlimited, we just have to stick in the
1609            BRAZERO and do no more at this point. */
1610    
1611            if (repeat_max <= 1)
1612            {            {
1613            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1614            code++;            code++;
1615            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1616            }            }
1617    
1618            /* If the maximum is greater than 1 and limited, we have to replicate
1619            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620            The first one has to be handled carefully because it's the original
1621            copy, which has to be moved up. The remainder can be handled by code
1622            that is common with the non-zero minimum case below. We just have to
1623            adjust the value or repeat_max, since one less copy is required. */
1624    
1625            else
1626              {
1627              int offset;
1628              memmove(previous+4, previous, len);
1629              code += 4;
1630              *previous++ = OP_BRAZERO + repeat_type;
1631              *previous++ = OP_BRA;
1632    
1633              /* We chain together the bracket offset fields that have to be
1634              filled in later when the ends of the brackets are reached. */
1635    
1636              offset = (bralink == NULL)? 0 : previous - bralink;
1637              bralink = previous;
1638              *previous++ = offset >> 8;
1639              *previous++ = offset & 255;
1640              }
1641    
1642            repeat_max--;
1643            }
1644    
1645          /* If the minimum is greater than zero, replicate the group as many
1646          times as necessary, and adjust the maximum to the number of subsequent
1647          copies that we need. */
1648    
1649          else
1650            {
1651          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1652            {            {
1653            memcpy(code, previous, len);            memcpy(code, previous, len);
1654            code += len;            code += len;
1655            }            }
1656            if (repeat_max > 0) repeat_max -= repeat_min;
1657            }
1658    
1659          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        /* This code is common to both the zero and non-zero minimum cases. If
1660          the maximum is limited, it replicates the group in a nested fashion,
1661          remembering the bracket starts on a stack. In the case of a zero minimum,
1662          the first one was set up above. In all cases the repeat_max now specifies
1663          the number of additional copies needed. */
1664    
1665          if (repeat_max >= 0)
1666            {
1667            for (i = repeat_max - 1; i >= 0; i--)
1668            {            {
1669            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1670    
1671              /* All but the final copy start a new nesting, maintaining the
1672              chain of brackets outstanding. */
1673    
1674              if (i != 0)
1675                {
1676                int offset;
1677                *code++ = OP_BRA;
1678                offset = (bralink == NULL)? 0 : code - bralink;
1679                bralink = code;
1680                *code++ = offset >> 8;
1681                *code++ = offset & 255;
1682                }
1683    
1684            memcpy(code, previous, len);            memcpy(code, previous, len);
1685            code += len;            code += len;
1686            }            }
1687    
1688            /* Now chain through the pending brackets, and fill in their length
1689            fields (which are holding the chain links pro tem). */
1690    
1691            while (bralink != NULL)
1692              {
1693              int oldlinkoffset;
1694              int offset = code - bralink + 1;
1695              uschar *bra = code - offset;
1696              oldlinkoffset = (bra[1] << 8) + bra[2];
1697              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1698              *code++ = OP_KET;
1699              *code++ = bra[1] = offset >> 8;
1700              *code++ = bra[2] = (offset & 255);
1701              }
1702          }          }
1703    
1704        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
# Line 1149  for (;; ptr++) Line 1706  for (;; ptr++)
1706        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
1707        correct offset was computed above. */        correct offset was computed above. */
1708    
1709        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1710        }        }
1711    
1712      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1162  for (;; ptr++) Line 1719  for (;; ptr++)
1719    
1720      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1721    
1722        END_REPEAT:
1723      previous = NULL;      previous = NULL;
1724      break;      break;
1725    
# Line 1175  for (;; ptr++) Line 1733  for (;; ptr++)
1733    
1734      case '(':      case '(':
1735      newoptions = options;      newoptions = options;
1736      condref = -1;      skipbytes = 0;
1737    
1738      if (*(++ptr) == '?')      if (*(++ptr) == '?')
1739        {        {
# Line 1198  for (;; ptr++) Line 1756  for (;; ptr++)
1756          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
1757          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758            {            {
1759            condref = *ptr - '0';            int condref = *ptr - '0';
1760            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761              if (condref == 0)
1762                {
1763                *errorptr = ERR35;
1764                goto FAILED;
1765                }
1766            ptr++;            ptr++;
1767              code[3] = OP_CREF;
1768              code[4] = condref >> 8;
1769              code[5] = condref & 255;
1770              skipbytes = 3;
1771            }            }
1772          else ptr--;          else ptr--;
1773          break;          break;
# Line 1239  for (;; ptr++) Line 1806  for (;; ptr++)
1806          ptr++;          ptr++;
1807          break;          break;
1808    
1809            case 'R':                 /* Pattern recursion */
1810            *code++ = OP_RECURSE;
1811            ptr++;
1812            continue;
1813    
1814          default:                  /* Option setting */          default:                  /* Option setting */
1815          set = unset = 0;          set = unset = 0;
1816          optset = &set;          optset = &set;
# Line 1298  for (;; ptr++) Line 1870  for (;; ptr++)
1870          }          }
1871        }        }
1872    
1873      /* Else we have a referencing group; adjust the opcode. */      /* Else we have a referencing group; adjust the opcode. If the bracket
1874        number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875        arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876    
1877      else      else
1878        {        {
1879        if (++(*brackets) > EXTRACT_MAX)        if (++(*brackets) > EXTRACT_BASIC_MAX)
1880          {          {
1881          *errorptr = ERR13;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882          goto FAILED;          code[3] = OP_BRANUMBER;
1883            code[4] = *brackets >> 8;
1884            code[5] = *brackets & 255;
1885            skipbytes = 3;
1886          }          }
1887        bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
1888        }        }
1889    
1890      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed re. Assertions may not be repeated, but other
# Line 1323  for (;; ptr++) Line 1900  for (;; ptr++)
1900           options | PCRE_INGROUP,       /* Set for all nested groups */           options | PCRE_INGROUP,       /* Set for all nested groups */
1901           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903           brackets,                     /* Bracket level */           brackets,                     /* Extracting bracket count */
1904           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
1905           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
1906           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
1907           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1908            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909           condref,                      /* Condition reference number */           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1910             &subreqchar,                  /* For possible last char */
1911             &subcountlits,                /* For literal count */
1912           cd))                          /* Tables block */           cd))                          /* Tables block */
1913        goto FAILED;        goto FAILED;
1914    
# Line 1341  for (;; ptr++) Line 1920  for (;; ptr++)
1920      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
1921      two branches in the group. */      two branches in the group. */
1922    
1923      if (bravalue == OP_COND)      else if (bravalue == OP_COND)
1924        {        {
       int branchcount = 0;  
1925        uschar *tc = code;        uschar *tc = code;
1926          condcount = 0;
1927    
1928        do {        do {
1929           branchcount++;           condcount++;
1930           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1931           }           }
1932        while (*tc != OP_KET);        while (*tc != OP_KET);
1933    
1934        if (branchcount > 2)        if (condcount > 2)
1935          {          {
1936          *errorptr = ERR27;          *errorptr = ERR27;
1937          goto FAILED;          goto FAILED;
1938          }          }
1939        }        }
1940    
1941        /* Handle updating of the required character. If the subpattern didn't
1942        set one, leave it as it was. Otherwise, update it for normal brackets of
1943        all kinds, forward assertions, and conditions with two branches. Don't
1944        update the literal count for forward assertions, however. If the bracket
1945        is followed by a quantifier with zero repeat, we have to back off. Hence
1946        the definition of prevreqchar and subcountlits outside the main loop so
1947        that they can be accessed for the back off. */
1948    
1949        if (subreqchar > 0 &&
1950             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951             (bravalue == OP_COND && condcount == 2)))
1952          {
1953          prevreqchar = *reqchar;
1954          *reqchar = subreqchar;
1955          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1956          }
1957    
1958      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1959    
1960      code = tempcode;      code = tempcode;
# Line 1391  for (;; ptr++) Line 1987  for (;; ptr++)
1987        {        {
1988        if (-c >= ESC_REF)        if (-c >= ESC_REF)
1989          {          {
1990            int number = -c - ESC_REF;
1991          previous = code;          previous = code;
1992          *code++ = OP_REF;          *code++ = OP_REF;
1993          *code++ = -c - ESC_REF;          *code++ = number >> 8;
1994            *code++ = number & 255;
1995          }          }
1996        else        else
1997          {          {
# Line 1426  for (;; ptr++) Line 2024  for (;; ptr++)
2024          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2025          if (c == '#')          if (c == '#')
2026            {            {
2027            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2028              on the Macintosh. */
2029              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030            if (c == 0) break;            if (c == 0) break;
2031            continue;            continue;
2032            }            }
# Line 1441  for (;; ptr++) Line 2041  for (;; ptr++)
2041          tempptr = ptr;          tempptr = ptr;
2042          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
2044    
2045            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046            two or more characters in the UTF-8 encoding. */
2047    
2048    #ifdef SUPPORT_UTF8
2049            if (c > 127 && (options & PCRE_UTF8) != 0)
2050              {
2051              uschar buffer[8];
2052              int len = ord2utf8(c, buffer);
2053              for (c = 0; c < len; c++) *code++ = buffer[c];
2054              length += len;
2055              continue;
2056              }
2057    #endif
2058          }          }
2059    
2060        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1451  for (;; ptr++) Line 2065  for (;; ptr++)
2065    
2066      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2067    
2068      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069    
2070        /* Update the last character and the count of literals */
2071    
2072        prevreqchar = (length > 1)? code[-2] : *reqchar;
2073        *reqchar = code[-1];
2074        *countlits += length;
2075    
2076      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2077      the next state. */      the next state. */
2078    
2079      previous[1] = length;      previous[1] = length;
2080      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2081      break;      break;
2082      }      }
2083    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1495  Argument: Line 2115  Argument:
2115    ptrptr      -> the address of the current pattern pointer    ptrptr      -> the address of the current pattern pointer
2116    errorptr    -> pointer to error message    errorptr    -> pointer to error message
2117    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
2118    condref     > 0 for OPT_CREF setting at start of conditional group    skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119      reqchar     -> place to put the last required character, or a negative number
2120      countlits   -> place to put the shortest literal count of any branch
2121    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
2122    
2123  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1503  Returns:      TRUE on success Line 2125  Returns:      TRUE on success
2125    
2126  static BOOL  static BOOL
2127  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
2130  {  {
2131  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2132  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1512  uschar *last_branch = code; Line 2134  uschar *last_branch = code;
2134  uschar *start_bracket = code;  uschar *start_bracket = code;
2135  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
2136  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
2137    int branchreqchar, branchcountlits;
2138    
2139  code += 3;  *reqchar = -1;
2140    *countlits = INT_MAX;
2141  /* At the start of a reference-based conditional group, insert the reference  code += 3 + skipbytes;
 number as an OP_CREF item. */  
   
 if (condref > 0)  
   {  
   *code++ = OP_CREF;  
   *code++ = condref;  
   }  
2142    
2143  /* Loop for each alternative branch */  /* Loop for each alternative branch */
2144    
# Line 1551  for (;;) Line 2167  for (;;)
2167    
2168    /* Now compile the branch */    /* Now compile the branch */
2169    
2170    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171          &branchreqchar, &branchcountlits, cd))
2172      {      {
2173      *ptrptr = ptr;      *ptrptr = ptr;
2174      return FALSE;      return FALSE;
# Line 1563  for (;;) Line 2180  for (;;)
2180    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
2181    last_branch[2] = length & 255;    last_branch[2] = length & 255;
2182    
2183      /* Save the last required character if all branches have the same; a current
2184      value of -1 means unset, while -2 means "previous branch had no last required
2185      char".  */
2186    
2187      if (*reqchar != -2)
2188        {
2189        if (branchreqchar >= 0)
2190          {
2191          if (*reqchar == -1) *reqchar = branchreqchar;
2192          else if (*reqchar != branchreqchar) *reqchar = -2;
2193          }
2194        else *reqchar = -2;
2195        }
2196    
2197      /* Keep the shortest literal count */
2198    
2199      if (branchcountlits < *countlits) *countlits = branchcountlits;
2200      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2201    
2202    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
2203    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
2204    the branch with OP_END. */    the branch with OP_END. */
# Line 1570  for (;;) Line 2206  for (;;)
2206    if (lookbehind)    if (lookbehind)
2207      {      {
2208      *code = OP_END;      *code = OP_END;
2209      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
2210      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
2211      if (length < 0)      if (length < 0)
2212        {        {
# Line 1654  for (;;) Line 2290  for (;;)
2290      break;      break;
2291    
2292      case OP_CREF:      case OP_CREF:
2293      code += 2;      case OP_BRANUMBER:
2294        code += 3;
2295        break;
2296    
2297        case OP_WORD_BOUNDARY:
2298        case OP_NOT_WORD_BOUNDARY:
2299        code++;
2300      break;      break;
2301    
2302      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
# Line 1684  all of whose alternatives start with OP_ Line 2326  all of whose alternatives start with OP_
2326  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2328    
2329  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2331  trying them again.  so there is no point trying them again.
2332    
2333  Arguments:  Arguments:
2334    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1704  do { Line 2346  do {
2346     register int op = *scode;     register int op = *scode;
2347     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2349     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350                (*options & PCRE_DOTALL) != 0)
2351       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
2352     else if (op != OP_SOD &&     else if (op != OP_SOD &&
2353             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1718  return TRUE; Line 2361  return TRUE;
2361    
2362    
2363  /*************************************************  /*************************************************
2364  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2365  *************************************************/  *************************************************/
2366    
2367  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2368  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2369    matching and for non-DOTALL patterns that start with .* (which must start at
2370    the beginning or after \n).
2371    
2372  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2373  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1736  do { Line 2381  do {
2381     register int op = *scode;     register int op = *scode;
2382     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
2384       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385         { if (scode[1] != OP_ANY) return FALSE; }
2386     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
2387     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2388     }     }
# Line 1834  pcre_compile(const char *pattern, int op Line 2481  pcre_compile(const char *pattern, int op
2481  real_pcre *re;  real_pcre *re;
2482  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2483  int runlength;  int runlength;
2484  int c, size;  int c, reqchar, countlits;
2485  int bracount = 0;  int bracount = 0;
2486  int top_backref = 0;  int top_backref = 0;
2487  int branch_extra = 0;  int branch_extra = 0;
2488  int branch_newextra;  int branch_newextra;
2489  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2490    size_t size;
2491  uschar *code;  uschar *code;
2492  const uschar *ptr;  const uschar *ptr;
2493  compile_data compile_block;  compile_data compile_block;
# Line 1850  uschar bralenstack[BRASTACK_SIZE]; Line 2498  uschar bralenstack[BRASTACK_SIZE];
2498  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2499  #endif  #endif
2500    
2501    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502    
2503    #ifndef SUPPORT_UTF8
2504    if ((options & PCRE_UTF8) != 0)
2505      {
2506      *errorptr = ERR32;
2507      return NULL;
2508      }
2509    #endif
2510    
2511  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512  can do is just return NULL. */  can do is just return NULL. */
2513    
# Line 1896  while ((c = *(++ptr)) != 0) Line 2554  while ((c = *(++ptr)) != 0)
2554    {    {
2555    int min, max;    int min, max;
2556    int class_charcount;    int class_charcount;
2557      int bracket_length;
2558    
2559    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2560      {      {
2561      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2562      if (c == '#')      if (c == '#')
2563        {        {
2564        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
2565          on the Macintosh. */
2566          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567        continue;        continue;
2568        }        }
2569      }      }
# Line 1928  while ((c = *(++ptr)) != 0) Line 2589  while ((c = *(++ptr)) != 0)
2589        }        }
2590      length++;      length++;
2591    
2592      /* A back reference needs an additional char, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
2593      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
2594      back reference. */      back reference. */
2595    
# Line 1936  while ((c = *(++ptr)) != 0) Line 2597  while ((c = *(++ptr)) != 0)
2597        {        {
2598        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2599        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2600        length++;   /* For single back reference */        length += 2;   /* For single back reference */
2601        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602          {          {
2603          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
# Line 2034  while ((c = *(++ptr)) != 0) Line 2695  while ((c = *(++ptr)) != 0)
2695    
2696      case '(':      case '(':
2697      branch_newextra = 0;      branch_newextra = 0;
2698        bracket_length = 3;
2699    
2700      /* Handle special forms of bracket, which all start (? */      /* Handle special forms of bracket, which all start (? */
2701    
# Line 2067  while ((c = *(++ptr)) != 0) Line 2729  while ((c = *(++ptr)) != 0)
2729          ptr += 2;          ptr += 2;
2730          break;          break;
2731    
2732            /* A recursive call to the regex is an extension, to provide the
2733            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2734    
2735            case 'R':
2736            if (ptr[3] != ')')
2737              {
2738              *errorptr = ERR29;
2739              goto PCRE_ERROR_RETURN;
2740              }
2741            ptr += 3;
2742            length += 1;
2743            break;
2744    
2745          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2746    
2747          case '<':          case '<':
# Line 2088  while ((c = *(++ptr)) != 0) Line 2763  while ((c = *(++ptr)) != 0)
2763          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764            {            {
2765            ptr += 4;            ptr += 4;
2766            length += 2;            length += 3;
2767            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768            if (*ptr != ')')            if (*ptr != ')')
2769              {              {
# Line 2099  while ((c = *(++ptr)) != 0) Line 2774  while ((c = *(++ptr)) != 0)
2774          else   /* An assertion must follow */          else   /* An assertion must follow */
2775            {            {
2776            ptr++;   /* Can treat like ':' as far as spacing is concerned */            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2777              if (ptr[2] != '?' ||
2778            if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)               (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2779              {              {
2780              ptr += 2;    /* To get right offset in message */              ptr += 2;    /* To get right offset in message */
2781              *errorptr = ERR28;              *errorptr = ERR28;
# Line 2174  while ((c = *(++ptr)) != 0) Line 2849  while ((c = *(++ptr)) != 0)
2849              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2850              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2851              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2852              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2853                flag ever changes within the regex. This is used by the "required
2854                character" code. */
2855    
2856              case ':':              case ':':
2857              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2858                {                {
2859                length += 4;                length += 4;
2860                branch_newextra = 2;                branch_newextra = 2;
2861                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2862                }                }
2863              goto END_OPTIONS;              goto END_OPTIONS;
2864    
# Line 2212  while ((c = *(++ptr)) != 0) Line 2890  while ((c = *(++ptr)) != 0)
2890        }        }
2891    
2892      /* Extracting brackets must be counted so we can process escapes in a      /* Extracting brackets must be counted so we can process escapes in a
2893      Perlish way. */      Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894        need an additional 3 bytes of store per extracting bracket. */
2895    
2896      else bracount++;      else
2897          {
2898          bracount++;
2899          if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900          }
2901    
2902      /* Non-special forms of bracket. Save length for computing whole length      /* Save length for computing whole length at end if there's a repeat that
2903      at end if there's a repeat that requires duplication of the group. Also      requires duplication of the group. Also save the current value of
2904      save the current value of branch_extra, and start the new group with      branch_extra, and start the new group with the new value. If non-zero, this
2905      the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3      will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
     for a lookbehind assertion. */  
2906    
2907      if (brastackptr >= sizeof(brastack)/sizeof(int))      if (brastackptr >= sizeof(brastack)/sizeof(int))
2908        {        {
# Line 2232  while ((c = *(++ptr)) != 0) Line 2914  while ((c = *(++ptr)) != 0)
2914      branch_extra = branch_newextra;      branch_extra = branch_newextra;
2915    
2916      brastack[brastackptr++] = length;      brastack[brastackptr++] = length;
2917      length += 3;      length += bracket_length;
2918      continue;      continue;
2919    
2920      /* Handle ket. Look for subsequent max/min; for certain sets of values we      /* Handle ket. Look for subsequent max/min; for certain sets of values we
# Line 2268  while ((c = *(++ptr)) != 0) Line 2950  while ((c = *(++ptr)) != 0)
2950        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2951        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2952    
2953        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2955        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2957    
2958        if (minval == 0) length++;        if (minval == 0)
2959          else if (minval > 1) length += (minval - 1) * duplength;          {
2960        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2961            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2962            }
2963    
2964          /* When the minimum is greater than zero, 1 we have to replicate up to
2965          minval-1 times, with no additions required in the copies. Then, if
2966          there is a limited maximum we have to replicate up to maxval-1 times
2967          allowing for a BRAZERO item before each optional copy and nesting
2968          brackets for all but one of the optional copies. */
2969    
2970          else
2971            {
2972            length += (minval - 1) * duplength;
2973            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2974              length += (maxval - minval) * (duplength + 7) - 6;
2975            }
2976        }        }
2977      continue;      continue;
2978    
# Line 2295  while ((c = *(++ptr)) != 0) Line 2992  while ((c = *(++ptr)) != 0)
2992          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2993          if (c == '#')          if (c == '#')
2994            {            {
2995            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2996              on the Macintosh. */
2997              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998            continue;            continue;
2999            }            }
3000          }          }
# Line 2310  while ((c = *(++ptr)) != 0) Line 3009  while ((c = *(++ptr)) != 0)
3009            &compile_block);            &compile_block);
3010          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
3012    
3013    #ifdef SUPPORT_UTF8
3014            if (c > 127 && (options & PCRE_UTF8) != 0)
3015              {
3016              int i;
3017              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018                if (c <= utf8_table1[i]) break;
3019              runlength += i;
3020              }
3021    #endif
3022          }          }
3023    
3024        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 2319  while ((c = *(++ptr)) != 0) Line 3028  while ((c = *(++ptr)) != 0)
3028    
3029      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3030    
3031      while (runlength < 255 &&      while (runlength < MAXLIT &&
3032        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033    
3034      ptr--;      ptr--;
# Line 2351  if (re == NULL) Line 3060  if (re == NULL)
3060    return NULL;    return NULL;
3061    }    }
3062    
3063  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3064    
3065  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3066    re->size = size;
3067  re->options = options;  re->options = options;
3068  re->tables = tables;  re->tables = tables;
3069    
# Line 2365  ptr = (const uschar *)pattern; Line 3075  ptr = (const uschar *)pattern;
3075  code = re->code;  code = re->code;
3076  *code = OP_BRA;  *code = OP_BRA;
3077  bracount = 0;  bracount = 0;
3078  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079    &compile_block);    &reqchar, &countlits, &compile_block);
3080  re->top_bracket = bracount;  re->top_bracket = bracount;
3081  re->top_backref = top_backref;  re->top_backref = top_backref;
3082    
# Line 2398  if (*errorptr != NULL) Line 3108  if (*errorptr != NULL)
3108    return NULL;    return NULL;
3109    }    }
3110    
3111  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3112  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3114  unanchored matches no end. In the case of multiline matches, an alternative is  
3115  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3116    that speeds up unanchored matches no end. If not, see if we can set the
3117    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118    start with ^. and also when all branches start with .* for non-DOTALL matches.
3119    */
3120    
3121  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3122    {    {
# Line 2422  if ((options & PCRE_ANCHORED) == 0) Line 3136  if ((options & PCRE_ANCHORED) == 0)
3136      }      }
3137    }    }
3138    
3139    /* Save the last required character if there are at least two literal
3140    characters on all paths, or if there is no first character setting. */
3141    
3142    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143      {
3144      re->req_char = reqchar;
3145      re->options |= PCRE_REQCHSET;
3146      }
3147    
3148  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3149    
3150  #ifdef DEBUG  #ifdef DEBUG
# Line 2431  printf("Length = %d top_bracket = %d top Line 3154  printf("Length = %d top_bracket = %d top
3154    
3155  if (re->options != 0)  if (re->options != 0)
3156    {    {
3157    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3158      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2448  if ((re->options & PCRE_FIRSTSET) != 0) Line 3172  if ((re->options & PCRE_FIRSTSET) != 0)
3172      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3173    }    }
3174    
3175    if ((re->options & PCRE_REQCHSET) != 0)
3176      {
3177      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178        else printf("Req char = \\x%02x\n", re->req_char);
3179      }
3180    
3181  code_end = code;  code_end = code;
3182  code_base = code = re->code;  code_base = code = re->code;
3183    
# Line 2459  while (code < code_end) Line 3189  while (code < code_end)
3189    
3190    if (*code >= OP_BRA)    if (*code >= OP_BRA)
3191      {      {
3192      printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);      if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193          printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194        else
3195          printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196      code += 2;      code += 2;
3197      }      }
3198    
# Line 2470  while (code < code_end) Line 3203  while (code < code_end)
3203      code++;      code++;
3204      break;      break;
3205    
     case OP_COND:  
     printf("%3d Cond", (code[1] << 8) + code[2]);  
     code += 2;  
     break;  
   
     case OP_CREF:  
     printf(" %.2d %s", code[1], OP_names[*code]);  
     code++;  
     break;  
   
3206      case OP_CHARS:      case OP_CHARS:
3207      charlength = *(++code);      charlength = *(++code);
3208      printf("%3d ", charlength);      printf("%3d ", charlength);
# Line 2496  while (code < code_end) Line 3219  while (code < code_end)
3219      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3220      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3221      case OP_ONCE:      case OP_ONCE:
     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);  
     code += 2;  
     break;  
   
3222      case OP_REVERSE:      case OP_REVERSE:
3223        case OP_BRANUMBER:
3224        case OP_COND:
3225        case OP_CREF:
3226      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227      code += 2;      code += 2;
3228      break;      break;
# Line 2573  while (code < code_end) Line 3295  while (code < code_end)
3295      break;      break;
3296    
3297      case OP_REF:      case OP_REF:
3298      printf("    \\%d", *(++code));      printf("    \\%d", (code[1] << 8) | code[2]);
3299      code ++;      code += 3;
3300      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
3301    
3302      case OP_CLASS:      case OP_CLASS:
# Line 2681  Returns:      TRUE if matched Line 3403  Returns:      TRUE if matched
3403    
3404  static BOOL  static BOOL
3405  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406    int ims)    unsigned long int ims)
3407  {  {
3408  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3409    
# Line 2732  Arguments: Line 3454  Arguments:
3454     offset_top  current top pointer     offset_top  current top pointer
3455     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3456     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
3457     condassert  TRUE if called to check a condition assertion     eptrb       pointer to chain of blocks containing eptr at start of
3458     eptrb       eptr at start of last bracket                   brackets - for testing for empty matches
3459       flags       can contain
3460                     match_condassert - this is an assertion condition
3461                     match_isgroup - this is the start of a bracketed group
3462    
3463  Returns:       TRUE if matched  Returns:       TRUE if matched
3464  */  */
3465    
3466  static BOOL  static BOOL
3467  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3468    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3469      int flags)
3470  {  {
3471  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3472    eptrblock newptrb;
3473    
3474    /* At the start of a bracketed group, add the current subject pointer to the
3475    stack of such pointers, to be re-instated at the end of the group when we hit
3476    the closing ket. When match() is called in other circumstances, we don't add to
3477    the stack. */
3478    
3479    if ((flags & match_isgroup) != 0)
3480      {
3481      newptrb.prev = eptrb;
3482      newptrb.saved_eptr = eptr;
3483      eptrb = &newptrb;
3484      }
3485    
3486    /* Now start processing the operations. */
3487    
3488  for (;;)  for (;;)
3489    {    {
# Line 2768  for (;;) Line 3509  for (;;)
3509    
3510    if (op > OP_BRA)    if (op > OP_BRA)
3511      {      {
3512        int offset;
3513      int number = op - OP_BRA;      int number = op - OP_BRA;
     int offset = number << 1;  
3514    
3515      DPRINTF(("start bracket %d\n", number));      /* For extended extraction brackets (large number), we have to fish out the
3516        number from a dummy opcode at the start. */
3517    
3518        if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519        offset = number << 1;
3520    
3521    #ifdef DEBUG
3522        printf("start bracket %d subject=", number);
3523        pchars(eptr, 16, TRUE, md);
3524        printf("\n");
3525    #endif
3526    
3527      if (offset < md->offset_max)      if (offset < md->offset_max)
3528        {        {
# Line 2784  for (;;) Line 3535  for (;;)
3535    
3536        do        do
3537          {          {
3538          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3539              return TRUE;
3540          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3541          }          }
3542        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2794  for (;;) Line 3546  for (;;)
3546        md->offset_vector[offset] = save_offset1;        md->offset_vector[offset] = save_offset1;
3547        md->offset_vector[offset+1] = save_offset2;        md->offset_vector[offset+1] = save_offset2;
3548        md->offset_vector[md->offset_end - number] = save_offset3;        md->offset_vector[md->offset_end - number] = save_offset3;
3549    
3550        return FALSE;        return FALSE;
3551        }        }
3552    
# Line 2810  for (;;) Line 3563  for (;;)
3563      DPRINTF(("start bracket 0\n"));      DPRINTF(("start bracket 0\n"));
3564      do      do
3565        {        {
3566        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567            return TRUE;
3568        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3569        }        }
3570      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2825  for (;;) Line 3579  for (;;)
3579      case OP_COND:      case OP_COND:
3580      if (ecode[3] == OP_CREF)         /* Condition is extraction test */      if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3581        {        {
3582        int offset = ecode[4] << 1;    /* Doubled reference number */        int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583        return match(eptr,        return match(eptr,
3584          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585            5 : 3 + (ecode[1] << 8) + ecode[2]),            6 : 3 + (ecode[1] << 8) + ecode[2]),
3586          offset_top, md, ims, FALSE, eptr);          offset_top, md, ims, eptrb, match_isgroup);
3587        }        }
3588    
3589      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
# Line 2837  for (;;) Line 3591  for (;;)
3591    
3592      else      else
3593        {        {
3594        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595              match_condassert | match_isgroup))
3596          {          {
3597          ecode += 3 + (ecode[4] << 8) + ecode[5];          ecode += 3 + (ecode[4] << 8) + ecode[5];
3598          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599          }          }
3600        else ecode += (ecode[1] << 8) + ecode[2];        else ecode += (ecode[1] << 8) + ecode[2];
3601        return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);        return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602        }        }
3603      /* Control never reaches here */      /* Control never reaches here */
3604    
3605      /* Skip over conditional reference data if encountered (should not be) */      /* Skip over conditional reference or large extraction number data if
3606        encountered. */
3607    
3608      case OP_CREF:      case OP_CREF:
3609      ecode += 2;      case OP_BRANUMBER:
3610        ecode += 3;
3611      break;      break;
3612    
3613      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614        an empty string - recursion will then try other alternatives, if any. */
3615    
3616      case OP_END:      case OP_END:
3617        if (md->notempty && eptr == md->start_match) return FALSE;
3618      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3619      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3620      return TRUE;      return TRUE;
# Line 2865  for (;;) Line 3624  for (;;)
3624      case OP_OPT:      case OP_OPT:
3625      ims = ecode[1];      ims = ecode[1];
3626      ecode += 2;      ecode += 2;
3627      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3628      break;      break;
3629    
3630      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 2878  for (;;) Line 3637  for (;;)
3637      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3638      do      do
3639        {        {
3640        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3642        }        }
3643      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2886  for (;;) Line 3645  for (;;)
3645    
3646      /* If checking an assertion for a condition, return TRUE. */      /* If checking an assertion for a condition, return TRUE. */
3647    
3648      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3649    
3650      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3651      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
# Line 2902  for (;;) Line 3661  for (;;)
3661      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3662      do      do
3663        {        {
3664        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3665            return FALSE;
3666        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3667        }        }
3668      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3669    
3670      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3671    
3672      ecode += 3;      ecode += 3;
3673      continue;      continue;
3674    
3675      /* Move the subject pointer back. This occurs only at the start of      /* Move the subject pointer back. This occurs only at the start of
3676      each branch of a lookbehind assertion. If we are too close to the start to      each branch of a lookbehind assertion. If we are too close to the start to
3677      move back, this match function fails. */      move back, this match function fails. When working with UTF-8 we move
3678        back a number of characters, not bytes. */
3679    
3680      case OP_REVERSE:      case OP_REVERSE:
3681    #ifdef SUPPORT_UTF8
3682        c = (ecode[1] << 8) + ecode[2];
3683        for (i = 0; i < c; i++)
3684          {
3685          eptr--;
3686          BACKCHAR(eptr)
3687          }
3688    #else
3689      eptr -= (ecode[1] << 8) + ecode[2];      eptr -= (ecode[1] << 8) + ecode[2];
3690    #endif
3691    
3692      if (eptr < md->start_subject) return FALSE;      if (eptr < md->start_subject) return FALSE;
3693      ecode += 3;      ecode += 3;
3694      break;      break;
3695    
3696        /* Recursion matches the current regex, nested. If there are any capturing
3697        brackets started but not finished, we have to save their starting points
3698        and reinstate them after the recursion. However, we don't know how many
3699        such there are (offset_top records the completed total) so we just have
3700        to save all the potential data. There may be up to 99 such values, which
3701        is a bit large to put on the stack, but using malloc for small numbers
3702        seems expensive. As a compromise, the stack is used when there are fewer
3703        than 16 values to store; otherwise malloc is used. A problem is what to do
3704        if the malloc fails ... there is no way of returning to the top level with
3705        an error. Save the top 15 values on the stack, and accept that the rest
3706        may be wrong. */
3707    
3708        case OP_RECURSE:
3709          {
3710          BOOL rc;
3711          int *save;
3712          int stacksave[15];
3713    
3714          c = md->offset_max;
3715    
3716          if (c < 16) save = stacksave; else
3717            {
3718            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3719            if (save == NULL)
3720              {
3721              save = stacksave;
3722              c = 15;
3723              }
3724            }
3725    
3726          for (i = 1; i <= c; i++)
3727            save[i] = md->offset_vector[md->offset_end - i];
3728          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3729            match_isgroup);
3730          for (i = 1; i <= c; i++)
3731            md->offset_vector[md->offset_end - i] = save[i];
3732          if (save != stacksave) (pcre_free)(save);
3733          if (!rc) return FALSE;
3734    
3735          /* In case the recursion has set more capturing values, save the final
3736          number, then move along the subject till after the recursive match,
3737          and advance one byte in the pattern code. */
3738    
3739          offset_top = md->end_offset_top;
3740          eptr = md->end_match_ptr;
3741          ecode++;
3742          }
3743        break;
3744    
3745      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3746      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 2932  for (;;) Line 3752  for (;;)
3752      case OP_ONCE:      case OP_ONCE:
3753        {        {
3754        const uschar *prev = ecode;        const uschar *prev = ecode;
3755          const uschar *saved_eptr = eptr;
3756    
3757        do        do
3758          {          {
3759          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3760              break;
3761          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3762          }          }
3763        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2958  for (;;) Line 3780  for (;;)
3780        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3781        course of events. */        course of events. */
3782    
3783        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3784          {          {
3785          ecode += 3;          ecode += 3;
3786          break;          break;
# Line 2972  for (;;) Line 3794  for (;;)
3794        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3795          {          {
3796          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3797          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798          }          }
3799    
3800        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3801          {          {
3802          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804                  return TRUE;
3805          }          }
3806        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3807          {          {
3808          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3810          }          }
3811        }        }
3812      return FALSE;      return FALSE;
# Line 3004  for (;;) Line 3827  for (;;)
3827      case OP_BRAZERO:      case OP_BRAZERO:
3828        {        {
3829        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3830        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3831            return TRUE;
3832        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833        ecode = next + 3;        ecode = next + 3;
3834        }        }
# Line 3014  for (;;) Line 3838  for (;;)
3838        {        {
3839        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3840        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3842            return TRUE;
3843        ecode++;        ecode++;
3844        }        }
3845      break;      break;
# Line 3029  for (;;) Line 3854  for (;;)
3854      case OP_KETRMAX:      case OP_KETRMAX:
3855        {        {
3856        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857          const uschar *saved_eptr = eptrb->saved_eptr;
3858    
3859          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3860    
3861        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 3045  for (;;) Line 3873  for (;;)
3873    
3874        if (*prev != OP_COND)        if (*prev != OP_COND)
3875          {          {
3876            int offset;
3877          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
         int offset = number << 1;  
3878    
3879          DPRINTF(("end bracket %d\n", number));          /* For extended extraction brackets (large number), we have to fish out
3880            the number from a dummy opcode at the start. */
3881    
3882            if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883            offset = number << 1;
3884    
3885    #ifdef DEBUG
3886            printf("end bracket %d", number);
3887            printf("\n");
3888    #endif
3889    
3890          if (number > 0)          if (number > 0)
3891            {            {
# Line 3066  for (;;) Line 3903  for (;;)
3903        the group. */        the group. */
3904    
3905        ims = original_ims;        ims = original_ims;
3906        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3907    
3908        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3909        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3074  for (;;) Line 3911  for (;;)
3911        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3912        course of events. */        course of events. */
3913    
3914        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3915          {          {
3916          ecode += 3;          ecode += 3;
3917          break;          break;
# Line 3085  for (;;) Line 3922  for (;;)
3922    
3923        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3924          {          {
3925          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3926              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3927                  return TRUE;
3928          }          }
3929        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3930          {          {
3931          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3932              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3933          }          }
3934        }        }
3935      return FALSE;      return FALSE;
# Line 3102  for (;;) Line 3940  for (;;)
3940      if (md->notbol && eptr == md->start_subject) return FALSE;      if (md->notbol && eptr == md->start_subject) return FALSE;
3941      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
3942        {        {
3943        if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;        if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3944        ecode++;        ecode++;
3945        break;        break;
3946        }        }
# Line 3121  for (;;) Line 3959  for (;;)
3959      case OP_DOLL:      case OP_DOLL:
3960      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
3961        {        {
3962        if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }        if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963          else { if (md->noteol) return FALSE; }          else { if (md->noteol) return FALSE; }
3964        ecode++;        ecode++;
3965        break;        break;
# Line 3132  for (;;) Line 3970  for (;;)
3970        if (!md->endonly)        if (!md->endonly)
3971          {          {
3972          if (eptr < md->end_subject - 1 ||          if (eptr < md->end_subject - 1 ||
3973             (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;             (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3974    
3975          ecode++;          ecode++;
3976          break;          break;
# Line 3151  for (;;) Line 3989  for (;;)
3989    
3990      case OP_EODN:      case OP_EODN:
3991      if (eptr < md->end_subject - 1 ||      if (eptr < md->end_subject - 1 ||
3992         (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;         (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3993      ecode++;      ecode++;
3994      break;      break;
3995    
# Line 3173  for (;;) Line 4011  for (;;)
4011      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
4012    
4013      case OP_ANY:      case OP_ANY:
4014      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4015        return FALSE;        return FALSE;
4016      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
4017    #ifdef SUPPORT_UTF8
4018        if (md->utf8)
4019          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4020    #endif
4021      ecode++;      ecode++;
4022      break;      break;
4023    
# Line 3232  for (;;) Line 4074  for (;;)
4074      case OP_REF:      case OP_REF:
4075        {        {
4076        int length;        int length;
4077        int offset = ecode[1] << 1;                /* Doubled reference number */        int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078        ecode += 2;                                /* Advance past the item */        ecode += 3;                                     /* Advance past item */
4079    
4080        /* If the reference is unset, set the length to be longer than the amount        /* If the reference is unset, set the length to be longer than the amount
4081        of subject left; this ensures that every attempt at a match fails. We        of subject left; this ensures that every attempt at a match fails. We
# Line 3302  for (;;) Line 4144  for (;;)
4144          {          {
4145          for (i = min;; i++)          for (i = min;; i++)
4146            {            {
4147            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4148              return TRUE;              return TRUE;
4149            if (i >= max || !match_ref(offset, eptr, length, md, ims))            if (i >= max || !match_ref(offset, eptr, length, md, ims))
4150              return FALSE;              return FALSE;
# Line 3323  for (;;) Line 4165  for (;;)
4165            }            }
4166          while (eptr >= pp)          while (eptr >= pp)
4167            {            {
4168            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4169              return TRUE;              return TRUE;
4170            eptr -= length;            eptr -= length;
4171            }            }
# Line 3377  for (;;) Line 4219  for (;;)
4219        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4220          {          {
4221          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
4222          c = *eptr++;          GETCHARINC(c, eptr)         /* Get character; increment eptr */
4223    
4224    #ifdef SUPPORT_UTF8
4225            /* We do not yet support class members > 255 */
4226            if (c > 255) return FALSE;
4227    #endif
4228    
4229          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230          return FALSE;          return FALSE;
4231          }          }
# Line 3394  for (;;) Line 4242  for (;;)
4242          {          {
4243          for (i = min;; i++)          for (i = min;; i++)
4244            {            {
4245            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4246              return TRUE;              return TRUE;
4247            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
4248            c = *eptr++;            GETCHARINC(c, eptr)       /* Get character; increment eptr */
4249    
4250    #ifdef SUPPORT_UTF8
4251              /* We do not yet support class members > 255 */
4252              if (c > 255) return FALSE;
4253    #endif
4254            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
4255            return FALSE;            return FALSE;
4256            }            }
# Line 3409  for (;;) Line 4262  for (;;)
4262        else        else
4263          {          {
4264          const uschar *pp = eptr;          const uschar *pp = eptr;
4265          for (i = min; i < max; eptr++, i++)          int len = 1;
4266            for (i = min; i < max; i++)
4267            {            {
4268            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
4269            c = *eptr;            GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4270            if ((data[c/8] & (1 << (c&7))) != 0) continue;  
4271            break;  #ifdef SUPPORT_UTF8
4272              /* We do not yet support class members > 255 */
4273              if (c > 255) break;
4274    #endif
4275              if ((data[c/8] & (1 << (c&7))) == 0) break;
4276              eptr += len;
4277            }            }
4278    
4279          while (eptr >= pp)          while (eptr >= pp)
4280            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            {
4281              if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4282              return TRUE;              return TRUE;
4283    
4284    #ifdef SUPPORT_UTF8
4285              BACKCHAR(eptr)
4286    #endif
4287              }
4288          return FALSE;          return FALSE;
4289          }          }
4290        }        }
# Line 3515  for (;;) Line 4380  for (;;)
4380          {          {
4381          for (i = min;; i++)          for (i = min;; i++)
4382            {            {
4383            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4384              return TRUE;              return TRUE;
4385            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4386                c != md->lcc[*eptr++])                c != md->lcc[*eptr++])
# Line 3532  for (;;) Line 4397  for (;;)
4397            eptr++;            eptr++;
4398            }            }
4399          while (eptr >= pp)          while (eptr >= pp)
4400            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4401              return TRUE;              return TRUE;
4402          return FALSE;          return FALSE;
4403          }          }
# Line 3549  for (;;) Line 4414  for (;;)
4414          {          {
4415          for (i = min;; i++)          for (i = min;; i++)
4416            {            {
4417            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4418              return TRUE;              return TRUE;
4419            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4420            }            }
# Line 3564  for (;;) Line 4429  for (;;)
4429            eptr++;            eptr++;
4430            }            }
4431          while (eptr >= pp)          while (eptr >= pp)
4432           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4433             return TRUE;             return TRUE;
4434          return FALSE;          return FALSE;
4435          }          }
# Line 3646  for (;;) Line 4511  for (;;)
4511          {          {
4512          for (i = min;; i++)          for (i = min;; i++)
4513            {            {
4514            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4515              return TRUE;              return TRUE;
4516            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4517                c == md->lcc[*eptr++])                c == md->lcc[*eptr++])
# Line 3663  for (;;) Line 4528  for (;;)
4528            eptr++;            eptr++;
4529            }            }
4530          while (eptr >= pp)          while (eptr >= pp)
4531            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4532              return TRUE;              return TRUE;
4533          return FALSE;          return FALSE;
4534          }          }
# Line 3680  for (;;) Line 4545  for (;;)
4545          {          {
4546          for (i = min;; i++)          for (i = min;; i++)
4547            {            {
4548            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4549              return TRUE;              return TRUE;
4550            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4551            }            }
# Line 3695  for (;;) Line 4560  for (;;)
4560            eptr++;            eptr++;
4561            }            }
4562          while (eptr >= pp)          while (eptr >= pp)
4563           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4564             return TRUE;             return TRUE;
4565          return FALSE;          return FALSE;
4566          }          }
# Line 3739  for (;;) Line 4604  for (;;)
4604    
4605      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
4606      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
4607      (i.e. keep it out of the loop). Also test that there are at least the      (i.e. keep it out of the loop). Also we can test that there are at least
4608      minimum number of characters before we start. */      the minimum number of bytes before we start, except when doing '.' in
4609        UTF8 mode. Leave the test in in all cases; in the special case we have
4610        to test after each character. */
4611    
4612      if (min > md->end_subject - eptr) return FALSE;      if (min > md->end_subject - eptr) return FALSE;
4613      if (min > 0) switch(ctype)      if (min > 0) switch(ctype)
4614        {        {
4615        case OP_ANY:        case OP_ANY:
4616    #ifdef SUPPORT_UTF8
4617          if (md->utf8)
4618            {
4619            for (i = 1; i <= min; i++)
4620              {
4621              if (eptr >= md->end_subject ||
4622                 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4623                return FALSE;
4624              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4625              }
4626            break;
4627            }
4628    #endif
4629          /* Non-UTF8 can be faster */
4630        if ((ims & PCRE_DOTALL) == 0)        if ((ims & PCRE_DOTALL) == 0)
4631          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }          { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4632        else eptr += min;        else eptr += min;
4633        break;        break;
4634    
# Line 3795  for (;;) Line 4676  for (;;)
4676        {        {
4677        for (i = min;; i++)        for (i = min;; i++)
4678          {          {
4679          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4680          if (i >= max || eptr >= md->end_subject) return FALSE;          if (i >= max || eptr >= md->end_subject) return FALSE;
4681    
4682          c = *eptr++;          c = *eptr++;
4683          switch(ctype)          switch(ctype)
4684            {            {
4685            case OP_ANY:            case OP_ANY:
4686            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;            if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4687    #ifdef SUPPORT_UTF8
4688              if (md->utf8)
4689                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4690    #endif
4691            break;            break;
4692    
4693            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
# Line 3842  for (;;) Line 4727  for (;;)
4727        switch(ctype)        switch(ctype)
4728          {          {
4729          case OP_ANY:          case OP_ANY:
4730    
4731            /* Special code is required for UTF8, but when the maximum is unlimited
4732            we don't need it. */
4733    
4734    #ifdef SUPPORT_UTF8
4735            if (md->utf8 && max < INT_MAX)
4736              {
4737              if ((ims & PCRE_DOTALL) == 0)
4738                {
4739                for (i = min; i < max; i++)
4740                  {
4741                  if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4742                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743                  }
4744                }
4745              else
4746                {
4747                for (i = min; i < max; i++)
4748                  {
4749                  eptr++;
4750                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4751                  }
4752                }
4753              break;
4754              }
4755    #endif
4756            /* Non-UTF8 can be faster */
4757          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
4758            {            {
4759            for (i = min; i < max; i++)            for (i = min; i < max; i++)
4760              {              {
4761              if (eptr >= md->end_subject || *eptr == '\n') break;              if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4762              eptr++;              eptr++;
4763              }              }
4764            }            }
# Line 3914  for (;;) Line 4826  for (;;)
4826          }          }
4827    
4828        while (eptr >= pp)        while (eptr >= pp)
4829          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))          {
4830            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4831            return TRUE;            return TRUE;
4832    #ifdef SUPPORT_UTF8
4833            if (md->utf8)
4834              while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4835    #endif
4836            }
4837        return FALSE;        return FALSE;
4838        }        }
4839      /* Control never gets here */      /* Control never gets here */
# Line 3952  Arguments: Line 4870  Arguments:
4870    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4871    subject         points to the subject string    subject         points to the subject string
4872    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4873      start_offset    where to start in the subject string
4874    options         option bits    options         option bits
4875    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4876    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3964  Returns:          > 0 => success; value Line 4883  Returns:          > 0 => success; value
4883    
4884  int  int
4885  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4886    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4887      int offsetcount)
4888  {  {
4889  int resetcount, ocount;  int resetcount, ocount;
4890  int first_char = -1;  int first_char = -1;
4891  int ims = 0;  int req_char = -1;
4892    int req_char2 = -1;
4893    unsigned long int ims = 0;
4894  match_data match_block;  match_data match_block;
4895  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4896  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4897  const uschar *end_subject;  const uschar *end_subject;
4898    const uschar *req_char_ptr = start_match - 1;
4899  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4900  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4901  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
4902  BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;  BOOL anchored;
4903  BOOL startline = (re->options & PCRE_STARTLINE) != 0;  BOOL startline;
4904    
4905  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4906    
# Line 3985  if (re == NULL || subject == NULL || Line 4908  if (re == NULL || subject == NULL ||
4908     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4909  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4910    
4911    anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912    startline = (re->options & PCRE_STARTLINE) != 0;
4913    
4914    match_block.start_pattern = re->code;
4915  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4916  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4917  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
4918    
4919  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4920    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4921    
4922  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4923  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4924    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4925    
4926  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4927    
# Line 4033  in the pattern. */ Line 4962  in the pattern. */
4962  resetcount = 2 + re->top_bracket * 2;  resetcount = 2 + re->top_bracket * 2;
4963  if (resetcount > offsetcount) resetcount = ocount;  if (resetcount > offsetcount) resetcount = ocount;
4964    
4965    /* Reset the working variable associated with each extraction. These should
4966    never be used unless previously set, but they get saved and restored, and so we
4967    initialize them to avoid reading uninitialized locations. */
4968    
4969    if (match_block.offset_vector != NULL)
4970      {
4971      register int *iptr = match_block.offset_vector + ocount;
4972      register int *iend = iptr - resetcount/2 + 1;
4973      while (--iptr >= iend) *iptr = -1;
4974      }
4975    
4976  /* Set up the first character to match, if available. The first_char value is  /* Set up the first character to match, if available. The first_char value is
4977  never set for an anchored regular expression, but the anchoring may be forced  never set for an anchored regular expression, but the anchoring may be forced
4978  at run time, so we have to test for anchoring. The first char may be unset for  at run time, so we have to test for anchoring. The first char may be unset for
# Line 4052  if (!anchored) Line 4992  if (!anchored)
4992          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4993    }    }
4994    
4995  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4996    character" set. If the PCRE_CASELESS is set, implying that the match starts
4997    caselessly, or if there are any changes of this flag within the regex, set up
4998    both cases of the character. Otherwise set the two values the same, which will
4999    avoid duplicate testing (which takes significant time). This covers the vast
5000    majority of cases. It will be suboptimal when the case flag changes in a regex
5001    and the required character in fact is caseful. */
5002    
5003    if ((re->options & PCRE_REQCHSET) != 0)
5004      {
5005      req_char = re->req_char;
5006      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5007        (re->tables + fcc_offset)[req_char] : req_char;
5008      }
5009    
5010    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5011    the loop runs just once. */
5012    
5013  do  do
5014    {    {
# Line 4081  do Line 5037  do
5037    
5038    else if (startline)    else if (startline)
5039      {      {
5040      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
5041        {        {
5042        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != NEWLINE)
5043          start_match++;          start_match++;
5044        }        }
5045      }      }
5046    
5047    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
5048    
5049    else if (start_bits != NULL)    else if (start_bits != NULL)
5050      {      {
# Line 4105  do Line 5061  do
5061    printf("\n");    printf("\n");
5062  #endif  #endif
5063    
5064      /* If req_char is set, we know that that character must appear in the subject
5065      for the match to succeed. If the first character is set, req_char must be
5066      later in the subject; otherwise the test starts at the match point. This
5067      optimization can save a huge amount of backtracking in patterns with nested
5068      unlimited repeats that aren't going to match. We don't know what the state of
5069      case matching may be when this character is hit, so test for it in both its
5070      cases if necessary. However, the different cased versions will not be set up
5071      unless PCRE_CASELESS was given or the casing state changes within the regex.
5072      Writing separate code makes it go faster, as does using an autoincrement and
5073      backing off on a match. */
5074    
5075      if (req_char >= 0)
5076        {
5077        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5078    
5079        /* We don't need to repeat the search if we haven't yet reached the
5080        place we found it at last time. */
5081    
5082        if (p > req_char_ptr)
5083          {
5084          /* Do a single test if no case difference is set up */
5085    
5086          if (req_char == req_char2)
5087            {
5088            while (p < end_subject)
5089              {
5090              if (*p++ == req_char) { p--; break; }
5091              }
5092            }
5093    
5094          /* Otherwise test for either case */
5095    
5096          else
5097            {
5098            while (p < end_subject)
5099              {
5100              register int pp = *p++;
5101              if (pp == req_char || pp == req_char2) { p--; break; }
5102              }
5103            }
5104    
5105          /* If we can't find the required character, break the matching loop */
5106    
5107          if (p >= end_subject) break;
5108    
5109          /* If we have found the required character, save the point where we
5110          found it, so that we don't search again next time round the loop if
5111          the start hasn't passed this character yet. */
5112    
5113          req_char_ptr = p;
5114          }
5115        }
5116    
5117    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
5118    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
5119    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4112  do Line 5121  do
5121    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
5122    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
5123    
5124    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    match_block.start_match = start_match;
5125      if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5126      continue;      continue;
5127    
5128    /* Copy the offset information from temporary store if necessary */    /* Copy the offset information from temporary store if necessary */

Legend:
Removed from v.25  
changed lines
  Added in v.53

  ViewVC Help
Powered by ViewVC 1.1.5