/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 31 by nigel, Sat Feb 24 21:38:57 2007 UTC revision 51 by nigel, Sat Feb 24 21:39:37 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 66  not be set greater than 200. */ Line 66  not be set greater than 200. */
66  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
67    
68    
69    /* The number of bytes in a literal character string above which we can't add
70    any more is different when UTF-8 characters may be encountered. */
71    
72    #ifdef SUPPORT_UTF8
73    #define MAXLIT 250
74    #else
75    #define MAXLIT 255
76    #endif
77    
78    
79  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 82  static const char *OP_names[] = { Line 92  static const char *OP_names[] = {
92    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
95    "class", "Ref",    "class", "Ref", "Recurse",
96    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 107  static const short int escapes[] = { Line 117  static const short int escapes[] = {
117      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
118  };  };
119    
120    /* Tables of names of POSIX character classes and their lengths. The list is
121    terminated by a zero length entry. The first three must be alpha, upper, lower,
122    as this is assumed for handling case independence. */
123    
124    static const char *posix_names[] = {
125      "alpha", "lower", "upper",
126      "alnum", "ascii", "cntrl", "digit", "graph",
127      "print", "punct", "space", "word",  "xdigit" };
128    
129    static const uschar posix_name_lengths[] = {
130      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131    
132    /* Table of class bit maps for each POSIX class; up to three may be combined
133    to form the class. */
134    
135    static const int posix_class_maps[] = {
136      cbit_lower, cbit_upper, -1,             /* alpha */
137      cbit_lower, -1,         -1,             /* lower */
138      cbit_upper, -1,         -1,             /* upper */
139      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
140      cbit_print, cbit_cntrl, -1,             /* ascii */
141      cbit_cntrl, -1,         -1,             /* cntrl */
142      cbit_digit, -1,         -1,             /* digit */
143      cbit_graph, -1,         -1,             /* graph */
144      cbit_print, -1,         -1,             /* print */
145      cbit_punct, -1,         -1,             /* punct */
146      cbit_space, -1,         -1,             /* space */
147      cbit_word,  -1,         -1,             /* word */
148      cbit_xdigit,-1,         -1              /* xdigit */
149    };
150    
151    
152  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
153    
154  static BOOL  static BOOL
155    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
157    
158    /* Structure for building a chain of data that actually lives on the
159    stack, for holding the values of the subject pointer at the start of each
160    subpattern, so as to detect when an empty string has been matched by a
161    subpattern - to break infinite loops. */
162    
163    typedef struct eptrblock {
164      struct eptrblock *prev;
165      const uschar *saved_eptr;
166    } eptrblock;
167    
168    /* Flag bits for the match() function */
169    
170    #define match_condassert   0x01    /* Called to check a condition assertion */
171    #define match_isgroup      0x02    /* Set if start of bracketed group */
172    
173    
174    
# Line 129  void  (*pcre_free)(void *) = free; Line 186  void  (*pcre_free)(void *) = free;
186    
187    
188    
189    /*************************************************
190    *    Macros and tables for character handling    *
191    *************************************************/
192    
193    /* When UTF-8 encoding is being used, a character is no longer just a single
194    byte. The macros for character handling generate simple sequences when used in
195    byte-mode, and more complicated ones for UTF-8 characters. */
196    
197    #ifndef SUPPORT_UTF8
198    #define GETCHARINC(c, eptr) c = *eptr++;
199    #define GETCHARLEN(c, eptr, len) c = *eptr;
200    #define BACKCHAR(eptr)
201    
202    #else   /* SUPPORT_UTF8 */
203    
204    /* Get the next UTF-8 character, advancing the pointer */
205    
206    #define GETCHARINC(c, eptr) \
207      c = *eptr++; \
208      if (md->utf8 && (c & 0xc0) == 0xc0) \
209        { \
210        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
211        int s = 6 - a;                  /* Amount to shift next byte */  \
212        c &= utf8_table3[a];            /* Low order bits from first byte */ \
213        while (a-- > 0) \
214          { \
215          c |= (*eptr++ & 0x3f) << s; \
216          s += 6; \
217          } \
218        }
219    
220    /* Get the next UTF-8 character, not advancing the pointer, setting length */
221    
222    #define GETCHARLEN(c, eptr, len) \
223      c = *eptr; \
224      len = 1; \
225      if (md->utf8 && (c & 0xc0) == 0xc0) \
226        { \
227        int i; \
228        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
229        int s = 6 - a;                  /* Amount to shift next byte */  \
230        c &= utf8_table3[a];            /* Low order bits from first byte */ \
231        for (i = 1; i <= a; i++) \
232          { \
233          c |= (eptr[i] & 0x3f) << s; \
234          s += 6; \
235          } \
236        len += a; \
237        }
238    
239    /* If the pointer is not at the start of a character, move it back until
240    it is. */
241    
242    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243    
244    #endif
245    
246    
247    
248  /*************************************************  /*************************************************
249  *             Default character tables           *  *             Default character tables           *
# Line 144  tables. */ Line 259  tables. */
259    
260    
261    
262    #ifdef SUPPORT_UTF8
263    /*************************************************
264    *           Tables for UTF-8 support             *
265    *************************************************/
266    
267    /* These are the breakpoints for different numbers of bytes in a UTF-8
268    character. */
269    
270    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271    
272    /* These are the indicator bits and the mask for the data bits to set in the
273    first byte of a character, indexed by the number of additional bytes. */
274    
275    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277    
278    /* Table of the number of extra characters, indexed by the first character
279    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280    0x3d. */
281    
282    static uschar utf8_table4[] = {
283      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287    
288    
289    /*************************************************
290    *       Convert character value to UTF-8         *
291    *************************************************/
292    
293    /* This function takes an integer value in the range 0 - 0x7fffffff
294    and encodes it as a UTF-8 character in 0 to 6 bytes.
295    
296    Arguments:
297      cvalue     the character value
298      buffer     pointer to buffer for result - at least 6 bytes long
299    
300    Returns:     number of characters placed in the buffer
301    */
302    
303    static int
304    ord2utf8(int cvalue, uschar *buffer)
305    {
306    register int i, j;
307    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308      if (cvalue <= utf8_table1[i]) break;
309    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310    cvalue >>= 6 - i;
311    for (j = 0; j < i; j++)
312      {
313      *buffer++ = 0x80 | (cvalue & 0x3f);
314      cvalue >>= 6;
315      }
316    return i + 1;
317    }
318    #endif
319    
320    
321    
322  /*************************************************  /*************************************************
323  *          Return version string                 *  *          Return version string                 *
324  *************************************************/  *************************************************/
325    
326    #define STRING(a)  # a
327    #define XSTRING(s) STRING(s)
328    
329  const char *  const char *
330  pcre_version(void)  pcre_version(void)
331  {  {
332  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333  }  }
334    
335    
336    
337    
338  /*************************************************  /*************************************************
339  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
340  *************************************************/  *************************************************/
341    
342  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
343  structure.  of the private structure, but its interface was too rigid. It remains for
344    backwards compatibility. The public options are passed back in an int - though
345    the re->options field has been expanded to a long int, all the public options
346    at the low end of it, and so even on 16-bit systems this will still be OK.
347    Therefore, I haven't changed the API for pcre_info().
348    
349  Arguments:  Arguments:
350    external_re   points to compiled code    external_re   points to compiled code
# Line 171  Arguments: Line 353  Arguments:
353                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
354                  or -2 otherwise                  or -2 otherwise
355    
356  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
357                  or negative values on error                  or negative values on error
358  */  */
359    
# Line 181  pcre_info(const pcre *external_re, int * Line 363  pcre_info(const pcre *external_re, int *
363  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
364  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
365  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367  if (first_char != NULL)  if (first_char != NULL)
368    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 190  return re->top_bracket; Line 372  return re->top_bracket;
372    
373    
374    
375    /*************************************************
376    *        Return info about compiled pattern      *
377    *************************************************/
378    
379    /* This is a newer "info" function which has an extensible interface so
380    that additional items can be added compatibly.
381    
382    Arguments:
383      external_re      points to compiled code
384      external_study   points to study data, or NULL
385      what             what information is required
386      where            where to put the information
387    
388    Returns:           0 if data returned, negative on error
389    */
390    
391    int
392    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393      void *where)
394    {
395    const real_pcre *re = (const real_pcre *)external_re;
396    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397    
398    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400    
401    switch (what)
402      {
403      case PCRE_INFO_OPTIONS:
404      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405      break;
406    
407      case PCRE_INFO_SIZE:
408      *((size_t *)where) = re->size;
409      break;
410    
411      case PCRE_INFO_CAPTURECOUNT:
412      *((int *)where) = re->top_bracket;
413      break;
414    
415      case PCRE_INFO_BACKREFMAX:
416      *((int *)where) = re->top_backref;
417      break;
418    
419      case PCRE_INFO_FIRSTCHAR:
420      *((int *)where) =
421        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423      break;
424    
425      case PCRE_INFO_FIRSTTABLE:
426      *((const uschar **)where) =
427        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428          study->start_bits : NULL;
429      break;
430    
431      case PCRE_INFO_LASTLITERAL:
432      *((int *)where) =
433        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434      break;
435    
436      default: return PCRE_ERROR_BADOPTION;
437      }
438    
439    return 0;
440    }
441    
442    
443    
444  #ifdef DEBUG  #ifdef DEBUG
445  /*************************************************  /*************************************************
# Line 227  while (length-- > 0) Line 477  while (length-- > 0)
477    
478  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
479  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
480  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
482  sequence.  the \. On exit, it is on the final character of the escape sequence.
483    
484  Arguments:  Arguments:
485    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 249  check_escape(const uschar **ptrptr, cons Line 499  check_escape(const uschar **ptrptr, cons
499    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
500  {  {
501  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
502  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
503    
504    /* If backslash is at the end of the pattern, it's an error. */
505    
506    c = *(++ptr);
507  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
508    
509  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 311  else Line 563  else
563        }        }
564    
565      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
566      larger first octal digit */      larger first octal digit. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
572          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
573        c &= 255;     /* Take least significant 8 bits */
574      break;      break;
575    
576      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577        which can be greater than 0xff, but only if the ddd are hex digits. */
578    
579      case 'x':      case 'x':
580    #ifdef SUPPORT_UTF8
581        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582          {
583          const uschar *pt = ptr + 2;
584          register int count = 0;
585          c = 0;
586          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587            {
588            count++;
589            c = c * 16 + cd->lcc[*pt] -
590              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591            pt++;
592            }
593          if (*pt == '}')
594            {
595            if (c < 0 || count > 8) *errorptr = ERR34;
596            ptr = pt;
597            break;
598            }
599          /* If the sequence of hex digits does not end with '}', then we don't
600          recognize this construct; fall through to the normal \x handling. */
601          }
602    #endif
603    
604        /* Read just a single hex char */
605    
606      c = 0;      c = 0;
607      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608        {        {
# Line 332  else Line 612  else
612        }        }
613      break;      break;
614    
615        /* Other special escapes not starting with a digit are straightforward */
616    
617      case 'c':      case 'c':
618      c = *(++ptr);      c = *(++ptr);
619      if (c == 0)      if (c == 0)
# Line 469  if the length is fixed. This is needed f Line 751  if the length is fixed. This is needed f
751    
752  Arguments:  Arguments:
753    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
754      options  the compiling options
755    
756  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length
757  */  */
758    
759  static int  static int
760  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
761  {  {
762  int length = -1;  int length = -1;
763    
# Line 495  for (;;) Line 778  for (;;)
778      case OP_BRA:      case OP_BRA:
779      case OP_ONCE:      case OP_ONCE:
780      case OP_COND:      case OP_COND:
781      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
782      if (d < 0) return -1;      if (d < 0) return -1;
783      branchlength += d;      branchlength += d;
784      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
# Line 532  for (;;) Line 815  for (;;)
815    
816      case OP_REVERSE:      case OP_REVERSE:
817      cc++;      cc++;
818        /* Fall through */
819    
820      case OP_CREF:      case OP_CREF:
821      case OP_OPT:      case OP_OPT:
# Line 548  for (;;) Line 832  for (;;)
832      cc++;      cc++;
833      break;      break;
834    
835      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836        This requires a scan of the string, unfortunately. We assume valid UTF-8
837        strings, so all we do is reduce the length by one for byte whose bits are
838        10xxxxxx. */
839    
840      case OP_CHARS:      case OP_CHARS:
841      branchlength += *(++cc);      branchlength += *(++cc);
842    #ifdef SUPPORT_UTF8
843        for (d = 1; d <= *cc; d++)
844          if ((cc[d] & 0xc0) == 0x80) branchlength--;
845    #endif
846      cc += *cc + 1;      cc += *cc + 1;
847      break;      break;
848    
# Line 615  for (;;) Line 906  for (;;)
906    
907    
908  /*************************************************  /*************************************************
909    *           Check for POSIX class syntax         *
910    *************************************************/
911    
912    /* This function is called when the sequence "[:" or "[." or "[=" is
913    encountered in a character class. It checks whether this is followed by an
914    optional ^ and then a sequence of letters, terminated by a matching ":]" or
915    ".]" or "=]".
916    
917    Argument:
918      ptr      pointer to the initial [
919      endptr   where to return the end pointer
920      cd       pointer to compile data
921    
922    Returns:   TRUE or FALSE
923    */
924    
925    static BOOL
926    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927    {
928    int terminator;          /* Don't combine these lines; the Solaris cc */
929    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
930    if (*(++ptr) == '^') ptr++;
931    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932    if (*ptr == terminator && ptr[1] == ']')
933      {
934      *endptr = ptr;
935      return TRUE;
936      }
937    return FALSE;
938    }
939    
940    
941    
942    
943    /*************************************************
944    *          Check POSIX class name                *
945    *************************************************/
946    
947    /* This function is called to check the name given in a POSIX-style class entry
948    such as [:alnum:].
949    
950    Arguments:
951      ptr        points to the first letter
952      len        the length of the name
953    
954    Returns:     a value representing the name, or -1 if unknown
955    */
956    
957    static int
958    check_posix_name(const uschar *ptr, int len)
959    {
960    register int yield = 0;
961    while (posix_name_lengths[yield] != 0)
962      {
963      if (len == posix_name_lengths[yield] &&
964        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965      yield++;
966      }
967    return -1;
968    }
969    
970    
971    
972    
973    /*************************************************
974  *           Compile one branch                   *  *           Compile one branch                   *
975  *************************************************/  *************************************************/
976    
# Line 627  Arguments: Line 983  Arguments:
983    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
984    errorptr     points to pointer to error message    errorptr     points to pointer to error message
985    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
986      reqchar      set to the last literal character required, else -1
987      countlits    set to count of mandatory literal characters
988    cd           contains pointers to tables    cd           contains pointers to tables
989    
990  Returns:       TRUE on success  Returns:       TRUE on success
# Line 636  Returns:       TRUE on success Line 994  Returns:       TRUE on success
994  static BOOL  static BOOL
995  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
996    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
997    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
998  {  {
999  int repeat_type, op_type;  int repeat_type, op_type;
1000  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1001  int bravalue, length;  int bravalue, length;
1002  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
1003    int prevreqchar;
1004    int condcount = 0;
1005    int subcountlits = 0;
1006  register int c;  register int c;
1007  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1008  uschar *tempcode;  uschar *tempcode;
# Line 655  uschar class[32]; Line 1016  uschar class[32];
1016  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
1018    
1019    /* Initialize no required char, and count of literals */
1020    
1021    *reqchar = prevreqchar = -1;
1022    *countlits = 0;
1023    
1024  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1025    
1026  for (;; ptr++)  for (;; ptr++)
# Line 664  for (;; ptr++) Line 1030  for (;; ptr++)
1030    int class_lastchar;    int class_lastchar;
1031    int newoptions;    int newoptions;
1032    int condref;    int condref;
1033      int subreqchar;
1034    
1035    c = *ptr;    c = *ptr;
1036    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 671  for (;; ptr++) Line 1038  for (;; ptr++)
1038      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039      if (c == '#')      if (c == '#')
1040        {        {
1041        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1042          on the Macintosh. */
1043          while ((c = *(++ptr)) != 0 && c != '\n') ;
1044        continue;        continue;
1045        }        }
1046      }      }
# Line 746  for (;; ptr++) Line 1115  for (;; ptr++)
1115          goto FAILED;          goto FAILED;
1116          }          }
1117    
1118          /* Handle POSIX class names. Perl allows a negation extension of the
1119          form [:^name]. A square bracket that doesn't match the syntax is
1120          treated as a literal. We also recognize the POSIX constructions
1121          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122          5.6 does. */
1123    
1124          if (c == '[' &&
1125              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126              check_posix_syntax(ptr, &tempptr, cd))
1127            {
1128            BOOL local_negate = FALSE;
1129            int posix_class, i;
1130            register const uschar *cbits = cd->cbits;
1131    
1132            if (ptr[1] != ':')
1133              {
1134              *errorptr = ERR31;
1135              goto FAILED;
1136              }
1137    
1138            ptr += 2;
1139            if (*ptr == '^')
1140              {
1141              local_negate = TRUE;
1142              ptr++;
1143              }
1144    
1145            posix_class = check_posix_name(ptr, tempptr - ptr);
1146            if (posix_class < 0)
1147              {
1148              *errorptr = ERR30;
1149              goto FAILED;
1150              }
1151    
1152            /* If matching is caseless, upper and lower are converted to
1153            alpha. This relies on the fact that the class table starts with
1154            alpha, lower, upper as the first 3 entries. */
1155    
1156            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157              posix_class = 0;
1158    
1159            /* Or into the map we are building up to 3 of the static class
1160            tables, or their negations. */
1161    
1162            posix_class *= 3;
1163            for (i = 0; i < 3; i++)
1164              {
1165              int taboffset = posix_class_maps[posix_class + i];
1166              if (taboffset < 0) break;
1167              if (local_negate)
1168                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169              else
1170                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171              }
1172    
1173            ptr = tempptr + 1;
1174            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1175            continue;
1176            }
1177    
1178        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1179        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1180        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 773  for (;; ptr++) Line 1202  for (;; ptr++)
1202              continue;              continue;
1203    
1204              case ESC_w:              case ESC_w:
1205              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1206              continue;              continue;
1207    
1208              case ESC_W:              case ESC_W:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_s:              case ESC_s:
# Line 795  for (;; ptr++) Line 1222  for (;; ptr++)
1222              goto FAILED;              goto FAILED;
1223              }              }
1224            }            }
1225          /* Fall through if single character */  
1226            /* Fall through if single character, but don't at present allow
1227            chars > 255 in UTF-8 mode. */
1228    
1229    #ifdef SUPPORT_UTF8
1230            if (c > 255)
1231              {
1232              *errorptr = ERR33;
1233              goto FAILED;
1234              }
1235    #endif
1236          }          }
1237    
1238        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 815  for (;; ptr++) Line 1252  for (;; ptr++)
1252            }            }
1253    
1254          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1255          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256            in such circumstances. */
1257    
1258          if (d == '\\')          if (d == '\\')
1259            {            {
1260              const uschar *oldptr = ptr;
1261            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262    
1263    #ifdef SUPPORT_UTF8
1264              if (d > 255)
1265                {
1266                *errorptr = ERR33;
1267                goto FAILED;
1268                }
1269    #endif
1270              /* \b is backslash; any other special means the '-' was literal */
1271    
1272            if (d < 0)            if (d < 0)
1273              {              {
1274              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1275                {                {
1276                *errorptr = ERR7;                ptr = oldptr - 2;
1277                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1278                }                }
1279              }              }
1280            }            }
# Line 853  for (;; ptr++) Line 1302  for (;; ptr++)
1302        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1303        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1304    
1305          SINGLE_CHARACTER:
1306    
1307        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1308        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1309          {          {
# Line 937  for (;; ptr++) Line 1388  for (;; ptr++)
1388        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1389      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1390    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1391      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1392      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1393      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1394        out any reqchar setting, backing up to the previous value. We must also
1395        adjust the countlits value. */
1396    
1397      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1398        {        {
1399        int len = previous[1];        int len = previous[1];
1400    
1401          if (repeat_min == 0) *reqchar = prevreqchar;
1402          *countlits += repeat_min - 1;
1403    
1404        if (len == 1)        if (len == 1)
1405          {          {
1406          c = previous[2];          c = previous[2];
# Line 987  for (;; ptr++) Line 1439  for (;; ptr++)
1439        code = previous;        code = previous;
1440    
1441        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1442        repeat_type += op_type;      /* Combine both values for many cases */  
1443          /* If the maximum is zero then the minimum must also be zero; Perl allows
1444          this case, so we do too - by simply omitting the item altogether. */
1445    
1446          if (repeat_max == 0) goto END_REPEAT;
1447    
1448          /* Combine the op_type with the repeat_type */
1449    
1450          repeat_type += op_type;
1451    
1452        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1453        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1064  for (;; ptr++) Line 1524  for (;; ptr++)
1524        }        }
1525    
1526      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1527      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1528    
1529      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1530        {        {
1531          if (repeat_max == 0)
1532            {
1533            code = previous;
1534            goto END_REPEAT;
1535            }
1536        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1537          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1538        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1118  for (;; ptr++) Line 1583  for (;; ptr++)
1583    
1584        if (repeat_min == 0)        if (repeat_min == 0)
1585          {          {
1586            /* If we set up a required char from the bracket, we must back off
1587            to the previous value and reset the countlits value too. */
1588    
1589            if (subcountlits > 0)
1590              {
1591              *reqchar = prevreqchar;
1592              *countlits -= subcountlits;
1593              }
1594    
1595          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
1596          altogether. */          altogether. */
1597    
1598          if (repeat_max == 0)          if (repeat_max == 0)
1599            {            {
1600            code = previous;            code = previous;
1601            previous = NULL;            goto END_REPEAT;
           break;  
1602            }            }
1603    
1604          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
# Line 1230  for (;; ptr++) Line 1703  for (;; ptr++)
1703        correct offset was computed above. */        correct offset was computed above. */
1704    
1705        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
   
   
 #ifdef NEVER  
       /* If the minimum is greater than zero, and the maximum is unlimited or  
       equal to the minimum, the first copy remains where it is, and is  
       replicated up to the minimum number of times. This case includes the +  
       repeat, but of course no replication is needed in that case. */  
   
       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))  
         {  
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the minimum is zero, stick BRAZERO in front of the first copy.  
       Then, if there is a fixed upper limit, replicated up to that many times,  
       sticking BRAZERO in front of all the optional ones. */  
   
       else  
         {  
         if (repeat_min == 0)  
           {  
           memmove(previous+1, previous, len);  
           code++;  
           *previous++ = OP_BRAZERO + repeat_type;  
           }  
   
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
   
         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)  
           {  
           *code++ = OP_BRAZERO + repeat_type;  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the maximum is unlimited, set a repeater in the final copy. We  
       can't just offset backwards from the current code point, because we  
       don't know if there's been an options resetting after the ket. The  
       correct offset was computed above. */  
   
       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;  
 #endif  
   
   
1706        }        }
1707    
1708      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1295  for (;; ptr++) Line 1715  for (;; ptr++)
1715    
1716      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1717    
1718        END_REPEAT:
1719      previous = NULL;      previous = NULL;
1720      break;      break;
1721    
# Line 1333  for (;; ptr++) Line 1754  for (;; ptr++)
1754            {            {
1755            condref = *ptr - '0';            condref = *ptr - '0';
1756            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757              if (condref == 0)
1758                {
1759                *errorptr = ERR35;
1760                goto FAILED;
1761                }
1762            ptr++;            ptr++;
1763            }            }
1764          else ptr--;          else ptr--;
# Line 1372  for (;; ptr++) Line 1798  for (;; ptr++)
1798          ptr++;          ptr++;
1799          break;          break;
1800    
1801            case 'R':                 /* Pattern recursion */
1802            *code++ = OP_RECURSE;
1803            ptr++;
1804            continue;
1805    
1806          default:                  /* Option setting */          default:                  /* Option setting */
1807          set = unset = 0;          set = unset = 0;
1808          optset = &set;          optset = &set;
# Line 1463  for (;; ptr++) Line 1894  for (;; ptr++)
1894           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1895            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1897             &subreqchar,                  /* For possible last char */
1898             &subcountlits,                /* For literal count */
1899           cd))                          /* Tables block */           cd))                          /* Tables block */
1900        goto FAILED;        goto FAILED;
1901    
# Line 1476  for (;; ptr++) Line 1909  for (;; ptr++)
1909    
1910      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1911        {        {
       int branchcount = 0;  
1912        uschar *tc = code;        uschar *tc = code;
1913          condcount = 0;
1914    
1915        do {        do {
1916           branchcount++;           condcount++;
1917           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1918           }           }
1919        while (*tc != OP_KET);        while (*tc != OP_KET);
1920    
1921        if (branchcount > 2)        if (condcount > 2)
1922          {          {
1923          *errorptr = ERR27;          *errorptr = ERR27;
1924          goto FAILED;          goto FAILED;
1925          }          }
1926        }        }
1927    
1928        /* Handle updating of the required character. If the subpattern didn't
1929        set one, leave it as it was. Otherwise, update it for normal brackets of
1930        all kinds, forward assertions, and conditions with two branches. Don't
1931        update the literal count for forward assertions, however. If the bracket
1932        is followed by a quantifier with zero repeat, we have to back off. Hence
1933        the definition of prevreqchar and subcountlits outside the main loop so
1934        that they can be accessed for the back off. */
1935    
1936        if (subreqchar > 0 &&
1937             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938             (bravalue == OP_COND && condcount == 2)))
1939          {
1940          prevreqchar = *reqchar;
1941          *reqchar = subreqchar;
1942          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1943          }
1944    
1945      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1946    
1947      code = tempcode;      code = tempcode;
# Line 1559  for (;; ptr++) Line 2009  for (;; ptr++)
2009          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2010          if (c == '#')          if (c == '#')
2011            {            {
2012            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2013              on the Macintosh. */
2014              while ((c = *(++ptr)) != 0 && c != '\n') ;
2015            if (c == 0) break;            if (c == 0) break;
2016            continue;            continue;
2017            }            }
# Line 1574  for (;; ptr++) Line 2026  for (;; ptr++)
2026          tempptr = ptr;          tempptr = ptr;
2027          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
2029    
2030            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031            two or more characters in the UTF-8 encoding. */
2032    
2033    #ifdef SUPPORT_UTF8
2034            if (c > 127 && (options & PCRE_UTF8) != 0)
2035              {
2036              uschar buffer[8];
2037              int len = ord2utf8(c, buffer);
2038              for (c = 0; c < len; c++) *code++ = buffer[c];
2039              length += len;
2040              continue;
2041              }
2042    #endif
2043          }          }
2044    
2045        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1584  for (;; ptr++) Line 2050  for (;; ptr++)
2050    
2051      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2052    
2053      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054    
2055        /* Update the last character and the count of literals */
2056    
2057        prevreqchar = (length > 1)? code[-2] : *reqchar;
2058        *reqchar = code[-1];
2059        *countlits += length;
2060    
2061      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2062      the next state. */      the next state. */
2063    
2064      previous[1] = length;      previous[1] = length;
2065      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2066      break;      break;
2067      }      }
2068    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1628  Argument: Line 2100  Argument:
2100    ptrptr      -> the address of the current pattern pointer    ptrptr      -> the address of the current pattern pointer
2101    errorptr    -> pointer to error message    errorptr    -> pointer to error message
2102    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
2103    condref     > 0 for OPT_CREF setting at start of conditional group    condref     >= 0 for OPT_CREF setting at start of conditional group
2104      reqchar     -> place to put the last required character, or a negative number
2105      countlits   -> place to put the shortest literal count of any branch
2106    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
2107    
2108  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1637  Returns:      TRUE on success Line 2111  Returns:      TRUE on success
2111  static BOOL  static BOOL
2112  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
2115  {  {
2116  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2117  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1645  uschar *last_branch = code; Line 2119  uschar *last_branch = code;
2119  uschar *start_bracket = code;  uschar *start_bracket = code;
2120  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
2121  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
2122    int branchreqchar, branchcountlits;
2123    
2124    *reqchar = -1;
2125    *countlits = INT_MAX;
2126  code += 3;  code += 3;
2127    
2128  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
2129  number as an OP_CREF item. */  number as an OP_CREF item. */
2130    
2131  if (condref > 0)  if (condref >= 0)
2132    {    {
2133    *code++ = OP_CREF;    *code++ = OP_CREF;
2134    *code++ = condref;    *code++ = condref;
# Line 1684  for (;;) Line 2161  for (;;)
2161    
2162    /* Now compile the branch */    /* Now compile the branch */
2163    
2164    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165          &branchreqchar, &branchcountlits, cd))
2166      {      {
2167      *ptrptr = ptr;      *ptrptr = ptr;
2168      return FALSE;      return FALSE;
# Line 1696  for (;;) Line 2174  for (;;)
2174    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
2175    last_branch[2] = length & 255;    last_branch[2] = length & 255;
2176    
2177      /* Save the last required character if all branches have the same; a current
2178      value of -1 means unset, while -2 means "previous branch had no last required
2179      char".  */
2180    
2181      if (*reqchar != -2)
2182        {
2183        if (branchreqchar >= 0)
2184          {
2185          if (*reqchar == -1) *reqchar = branchreqchar;
2186          else if (*reqchar != branchreqchar) *reqchar = -2;
2187          }
2188        else *reqchar = -2;
2189        }
2190    
2191      /* Keep the shortest literal count */
2192    
2193      if (branchcountlits < *countlits) *countlits = branchcountlits;
2194      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2195    
2196    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
2197    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
2198    the branch with OP_END. */    the branch with OP_END. */
# Line 1703  for (;;) Line 2200  for (;;)
2200    if (lookbehind)    if (lookbehind)
2201      {      {
2202      *code = OP_END;      *code = OP_END;
2203      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
2204      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
2205      if (length < 0)      if (length < 0)
2206        {        {
# Line 1790  for (;;) Line 2287  for (;;)
2287      code += 2;      code += 2;
2288      break;      break;
2289    
2290        case OP_WORD_BOUNDARY:
2291        case OP_NOT_WORD_BOUNDARY:
2292        code++;
2293        break;
2294    
2295      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
2296      case OP_ASSERTBACK:      case OP_ASSERTBACK:
2297      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1817  all of whose alternatives start with OP_ Line 2319  all of whose alternatives start with OP_
2319  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2321    
2322  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2324  trying them again.  so there is no point trying them again.
2325    
2326  Arguments:  Arguments:
2327    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1837  do { Line 2339  do {
2339     register int op = *scode;     register int op = *scode;
2340     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2342     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343                (*options & PCRE_DOTALL) != 0)
2344       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
2345     else if (op != OP_SOD &&     else if (op != OP_SOD &&
2346             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1851  return TRUE; Line 2354  return TRUE;
2354    
2355    
2356  /*************************************************  /*************************************************
2357  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2358  *************************************************/  *************************************************/
2359    
2360  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2361  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2362    matching and for non-DOTALL patterns that start with .* (which must start at
2363    the beginning or after \n).
2364    
2365  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2366  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1869  do { Line 2374  do {
2374     register int op = *scode;     register int op = *scode;
2375     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
2377       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378         { if (scode[1] != OP_ANY) return FALSE; }
2379     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
2380     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2381     }     }
# Line 1967  pcre_compile(const char *pattern, int op Line 2474  pcre_compile(const char *pattern, int op
2474  real_pcre *re;  real_pcre *re;
2475  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2476  int runlength;  int runlength;
2477  int c, size;  int c, reqchar, countlits;
2478  int bracount = 0;  int bracount = 0;
2479  int top_backref = 0;  int top_backref = 0;
2480  int branch_extra = 0;  int branch_extra = 0;
2481  int branch_newextra;  int branch_newextra;
2482  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2483    size_t size;
2484  uschar *code;  uschar *code;
2485  const uschar *ptr;  const uschar *ptr;
2486  compile_data compile_block;  compile_data compile_block;
# Line 1983  uschar bralenstack[BRASTACK_SIZE]; Line 2491  uschar bralenstack[BRASTACK_SIZE];
2491  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2492  #endif  #endif
2493    
2494    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2495    
2496    #ifndef SUPPORT_UTF8
2497    if ((options & PCRE_UTF8) != 0)
2498      {
2499      *errorptr = ERR32;
2500      return NULL;
2501      }
2502    #endif
2503    
2504  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2505  can do is just return NULL. */  can do is just return NULL. */
2506    
# Line 2035  while ((c = *(++ptr)) != 0) Line 2553  while ((c = *(++ptr)) != 0)
2553      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2554      if (c == '#')      if (c == '#')
2555        {        {
2556        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
2557          on the Macintosh. */
2558          while ((c = *(++ptr)) != 0 && c != '\n') ;
2559        continue;        continue;
2560        }        }
2561      }      }
# Line 2200  while ((c = *(++ptr)) != 0) Line 2720  while ((c = *(++ptr)) != 0)
2720          ptr += 2;          ptr += 2;
2721          break;          break;
2722    
2723            /* A recursive call to the regex is an extension, to provide the
2724            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2725    
2726            case 'R':
2727            if (ptr[3] != ')')
2728              {
2729              *errorptr = ERR29;
2730              goto PCRE_ERROR_RETURN;
2731              }
2732            ptr += 3;
2733            length += 1;
2734            break;
2735    
2736          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2737    
2738          case '<':          case '<':
# Line 2232  while ((c = *(++ptr)) != 0) Line 2765  while ((c = *(++ptr)) != 0)
2765          else   /* An assertion must follow */          else   /* An assertion must follow */
2766            {            {
2767            ptr++;   /* Can treat like ':' as far as spacing is concerned */            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2768              if (ptr[2] != '?' ||
2769            if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)               (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2770              {              {
2771              ptr += 2;    /* To get right offset in message */              ptr += 2;    /* To get right offset in message */
2772              *errorptr = ERR28;              *errorptr = ERR28;
# Line 2307  while ((c = *(++ptr)) != 0) Line 2840  while ((c = *(++ptr)) != 0)
2840              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2841              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2842              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2843              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2844                flag ever changes within the regex. This is used by the "required
2845                character" code. */
2846    
2847              case ':':              case ':':
2848              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2849                {                {
2850                length += 4;                length += 4;
2851                branch_newextra = 2;                branch_newextra = 2;
2852                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2853                }                }
2854              goto END_OPTIONS;              goto END_OPTIONS;
2855    
# Line 2443  while ((c = *(++ptr)) != 0) Line 2979  while ((c = *(++ptr)) != 0)
2979          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2980          if (c == '#')          if (c == '#')
2981            {            {
2982            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2983              on the Macintosh. */
2984              while ((c = *(++ptr)) != 0 && c != '\n') ;
2985            continue;            continue;
2986            }            }
2987          }          }
# Line 2458  while ((c = *(++ptr)) != 0) Line 2996  while ((c = *(++ptr)) != 0)
2996            &compile_block);            &compile_block);
2997          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2998          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2999    
3000    #ifdef SUPPORT_UTF8
3001            if (c > 127 && (options & PCRE_UTF8) != 0)
3002              {
3003              int i;
3004              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3005                if (c <= utf8_table1[i]) break;
3006              runlength += i;
3007              }
3008    #endif
3009          }          }
3010    
3011        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 2467  while ((c = *(++ptr)) != 0) Line 3015  while ((c = *(++ptr)) != 0)
3015    
3016      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3017    
3018      while (runlength < 255 &&      while (runlength < MAXLIT &&
3019        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3020    
3021      ptr--;      ptr--;
# Line 2499  if (re == NULL) Line 3047  if (re == NULL)
3047    return NULL;    return NULL;
3048    }    }
3049    
3050  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3051    
3052  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3053    re->size = size;
3054  re->options = options;  re->options = options;
3055  re->tables = tables;  re->tables = tables;
3056    
# Line 2514  code = re->code; Line 3063  code = re->code;
3063  *code = OP_BRA;  *code = OP_BRA;
3064  bracount = 0;  bracount = 0;
3065  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3066    &compile_block);    &reqchar, &countlits, &compile_block);
3067  re->top_bracket = bracount;  re->top_bracket = bracount;
3068  re->top_backref = top_backref;  re->top_backref = top_backref;
3069    
# Line 2546  if (*errorptr != NULL) Line 3095  if (*errorptr != NULL)
3095    return NULL;    return NULL;
3096    }    }
3097    
3098  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3099  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3100  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3101  unanchored matches no end. In the case of multiline matches, an alternative is  
3102  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3103    that speeds up unanchored matches no end. If not, see if we can set the
3104    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3105    start with ^. and also when all branches start with .* for non-DOTALL matches.
3106    */
3107    
3108  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3109    {    {
# Line 2570  if ((options & PCRE_ANCHORED) == 0) Line 3123  if ((options & PCRE_ANCHORED) == 0)
3123      }      }
3124    }    }
3125    
3126    /* Save the last required character if there are at least two literal
3127    characters on all paths, or if there is no first character setting. */
3128    
3129    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3130      {
3131      re->req_char = reqchar;
3132      re->options |= PCRE_REQCHSET;
3133      }
3134    
3135  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3136    
3137  #ifdef DEBUG  #ifdef DEBUG
# Line 2579  printf("Length = %d top_bracket = %d top Line 3141  printf("Length = %d top_bracket = %d top
3141    
3142  if (re->options != 0)  if (re->options != 0)
3143    {    {
3144    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3145      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3146      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3147        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3148      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3149      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3150      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2596  if ((re->options & PCRE_FIRSTSET) != 0) Line 3159  if ((re->options & PCRE_FIRSTSET) != 0)
3159      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3160    }    }
3161    
3162    if ((re->options & PCRE_REQCHSET) != 0)
3163      {
3164      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3165        else printf("Req char = \\x%02x\n", re->req_char);
3166      }
3167    
3168  code_end = code;  code_end = code;
3169  code_base = code = re->code;  code_base = code = re->code;
3170    
# Line 2829  Returns:      TRUE if matched Line 3398  Returns:      TRUE if matched
3398    
3399  static BOOL  static BOOL
3400  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3401    int ims)    unsigned long int ims)
3402  {  {
3403  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3404    
# Line 2880  Arguments: Line 3449  Arguments:
3449     offset_top  current top pointer     offset_top  current top pointer
3450     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3451     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
3452     condassert  TRUE if called to check a condition assertion     eptrb       pointer to chain of blocks containing eptr at start of
3453     eptrb       eptr at start of last bracket                   brackets - for testing for empty matches
3454       flags       can contain
3455                     match_condassert - this is an assertion condition
3456                     match_isgroup - this is the start of a bracketed group
3457    
3458  Returns:       TRUE if matched  Returns:       TRUE if matched
3459  */  */
3460    
3461  static BOOL  static BOOL
3462  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3463    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3464      int flags)
3465  {  {
3466  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3467    eptrblock newptrb;
3468    
3469    /* At the start of a bracketed group, add the current subject pointer to the
3470    stack of such pointers, to be re-instated at the end of the group when we hit
3471    the closing ket. When match() is called in other circumstances, we don't add to
3472    the stack. */
3473    
3474    if ((flags & match_isgroup) != 0)
3475      {
3476      newptrb.prev = eptrb;
3477      newptrb.saved_eptr = eptr;
3478      eptrb = &newptrb;
3479      }
3480    
3481    /* Now start processing the operations. */
3482    
3483  for (;;)  for (;;)
3484    {    {
# Line 2936  for (;;) Line 3524  for (;;)
3524    
3525        do        do
3526          {          {
3527          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3528              return TRUE;
3529          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3530          }          }
3531        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2962  for (;;) Line 3551  for (;;)
3551      DPRINTF(("start bracket 0\n"));      DPRINTF(("start bracket 0\n"));
3552      do      do
3553        {        {
3554        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3555            return TRUE;
3556        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3557        }        }
3558      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2981  for (;;) Line 3571  for (;;)
3571        return match(eptr,        return match(eptr,
3572          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3573            5 : 3 + (ecode[1] << 8) + ecode[2]),            5 : 3 + (ecode[1] << 8) + ecode[2]),
3574          offset_top, md, ims, FALSE, eptr);          offset_top, md, ims, eptrb, match_isgroup);
3575        }        }
3576    
3577      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
# Line 2989  for (;;) Line 3579  for (;;)
3579    
3580      else      else
3581        {        {
3582        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3583              match_condassert | match_isgroup))
3584          {          {
3585          ecode += 3 + (ecode[4] << 8) + ecode[5];          ecode += 3 + (ecode[4] << 8) + ecode[5];
3586          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3587          }          }
3588        else ecode += (ecode[1] << 8) + ecode[2];        else ecode += (ecode[1] << 8) + ecode[2];
3589        return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);        return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3590        }        }
3591      /* Control never reaches here */      /* Control never reaches here */
3592    
# Line 3005  for (;;) Line 3596  for (;;)
3596      ecode += 2;      ecode += 2;
3597      break;      break;
3598    
3599      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3600        an empty string - recursion will then try other alternatives, if any. */
3601    
3602      case OP_END:      case OP_END:
3603        if (md->notempty && eptr == md->start_match) return FALSE;
3604      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3605      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3606      return TRUE;      return TRUE;
# Line 3017  for (;;) Line 3610  for (;;)
3610      case OP_OPT:      case OP_OPT:
3611      ims = ecode[1];      ims = ecode[1];
3612      ecode += 2;      ecode += 2;
3613      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3614      break;      break;
3615    
3616      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 3030  for (;;) Line 3623  for (;;)
3623      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3624      do      do
3625        {        {
3626        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3627        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3628        }        }
3629      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 3038  for (;;) Line 3631  for (;;)
3631    
3632      /* If checking an assertion for a condition, return TRUE. */      /* If checking an assertion for a condition, return TRUE. */
3633    
3634      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3635    
3636      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3637      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
# Line 3054  for (;;) Line 3647  for (;;)
3647      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3648      do      do
3649        {        {
3650        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3651            return FALSE;
3652        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3653        }        }
3654      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3655    
3656      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3657    
3658      ecode += 3;      ecode += 3;
3659      continue;      continue;
3660    
3661      /* Move the subject pointer back. This occurs only at the start of      /* Move the subject pointer back. This occurs only at the start of
3662      each branch of a lookbehind assertion. If we are too close to the start to      each branch of a lookbehind assertion. If we are too close to the start to
3663      move back, this match function fails. */      move back, this match function fails. When working with UTF-8 we move
3664        back a number of characters, not bytes. */
3665    
3666      case OP_REVERSE:      case OP_REVERSE:
3667    #ifdef SUPPORT_UTF8
3668        c = (ecode[1] << 8) + ecode[2];
3669        for (i = 0; i < c; i++)
3670          {
3671          eptr--;
3672          BACKCHAR(eptr)
3673          }
3674    #else
3675      eptr -= (ecode[1] << 8) + ecode[2];      eptr -= (ecode[1] << 8) + ecode[2];
3676    #endif
3677    
3678      if (eptr < md->start_subject) return FALSE;      if (eptr < md->start_subject) return FALSE;
3679      ecode += 3;      ecode += 3;
3680      break;      break;
3681    
3682        /* Recursion matches the current regex, nested. If there are any capturing
3683        brackets started but not finished, we have to save their starting points
3684        and reinstate them after the recursion. However, we don't know how many
3685        such there are (offset_top records the completed total) so we just have
3686        to save all the potential data. There may be up to 99 such values, which
3687        is a bit large to put on the stack, but using malloc for small numbers
3688        seems expensive. As a compromise, the stack is used when there are fewer
3689        than 16 values to store; otherwise malloc is used. A problem is what to do
3690        if the malloc fails ... there is no way of returning to the top level with
3691        an error. Save the top 15 values on the stack, and accept that the rest
3692        may be wrong. */
3693    
3694        case OP_RECURSE:
3695          {
3696          BOOL rc;
3697          int *save;
3698          int stacksave[15];
3699    
3700          c = md->offset_max;
3701    
3702          if (c < 16) save = stacksave; else
3703            {
3704            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3705            if (save == NULL)
3706              {
3707              save = stacksave;
3708              c = 15;
3709              }
3710            }
3711    
3712          for (i = 1; i <= c; i++)
3713            save[i] = md->offset_vector[md->offset_end - i];
3714          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3715            match_isgroup);
3716          for (i = 1; i <= c; i++)
3717            md->offset_vector[md->offset_end - i] = save[i];
3718          if (save != stacksave) (pcre_free)(save);
3719          if (!rc) return FALSE;
3720    
3721          /* In case the recursion has set more capturing values, save the final
3722          number, then move along the subject till after the recursive match,
3723          and advance one byte in the pattern code. */
3724    
3725          offset_top = md->end_offset_top;
3726          eptr = md->end_match_ptr;
3727          ecode++;
3728          }
3729        break;
3730    
3731      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3732      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 3084  for (;;) Line 3738  for (;;)
3738      case OP_ONCE:      case OP_ONCE:
3739        {        {
3740        const uschar *prev = ecode;        const uschar *prev = ecode;
3741          const uschar *saved_eptr = eptr;
3742    
3743        do        do
3744          {          {
3745          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3746              break;
3747          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3748          }          }
3749        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 3110  for (;;) Line 3766  for (;;)
3766        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3767        course of events. */        course of events. */
3768    
3769        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3770          {          {
3771          ecode += 3;          ecode += 3;
3772          break;          break;
# Line 3124  for (;;) Line 3780  for (;;)
3780        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3781          {          {
3782          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3783          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3784          }          }
3785    
3786        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3787          {          {
3788          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3789              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3790                  return TRUE;
3791          }          }
3792        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3793          {          {
3794          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3795              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3796          }          }
3797        }        }
3798      return FALSE;      return FALSE;
# Line 3156  for (;;) Line 3813  for (;;)
3813      case OP_BRAZERO:      case OP_BRAZERO:
3814        {        {
3815        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3816        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3817            return TRUE;
3818        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3819        ecode = next + 3;        ecode = next + 3;
3820        }        }
# Line 3166  for (;;) Line 3824  for (;;)
3824        {        {
3825        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3826        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3827        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3828            return TRUE;
3829        ecode++;        ecode++;
3830        }        }
3831      break;      break;
# Line 3181  for (;;) Line 3840  for (;;)
3840      case OP_KETRMAX:      case OP_KETRMAX:
3841        {        {
3842        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3843          const uschar *saved_eptr = eptrb->saved_eptr;
3844    
3845          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3846    
3847        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3848            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 3200  for (;;) Line 3862  for (;;)
3862          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
3863          int offset = number << 1;          int offset = number << 1;
3864    
3865          DPRINTF(("end bracket %d\n", number));  #ifdef DEBUG
3866            printf("end bracket %d", number);
3867            printf("\n");
3868    #endif
3869    
3870          if (number > 0)          if (number > 0)
3871            {            {
# Line 3218  for (;;) Line 3883  for (;;)
3883        the group. */        the group. */
3884    
3885        ims = original_ims;        ims = original_ims;
3886        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3887    
3888        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3889        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3226  for (;;) Line 3891  for (;;)
3891        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3892        course of events. */        course of events. */
3893    
3894        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3895          {          {
3896          ecode += 3;          ecode += 3;
3897          break;          break;
# Line 3237  for (;;) Line 3902  for (;;)
3902    
3903        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3904          {          {
3905          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3906              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3907                  return TRUE;
3908          }          }
3909        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3910          {          {
3911          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3912              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3913          }          }
3914        }        }
3915      return FALSE;      return FALSE;
# Line 3328  for (;;) Line 3994  for (;;)
3994      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3995        return FALSE;        return FALSE;
3996      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
3997    #ifdef SUPPORT_UTF8
3998        if (md->utf8)
3999          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4000    #endif
4001      ecode++;      ecode++;
4002      break;      break;
4003    
# Line 3454  for (;;) Line 4124  for (;;)
4124          {          {
4125          for (i = min;; i++)          for (i = min;; i++)
4126            {            {
4127            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4128              return TRUE;              return TRUE;
4129            if (i >= max || !match_ref(offset, eptr, length, md, ims))            if (i >= max || !match_ref(offset, eptr, length, md, ims))
4130              return FALSE;              return FALSE;
# Line 3475  for (;;) Line 4145  for (;;)
4145            }            }
4146          while (eptr >= pp)          while (eptr >= pp)
4147            {            {
4148            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4149              return TRUE;              return TRUE;
4150            eptr -= length;            eptr -= length;
4151            }            }
# Line 3529  for (;;) Line 4199  for (;;)
4199        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4200          {          {
4201          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
4202          c = *eptr++;          GETCHARINC(c, eptr)         /* Get character; increment eptr */
4203    
4204    #ifdef SUPPORT_UTF8
4205            /* We do not yet support class members > 255 */
4206            if (c > 255) return FALSE;
4207    #endif
4208    
4209          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
4210          return FALSE;          return FALSE;
4211          }          }
# Line 3546  for (;;) Line 4222  for (;;)
4222          {          {
4223          for (i = min;; i++)          for (i = min;; i++)
4224            {            {
4225            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4226              return TRUE;              return TRUE;
4227            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
4228            c = *eptr++;            GETCHARINC(c, eptr)       /* Get character; increment eptr */
4229    
4230    #ifdef SUPPORT_UTF8
4231              /* We do not yet support class members > 255 */
4232              if (c > 255) return FALSE;
4233    #endif
4234            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
4235            return FALSE;            return FALSE;
4236            }            }
# Line 3561  for (;;) Line 4242  for (;;)
4242        else        else
4243          {          {
4244          const uschar *pp = eptr;          const uschar *pp = eptr;
4245          for (i = min; i < max; eptr++, i++)          int len = 1;
4246            for (i = min; i < max; i++)
4247            {            {
4248            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
4249            c = *eptr;            GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4250            if ((data[c/8] & (1 << (c&7))) != 0) continue;  
4251            break;  #ifdef SUPPORT_UTF8
4252              /* We do not yet support class members > 255 */
4253              if (c > 255) break;
4254    #endif
4255              if ((data[c/8] & (1 << (c&7))) == 0) break;
4256              eptr += len;
4257            }            }
4258    
4259          while (eptr >= pp)          while (eptr >= pp)
4260            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            {
4261              if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4262              return TRUE;              return TRUE;
4263    
4264    #ifdef SUPPORT_UTF8
4265              BACKCHAR(eptr)
4266    #endif
4267              }
4268          return FALSE;          return FALSE;
4269          }          }
4270        }        }
# Line 3667  for (;;) Line 4360  for (;;)
4360          {          {
4361          for (i = min;; i++)          for (i = min;; i++)
4362            {            {
4363            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4364              return TRUE;              return TRUE;
4365            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4366                c != md->lcc[*eptr++])                c != md->lcc[*eptr++])
# Line 3684  for (;;) Line 4377  for (;;)
4377            eptr++;            eptr++;
4378            }            }
4379          while (eptr >= pp)          while (eptr >= pp)
4380            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4381              return TRUE;              return TRUE;
4382          return FALSE;          return FALSE;
4383          }          }
# Line 3701  for (;;) Line 4394  for (;;)
4394          {          {
4395          for (i = min;; i++)          for (i = min;; i++)
4396            {            {
4397            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4398              return TRUE;              return TRUE;
4399            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4400            }            }
# Line 3716  for (;;) Line 4409  for (;;)
4409            eptr++;            eptr++;
4410            }            }
4411          while (eptr >= pp)          while (eptr >= pp)
4412           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4413             return TRUE;             return TRUE;
4414          return FALSE;          return FALSE;
4415          }          }
# Line 3798  for (;;) Line 4491  for (;;)
4491          {          {
4492          for (i = min;; i++)          for (i = min;; i++)
4493            {            {
4494            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4495              return TRUE;              return TRUE;
4496            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4497                c == md->lcc[*eptr++])                c == md->lcc[*eptr++])
# Line 3815  for (;;) Line 4508  for (;;)
4508            eptr++;            eptr++;
4509            }            }
4510          while (eptr >= pp)          while (eptr >= pp)
4511            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4512              return TRUE;              return TRUE;
4513          return FALSE;          return FALSE;
4514          }          }
# Line 3832  for (;;) Line 4525  for (;;)
4525          {          {
4526          for (i = min;; i++)          for (i = min;; i++)
4527            {            {
4528            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4529              return TRUE;              return TRUE;
4530            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4531            }            }
# Line 3847  for (;;) Line 4540  for (;;)
4540            eptr++;            eptr++;
4541            }            }
4542          while (eptr >= pp)          while (eptr >= pp)
4543           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4544             return TRUE;             return TRUE;
4545          return FALSE;          return FALSE;
4546          }          }
# Line 3891  for (;;) Line 4584  for (;;)
4584    
4585      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
4586      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
4587      (i.e. keep it out of the loop). Also test that there are at least the      (i.e. keep it out of the loop). Also we can test that there are at least
4588      minimum number of characters before we start. */      the minimum number of bytes before we start, except when doing '.' in
4589        UTF8 mode. Leave the test in in all cases; in the special case we have
4590        to test after each character. */
4591    
4592      if (min > md->end_subject - eptr) return FALSE;      if (min > md->end_subject - eptr) return FALSE;
4593      if (min > 0) switch(ctype)      if (min > 0) switch(ctype)
4594        {        {
4595        case OP_ANY:        case OP_ANY:
4596    #ifdef SUPPORT_UTF8
4597          if (md->utf8)
4598            {
4599            for (i = 1; i <= min; i++)
4600              {
4601              if (eptr >= md->end_subject ||
4602                 (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4603                return FALSE;
4604              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4605              }
4606            break;
4607            }
4608    #endif
4609          /* Non-UTF8 can be faster */
4610        if ((ims & PCRE_DOTALL) == 0)        if ((ims & PCRE_DOTALL) == 0)
4611          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4612        else eptr += min;        else eptr += min;
# Line 3947  for (;;) Line 4656  for (;;)
4656        {        {
4657        for (i = min;; i++)        for (i = min;; i++)
4658          {          {
4659          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4660          if (i >= max || eptr >= md->end_subject) return FALSE;          if (i >= max || eptr >= md->end_subject) return FALSE;
4661    
4662          c = *eptr++;          c = *eptr++;
# Line 3955  for (;;) Line 4664  for (;;)
4664            {            {
4665            case OP_ANY:            case OP_ANY:
4666            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4667    #ifdef SUPPORT_UTF8
4668              if (md->utf8)
4669                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4670    #endif
4671            break;            break;
4672    
4673            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
# Line 3994  for (;;) Line 4707  for (;;)
4707        switch(ctype)        switch(ctype)
4708          {          {
4709          case OP_ANY:          case OP_ANY:
4710    
4711            /* Special code is required for UTF8, but when the maximum is unlimited
4712            we don't need it. */
4713    
4714    #ifdef SUPPORT_UTF8
4715            if (md->utf8 && max < INT_MAX)
4716              {
4717              if ((ims & PCRE_DOTALL) == 0)
4718                {
4719                for (i = min; i < max; i++)
4720                  {
4721                  if (eptr >= md->end_subject || *eptr++ == '\n') break;
4722                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4723                  }
4724                }
4725              else
4726                {
4727                for (i = min; i < max; i++)
4728                  {
4729                  eptr++;
4730                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4731                  }
4732                }
4733              break;
4734              }
4735    #endif
4736            /* Non-UTF8 can be faster */
4737          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
4738            {            {
4739            for (i = min; i < max; i++)            for (i = min; i < max; i++)
# Line 4066  for (;;) Line 4806  for (;;)
4806          }          }
4807    
4808        while (eptr >= pp)        while (eptr >= pp)
4809          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))          {
4810            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4811            return TRUE;            return TRUE;
4812    #ifdef SUPPORT_UTF8
4813            if (md->utf8)
4814              while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4815    #endif
4816            }
4817        return FALSE;        return FALSE;
4818        }        }
4819      /* Control never gets here */      /* Control never gets here */
# Line 4104  Arguments: Line 4850  Arguments:
4850    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4851    subject         points to the subject string    subject         points to the subject string
4852    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4853      start_offset    where to start in the subject string
4854    options         option bits    options         option bits
4855    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4856    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 4116  Returns:          > 0 => success; value Line 4863  Returns:          > 0 => success; value
4863    
4864  int  int
4865  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4866    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4867      int offsetcount)
4868  {  {
4869  int resetcount, ocount;  int resetcount, ocount;
4870  int first_char = -1;  int first_char = -1;
4871  int ims = 0;  int req_char = -1;
4872    int req_char2 = -1;
4873    unsigned long int ims = 0;
4874  match_data match_block;  match_data match_block;
4875  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4876  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4877  const uschar *end_subject;  const uschar *end_subject;
4878    const uschar *req_char_ptr = start_match - 1;
4879  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4880  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4881  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 4137  if (re == NULL || subject == NULL || Line 4888  if (re == NULL || subject == NULL ||
4888     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4889  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4890    
4891    match_block.start_pattern = re->code;
4892  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4893  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4894  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
4895    
4896  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4897    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4898    
4899  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4900  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4901    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4902    
4903  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4904    
# Line 4215  if (!anchored) Line 4969  if (!anchored)
4969          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4970    }    }
4971    
4972  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4973    character" set. If the PCRE_CASELESS is set, implying that the match starts
4974    caselessly, or if there are any changes of this flag within the regex, set up
4975    both cases of the character. Otherwise set the two values the same, which will
4976    avoid duplicate testing (which takes significant time). This covers the vast
4977    majority of cases. It will be suboptimal when the case flag changes in a regex
4978    and the required character in fact is caseful. */
4979    
4980    if ((re->options & PCRE_REQCHSET) != 0)
4981      {
4982      req_char = re->req_char;
4983      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4984        (re->tables + fcc_offset)[req_char] : req_char;
4985      }
4986    
4987    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4988    the loop runs just once. */
4989    
4990  do  do
4991    {    {
# Line 4244  do Line 5014  do
5014    
5015    else if (startline)    else if (startline)
5016      {      {
5017      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
5018        {        {
5019        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != '\n')
5020          start_match++;          start_match++;
5021        }        }
5022      }      }
5023    
5024    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
5025    
5026    else if (start_bits != NULL)    else if (start_bits != NULL)
5027      {      {
# Line 4268  do Line 5038  do
5038    printf("\n");    printf("\n");
5039  #endif  #endif
5040    
5041      /* If req_char is set, we know that that character must appear in the subject
5042      for the match to succeed. If the first character is set, req_char must be
5043      later in the subject; otherwise the test starts at the match point. This
5044      optimization can save a huge amount of backtracking in patterns with nested
5045      unlimited repeats that aren't going to match. We don't know what the state of
5046      case matching may be when this character is hit, so test for it in both its
5047      cases if necessary. However, the different cased versions will not be set up
5048      unless PCRE_CASELESS was given or the casing state changes within the regex.
5049      Writing separate code makes it go faster, as does using an autoincrement and
5050      backing off on a match. */
5051    
5052      if (req_char >= 0)
5053        {
5054        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5055    
5056        /* We don't need to repeat the search if we haven't yet reached the
5057        place we found it at last time. */
5058    
5059        if (p > req_char_ptr)
5060          {
5061          /* Do a single test if no case difference is set up */
5062    
5063          if (req_char == req_char2)
5064            {
5065            while (p < end_subject)
5066              {
5067              if (*p++ == req_char) { p--; break; }
5068              }
5069            }
5070    
5071          /* Otherwise test for either case */
5072    
5073          else
5074            {
5075            while (p < end_subject)
5076              {
5077              register int pp = *p++;
5078              if (pp == req_char || pp == req_char2) { p--; break; }
5079              }
5080            }
5081    
5082          /* If we can't find the required character, break the matching loop */
5083    
5084          if (p >= end_subject) break;
5085    
5086          /* If we have found the required character, save the point where we
5087          found it, so that we don't search again next time round the loop if
5088          the start hasn't passed this character yet. */
5089    
5090          req_char_ptr = p;
5091          }
5092        }
5093    
5094    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
5095    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
5096    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4275  do Line 5098  do
5098    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
5099    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
5100    
5101    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    match_block.start_match = start_match;
5102      if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5103      continue;      continue;
5104    
5105    /* Copy the offset information from temporary store if necessary */    /* Copy the offset information from temporary store if necessary */

Legend:
Removed from v.31  
changed lines
  Added in v.51

  ViewVC Help
Powered by ViewVC 1.1.5