/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 31 by nigel, Sat Feb 24 21:38:57 2007 UTC revision 49 by nigel, Sat Feb 24 21:39:33 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 66  not be set greater than 200. */ Line 66  not be set greater than 200. */
66  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
67    
68    
69    /* The number of bytes in a literal character string above which we can't add
70    any more is different when UTF-8 characters may be encountered. */
71    
72    #ifdef SUPPORT_UTF8
73    #define MAXLIT 250
74    #else
75    #define MAXLIT 255
76    #endif
77    
78    
79  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 82  static const char *OP_names[] = { Line 92  static const char *OP_names[] = {
92    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
95    "class", "Ref",    "class", "Ref", "Recurse",
96    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 107  static const short int escapes[] = { Line 117  static const short int escapes[] = {
117      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
118  };  };
119    
120    /* Tables of names of POSIX character classes and their lengths. The list is
121    terminated by a zero length entry. The first three must be alpha, upper, lower,
122    as this is assumed for handling case independence. */
123    
124    static const char *posix_names[] = {
125      "alpha", "lower", "upper",
126      "alnum", "ascii", "cntrl", "digit", "graph",
127      "print", "punct", "space", "word",  "xdigit" };
128    
129    static const uschar posix_name_lengths[] = {
130      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131    
132    /* Table of class bit maps for each POSIX class; up to three may be combined
133    to form the class. */
134    
135    static const int posix_class_maps[] = {
136      cbit_lower, cbit_upper, -1,             /* alpha */
137      cbit_lower, -1,         -1,             /* lower */
138      cbit_upper, -1,         -1,             /* upper */
139      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
140      cbit_print, cbit_cntrl, -1,             /* ascii */
141      cbit_cntrl, -1,         -1,             /* cntrl */
142      cbit_digit, -1,         -1,             /* digit */
143      cbit_graph, -1,         -1,             /* graph */
144      cbit_print, -1,         -1,             /* print */
145      cbit_punct, -1,         -1,             /* punct */
146      cbit_space, -1,         -1,             /* space */
147      cbit_word,  -1,         -1,             /* word */
148      cbit_xdigit,-1,         -1              /* xdigit */
149    };
150    
151    
152  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
153    
154  static BOOL  static BOOL
155    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
157    
158    /* Structure for building a chain of data that actually lives on the
159    stack, for holding the values of the subject pointer at the start of each
160    subpattern, so as to detect when an empty string has been matched by a
161    subpattern - to break infinite loops. */
162    
163    typedef struct eptrblock {
164      struct eptrblock *prev;
165      const uschar *saved_eptr;
166    } eptrblock;
167    
168    /* Flag bits for the match() function */
169    
170    #define match_condassert   0x01    /* Called to check a condition assertion */
171    #define match_isgroup      0x02    /* Set if start of bracketed group */
172    
173    
174    
# Line 129  void  (*pcre_free)(void *) = free; Line 186  void  (*pcre_free)(void *) = free;
186    
187    
188    
189    /*************************************************
190    *    Macros and tables for character handling    *
191    *************************************************/
192    
193    /* When UTF-8 encoding is being used, a character is no longer just a single
194    byte. The macros for character handling generate simple sequences when used in
195    byte-mode, and more complicated ones for UTF-8 characters. */
196    
197    #ifndef SUPPORT_UTF8
198    #define GETCHARINC(c, eptr) c = *eptr++;
199    #define GETCHARLEN(c, eptr, len) c = *eptr;
200    #define BACKCHAR(eptr)
201    
202    #else   /* SUPPORT_UTF8 */
203    
204    /* Get the next UTF-8 character, advancing the pointer */
205    
206    #define GETCHARINC(c, eptr) \
207      c = *eptr++; \
208      if (md->utf8 && (c & 0xc0) == 0xc0) \
209        { \
210        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
211        int s = 6 - a;                  /* Amount to shift next byte */  \
212        c &= utf8_table3[a];            /* Low order bits from first byte */ \
213        while (a-- > 0) \
214          { \
215          c |= (*eptr++ & 0x3f) << s; \
216          s += 6; \
217          } \
218        }
219    
220    /* Get the next UTF-8 character, not advancing the pointer, setting length */
221    
222    #define GETCHARLEN(c, eptr, len) \
223      c = *eptr; \
224      len = 1; \
225      if (md->utf8 && (c & 0xc0) == 0xc0) \
226        { \
227        int i; \
228        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
229        int s = 6 - a;                  /* Amount to shift next byte */  \
230        c &= utf8_table3[a];            /* Low order bits from first byte */ \
231        for (i = 1; i <= a; i++) \
232          { \
233          c |= (eptr[i] & 0x3f) << s; \
234          s += 6; \
235          } \
236        len += a; \
237        }
238    
239    /* If the pointer is not at the start of a character, move it back until
240    it is. */
241    
242    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243    
244    #endif
245    
246    
247    
248  /*************************************************  /*************************************************
249  *             Default character tables           *  *             Default character tables           *
# Line 144  tables. */ Line 259  tables. */
259    
260    
261    
262    #ifdef SUPPORT_UTF8
263    /*************************************************
264    *           Tables for UTF-8 support             *
265    *************************************************/
266    
267    /* These are the breakpoints for different numbers of bytes in a UTF-8
268    character. */
269    
270    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271    
272    /* These are the indicator bits and the mask for the data bits to set in the
273    first byte of a character, indexed by the number of additional bytes. */
274    
275    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277    
278    /* Table of the number of extra characters, indexed by the first character
279    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280    0x3d. */
281    
282    static uschar utf8_table4[] = {
283      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287    
288    
289    /*************************************************
290    *       Convert character value to UTF-8         *
291    *************************************************/
292    
293    /* This function takes an integer value in the range 0 - 0x7fffffff
294    and encodes it as a UTF-8 character in 0 to 6 bytes.
295    
296    Arguments:
297      cvalue     the character value
298      buffer     pointer to buffer for result - at least 6 bytes long
299    
300    Returns:     number of characters placed in the buffer
301    */
302    
303    static int
304    ord2utf8(int cvalue, uschar *buffer)
305    {
306    register int i, j;
307    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308      if (cvalue <= utf8_table1[i]) break;
309    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310    cvalue >>= 6 - i;
311    for (j = 0; j < i; j++)
312      {
313      *buffer++ = 0x80 | (cvalue & 0x3f);
314      cvalue >>= 6;
315      }
316    return i + 1;
317    }
318    #endif
319    
320    
321    
322  /*************************************************  /*************************************************
323  *          Return version string                 *  *          Return version string                 *
324  *************************************************/  *************************************************/
325    
326    #define STRING(a)  # a
327    #define XSTRING(s) STRING(s)
328    
329  const char *  const char *
330  pcre_version(void)  pcre_version(void)
331  {  {
332  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333  }  }
334    
335    
336    
337    
338  /*************************************************  /*************************************************
339  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
340  *************************************************/  *************************************************/
341    
342  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
343  structure.  of the private structure, but its interface was too rigid. It remains for
344    backwards compatibility. The public options are passed back in an int - though
345    the re->options field has been expanded to a long int, all the public options
346    at the low end of it, and so even on 16-bit systems this will still be OK.
347    Therefore, I haven't changed the API for pcre_info().
348    
349  Arguments:  Arguments:
350    external_re   points to compiled code    external_re   points to compiled code
# Line 171  Arguments: Line 353  Arguments:
353                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
354                  or -2 otherwise                  or -2 otherwise
355    
356  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
357                  or negative values on error                  or negative values on error
358  */  */
359    
# Line 181  pcre_info(const pcre *external_re, int * Line 363  pcre_info(const pcre *external_re, int *
363  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
364  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
365  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367  if (first_char != NULL)  if (first_char != NULL)
368    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 190  return re->top_bracket; Line 372  return re->top_bracket;
372    
373    
374    
375    /*************************************************
376    *        Return info about compiled pattern      *
377    *************************************************/
378    
379    /* This is a newer "info" function which has an extensible interface so
380    that additional items can be added compatibly.
381    
382    Arguments:
383      external_re      points to compiled code
384      external_study   points to study data, or NULL
385      what             what information is required
386      where            where to put the information
387    
388    Returns:           0 if data returned, negative on error
389    */
390    
391    int
392    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393      void *where)
394    {
395    const real_pcre *re = (const real_pcre *)external_re;
396    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397    
398    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400    
401    switch (what)
402      {
403      case PCRE_INFO_OPTIONS:
404      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405      break;
406    
407      case PCRE_INFO_SIZE:
408      *((size_t *)where) = re->size;
409      break;
410    
411      case PCRE_INFO_CAPTURECOUNT:
412      *((int *)where) = re->top_bracket;
413      break;
414    
415      case PCRE_INFO_BACKREFMAX:
416      *((int *)where) = re->top_backref;
417      break;
418    
419      case PCRE_INFO_FIRSTCHAR:
420      *((int *)where) =
421        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423      break;
424    
425      case PCRE_INFO_FIRSTTABLE:
426      *((const uschar **)where) =
427        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428          study->start_bits : NULL;
429      break;
430    
431      case PCRE_INFO_LASTLITERAL:
432      *((int *)where) =
433        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434      break;
435    
436      default: return PCRE_ERROR_BADOPTION;
437      }
438    
439    return 0;
440    }
441    
442    
443    
444  #ifdef DEBUG  #ifdef DEBUG
445  /*************************************************  /*************************************************
# Line 227  while (length-- > 0) Line 477  while (length-- > 0)
477    
478  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
479  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
480  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
482  sequence.  the \. On exit, it is on the final character of the escape sequence.
483    
484  Arguments:  Arguments:
485    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 249  check_escape(const uschar **ptrptr, cons Line 499  check_escape(const uschar **ptrptr, cons
499    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
500  {  {
501  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
502  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
503  int i;  
504    /* If backslash is at the end of the pattern, it's an error. */
505    
506    c = *(++ptr);
507  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
508    
509  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 311  else Line 563  else
563        }        }
564    
565      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
566      larger first octal digit */      larger first octal digit. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
572          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
573        c &= 255;     /* Take least significant 8 bits */
574      break;      break;
575    
576      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577        which can be greater than 0xff, but only if the ddd are hex digits. */
578    
579      case 'x':      case 'x':
580    #ifdef SUPPORT_UTF8
581        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582          {
583          const uschar *pt = ptr + 2;
584          register int count = 0;
585          c = 0;
586          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587            {
588            count++;
589            c = c * 16 + cd->lcc[*pt] -
590              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591            pt++;
592            }
593          if (*pt == '}')
594            {
595            if (c < 0 || count > 8) *errorptr = ERR34;
596            ptr = pt;
597            break;
598            }
599          /* If the sequence of hex digits does not end with '}', then we don't
600          recognize this construct; fall through to the normal \x handling. */
601          }
602    #endif
603    
604        /* Read just a single hex char */
605    
606      c = 0;      c = 0;
607      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608        {        {
# Line 332  else Line 612  else
612        }        }
613      break;      break;
614    
615        /* Other special escapes not starting with a digit are straightforward */
616    
617      case 'c':      case 'c':
618      c = *(++ptr);      c = *(++ptr);
619      if (c == 0)      if (c == 0)
# Line 469  if the length is fixed. This is needed f Line 751  if the length is fixed. This is needed f
751    
752  Arguments:  Arguments:
753    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
754      options  the compiling options
755    
756  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length
757  */  */
758    
759  static int  static int
760  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
761  {  {
762  int length = -1;  int length = -1;
763    
# Line 495  for (;;) Line 778  for (;;)
778      case OP_BRA:      case OP_BRA:
779      case OP_ONCE:      case OP_ONCE:
780      case OP_COND:      case OP_COND:
781      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
782      if (d < 0) return -1;      if (d < 0) return -1;
783      branchlength += d;      branchlength += d;
784      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
# Line 532  for (;;) Line 815  for (;;)
815    
816      case OP_REVERSE:      case OP_REVERSE:
817      cc++;      cc++;
818        /* Fall through */
819    
820      case OP_CREF:      case OP_CREF:
821      case OP_OPT:      case OP_OPT:
# Line 548  for (;;) Line 832  for (;;)
832      cc++;      cc++;
833      break;      break;
834    
835      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836        This requires a scan of the string, unfortunately. We assume valid UTF-8
837        strings, so all we do is reduce the length by one for byte whose bits are
838        10xxxxxx. */
839    
840      case OP_CHARS:      case OP_CHARS:
841      branchlength += *(++cc);      branchlength += *(++cc);
842    #ifdef SUPPORT_UTF8
843        for (d = 1; d <= *cc; d++)
844          if ((cc[d] & 0xc0) == 0x80) branchlength--;
845    #endif
846      cc += *cc + 1;      cc += *cc + 1;
847      break;      break;
848    
# Line 615  for (;;) Line 906  for (;;)
906    
907    
908  /*************************************************  /*************************************************
909    *           Check for POSIX class syntax         *
910    *************************************************/
911    
912    /* This function is called when the sequence "[:" or "[." or "[=" is
913    encountered in a character class. It checks whether this is followed by an
914    optional ^ and then a sequence of letters, terminated by a matching ":]" or
915    ".]" or "=]".
916    
917    Argument:
918      ptr      pointer to the initial [
919      endptr   where to return the end pointer
920      cd       pointer to compile data
921    
922    Returns:   TRUE or FALSE
923    */
924    
925    static BOOL
926    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927    {
928    int terminator;          /* Don't combine these lines; the Solaris cc */
929    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
930    if (*(++ptr) == '^') ptr++;
931    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932    if (*ptr == terminator && ptr[1] == ']')
933      {
934      *endptr = ptr;
935      return TRUE;
936      }
937    return FALSE;
938    }
939    
940    
941    
942    
943    /*************************************************
944    *          Check POSIX class name                *
945    *************************************************/
946    
947    /* This function is called to check the name given in a POSIX-style class entry
948    such as [:alnum:].
949    
950    Arguments:
951      ptr        points to the first letter
952      len        the length of the name
953    
954    Returns:     a value representing the name, or -1 if unknown
955    */
956    
957    static int
958    check_posix_name(const uschar *ptr, int len)
959    {
960    register int yield = 0;
961    while (posix_name_lengths[yield] != 0)
962      {
963      if (len == posix_name_lengths[yield] &&
964        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965      yield++;
966      }
967    return -1;
968    }
969    
970    
971    
972    
973    /*************************************************
974  *           Compile one branch                   *  *           Compile one branch                   *
975  *************************************************/  *************************************************/
976    
# Line 627  Arguments: Line 983  Arguments:
983    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
984    errorptr     points to pointer to error message    errorptr     points to pointer to error message
985    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
986      reqchar      set to the last literal character required, else -1
987      countlits    set to count of mandatory literal characters
988    cd           contains pointers to tables    cd           contains pointers to tables
989    
990  Returns:       TRUE on success  Returns:       TRUE on success
# Line 636  Returns:       TRUE on success Line 994  Returns:       TRUE on success
994  static BOOL  static BOOL
995  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
996    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
997    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
998  {  {
999  int repeat_type, op_type;  int repeat_type, op_type;
1000  int repeat_min, repeat_max;  int repeat_min, repeat_max;
1001  int bravalue, length;  int bravalue, length;
1002  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
1003    int prevreqchar;
1004    int condcount = 0;
1005    int subcountlits = 0;
1006  register int c;  register int c;
1007  register uschar *code = *codeptr;  register uschar *code = *codeptr;
1008  uschar *tempcode;  uschar *tempcode;
# Line 655  uschar class[32]; Line 1016  uschar class[32];
1016  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
1018    
1019    /* Initialize no required char, and count of literals */
1020    
1021    *reqchar = prevreqchar = -1;
1022    *countlits = 0;
1023    
1024  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
1025    
1026  for (;; ptr++)  for (;; ptr++)
# Line 664  for (;; ptr++) Line 1030  for (;; ptr++)
1030    int class_lastchar;    int class_lastchar;
1031    int newoptions;    int newoptions;
1032    int condref;    int condref;
1033      int subreqchar;
1034    
1035    c = *ptr;    c = *ptr;
1036    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 671  for (;; ptr++) Line 1038  for (;; ptr++)
1038      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039      if (c == '#')      if (c == '#')
1040        {        {
1041        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
1042          on the Macintosh. */
1043          while ((c = *(++ptr)) != 0 && c != '\n') ;
1044        continue;        continue;
1045        }        }
1046      }      }
# Line 746  for (;; ptr++) Line 1115  for (;; ptr++)
1115          goto FAILED;          goto FAILED;
1116          }          }
1117    
1118          /* Handle POSIX class names. Perl allows a negation extension of the
1119          form [:^name]. A square bracket that doesn't match the syntax is
1120          treated as a literal. We also recognize the POSIX constructions
1121          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122          5.6 does. */
1123    
1124          if (c == '[' &&
1125              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126              check_posix_syntax(ptr, &tempptr, cd))
1127            {
1128            BOOL local_negate = FALSE;
1129            int posix_class, i;
1130            register const uschar *cbits = cd->cbits;
1131    
1132            if (ptr[1] != ':')
1133              {
1134              *errorptr = ERR31;
1135              goto FAILED;
1136              }
1137    
1138            ptr += 2;
1139            if (*ptr == '^')
1140              {
1141              local_negate = TRUE;
1142              ptr++;
1143              }
1144    
1145            posix_class = check_posix_name(ptr, tempptr - ptr);
1146            if (posix_class < 0)
1147              {
1148              *errorptr = ERR30;
1149              goto FAILED;
1150              }
1151    
1152            /* If matching is caseless, upper and lower are converted to
1153            alpha. This relies on the fact that the class table starts with
1154            alpha, lower, upper as the first 3 entries. */
1155    
1156            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157              posix_class = 0;
1158    
1159            /* Or into the map we are building up to 3 of the static class
1160            tables, or their negations. */
1161    
1162            posix_class *= 3;
1163            for (i = 0; i < 3; i++)
1164              {
1165              int taboffset = posix_class_maps[posix_class + i];
1166              if (taboffset < 0) break;
1167              if (local_negate)
1168                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169              else
1170                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171              }
1172    
1173            ptr = tempptr + 1;
1174            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1175            continue;
1176            }
1177    
1178        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1179        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1180        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 773  for (;; ptr++) Line 1202  for (;; ptr++)
1202              continue;              continue;
1203    
1204              case ESC_w:              case ESC_w:
1205              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1206              continue;              continue;
1207    
1208              case ESC_W:              case ESC_W:
1209              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1210              continue;              continue;
1211    
1212              case ESC_s:              case ESC_s:
# Line 795  for (;; ptr++) Line 1222  for (;; ptr++)
1222              goto FAILED;              goto FAILED;
1223              }              }
1224            }            }
1225          /* Fall through if single character */  
1226            /* Fall through if single character, but don't at present allow
1227            chars > 255 in UTF-8 mode. */
1228    
1229    #ifdef SUPPORT_UTF8
1230            if (c > 255)
1231              {
1232              *errorptr = ERR33;
1233              goto FAILED;
1234              }
1235    #endif
1236          }          }
1237    
1238        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 815  for (;; ptr++) Line 1252  for (;; ptr++)
1252            }            }
1253    
1254          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1255          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256            in such circumstances. */
1257    
1258          if (d == '\\')          if (d == '\\')
1259            {            {
1260              const uschar *oldptr = ptr;
1261            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262    
1263    #ifdef SUPPORT_UTF8
1264              if (d > 255)
1265                {
1266                *errorptr = ERR33;
1267                goto FAILED;
1268                }
1269    #endif
1270              /* \b is backslash; any other special means the '-' was literal */
1271    
1272            if (d < 0)            if (d < 0)
1273              {              {
1274              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1275                {                {
1276                *errorptr = ERR7;                ptr = oldptr - 2;
1277                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1278                }                }
1279              }              }
1280            }            }
# Line 853  for (;; ptr++) Line 1302  for (;; ptr++)
1302        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1303        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1304    
1305          SINGLE_CHARACTER:
1306    
1307        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1308        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1309          {          {
# Line 937  for (;; ptr++) Line 1388  for (;; ptr++)
1388        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1389      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1390    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1391      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1392      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1393      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1394        out any reqchar setting, backing up to the previous value. We must also
1395        adjust the countlits value. */
1396    
1397      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1398        {        {
1399        int len = previous[1];        int len = previous[1];
1400    
1401          if (repeat_min == 0) *reqchar = prevreqchar;
1402          *countlits += repeat_min - 1;
1403    
1404        if (len == 1)        if (len == 1)
1405          {          {
1406          c = previous[2];          c = previous[2];
# Line 987  for (;; ptr++) Line 1439  for (;; ptr++)
1439        code = previous;        code = previous;
1440    
1441        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1442        repeat_type += op_type;      /* Combine both values for many cases */  
1443          /* If the maximum is zero then the minimum must also be zero; Perl allows
1444          this case, so we do too - by simply omitting the item altogether. */
1445    
1446          if (repeat_max == 0) goto END_REPEAT;
1447    
1448          /* Combine the op_type with the repeat_type */
1449    
1450          repeat_type += op_type;
1451    
1452        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1453        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1064  for (;; ptr++) Line 1524  for (;; ptr++)
1524        }        }
1525    
1526      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1527      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1528    
1529      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1530        {        {
1531          if (repeat_max == 0)
1532            {
1533            code = previous;
1534            goto END_REPEAT;
1535            }
1536        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1537          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1538        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1118  for (;; ptr++) Line 1583  for (;; ptr++)
1583    
1584        if (repeat_min == 0)        if (repeat_min == 0)
1585          {          {
1586            /* If we set up a required char from the bracket, we must back off
1587            to the previous value and reset the countlits value too. */
1588    
1589            if (subcountlits > 0)
1590              {
1591              *reqchar = prevreqchar;
1592              *countlits -= subcountlits;
1593              }
1594    
1595          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
1596          altogether. */          altogether. */
1597    
1598          if (repeat_max == 0)          if (repeat_max == 0)
1599            {            {
1600            code = previous;            code = previous;
1601            previous = NULL;            goto END_REPEAT;
           break;  
1602            }            }
1603    
1604          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
# Line 1230  for (;; ptr++) Line 1703  for (;; ptr++)
1703        correct offset was computed above. */        correct offset was computed above. */
1704    
1705        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
   
   
 #ifdef NEVER  
       /* If the minimum is greater than zero, and the maximum is unlimited or  
       equal to the minimum, the first copy remains where it is, and is  
       replicated up to the minimum number of times. This case includes the +  
       repeat, but of course no replication is needed in that case. */  
   
       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))  
         {  
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the minimum is zero, stick BRAZERO in front of the first copy.  
       Then, if there is a fixed upper limit, replicated up to that many times,  
       sticking BRAZERO in front of all the optional ones. */  
   
       else  
         {  
         if (repeat_min == 0)  
           {  
           memmove(previous+1, previous, len);  
           code++;  
           *previous++ = OP_BRAZERO + repeat_type;  
           }  
   
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
   
         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)  
           {  
           *code++ = OP_BRAZERO + repeat_type;  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the maximum is unlimited, set a repeater in the final copy. We  
       can't just offset backwards from the current code point, because we  
       don't know if there's been an options resetting after the ket. The  
       correct offset was computed above. */  
   
       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;  
 #endif  
   
   
1706        }        }
1707    
1708      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1295  for (;; ptr++) Line 1715  for (;; ptr++)
1715    
1716      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1717    
1718        END_REPEAT:
1719      previous = NULL;      previous = NULL;
1720      break;      break;
1721    
# Line 1372  for (;; ptr++) Line 1793  for (;; ptr++)
1793          ptr++;          ptr++;
1794          break;          break;
1795    
1796            case 'R':                 /* Pattern recursion */
1797            *code++ = OP_RECURSE;
1798            ptr++;
1799            continue;
1800    
1801          default:                  /* Option setting */          default:                  /* Option setting */
1802          set = unset = 0;          set = unset = 0;
1803          optset = &set;          optset = &set;
# Line 1463  for (;; ptr++) Line 1889  for (;; ptr++)
1889           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1890            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1891           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1892             &subreqchar,                  /* For possible last char */
1893             &subcountlits,                /* For literal count */
1894           cd))                          /* Tables block */           cd))                          /* Tables block */
1895        goto FAILED;        goto FAILED;
1896    
# Line 1476  for (;; ptr++) Line 1904  for (;; ptr++)
1904    
1905      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1906        {        {
       int branchcount = 0;  
1907        uschar *tc = code;        uschar *tc = code;
1908          condcount = 0;
1909    
1910        do {        do {
1911           branchcount++;           condcount++;
1912           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1913           }           }
1914        while (*tc != OP_KET);        while (*tc != OP_KET);
1915    
1916        if (branchcount > 2)        if (condcount > 2)
1917          {          {
1918          *errorptr = ERR27;          *errorptr = ERR27;
1919          goto FAILED;          goto FAILED;
1920          }          }
1921        }        }
1922    
1923        /* Handle updating of the required character. If the subpattern didn't
1924        set one, leave it as it was. Otherwise, update it for normal brackets of
1925        all kinds, forward assertions, and conditions with two branches. Don't
1926        update the literal count for forward assertions, however. If the bracket
1927        is followed by a quantifier with zero repeat, we have to back off. Hence
1928        the definition of prevreqchar and subcountlits outside the main loop so
1929        that they can be accessed for the back off. */
1930    
1931        if (subreqchar > 0 &&
1932             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1933             (bravalue == OP_COND && condcount == 2)))
1934          {
1935          prevreqchar = *reqchar;
1936          *reqchar = subreqchar;
1937          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1938          }
1939    
1940      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1941    
1942      code = tempcode;      code = tempcode;
# Line 1559  for (;; ptr++) Line 2004  for (;; ptr++)
2004          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
2005          if (c == '#')          if (c == '#')
2006            {            {
2007            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2008              on the Macintosh. */
2009              while ((c = *(++ptr)) != 0 && c != '\n') ;
2010            if (c == 0) break;            if (c == 0) break;
2011            continue;            continue;
2012            }            }
# Line 1574  for (;; ptr++) Line 2021  for (;; ptr++)
2021          tempptr = ptr;          tempptr = ptr;
2022          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2023          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
2024    
2025            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2026            two or more characters in the UTF-8 encoding. */
2027    
2028    #ifdef SUPPORT_UTF8
2029            if (c > 127 && (options & PCRE_UTF8) != 0)
2030              {
2031              uschar buffer[8];
2032              int len = ord2utf8(c, buffer);
2033              for (c = 0; c < len; c++) *code++ = buffer[c];
2034              length += len;
2035              continue;
2036              }
2037    #endif
2038          }          }
2039    
2040        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1584  for (;; ptr++) Line 2045  for (;; ptr++)
2045    
2046      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2047    
2048      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2049    
2050        /* Update the last character and the count of literals */
2051    
2052        prevreqchar = (length > 1)? code[-2] : *reqchar;
2053        *reqchar = code[-1];
2054        *countlits += length;
2055    
2056      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
2057      the next state. */      the next state. */
2058    
2059      previous[1] = length;      previous[1] = length;
2060      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2061      break;      break;
2062      }      }
2063    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1629  Argument: Line 2096  Argument:
2096    errorptr    -> pointer to error message    errorptr    -> pointer to error message
2097    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
2098    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
2099      reqchar     -> place to put the last required character, or a negative number
2100      countlits   -> place to put the shortest literal count of any branch
2101    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
2102    
2103  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1637  Returns:      TRUE on success Line 2106  Returns:      TRUE on success
2106  static BOOL  static BOOL
2107  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2108    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2109    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
2110  {  {
2111  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2112  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1645  uschar *last_branch = code; Line 2114  uschar *last_branch = code;
2114  uschar *start_bracket = code;  uschar *start_bracket = code;
2115  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
2116  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
2117    int branchreqchar, branchcountlits;
2118    
2119    *reqchar = -1;
2120    *countlits = INT_MAX;
2121  code += 3;  code += 3;
2122    
2123  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1684  for (;;) Line 2156  for (;;)
2156    
2157    /* Now compile the branch */    /* Now compile the branch */
2158    
2159    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2160          &branchreqchar, &branchcountlits, cd))
2161      {      {
2162      *ptrptr = ptr;      *ptrptr = ptr;
2163      return FALSE;      return FALSE;
# Line 1696  for (;;) Line 2169  for (;;)
2169    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
2170    last_branch[2] = length & 255;    last_branch[2] = length & 255;
2171    
2172      /* Save the last required character if all branches have the same; a current
2173      value of -1 means unset, while -2 means "previous branch had no last required
2174      char".  */
2175    
2176      if (*reqchar != -2)
2177        {
2178        if (branchreqchar >= 0)
2179          {
2180          if (*reqchar == -1) *reqchar = branchreqchar;
2181          else if (*reqchar != branchreqchar) *reqchar = -2;
2182          }
2183        else *reqchar = -2;
2184        }
2185    
2186      /* Keep the shortest literal count */
2187    
2188      if (branchcountlits < *countlits) *countlits = branchcountlits;
2189      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2190    
2191    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
2192    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
2193    the branch with OP_END. */    the branch with OP_END. */
# Line 1703  for (;;) Line 2195  for (;;)
2195    if (lookbehind)    if (lookbehind)
2196      {      {
2197      *code = OP_END;      *code = OP_END;
2198      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
2199      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
2200      if (length < 0)      if (length < 0)
2201        {        {
# Line 1790  for (;;) Line 2282  for (;;)
2282      code += 2;      code += 2;
2283      break;      break;
2284    
2285        case OP_WORD_BOUNDARY:
2286        case OP_NOT_WORD_BOUNDARY:
2287        code++;
2288        break;
2289    
2290      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
2291      case OP_ASSERTBACK:      case OP_ASSERTBACK:
2292      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1817  all of whose alternatives start with OP_ Line 2314  all of whose alternatives start with OP_
2314  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2315  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2316    
2317  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2318  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2319  trying them again.  so there is no point trying them again.
2320    
2321  Arguments:  Arguments:
2322    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1837  do { Line 2334  do {
2334     register int op = *scode;     register int op = *scode;
2335     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2336       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2337     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2338                (*options & PCRE_DOTALL) != 0)
2339       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
2340     else if (op != OP_SOD &&     else if (op != OP_SOD &&
2341             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1851  return TRUE; Line 2349  return TRUE;
2349    
2350    
2351  /*************************************************  /*************************************************
2352  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2353  *************************************************/  *************************************************/
2354    
2355  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2356  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2357    matching and for non-DOTALL patterns that start with .* (which must start at
2358    the beginning or after \n).
2359    
2360  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2361  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1869  do { Line 2369  do {
2369     register int op = *scode;     register int op = *scode;
2370     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2371       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
2372       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2373         { if (scode[1] != OP_ANY) return FALSE; }
2374     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
2375     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2376     }     }
# Line 1967  pcre_compile(const char *pattern, int op Line 2469  pcre_compile(const char *pattern, int op
2469  real_pcre *re;  real_pcre *re;
2470  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2471  int runlength;  int runlength;
2472  int c, size;  int c, reqchar, countlits;
2473  int bracount = 0;  int bracount = 0;
2474  int top_backref = 0;  int top_backref = 0;
2475  int branch_extra = 0;  int branch_extra = 0;
2476  int branch_newextra;  int branch_newextra;
2477  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2478    size_t size;
2479  uschar *code;  uschar *code;
2480  const uschar *ptr;  const uschar *ptr;
2481  compile_data compile_block;  compile_data compile_block;
# Line 1983  uschar bralenstack[BRASTACK_SIZE]; Line 2486  uschar bralenstack[BRASTACK_SIZE];
2486  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2487  #endif  #endif
2488    
2489    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2490    
2491    #ifndef SUPPORT_UTF8
2492    if ((options & PCRE_UTF8) != 0)
2493      {
2494      *errorptr = ERR32;
2495      return NULL;
2496      }
2497    #endif
2498    
2499  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2500  can do is just return NULL. */  can do is just return NULL. */
2501    
# Line 2035  while ((c = *(++ptr)) != 0) Line 2548  while ((c = *(++ptr)) != 0)
2548      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2549      if (c == '#')      if (c == '#')
2550        {        {
2551        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
2552          on the Macintosh. */
2553          while ((c = *(++ptr)) != 0 && c != '\n') ;
2554        continue;        continue;
2555        }        }
2556      }      }
# Line 2200  while ((c = *(++ptr)) != 0) Line 2715  while ((c = *(++ptr)) != 0)
2715          ptr += 2;          ptr += 2;
2716          break;          break;
2717    
2718            /* A recursive call to the regex is an extension, to provide the
2719            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2720    
2721            case 'R':
2722            if (ptr[3] != ')')
2723              {
2724              *errorptr = ERR29;
2725              goto PCRE_ERROR_RETURN;
2726              }
2727            ptr += 3;
2728            length += 1;
2729            break;
2730    
2731          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2732    
2733          case '<':          case '<':
# Line 2232  while ((c = *(++ptr)) != 0) Line 2760  while ((c = *(++ptr)) != 0)
2760          else   /* An assertion must follow */          else   /* An assertion must follow */
2761            {            {
2762            ptr++;   /* Can treat like ':' as far as spacing is concerned */            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2763              if (ptr[2] != '?' ||
2764            if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)               (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2765              {              {
2766              ptr += 2;    /* To get right offset in message */              ptr += 2;    /* To get right offset in message */
2767              *errorptr = ERR28;              *errorptr = ERR28;
# Line 2307  while ((c = *(++ptr)) != 0) Line 2835  while ((c = *(++ptr)) != 0)
2835              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2836              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2837              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2838              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2839                flag ever changes within the regex. This is used by the "required
2840                character" code. */
2841    
2842              case ':':              case ':':
2843              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2844                {                {
2845                length += 4;                length += 4;
2846                branch_newextra = 2;                branch_newextra = 2;
2847                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2848                }                }
2849              goto END_OPTIONS;              goto END_OPTIONS;
2850    
# Line 2443  while ((c = *(++ptr)) != 0) Line 2974  while ((c = *(++ptr)) != 0)
2974          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2975          if (c == '#')          if (c == '#')
2976            {            {
2977            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2978              on the Macintosh. */
2979              while ((c = *(++ptr)) != 0 && c != '\n') ;
2980            continue;            continue;
2981            }            }
2982          }          }
# Line 2458  while ((c = *(++ptr)) != 0) Line 2991  while ((c = *(++ptr)) != 0)
2991            &compile_block);            &compile_block);
2992          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2993          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2994    
2995    #ifdef SUPPORT_UTF8
2996            if (c > 127 && (options & PCRE_UTF8) != 0)
2997              {
2998              int i;
2999              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3000                if (c <= utf8_table1[i]) break;
3001              runlength += i;
3002              }
3003    #endif
3004          }          }
3005    
3006        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 2467  while ((c = *(++ptr)) != 0) Line 3010  while ((c = *(++ptr)) != 0)
3010    
3011      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3012    
3013      while (runlength < 255 &&      while (runlength < MAXLIT &&
3014        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3015    
3016      ptr--;      ptr--;
# Line 2499  if (re == NULL) Line 3042  if (re == NULL)
3042    return NULL;    return NULL;
3043    }    }
3044    
3045  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
3046    
3047  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
3048    re->size = size;
3049  re->options = options;  re->options = options;
3050  re->tables = tables;  re->tables = tables;
3051    
# Line 2514  code = re->code; Line 3058  code = re->code;
3058  *code = OP_BRA;  *code = OP_BRA;
3059  bracount = 0;  bracount = 0;
3060  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3061    &compile_block);    &reqchar, &countlits, &compile_block);
3062  re->top_bracket = bracount;  re->top_bracket = bracount;
3063  re->top_backref = top_backref;  re->top_backref = top_backref;
3064    
# Line 2546  if (*errorptr != NULL) Line 3090  if (*errorptr != NULL)
3090    return NULL;    return NULL;
3091    }    }
3092    
3093  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
3094  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
3095  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
3096  unanchored matches no end. In the case of multiline matches, an alternative is  
3097  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
3098    that speeds up unanchored matches no end. If not, see if we can set the
3099    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3100    start with ^. and also when all branches start with .* for non-DOTALL matches.
3101    */
3102    
3103  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
3104    {    {
# Line 2570  if ((options & PCRE_ANCHORED) == 0) Line 3118  if ((options & PCRE_ANCHORED) == 0)
3118      }      }
3119    }    }
3120    
3121    /* Save the last required character if there are at least two literal
3122    characters on all paths, or if there is no first character setting. */
3123    
3124    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3125      {
3126      re->req_char = reqchar;
3127      re->options |= PCRE_REQCHSET;
3128      }
3129    
3130  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
3131    
3132  #ifdef DEBUG  #ifdef DEBUG
# Line 2579  printf("Length = %d top_bracket = %d top Line 3136  printf("Length = %d top_bracket = %d top
3136    
3137  if (re->options != 0)  if (re->options != 0)
3138    {    {
3139    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
3140      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3141      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3142        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3143      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3144      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3145      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2596  if ((re->options & PCRE_FIRSTSET) != 0) Line 3154  if ((re->options & PCRE_FIRSTSET) != 0)
3154      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
3155    }    }
3156    
3157    if ((re->options & PCRE_REQCHSET) != 0)
3158      {
3159      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3160        else printf("Req char = \\x%02x\n", re->req_char);
3161      }
3162    
3163  code_end = code;  code_end = code;
3164  code_base = code = re->code;  code_base = code = re->code;
3165    
# Line 2829  Returns:      TRUE if matched Line 3393  Returns:      TRUE if matched
3393    
3394  static BOOL  static BOOL
3395  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3396    int ims)    unsigned long int ims)
3397  {  {
3398  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3399    
# Line 2880  Arguments: Line 3444  Arguments:
3444     offset_top  current top pointer     offset_top  current top pointer
3445     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3446     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
3447     condassert  TRUE if called to check a condition assertion     eptrb       pointer to chain of blocks containing eptr at start of
3448     eptrb       eptr at start of last bracket                   brackets - for testing for empty matches
3449       flags       can contain
3450                     match_condassert - this is an assertion condition
3451                     match_isgroup - this is the start of a bracketed group
3452    
3453  Returns:       TRUE if matched  Returns:       TRUE if matched
3454  */  */
3455    
3456  static BOOL  static BOOL
3457  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3458    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3459      int flags)
3460  {  {
3461  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3462    eptrblock newptrb;
3463    
3464    /* At the start of a bracketed group, add the current subject pointer to the
3465    stack of such pointers, to be re-instated at the end of the group when we hit
3466    the closing ket. When match() is called in other circumstances, we don't add to
3467    the stack. */
3468    
3469    if ((flags & match_isgroup) != 0)
3470      {
3471      newptrb.prev = eptrb;
3472      newptrb.saved_eptr = eptr;
3473      eptrb = &newptrb;
3474      }
3475    
3476    /* Now start processing the operations. */
3477    
3478  for (;;)  for (;;)
3479    {    {
# Line 2936  for (;;) Line 3519  for (;;)
3519    
3520        do        do
3521          {          {
3522          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3523              return TRUE;
3524          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3525          }          }
3526        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2962  for (;;) Line 3546  for (;;)
3546      DPRINTF(("start bracket 0\n"));      DPRINTF(("start bracket 0\n"));
3547      do      do
3548        {        {
3549        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3550            return TRUE;
3551        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3552        }        }
3553      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2981  for (;;) Line 3566  for (;;)
3566        return match(eptr,        return match(eptr,
3567          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3568            5 : 3 + (ecode[1] << 8) + ecode[2]),            5 : 3 + (ecode[1] << 8) + ecode[2]),
3569          offset_top, md, ims, FALSE, eptr);          offset_top, md, ims, eptrb, match_isgroup);
3570        }        }
3571    
3572      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
# Line 2989  for (;;) Line 3574  for (;;)
3574    
3575      else      else
3576        {        {
3577        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3578              match_condassert | match_isgroup))
3579          {          {
3580          ecode += 3 + (ecode[4] << 8) + ecode[5];          ecode += 3 + (ecode[4] << 8) + ecode[5];
3581          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3582          }          }
3583        else ecode += (ecode[1] << 8) + ecode[2];        else ecode += (ecode[1] << 8) + ecode[2];
3584        return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);        return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3585        }        }
3586      /* Control never reaches here */      /* Control never reaches here */
3587    
# Line 3005  for (;;) Line 3591  for (;;)
3591      ecode += 2;      ecode += 2;
3592      break;      break;
3593    
3594      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3595        an empty string - recursion will then try other alternatives, if any. */
3596    
3597      case OP_END:      case OP_END:
3598        if (md->notempty && eptr == md->start_match) return FALSE;
3599      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3600      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3601      return TRUE;      return TRUE;
# Line 3017  for (;;) Line 3605  for (;;)
3605      case OP_OPT:      case OP_OPT:
3606      ims = ecode[1];      ims = ecode[1];
3607      ecode += 2;      ecode += 2;
3608      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3609      break;      break;
3610    
3611      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 3030  for (;;) Line 3618  for (;;)
3618      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3619      do      do
3620        {        {
3621        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3622        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3623        }        }
3624      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 3038  for (;;) Line 3626  for (;;)
3626    
3627      /* If checking an assertion for a condition, return TRUE. */      /* If checking an assertion for a condition, return TRUE. */
3628    
3629      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3630    
3631      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3632      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
# Line 3054  for (;;) Line 3642  for (;;)
3642      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3643      do      do
3644        {        {
3645        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3646            return FALSE;
3647        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3648        }        }
3649      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3650    
3651      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3652    
3653      ecode += 3;      ecode += 3;
3654      continue;      continue;
3655    
3656      /* Move the subject pointer back. This occurs only at the start of      /* Move the subject pointer back. This occurs only at the start of
3657      each branch of a lookbehind assertion. If we are too close to the start to      each branch of a lookbehind assertion. If we are too close to the start to
3658      move back, this match function fails. */      move back, this match function fails. When working with UTF-8 we move
3659        back a number of characters, not bytes. */
3660    
3661      case OP_REVERSE:      case OP_REVERSE:
3662    #ifdef SUPPORT_UTF8
3663        c = (ecode[1] << 8) + ecode[2];
3664        for (i = 0; i < c; i++)
3665          {
3666          eptr--;
3667          BACKCHAR(eptr)
3668          }
3669    #else
3670      eptr -= (ecode[1] << 8) + ecode[2];      eptr -= (ecode[1] << 8) + ecode[2];
3671    #endif
3672    
3673      if (eptr < md->start_subject) return FALSE;      if (eptr < md->start_subject) return FALSE;
3674      ecode += 3;      ecode += 3;
3675      break;      break;
3676    
3677        /* Recursion matches the current regex, nested. If there are any capturing
3678        brackets started but not finished, we have to save their starting points
3679        and reinstate them after the recursion. However, we don't know how many
3680        such there are (offset_top records the completed total) so we just have
3681        to save all the potential data. There may be up to 99 such values, which
3682        is a bit large to put on the stack, but using malloc for small numbers
3683        seems expensive. As a compromise, the stack is used when there are fewer
3684        than 16 values to store; otherwise malloc is used. A problem is what to do
3685        if the malloc fails ... there is no way of returning to the top level with
3686        an error. Save the top 15 values on the stack, and accept that the rest
3687        may be wrong. */
3688    
3689        case OP_RECURSE:
3690          {
3691          BOOL rc;
3692          int *save;
3693          int stacksave[15];
3694    
3695          c = md->offset_max;
3696    
3697          if (c < 16) save = stacksave; else
3698            {
3699            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3700            if (save == NULL)
3701              {
3702              save = stacksave;
3703              c = 15;
3704              }
3705            }
3706    
3707          for (i = 1; i <= c; i++)
3708            save[i] = md->offset_vector[md->offset_end - i];
3709          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3710            match_isgroup);
3711          for (i = 1; i <= c; i++)
3712            md->offset_vector[md->offset_end - i] = save[i];
3713          if (save != stacksave) (pcre_free)(save);
3714          if (!rc) return FALSE;
3715    
3716          /* In case the recursion has set more capturing values, save the final
3717          number, then move along the subject till after the recursive match,
3718          and advance one byte in the pattern code. */
3719    
3720          offset_top = md->end_offset_top;
3721          eptr = md->end_match_ptr;
3722          ecode++;
3723          }
3724        break;
3725    
3726      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3727      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 3084  for (;;) Line 3733  for (;;)
3733      case OP_ONCE:      case OP_ONCE:
3734        {        {
3735        const uschar *prev = ecode;        const uschar *prev = ecode;
3736          const uschar *saved_eptr = eptr;
3737    
3738        do        do
3739          {          {
3740          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3741              break;
3742          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3743          }          }
3744        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 3110  for (;;) Line 3761  for (;;)
3761        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3762        course of events. */        course of events. */
3763    
3764        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3765          {          {
3766          ecode += 3;          ecode += 3;
3767          break;          break;
# Line 3124  for (;;) Line 3775  for (;;)
3775        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3776          {          {
3777          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3778          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3779          }          }
3780    
3781        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3782          {          {
3783          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3784              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3785                  return TRUE;
3786          }          }
3787        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3788          {          {
3789          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3790              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3791          }          }
3792        }        }
3793      return FALSE;      return FALSE;
# Line 3156  for (;;) Line 3808  for (;;)
3808      case OP_BRAZERO:      case OP_BRAZERO:
3809        {        {
3810        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3811        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3812            return TRUE;
3813        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3814        ecode = next + 3;        ecode = next + 3;
3815        }        }
# Line 3166  for (;;) Line 3819  for (;;)
3819        {        {
3820        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3821        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3822        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3823            return TRUE;
3824        ecode++;        ecode++;
3825        }        }
3826      break;      break;
# Line 3181  for (;;) Line 3835  for (;;)
3835      case OP_KETRMAX:      case OP_KETRMAX:
3836        {        {
3837        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3838          const uschar *saved_eptr = eptrb->saved_eptr;
3839    
3840          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3841    
3842        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3843            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 3200  for (;;) Line 3857  for (;;)
3857          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
3858          int offset = number << 1;          int offset = number << 1;
3859    
3860          DPRINTF(("end bracket %d\n", number));  #ifdef DEBUG
3861            printf("end bracket %d", number);
3862            printf("\n");
3863    #endif
3864    
3865          if (number > 0)          if (number > 0)
3866            {            {
# Line 3218  for (;;) Line 3878  for (;;)
3878        the group. */        the group. */
3879    
3880        ims = original_ims;        ims = original_ims;
3881        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3882    
3883        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3884        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3226  for (;;) Line 3886  for (;;)
3886        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3887        course of events. */        course of events. */
3888    
3889        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3890          {          {
3891          ecode += 3;          ecode += 3;
3892          break;          break;
# Line 3237  for (;;) Line 3897  for (;;)
3897    
3898        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3899          {          {
3900          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3901              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3902                  return TRUE;
3903          }          }
3904        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3905          {          {
3906          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3907              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3908          }          }
3909        }        }
3910      return FALSE;      return FALSE;
# Line 3328  for (;;) Line 3989  for (;;)
3989      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3990        return FALSE;        return FALSE;
3991      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
3992    #ifdef SUPPORT_UTF8
3993        if (md->utf8)
3994          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3995    #endif
3996      ecode++;      ecode++;
3997      break;      break;
3998    
# Line 3454  for (;;) Line 4119  for (;;)
4119          {          {
4120          for (i = min;; i++)          for (i = min;; i++)
4121            {            {
4122            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4123              return TRUE;              return TRUE;
4124            if (i >= max || !match_ref(offset, eptr, length, md, ims))            if (i >= max || !match_ref(offset, eptr, length, md, ims))
4125              return FALSE;              return FALSE;
# Line 3475  for (;;) Line 4140  for (;;)
4140            }            }
4141          while (eptr >= pp)          while (eptr >= pp)
4142            {            {
4143            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4144              return TRUE;              return TRUE;
4145            eptr -= length;            eptr -= length;
4146            }            }
# Line 3529  for (;;) Line 4194  for (;;)
4194        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4195          {          {
4196          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
4197          c = *eptr++;          GETCHARINC(c, eptr)         /* Get character; increment eptr */
4198    
4199    #ifdef SUPPORT_UTF8
4200            /* We do not yet support class members > 255 */
4201            if (c > 255) return FALSE;
4202    #endif
4203    
4204          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
4205          return FALSE;          return FALSE;
4206          }          }
# Line 3546  for (;;) Line 4217  for (;;)
4217          {          {
4218          for (i = min;; i++)          for (i = min;; i++)
4219            {            {
4220            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4221              return TRUE;              return TRUE;
4222            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
4223            c = *eptr++;            GETCHARINC(c, eptr)       /* Get character; increment eptr */
4224    
4225    #ifdef SUPPORT_UTF8
4226              /* We do not yet support class members > 255 */
4227              if (c > 255) return FALSE;
4228    #endif
4229            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230            return FALSE;            return FALSE;
4231            }            }
# Line 3561  for (;;) Line 4237  for (;;)
4237        else        else
4238          {          {
4239          const uschar *pp = eptr;          const uschar *pp = eptr;
4240          for (i = min; i < max; eptr++, i++)          int len = 1;
4241            for (i = min; i < max; i++)
4242            {            {
4243            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
4244            c = *eptr;            GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4245            if ((data[c/8] & (1 << (c&7))) != 0) continue;  
4246            break;  #ifdef SUPPORT_UTF8
4247              /* We do not yet support class members > 255 */
4248              if (c > 255) break;
4249    #endif
4250              if ((data[c/8] & (1 << (c&7))) == 0) break;
4251              eptr += len;
4252            }            }
4253    
4254          while (eptr >= pp)          while (eptr >= pp)
4255            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            {
4256              if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4257              return TRUE;              return TRUE;
4258    
4259    #ifdef SUPPORT_UTF8
4260              BACKCHAR(eptr)
4261    #endif
4262              }
4263          return FALSE;          return FALSE;
4264          }          }
4265        }        }
# Line 3667  for (;;) Line 4355  for (;;)
4355          {          {
4356          for (i = min;; i++)          for (i = min;; i++)
4357            {            {
4358            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4359              return TRUE;              return TRUE;
4360            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4361                c != md->lcc[*eptr++])                c != md->lcc[*eptr++])
# Line 3684  for (;;) Line 4372  for (;;)
4372            eptr++;            eptr++;
4373            }            }
4374          while (eptr >= pp)          while (eptr >= pp)
4375            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4376              return TRUE;              return TRUE;
4377          return FALSE;          return FALSE;
4378          }          }
# Line 3701  for (;;) Line 4389  for (;;)
4389          {          {
4390          for (i = min;; i++)          for (i = min;; i++)
4391            {            {
4392            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4393              return TRUE;              return TRUE;
4394            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4395            }            }
# Line 3716  for (;;) Line 4404  for (;;)
4404            eptr++;            eptr++;
4405            }            }
4406          while (eptr >= pp)          while (eptr >= pp)
4407           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4408             return TRUE;             return TRUE;
4409          return FALSE;          return FALSE;
4410          }          }
# Line 3798  for (;;) Line 4486  for (;;)
4486          {          {
4487          for (i = min;; i++)          for (i = min;; i++)
4488            {            {
4489            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4490              return TRUE;              return TRUE;
4491            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4492                c == md->lcc[*eptr++])                c == md->lcc[*eptr++])
# Line 3815  for (;;) Line 4503  for (;;)
4503            eptr++;            eptr++;
4504            }            }
4505          while (eptr >= pp)          while (eptr >= pp)
4506            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4507              return TRUE;              return TRUE;
4508          return FALSE;          return FALSE;
4509          }          }
# Line 3832  for (;;) Line 4520  for (;;)
4520          {          {
4521          for (i = min;; i++)          for (i = min;; i++)
4522            {            {
4523            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4524              return TRUE;              return TRUE;
4525            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4526            }            }
# Line 3847  for (;;) Line 4535  for (;;)
4535            eptr++;            eptr++;
4536            }            }
4537          while (eptr >= pp)          while (eptr >= pp)
4538           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4539             return TRUE;             return TRUE;
4540          return FALSE;          return FALSE;
4541          }          }
# Line 3891  for (;;) Line 4579  for (;;)
4579    
4580      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
4581      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
4582      (i.e. keep it out of the loop). Also test that there are at least the      (i.e. keep it out of the loop). Also we can test that there are at least
4583      minimum number of characters before we start. */      the minimum number of bytes before we start, except when doing '.' in
4584        UTF8 mode. Leave the test in in all cases; in the special case we have
4585        to test after each character. */
4586    
4587      if (min > md->end_subject - eptr) return FALSE;      if (min > md->end_subject - eptr) return FALSE;
4588      if (min > 0) switch(ctype)      if (min > 0) switch(ctype)
4589        {        {
4590        case OP_ANY:        case OP_ANY:
4591    #ifdef SUPPORT_UTF8
4592          if (md->utf8)
4593            {
4594            for (i = 1; i <= min; i++)
4595              {
4596              if (eptr >= md->end_subject ||
4597                 (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4598                return FALSE;
4599              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4600              }
4601            break;
4602            }
4603    #endif
4604          /* Non-UTF8 can be faster */
4605        if ((ims & PCRE_DOTALL) == 0)        if ((ims & PCRE_DOTALL) == 0)
4606          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4607        else eptr += min;        else eptr += min;
# Line 3947  for (;;) Line 4651  for (;;)
4651        {        {
4652        for (i = min;; i++)        for (i = min;; i++)
4653          {          {
4654          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4655          if (i >= max || eptr >= md->end_subject) return FALSE;          if (i >= max || eptr >= md->end_subject) return FALSE;
4656    
4657          c = *eptr++;          c = *eptr++;
# Line 3955  for (;;) Line 4659  for (;;)
4659            {            {
4660            case OP_ANY:            case OP_ANY:
4661            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4662    #ifdef SUPPORT_UTF8
4663              if (md->utf8)
4664                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4665    #endif
4666            break;            break;
4667    
4668            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
# Line 3994  for (;;) Line 4702  for (;;)
4702        switch(ctype)        switch(ctype)
4703          {          {
4704          case OP_ANY:          case OP_ANY:
4705    
4706            /* Special code is required for UTF8, but when the maximum is unlimited
4707            we don't need it. */
4708    
4709    #ifdef SUPPORT_UTF8
4710            if (md->utf8 && max < INT_MAX)
4711              {
4712              if ((ims & PCRE_DOTALL) == 0)
4713                {
4714                for (i = min; i < max; i++)
4715                  {
4716                  if (eptr >= md->end_subject || *eptr++ == '\n') break;
4717                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4718                  }
4719                }
4720              else
4721                {
4722                for (i = min; i < max; i++)
4723                  {
4724                  eptr++;
4725                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4726                  }
4727                }
4728              break;
4729              }
4730    #endif
4731            /* Non-UTF8 can be faster */
4732          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
4733            {            {
4734            for (i = min; i < max; i++)            for (i = min; i < max; i++)
# Line 4066  for (;;) Line 4801  for (;;)
4801          }          }
4802    
4803        while (eptr >= pp)        while (eptr >= pp)
4804          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))          {
4805            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4806            return TRUE;            return TRUE;
4807    #ifdef SUPPORT_UTF8
4808            if (md->utf8)
4809              while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4810    #endif
4811            }
4812        return FALSE;        return FALSE;
4813        }        }
4814      /* Control never gets here */      /* Control never gets here */
# Line 4104  Arguments: Line 4845  Arguments:
4845    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4846    subject         points to the subject string    subject         points to the subject string
4847    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4848      start_offset    where to start in the subject string
4849    options         option bits    options         option bits
4850    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4851    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 4116  Returns:          > 0 => success; value Line 4858  Returns:          > 0 => success; value
4858    
4859  int  int
4860  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4861    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4862      int offsetcount)
4863  {  {
4864  int resetcount, ocount;  int resetcount, ocount;
4865  int first_char = -1;  int first_char = -1;
4866  int ims = 0;  int req_char = -1;
4867    int req_char2 = -1;
4868    unsigned long int ims = 0;
4869  match_data match_block;  match_data match_block;
4870  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4871  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4872  const uschar *end_subject;  const uschar *end_subject;
4873    const uschar *req_char_ptr = start_match - 1;
4874  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4875  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4876  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 4137  if (re == NULL || subject == NULL || Line 4883  if (re == NULL || subject == NULL ||
4883     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4884  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4885    
4886    match_block.start_pattern = re->code;
4887  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4888  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4889  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
4890    
4891  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4892    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4893    
4894  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4895  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4896    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4897    
4898  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4899    
# Line 4215  if (!anchored) Line 4964  if (!anchored)
4964          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4965    }    }
4966    
4967  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4968    character" set. If the PCRE_CASELESS is set, implying that the match starts
4969    caselessly, or if there are any changes of this flag within the regex, set up
4970    both cases of the character. Otherwise set the two values the same, which will
4971    avoid duplicate testing (which takes significant time). This covers the vast
4972    majority of cases. It will be suboptimal when the case flag changes in a regex
4973    and the required character in fact is caseful. */
4974    
4975    if ((re->options & PCRE_REQCHSET) != 0)
4976      {
4977      req_char = re->req_char;
4978      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4979        (re->tables + fcc_offset)[req_char] : req_char;
4980      }
4981    
4982    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4983    the loop runs just once. */
4984    
4985  do  do
4986    {    {
# Line 4244  do Line 5009  do
5009    
5010    else if (startline)    else if (startline)
5011      {      {
5012      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
5013        {        {
5014        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != '\n')
5015          start_match++;          start_match++;
5016        }        }
5017      }      }
5018    
5019    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
5020    
5021    else if (start_bits != NULL)    else if (start_bits != NULL)
5022      {      {
# Line 4268  do Line 5033  do
5033    printf("\n");    printf("\n");
5034  #endif  #endif
5035    
5036      /* If req_char is set, we know that that character must appear in the subject
5037      for the match to succeed. If the first character is set, req_char must be
5038      later in the subject; otherwise the test starts at the match point. This
5039      optimization can save a huge amount of backtracking in patterns with nested
5040      unlimited repeats that aren't going to match. We don't know what the state of
5041      case matching may be when this character is hit, so test for it in both its
5042      cases if necessary. However, the different cased versions will not be set up
5043      unless PCRE_CASELESS was given or the casing state changes within the regex.
5044      Writing separate code makes it go faster, as does using an autoincrement and
5045      backing off on a match. */
5046    
5047      if (req_char >= 0)
5048        {
5049        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5050    
5051        /* We don't need to repeat the search if we haven't yet reached the
5052        place we found it at last time. */
5053    
5054        if (p > req_char_ptr)
5055          {
5056          /* Do a single test if no case difference is set up */
5057    
5058          if (req_char == req_char2)
5059            {
5060            while (p < end_subject)
5061              {
5062              if (*p++ == req_char) { p--; break; }
5063              }
5064            }
5065    
5066          /* Otherwise test for either case */
5067    
5068          else
5069            {
5070            while (p < end_subject)
5071              {
5072              register int pp = *p++;
5073              if (pp == req_char || pp == req_char2) { p--; break; }
5074              }
5075            }
5076    
5077          /* If we can't find the required character, break the matching loop */
5078    
5079          if (p >= end_subject) break;
5080    
5081          /* If we have found the required character, save the point where we
5082          found it, so that we don't search again next time round the loop if
5083          the start hasn't passed this character yet. */
5084    
5085          req_char_ptr = p;
5086          }
5087        }
5088    
5089    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
5090    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
5091    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4275  do Line 5093  do
5093    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
5094    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
5095    
5096    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    match_block.start_match = start_match;
5097      if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5098      continue;      continue;
5099    
5100    /* Copy the offset information from temporary store if necessary */    /* Copy the offset information from temporary store if necessary */

Legend:
Removed from v.31  
changed lines
  Added in v.49

  ViewVC Help
Powered by ViewVC 1.1.5