/[pcre]/code/tags/pcre-2.08a/pcre.c
ViewVC logotype

Diff of /code/tags/pcre-2.08a/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/pcre.c revision 23 by nigel, Sat Feb 24 21:38:41 2007 UTC code/tags/pcre-2.08a/pcre.c revision 42 by nigel, Sat Feb 24 21:39:19 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-1999 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 107  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111    
112  static BOOL  static BOOL
113    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114      BOOL, int);      BOOL, int, int *, int *, compile_data *);
   
 /* Structure for passing "static" information around between the functions  
 doing the matching, so that they are thread-safe. */  
   
 typedef struct match_data {  
   int    errorcode;             /* As it says */  
   int   *offset_vector;         /* Offset vector */  
   int    offset_end;            /* One past the end */  
   int    offset_max;            /* The maximum usable for return data */  
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   endonly;               /* Dollar not before final \n */  
   const uschar *start_subject;  /* Start of the subject string */  
   const uschar *end_subject;    /* End of the subject string */  
   const uschar *end_match_ptr;  /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
115    
116    
117    
# Line 145  void  (*pcre_free)(void *) = free; Line 131  void  (*pcre_free)(void *) = free;
131    
132    
133  /*************************************************  /*************************************************
134    *             Default character tables           *
135    *************************************************/
136    
137    /* A default set of character tables is included in the PCRE binary. Its source
138    is built by the maketables auxiliary program, which uses the default C ctypes
139    functions, and put in the file chartables.c. These tables are used by PCRE
140    whenever the caller of pcre_compile() does not provide an alternate set of
141    tables. */
142    
143    #include "chartables.c"
144    
145    
146    
147    /*************************************************
148  *          Return version string                 *  *          Return version string                 *
149  *************************************************/  *************************************************/
150    
151    #define STRING(a)  # a
152    #define XSTRING(s) STRING(s)
153    
154  const char *  const char *
155  pcre_version(void)  pcre_version(void)
156  {  {
157  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
158  }  }
159    
160    
# Line 162  return PCRE_VERSION; Line 165  return PCRE_VERSION;
165  *************************************************/  *************************************************/
166    
167  /* This function picks potentially useful data out of the private  /* This function picks potentially useful data out of the private
168  structure.  structure. The public options are passed back in an int - though the
169    re->options field has been expanded to a long int, all the public options
170    at the low end of it, and so even on 16-bit systems this will still be OK.
171    Therefore, I haven't changed the API for pcre_info().
172    
173  Arguments:  Arguments:
174    external_re   points to compiled code    external_re   points to compiled code
# Line 181  pcre_info(const pcre *external_re, int * Line 187  pcre_info(const pcre *external_re, int *
187  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
188  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
189  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
190  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
191  if (first_char != NULL)  if (first_char != NULL)
192    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
193       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 237  Arguments: Line 243  Arguments:
243    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
244    options    the options bits    options    the options bits
245    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
246      cd         pointer to char tables block
247    
248  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
249               negative => a special escape sequence               negative => a special escape sequence
# Line 245  Returns:     zero or positive => a data Line 252  Returns:     zero or positive => a data
252    
253  static int  static int
254  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
255    int options, BOOL isclass)    int options, BOOL isclass, compile_data *cd)
256  {  {
257  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
258  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
# Line 288  else Line 295  else
295        {        {
296        oldptr = ptr;        oldptr = ptr;
297        c -= '0';        c -= '0';
298        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
299          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
300        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
301          {          {
# Line 314  else Line 321  else
321    
322      case '0':      case '0':
323      c -= '0';      c -= '0';
324      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
325        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
326          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
327      break;      break;
# Line 323  else Line 330  else
330    
331      case 'x':      case 'x':
332      c = 0;      c = 0;
333      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
334        {        {
335        ptr++;        ptr++;
336        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
337          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
338        }        }
339      break;      break;
340    
# Line 341  else Line 348  else
348    
349      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
350    
351      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
352      c ^= 0x40;      c ^= 0x40;
353      break;      break;
354    
355      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
356      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
357      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
358        there used to be some cases other than the default, and there may be again
359        in future, so I haven't "optimized" it. */
360    
361      default:      default:
362      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 377  where the ddds are digits. Line 386  where the ddds are digits.
386    
387  Arguments:  Arguments:
388    p         pointer to the first char after '{'    p         pointer to the first char after '{'
389      cd        pointer to char tables block
390    
391  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
392  */  */
393    
394  static BOOL  static BOOL
395  is_counted_repeat(const uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
396  {  {
397  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
398  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
399  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
400    
401  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
402  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
403    
404  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
405  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
406  return (*p == '}');  return (*p == '}');
407  }  }
408    
# Line 412  Arguments: Line 422  Arguments:
422    maxp       pointer to int for max    maxp       pointer to int for max
423               returned as -1 if no max               returned as -1 if no max
424    errorptr   points to pointer to error message    errorptr   points to pointer to error message
425      cd         pointer to character tables clock
426    
427  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
428               current ptr on error, with errorptr set               current ptr on error, with errorptr set
429  */  */
430    
431  static const uschar *  static const uschar *
432  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
433      const char **errorptr, compile_data *cd)
434  {  {
435  int min = 0;  int min = 0;
436  int max = -1;  int max = -1;
437    
438  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
439    
440  if (*p == '}') max = min; else  if (*p == '}') max = min; else
441    {    {
442    if (*(++p) != '}')    if (*(++p) != '}')
443      {      {
444      max = 0;      max = 0;
445      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
446      if (max < min)      if (max < min)
447        {        {
448        *errorptr = ERR4;        *errorptr = ERR4;
# Line 526  for (;;) Line 538  for (;;)
538    
539      case OP_REVERSE:      case OP_REVERSE:
540      cc++;      cc++;
541        /* Fall through */
542    
543      case OP_CREF:      case OP_CREF:
544      case OP_OPT:      case OP_OPT:
# Line 615  for (;;) Line 628  for (;;)
628  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
629    
630  Arguments:  Arguments:
631    options     the option bits    options      the option bits
632    brackets    points to number of brackets used    brackets     points to number of brackets used
633    code        points to the pointer to the current code point    code         points to the pointer to the current code point
634    ptrptr      points to the current pattern pointer    ptrptr       points to the current pattern pointer
635    errorptr    points to pointer to error message    errorptr     points to pointer to error message
636    optchanged  set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
637      reqchar      set to the last literal character required, else -1
638      countlits    set to count of mandatory literal characters
639      cd           contains pointers to tables
640    
641  Returns:      TRUE on success  Returns:       TRUE on success
642                FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
643  */  */
644    
645  static BOOL  static BOOL
646  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
647    const uschar **ptrptr, const char **errorptr, int *optchanged)    const uschar **ptrptr, const char **errorptr, int *optchanged,
648      int *reqchar, int *countlits, compile_data *cd)
649  {  {
650  int repeat_type, op_type;  int repeat_type, op_type;
651  int repeat_min, repeat_max;  int repeat_min, repeat_max;
652  int bravalue, length;  int bravalue, length;
653  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
654    int prevreqchar;
655    int condcount = 0;
656    int subcountlits = 0;
657  register int c;  register int c;
658  register uschar *code = *codeptr;  register uschar *code = *codeptr;
659  uschar *tempcode;  uschar *tempcode;
# Line 647  uschar class[32]; Line 667  uschar class[32];
667  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
668  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
669    
670    /* Initialize no required char, and count of literals */
671    
672    *reqchar = prevreqchar = -1;
673    *countlits = 0;
674    
675  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
676    
677  for (;; ptr++)  for (;; ptr++)
# Line 656  for (;; ptr++) Line 681  for (;; ptr++)
681    int class_lastchar;    int class_lastchar;
682    int newoptions;    int newoptions;
683    int condref;    int condref;
684      int subreqchar;
685    
686    c = *ptr;    c = *ptr;
687    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
688      {      {
689      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
690      if (c == '#')      if (c == '#')
691        {        {
692        while ((c = *(++ptr)) != 0 && c != '\n');        while ((c = *(++ptr)) != 0 && c != '\n');
# Line 748  for (;; ptr++) Line 774  for (;; ptr++)
774    
775        if (c == '\\')        if (c == '\\')
776          {          {
777          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
778          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
779          else if (c < 0)          else if (c < 0)
780            {            {
781              register const uschar *cbits = cd->cbits;
782            class_charcount = 10;            class_charcount = 10;
783            switch (-c)            switch (-c)
784              {              {
785              case ESC_d:              case ESC_d:
786              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
787              continue;              continue;
788    
789              case ESC_D:              case ESC_D:
790              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
791              continue;              continue;
792    
793              case ESC_w:              case ESC_w:
794              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
795                class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);                class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
796              continue;              continue;
797    
798              case ESC_W:              case ESC_W:
799              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
800                class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);                class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
801              continue;              continue;
802    
803              case ESC_s:              case ESC_s:
804              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
805              continue;              continue;
806    
807              case ESC_S:              case ESC_S:
808              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
809              continue;              continue;
810    
811              default:              default:
# Line 810  for (;; ptr++) Line 837  for (;; ptr++)
837    
838          if (d == '\\')          if (d == '\\')
839            {            {
840            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
841            if (d < 0)            if (d < 0)
842              {              {
843              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
# Line 832  for (;; ptr++) Line 859  for (;; ptr++)
859            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
860            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
861              {              {
862              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
863              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
864              }              }
865            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 847  for (;; ptr++) Line 874  for (;; ptr++)
874        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
875        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
876          {          {
877          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
878          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
879          }          }
880        class_charcount++;        class_charcount++;
# Line 894  for (;; ptr++) Line 921  for (;; ptr++)
921      /* Various kinds of repeat */      /* Various kinds of repeat */
922    
923      case '{':      case '{':
924      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
925      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
926      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
927      goto REPEAT;      goto REPEAT;
928    
# Line 928  for (;; ptr++) Line 955  for (;; ptr++)
955        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
956      else repeat_type = greedy_default;      else repeat_type = greedy_default;
957    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
958      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
959      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
960      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
961        out any reqchar setting, backing up to the previous value. We must also
962        adjust the countlits value. */
963    
964      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
965        {        {
966        int len = previous[1];        int len = previous[1];
967    
968          if (repeat_min == 0) *reqchar = prevreqchar;
969          *countlits += repeat_min - 1;
970    
971        if (len == 1)        if (len == 1)
972          {          {
973          c = previous[2];          c = previous[2];
# Line 978  for (;; ptr++) Line 1006  for (;; ptr++)
1006        code = previous;        code = previous;
1007    
1008        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1009        repeat_type += op_type;      /* Combine both values for many cases */  
1010          /* If the maximum is zero then the minimum must also be zero; Perl allows
1011          this case, so we do too - by simply omitting the item altogether. */
1012    
1013          if (repeat_max == 0) goto END_REPEAT;
1014    
1015          /* Combine the op_type with the repeat_type */
1016    
1017          repeat_type += op_type;
1018    
1019        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1020        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1055  for (;; ptr++) Line 1091  for (;; ptr++)
1091        }        }
1092    
1093      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1094      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1095    
1096      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1097        {        {
1098          if (repeat_max == 0)
1099            {
1100            code = previous;
1101            goto END_REPEAT;
1102            }
1103        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1104          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1105        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1082  for (;; ptr++) Line 1123  for (;; ptr++)
1123      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1124               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1125        {        {
1126        int i, ketoffset = 0;        register int i;
1127          int ketoffset = 0;
1128        int len = code - previous;        int len = code - previous;
1129          uschar *bralink = NULL;
1130    
1131        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1132        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1098  for (;; ptr++) Line 1141  for (;; ptr++)
1141          ketoffset = code - ket;          ketoffset = code - ket;
1142          }          }
1143    
1144        /* If the minimum is greater than zero, and the maximum is unlimited or        /* The case of a zero minimum is special because of the need to stick
1145        equal to the minimum, the first copy remains where it is, and is        OP_BRAZERO in front of it, and because the group appears once in the
1146        replicated up to the minimum number of times. This case includes the +        data, whereas in other cases it appears the minimum number of times. For
1147        repeat, but of course no replication is needed in that case. */        this reason, it is simplest to treat this case separately, as otherwise
1148          the code gets far too mess. There are several special subcases when the
1149          minimum is zero. */
1150    
1151        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))        if (repeat_min == 0)
1152          {          {
1153          for (i = 1; i < repeat_min; i++)          /* If we set up a required char from the bracket, we must back off
1154            to the previous value and reset the countlits value too. */
1155    
1156            if (subcountlits > 0)
1157            {            {
1158            memcpy(code, previous, len);            *reqchar = prevreqchar;
1159            code += len;            *countlits -= subcountlits;
1160            }            }
         }  
1161    
1162        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is also zero, we just omit the group from the output
1163        Then, if there is a fixed upper limit, replicated up to that many times,          altogether. */
       sticking BRAZERO in front of all the optional ones. */  
1164    
1165        else          if (repeat_max == 0)
1166          {            {
1167          if (repeat_min == 0)            code = previous;
1168              goto END_REPEAT;
1169              }
1170    
1171            /* If the maximum is 1 or unlimited, we just have to stick in the
1172            BRAZERO and do no more at this point. */
1173    
1174            if (repeat_max <= 1)
1175            {            {
1176            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1177            code++;            code++;
1178            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1179            }            }
1180    
1181            /* If the maximum is greater than 1 and limited, we have to replicate
1182            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1183            The first one has to be handled carefully because it's the original
1184            copy, which has to be moved up. The remainder can be handled by code
1185            that is common with the non-zero minimum case below. We just have to
1186            adjust the value or repeat_max, since one less copy is required. */
1187    
1188            else
1189              {
1190              int offset;
1191              memmove(previous+4, previous, len);
1192              code += 4;
1193              *previous++ = OP_BRAZERO + repeat_type;
1194              *previous++ = OP_BRA;
1195    
1196              /* We chain together the bracket offset fields that have to be
1197              filled in later when the ends of the brackets are reached. */
1198    
1199              offset = (bralink == NULL)? 0 : previous - bralink;
1200              bralink = previous;
1201              *previous++ = offset >> 8;
1202              *previous++ = offset & 255;
1203              }
1204    
1205            repeat_max--;
1206            }
1207    
1208          /* If the minimum is greater than zero, replicate the group as many
1209          times as necessary, and adjust the maximum to the number of subsequent
1210          copies that we need. */
1211    
1212          else
1213            {
1214          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1215            {            {
1216            memcpy(code, previous, len);            memcpy(code, previous, len);
1217            code += len;            code += len;
1218            }            }
1219            if (repeat_max > 0) repeat_max -= repeat_min;
1220            }
1221    
1222          /* This code is common to both the zero and non-zero minimum cases. If
1223          the maximum is limited, it replicates the group in a nested fashion,
1224          remembering the bracket starts on a stack. In the case of a zero minimum,
1225          the first one was set up above. In all cases the repeat_max now specifies
1226          the number of additional copies needed. */
1227    
1228          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        if (repeat_max >= 0)
1229            {
1230            for (i = repeat_max - 1; i >= 0; i--)
1231            {            {
1232            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1233    
1234              /* All but the final copy start a new nesting, maintaining the
1235              chain of brackets outstanding. */
1236    
1237              if (i != 0)
1238                {
1239                int offset;
1240                *code++ = OP_BRA;
1241                offset = (bralink == NULL)? 0 : code - bralink;
1242                bralink = code;
1243                *code++ = offset >> 8;
1244                *code++ = offset & 255;
1245                }
1246    
1247            memcpy(code, previous, len);            memcpy(code, previous, len);
1248            code += len;            code += len;
1249            }            }
1250    
1251            /* Now chain through the pending brackets, and fill in their length
1252            fields (which are holding the chain links pro tem). */
1253    
1254            while (bralink != NULL)
1255              {
1256              int oldlinkoffset;
1257              int offset = code - bralink + 1;
1258              uschar *bra = code - offset;
1259              oldlinkoffset = (bra[1] << 8) + bra[2];
1260              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1261              *code++ = OP_KET;
1262              *code++ = bra[1] = offset >> 8;
1263              *code++ = bra[2] = (offset & 255);
1264              }
1265          }          }
1266    
1267        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
# Line 1144  for (;; ptr++) Line 1269  for (;; ptr++)
1269        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
1270        correct offset was computed above. */        correct offset was computed above. */
1271    
1272        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1273        }        }
1274    
1275      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1157  for (;; ptr++) Line 1282  for (;; ptr++)
1282    
1283      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1284    
1285        END_REPEAT:
1286      previous = NULL;      previous = NULL;
1287      break;      break;
1288    
# Line 1191  for (;; ptr++) Line 1317  for (;; ptr++)
1317    
1318          case '(':          case '(':
1319          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
1320          if ((pcre_ctypes[*(++ptr)] & ctype_digit) != 0)          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1321            {            {
1322            condref = *ptr - '0';            condref = *ptr - '0';
1323            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
# Line 1324  for (;; ptr++) Line 1450  for (;; ptr++)
1450           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
1451           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1452            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1453           condref))                     /* Condition reference number */           condref,                      /* Condition reference number */
1454             &subreqchar,                  /* For possible last char */
1455             &subcountlits,                /* For literal count */
1456             cd))                          /* Tables block */
1457        goto FAILED;        goto FAILED;
1458    
1459      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 1337  for (;; ptr++) Line 1466  for (;; ptr++)
1466    
1467      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1468        {        {
       int branchcount = 0;  
1469        uschar *tc = code;        uschar *tc = code;
1470          condcount = 0;
1471    
1472        do {        do {
1473           branchcount++;           condcount++;
1474           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1475           }           }
1476        while (*tc != OP_KET);        while (*tc != OP_KET);
1477    
1478        if (branchcount > 2)        if (condcount > 2)
1479          {          {
1480          *errorptr = ERR27;          *errorptr = ERR27;
1481          goto FAILED;          goto FAILED;
1482          }          }
1483        }        }
1484    
1485        /* Handle updating of the required character. If the subpattern didn't
1486        set one, leave it as it was. Otherwise, update it for normal brackets of
1487        all kinds, forward assertions, and conditions with two branches. Don't
1488        update the literal count for forward assertions, however. If the bracket
1489        is followed by a quantifier with zero repeat, we have to back off. Hence
1490        the definition of prevreqchar and subcountlits outside the main loop so
1491        that they can be accessed for the back off. */
1492    
1493        if (subreqchar > 0 &&
1494             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1495             (bravalue == OP_COND && condcount == 2)))
1496          {
1497          prevreqchar = *reqchar;
1498          *reqchar = subreqchar;
1499          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1500          }
1501    
1502      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1503    
1504      code = tempcode;      code = tempcode;
# Line 1372  for (;; ptr++) Line 1518  for (;; ptr++)
1518    
1519      case '\\':      case '\\':
1520      tempptr = ptr;      tempptr = ptr;
1521      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1522    
1523      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1524      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1417  for (;; ptr++) Line 1563  for (;; ptr++)
1563        {        {
1564        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
1565          {          {
1566          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
1567          if (c == '#')          if (c == '#')
1568            {            {
1569            while ((c = *(++ptr)) != 0 && c != '\n');            while ((c = *(++ptr)) != 0 && c != '\n');
# Line 1433  for (;; ptr++) Line 1579  for (;; ptr++)
1579        if (c == '\\')        if (c == '\\')
1580          {          {
1581          tempptr = ptr;          tempptr = ptr;
1582          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1583          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
1584          }          }
1585    
# Line 1445  for (;; ptr++) Line 1591  for (;; ptr++)
1591    
1592      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
1593    
1594      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1595    
1596        /* Update the last character and the count of literals */
1597    
1598        prevreqchar = (length > 1)? code[-2] : *reqchar;
1599        *reqchar = code[-1];
1600        *countlits += length;
1601    
1602      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1603      the next state. */      the next state. */
# Line 1490  Argument: Line 1642  Argument:
1642    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1643    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1644    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1645      reqchar     -> place to put the last required character, or a negative number
1646      countlits   -> place to put the shortest literal count of any branch
1647      cd          points to the data block with tables pointers
1648    
1649  Returns:      TRUE on success  Returns:      TRUE on success
1650  */  */
1651    
1652  static BOOL  static BOOL
1653  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1654    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1655      int *reqchar, int *countlits, compile_data *cd)
1656  {  {
1657  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1658  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1504  uschar *last_branch = code; Line 1660  uschar *last_branch = code;
1660  uschar *start_bracket = code;  uschar *start_bracket = code;
1661  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1662  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1663    int branchreqchar, branchcountlits;
1664    
1665    *reqchar = -1;
1666    *countlits = INT_MAX;
1667  code += 3;  code += 3;
1668    
1669  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1543  for (;;) Line 1702  for (;;)
1702    
1703    /* Now compile the branch */    /* Now compile the branch */
1704    
1705    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1706          &branchreqchar, &branchcountlits, cd))
1707      {      {
1708      *ptrptr = ptr;      *ptrptr = ptr;
1709      return FALSE;      return FALSE;
# Line 1555  for (;;) Line 1715  for (;;)
1715    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1716    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1717    
1718      /* Save the last required character if all branches have the same; a current
1719      value of -1 means unset, while -2 means "previous branch had no last required
1720      char".  */
1721    
1722      if (*reqchar != -2)
1723        {
1724        if (branchreqchar >= 0)
1725          {
1726          if (*reqchar == -1) *reqchar = branchreqchar;
1727          else if (*reqchar != branchreqchar) *reqchar = -2;
1728          }
1729        else *reqchar = -2;
1730        }
1731    
1732      /* Keep the shortest literal count */
1733    
1734      if (branchcountlits < *countlits) *countlits = branchcountlits;
1735      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1736    
1737    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1738    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1739    the branch with OP_END. */    the branch with OP_END. */
# Line 1649  for (;;) Line 1828  for (;;)
1828      code += 2;      code += 2;
1829      break;      break;
1830    
1831        case OP_WORD_BOUNDARY:
1832        case OP_NOT_WORD_BOUNDARY:
1833        code++;
1834        break;
1835    
1836      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1837      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1838      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1676  all of whose alternatives start with OP_ Line 1860  all of whose alternatives start with OP_
1860  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
1861  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
1862    
1863  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1864  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
1865  trying them again.  so there is no point trying them again.
1866    
1867  Arguments:  Arguments:
1868    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1696  do { Line 1880  do {
1880     register int op = *scode;     register int op = *scode;
1881     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1882       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
1883     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1884                (*options & PCRE_DOTALL) != 0)
1885       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
1886     else if (op != OP_SOD &&     else if (op != OP_SOD &&
1887             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1710  return TRUE; Line 1895  return TRUE;
1895    
1896    
1897  /*************************************************  /*************************************************
1898  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
1899  *************************************************/  *************************************************/
1900    
1901  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
1902  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
1903    matching and for non-DOTALL patterns that start with .* (which must start at
1904    the beginning or after \n).
1905    
1906  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
1907  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1728  do { Line 1915  do {
1915     register int op = *scode;     register int op = *scode;
1916     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1917       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
1918       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1919         { if (scode[1] != OP_ANY) return FALSE; }
1920     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
1921     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
1922     }     }
# Line 1813  Arguments: Line 2002  Arguments:
2002    options      various option bits    options      various option bits
2003    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2004    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2005      tables       pointer to character tables or NULL
2006    
2007  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2008                 with errorptr and erroroffset set                 with errorptr and erroroffset set
# Line 1820  Returns:       pointer to compiled data Line 2010  Returns:       pointer to compiled data
2010    
2011  pcre *  pcre *
2012  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2013    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2014  {  {
2015  real_pcre *re;  real_pcre *re;
2016  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2017  int runlength;  int runlength;
2018  int c, size;  int c, size, reqchar, countlits;
2019  int bracount = 0;  int bracount = 0;
2020  int top_backref = 0;  int top_backref = 0;
2021  int branch_extra = 0;  int branch_extra = 0;
# Line 1833  int branch_newextra; Line 2023  int branch_newextra;
2023  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2024  uschar *code;  uschar *code;
2025  const uschar *ptr;  const uschar *ptr;
2026    compile_data compile_block;
2027  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
2028  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
2029    
# Line 1861  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2052  if ((options & ~PUBLIC_OPTIONS) != 0)
2052    return NULL;    return NULL;
2053    }    }
2054    
2055    /* Set up pointers to the individual character tables */
2056    
2057    if (tables == NULL) tables = pcre_default_tables;
2058    compile_block.lcc = tables + lcc_offset;
2059    compile_block.fcc = tables + fcc_offset;
2060    compile_block.cbits = tables + cbits_offset;
2061    compile_block.ctypes = tables + ctypes_offset;
2062    
2063    /* Reflect pattern for debugging output */
2064    
2065  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
2066  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
2067    
# Line 1879  while ((c = *(++ptr)) != 0) Line 2080  while ((c = *(++ptr)) != 0)
2080    
2081    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2082      {      {
2083      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2084      if (c == '#')      if (c == '#')
2085        {        {
2086        while ((c = *(++ptr)) != 0 && c != '\n');        while ((c = *(++ptr)) != 0 && c != '\n');
# Line 1897  while ((c = *(++ptr)) != 0) Line 2098  while ((c = *(++ptr)) != 0)
2098      case '\\':      case '\\':
2099        {        {
2100        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2101        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2102        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2103        if (c >= 0)        if (c >= 0)
2104          {          {
# Line 1917  while ((c = *(++ptr)) != 0) Line 2118  while ((c = *(++ptr)) != 0)
2118        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2119        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2120        length++;   /* For single back reference */        length++;   /* For single back reference */
2121        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2122          {          {
2123          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2124          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2125          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2126            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1943  while ((c = *(++ptr)) != 0) Line 2144  while ((c = *(++ptr)) != 0)
2144      or back reference. */      or back reference. */
2145    
2146      case '{':      case '{':
2147      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2148      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2149      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2150      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2151        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1979  while ((c = *(++ptr)) != 0) Line 2180  while ((c = *(++ptr)) != 0)
2180        {        {
2181        if (*ptr == '\\')        if (*ptr == '\\')
2182          {          {
2183          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2184              &compile_block);
2185          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2186          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2187          }          }
# Line 1996  while ((c = *(++ptr)) != 0) Line 2198  while ((c = *(++ptr)) != 0)
2198    
2199        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2200    
2201        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2202          {          {
2203          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2204          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2205          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2206            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 2064  while ((c = *(++ptr)) != 0) Line 2266  while ((c = *(++ptr)) != 0)
2266          group. */          group. */
2267    
2268          case '(':          case '(':
2269          if ((pcre_ctypes[ptr[3]] & ctype_digit) != 0)          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2270            {            {
2271            ptr += 4;            ptr += 4;
2272            length += 2;            length += 2;
2273            while ((pcre_ctypes[*ptr] & ctype_digit) != 0) ptr++;            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2274            if (*ptr != ')')            if (*ptr != ')')
2275              {              {
2276              *errorptr = ERR26;              *errorptr = ERR26;
# Line 2153  while ((c = *(++ptr)) != 0) Line 2355  while ((c = *(++ptr)) != 0)
2355              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2356              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2357              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2358              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2359                flag ever changes within the regex. This is used by the "required
2360                character" code. */
2361    
2362              case ':':              case ':':
2363              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2364                {                {
2365                length += 4;                length += 4;
2366                branch_newextra = 2;                branch_newextra = 2;
2367                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2368                }                }
2369              goto END_OPTIONS;              goto END_OPTIONS;
2370    
# Line 2237  while ((c = *(++ptr)) != 0) Line 2442  while ((c = *(++ptr)) != 0)
2442        /* Leave ptr at the final char; for read_repeat_counts this happens        /* Leave ptr at the final char; for read_repeat_counts this happens
2443        automatically; for the others we need an increment. */        automatically; for the others we need an increment. */
2444    
2445        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2446          {          {
2447          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2448              &compile_block);
2449          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2450          }          }
2451        else if (c == '*') { minval = 0; maxval = -1; ptr++; }        else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2452        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2453        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2454    
2455        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2456        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2457        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2458        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2459    
2460        if (minval == 0) length++;        if (minval == 0)
2461          else if (minval > 1) length += (minval - 1) * duplength;          {
2462        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2463            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2464            }
2465    
2466          /* When the minimum is greater than zero, 1 we have to replicate up to
2467          minval-1 times, with no additions required in the copies. Then, if
2468          there is a limited maximum we have to replicate up to maxval-1 times
2469          allowing for a BRAZERO item before each optional copy and nesting
2470          brackets for all but one of the optional copies. */
2471    
2472          else
2473            {
2474            length += (minval - 1) * duplength;
2475            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2476              length += (maxval - minval) * (duplength + 7) - 6;
2477            }
2478        }        }
2479      continue;      continue;
2480    
# Line 2270  while ((c = *(++ptr)) != 0) Line 2491  while ((c = *(++ptr)) != 0)
2491        {        {
2492        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2493          {          {
2494          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2495          if (c == '#')          if (c == '#')
2496            {            {
2497            while ((c = *(++ptr)) != 0 && c != '\n');            while ((c = *(++ptr)) != 0 && c != '\n');
# Line 2284  while ((c = *(++ptr)) != 0) Line 2505  while ((c = *(++ptr)) != 0)
2505        if (c == '\\')        if (c == '\\')
2506          {          {
2507          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
2508          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2509              &compile_block);
2510          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2511          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2512          }          }
# Line 2296  while ((c = *(++ptr)) != 0) Line 2518  while ((c = *(++ptr)) != 0)
2518    
2519      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2520    
2521      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < 255 &&
2522          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2523    
2524      ptr--;      ptr--;
2525      length += runlength;      length += runlength;
# Line 2331  if (re == NULL) Line 2554  if (re == NULL)
2554    
2555  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2556  re->options = options;  re->options = options;
2557    re->tables = tables;
2558    
2559  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
2560  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 2340  ptr = (const uschar *)pattern; Line 2564  ptr = (const uschar *)pattern;
2564  code = re->code;  code = re->code;
2565  *code = OP_BRA;  *code = OP_BRA;
2566  bracount = 0;  bracount = 0;
2567  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2568      &reqchar, &countlits, &compile_block);
2569  re->top_bracket = bracount;  re->top_bracket = bracount;
2570  re->top_backref = top_backref;  re->top_backref = top_backref;
2571    
# Line 2372  if (*errorptr != NULL) Line 2597  if (*errorptr != NULL)
2597    return NULL;    return NULL;
2598    }    }
2599    
2600  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2601  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2602  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2603  unanchored matches no end. In the case of multiline matches, an alternative is  
2604  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2605    that speeds up unanchored matches no end. If not, see if we can set the
2606    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2607    start with ^. and also when all branches start with .* for non-DOTALL matches.
2608    */
2609    
2610  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2611    {    {
# Line 2396  if ((options & PCRE_ANCHORED) == 0) Line 2625  if ((options & PCRE_ANCHORED) == 0)
2625      }      }
2626    }    }
2627    
2628    /* Save the last required character if there are at least two literal
2629    characters on all paths, or if there is no first character setting. */
2630    
2631    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2632      {
2633      re->req_char = reqchar;
2634      re->options |= PCRE_REQCHSET;
2635      }
2636    
2637  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2638    
2639  #ifdef DEBUG  #ifdef DEBUG
# Line 2405  printf("Length = %d top_bracket = %d top Line 2643  printf("Length = %d top_bracket = %d top
2643    
2644  if (re->options != 0)  if (re->options != 0)
2645    {    {
2646    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2647      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2648      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2649        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2650      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2651      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2652      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2422  if ((re->options & PCRE_FIRSTSET) != 0) Line 2661  if ((re->options & PCRE_FIRSTSET) != 0)
2661      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2662    }    }
2663    
2664    if ((re->options & PCRE_REQCHSET) != 0)
2665      {
2666      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2667        else printf("Req char = \\x%02x\n", re->req_char);
2668      }
2669    
2670  code_end = code;  code_end = code;
2671  code_base = code = re->code;  code_base = code = re->code;
2672    
# Line 2637  return (pcre *)re; Line 2882  return (pcre *)re;
2882    
2883    
2884  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
2885  *          Match a back-reference                *  *          Match a back-reference                *
2886  *************************************************/  *************************************************/
2887    
# Line 2695  Returns:      TRUE if matched Line 2900  Returns:      TRUE if matched
2900    
2901  static BOOL  static BOOL
2902  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2903    int ims)    unsigned long int ims)
2904  {  {
2905  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
2906    
# Line 2719  if (length > md->end_subject - eptr) ret Line 2924  if (length > md->end_subject - eptr) ret
2924  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
2925    
2926  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
2927    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
2928      while (length-- > 0)
2929        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2930      }
2931  else  else
2932    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2933    
# Line 2751  Returns:       TRUE if matched Line 2959  Returns:       TRUE if matched
2959    
2960  static BOOL  static BOOL
2961  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
2962    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2963      const uschar *eptrb)
2964  {  {
2965  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
2966    
2967  for (;;)  for (;;)
2968    {    {
# Line 2782  for (;;) Line 2991  for (;;)
2991      int number = op - OP_BRA;      int number = op - OP_BRA;
2992      int offset = number << 1;      int offset = number << 1;
2993    
2994      DPRINTF(("start bracket %d\n", number));  #ifdef DEBUG
2995        printf("start bracket %d subject=", number);
2996        pchars(eptr, 16, TRUE, md);
2997        printf("\n");
2998    #endif
2999    
3000      if (offset < md->offset_max)      if (offset < md->offset_max)
3001        {        {
# Line 2864  for (;;) Line 3077  for (;;)
3077      ecode += 2;      ecode += 2;
3078      break;      break;
3079    
3080      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3081        an empty string - recursion will then try other alternatives, if any. */
3082    
3083      case OP_END:      case OP_END:
3084        if (md->notempty && eptr == md->start_match) return FALSE;
3085      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3086      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3087      return TRUE;      return TRUE;
# Line 2876  for (;;) Line 3091  for (;;)
3091      case OP_OPT:      case OP_OPT:
3092      ims = ecode[1];      ims = ecode[1];
3093      ecode += 2;      ecode += 2;
3094      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3095      break;      break;
3096    
3097      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 2983  for (;;) Line 3198  for (;;)
3198        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3199          {          {
3200          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3201          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3202          }          }
3203    
3204        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
# Line 3077  for (;;) Line 3292  for (;;)
3292        the group. */        the group. */
3293    
3294        ims = original_ims;        ims = original_ims;
3295        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3296    
3297        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3298        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3172  for (;;) Line 3387  for (;;)
3387      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
3388        {        {
3389        BOOL prev_is_word = (eptr != md->start_subject) &&        BOOL prev_is_word = (eptr != md->start_subject) &&
3390          ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);          ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3391        BOOL cur_is_word = (eptr < md->end_subject) &&        BOOL cur_is_word = (eptr < md->end_subject) &&
3392          ((pcre_ctypes[*eptr] & ctype_word) != 0);          ((md->ctypes[*eptr] & ctype_word) != 0);
3393        if ((*ecode++ == OP_WORD_BOUNDARY)?        if ((*ecode++ == OP_WORD_BOUNDARY)?
3394             cur_is_word == prev_is_word : cur_is_word != prev_is_word)             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3395          return FALSE;          return FALSE;
# Line 3191  for (;;) Line 3406  for (;;)
3406      break;      break;
3407    
3408      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
3409      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)      if (eptr >= md->end_subject ||
3410           (md->ctypes[*eptr++] & ctype_digit) != 0)
3411        return FALSE;        return FALSE;
3412      ecode++;      ecode++;
3413      break;      break;
3414    
3415      case OP_DIGIT:      case OP_DIGIT:
3416      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)      if (eptr >= md->end_subject ||
3417           (md->ctypes[*eptr++] & ctype_digit) == 0)
3418        return FALSE;        return FALSE;
3419      ecode++;      ecode++;
3420      break;      break;
3421    
3422      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
3423      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)      if (eptr >= md->end_subject ||
3424           (md->ctypes[*eptr++] & ctype_space) != 0)
3425        return FALSE;        return FALSE;
3426      ecode++;      ecode++;
3427      break;      break;
3428    
3429      case OP_WHITESPACE:      case OP_WHITESPACE:
3430      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)      if (eptr >= md->end_subject ||
3431           (md->ctypes[*eptr++] & ctype_space) == 0)
3432        return FALSE;        return FALSE;
3433      ecode++;      ecode++;
3434      break;      break;
3435    
3436      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
3437      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)      if (eptr >= md->end_subject ||
3438           (md->ctypes[*eptr++] & ctype_word) != 0)
3439        return FALSE;        return FALSE;
3440      ecode++;      ecode++;
3441      break;      break;
3442    
3443      case OP_WORDCHAR:      case OP_WORDCHAR:
3444      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)      if (eptr >= md->end_subject ||
3445           (md->ctypes[*eptr++] & ctype_word) == 0)
3446        return FALSE;        return FALSE;
3447      ecode++;      ecode++;
3448      break;      break;
# Line 3453  for (;;) Line 3674  for (;;)
3674        if (length > md->end_subject - eptr) return FALSE;        if (length > md->end_subject - eptr) return FALSE;
3675        if ((ims & PCRE_CASELESS) != 0)        if ((ims & PCRE_CASELESS) != 0)
3676          {          {
3677          while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;          while (length-- > 0)
3678              if (md->lcc[*ecode++] != md->lcc[*eptr++])
3679                return FALSE;
3680          }          }
3681        else        else
3682          {          {
# Line 3510  for (;;) Line 3733  for (;;)
3733    
3734      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
3735        {        {
3736        c = pcre_lcc[c];        c = md->lcc[c];
3737        for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
3738            if (c != md->lcc[*eptr++]) return FALSE;
3739        if (min == max) continue;        if (min == max) continue;
3740        if (minimize)        if (minimize)
3741          {          {
# Line 3519  for (;;) Line 3743  for (;;)
3743            {            {
3744            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3745              return TRUE;              return TRUE;
3746            if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])            if (i >= max || eptr >= md->end_subject ||
3747                  c != md->lcc[*eptr++])
3748              return FALSE;              return FALSE;
3749            }            }
3750          /* Control never gets here */          /* Control never gets here */
# Line 3529  for (;;) Line 3754  for (;;)
3754          const uschar *pp = eptr;          const uschar *pp = eptr;
3755          for (i = min; i < max; i++)          for (i = min; i < max; i++)
3756            {            {
3757            if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3758            eptr++;            eptr++;
3759            }            }
3760          while (eptr >= pp)          while (eptr >= pp)
# Line 3579  for (;;) Line 3804  for (;;)
3804      ecode++;      ecode++;
3805      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
3806        {        {
3807        if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;        if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3808        }        }
3809      else      else
3810        {        {
# Line 3639  for (;;) Line 3864  for (;;)
3864    
3865      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
3866        {        {
3867        c = pcre_lcc[c];        c = md->lcc[c];
3868        for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
3869            if (c == md->lcc[*eptr++]) return FALSE;
3870        if (min == max) continue;        if (min == max) continue;
3871        if (minimize)        if (minimize)
3872          {          {
# Line 3648  for (;;) Line 3874  for (;;)
3874            {            {
3875            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3876              return TRUE;              return TRUE;
3877            if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])            if (i >= max || eptr >= md->end_subject ||
3878                  c == md->lcc[*eptr++])
3879              return FALSE;              return FALSE;
3880            }            }
3881          /* Control never gets here */          /* Control never gets here */
# Line 3658  for (;;) Line 3885  for (;;)
3885          const uschar *pp = eptr;          const uschar *pp = eptr;
3886          for (i = min; i < max; i++)          for (i = min; i < max; i++)
3887            {            {
3888            if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3889            eptr++;            eptr++;
3890            }            }
3891          while (eptr >= pp)          while (eptr >= pp)
# Line 3752  for (;;) Line 3979  for (;;)
3979    
3980        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
3981        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3982          if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3983        break;        break;
3984    
3985        case OP_DIGIT:        case OP_DIGIT:
3986        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3987          if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3988        break;        break;
3989    
3990        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
3991        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3992          if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3993        break;        break;
3994    
3995        case OP_WHITESPACE:        case OP_WHITESPACE:
3996        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3997          if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3998        break;        break;
3999    
4000        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
4001        for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)        for (i = 1; i <= min; i++)
4002          return FALSE;          if ((md->ctypes[*eptr++] & ctype_word) != 0)
4003              return FALSE;
4004        break;        break;
4005    
4006        case OP_WORDCHAR:        case OP_WORDCHAR:
4007        for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)        for (i = 1; i <= min; i++)
4008          return FALSE;          if ((md->ctypes[*eptr++] & ctype_word) == 0)
4009              return FALSE;
4010        break;        break;
4011        }        }
4012    
# Line 3786  for (;;) Line 4015  for (;;)
4015      if (min == max) continue;      if (min == max) continue;
4016    
4017      /* If minimizing, we have to test the rest of the pattern before each      /* If minimizing, we have to test the rest of the pattern before each
4018      subsequent match, so inlining isn't much help; just use the function. */      subsequent match. */
4019    
4020      if (minimize)      if (minimize)
4021        {        {
4022        for (i = min;; i++)        for (i = min;; i++)
4023          {          {
4024          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4025          if (i >= max || eptr >= md->end_subject ||          if (i >= max || eptr >= md->end_subject) return FALSE;
4026            !match_type(ctype, *eptr++, (ims & PCRE_DOTALL) != 0))  
4027              return FALSE;          c = *eptr++;
4028            switch(ctype)
4029              {
4030              case OP_ANY:
4031              if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4032              break;
4033    
4034              case OP_NOT_DIGIT:
4035              if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4036              break;
4037    
4038              case OP_DIGIT:
4039              if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4040              break;
4041    
4042              case OP_NOT_WHITESPACE:
4043              if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4044              break;
4045    
4046              case OP_WHITESPACE:
4047              if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4048              break;
4049    
4050              case OP_NOT_WORDCHAR:
4051              if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4052              break;
4053    
4054              case OP_WORDCHAR:
4055              if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4056              break;
4057              }
4058          }          }
4059        /* Control never gets here */        /* Control never gets here */
4060        }        }
# Line 3828  for (;;) Line 4087  for (;;)
4087          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
4088          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4089            {            {
4090            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4091              break;              break;
4092            eptr++;            eptr++;
4093            }            }
# Line 3837  for (;;) Line 4096  for (;;)
4096          case OP_DIGIT:          case OP_DIGIT:
4097          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4098            {            {
4099            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4100              break;              break;
4101            eptr++;            eptr++;
4102            }            }
# Line 3846  for (;;) Line 4105  for (;;)
4105          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
4106          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4107            {            {
4108            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4109              break;              break;
4110            eptr++;            eptr++;
4111            }            }
# Line 3855  for (;;) Line 4114  for (;;)
4114          case OP_WHITESPACE:          case OP_WHITESPACE:
4115          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4116            {            {
4117            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4118              break;              break;
4119            eptr++;            eptr++;
4120            }            }
# Line 3864  for (;;) Line 4123  for (;;)
4123          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
4124          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4125            {            {
4126            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4127              break;              break;
4128            eptr++;            eptr++;
4129            }            }
# Line 3873  for (;;) Line 4132  for (;;)
4132          case OP_WORDCHAR:          case OP_WORDCHAR:
4133          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4134            {            {
4135            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4136              break;              break;
4137            eptr++;            eptr++;
4138            }            }
# Line 3919  Arguments: Line 4178  Arguments:
4178    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4179    subject         points to the subject string    subject         points to the subject string
4180    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4181      start_offset    where to start in the subject string
4182    options         option bits    options         option bits
4183    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4184    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3931  Returns:          > 0 => success; value Line 4191  Returns:          > 0 => success; value
4191    
4192  int  int
4193  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4194    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4195      int offsetcount)
4196  {  {
4197  int resetcount, ocount;  int resetcount, ocount;
4198  int first_char = -1;  int first_char = -1;
4199  int ims = 0;  int req_char = -1;
4200    int req_char2 = -1;
4201    unsigned long int ims = 0;
4202  match_data match_block;  match_data match_block;
4203  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4204  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4205  const uschar *end_subject;  const uschar *end_subject;
4206    const uschar *req_char_ptr = start_match - 1;
4207  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4208  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4209  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 3960  match_block.endonly = (re->options & PCR Line 4224  match_block.endonly = (re->options & PCR
4224    
4225  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4226  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4227    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4228    
4229  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4230    
4231    match_block.lcc = re->tables + lcc_offset;
4232    match_block.ctypes = re->tables + ctypes_offset;
4233    
4234  /* The ims options can vary during the matching as a result of the presence  /* The ims options can vary during the matching as a result of the presence
4235  of (?ims) items in the pattern. They are kept in a local variable so that  of (?ims) items in the pattern. They are kept in a local variable so that
4236  restoring at the exit of a group is easy. */  restoring at the exit of a group is easy. */
# Line 3997  in the pattern. */ Line 4265  in the pattern. */
4265  resetcount = 2 + re->top_bracket * 2;  resetcount = 2 + re->top_bracket * 2;
4266  if (resetcount > offsetcount) resetcount = ocount;  if (resetcount > offsetcount) resetcount = ocount;
4267    
4268    /* Reset the working variable associated with each extraction. These should
4269    never be used unless previously set, but they get saved and restored, and so we
4270    initialize them to avoid reading uninitialized locations. */
4271    
4272    if (match_block.offset_vector != NULL)
4273      {
4274      register int *iptr = match_block.offset_vector + ocount;
4275      register int *iend = iptr - resetcount/2 + 1;
4276      while (--iptr >= iend) *iptr = -1;
4277      }
4278    
4279  /* Set up the first character to match, if available. The first_char value is  /* Set up the first character to match, if available. The first_char value is
4280  never set for an anchored regular expression, but the anchoring may be forced  never set for an anchored regular expression, but the anchoring may be forced
4281  at run time, so we have to test for anchoring. The first char may be unset for  at run time, so we have to test for anchoring. The first char may be unset for
# Line 4008  if (!anchored) Line 4287  if (!anchored)
4287    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->options & PCRE_FIRSTSET) != 0)
4288      {      {
4289      first_char = re->first_char;      first_char = re->first_char;
4290      if ((ims & PCRE_CASELESS) != 0) first_char = pcre_lcc[first_char];      if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4291      }      }
4292    else    else
4293      if (!startline && extra != NULL &&      if (!startline && extra != NULL &&
# Line 4016  if (!anchored) Line 4295  if (!anchored)
4295          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4296    }    }
4297    
4298  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4299    character" set. If the PCRE_CASELESS is set, implying that the match starts
4300    caselessly, or if there are any changes of this flag within the regex, set up
4301    both cases of the character. Otherwise set the two values the same, which will
4302    avoid duplicate testing (which takes significant time). This covers the vast
4303    majority of cases. It will be suboptimal when the case flag changes in a regex
4304    and the required character in fact is caseful. */
4305    
4306    if ((re->options & PCRE_REQCHSET) != 0)
4307      {
4308      req_char = re->req_char;
4309      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4310        (re->tables + fcc_offset)[req_char] : req_char;
4311      }
4312    
4313    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4314    the loop runs just once. */
4315    
4316  do  do
4317    {    {
# Line 4033  do Line 4328  do
4328    if (first_char >= 0)    if (first_char >= 0)
4329      {      {
4330      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
4331        while (start_match < end_subject && pcre_lcc[*start_match] != first_char)        while (start_match < end_subject &&
4332                 match_block.lcc[*start_match] != first_char)
4333          start_match++;          start_match++;
4334      else      else
4335        while (start_match < end_subject && *start_match != first_char)        while (start_match < end_subject && *start_match != first_char)
# Line 4044  do Line 4340  do
4340    
4341    else if (startline)    else if (startline)
4342      {      {
4343      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
4344        {        {
4345        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != '\n')
4346          start_match++;          start_match++;
4347        }        }
4348      }      }
4349    
4350    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4351    
4352    else if (start_bits != NULL)    else if (start_bits != NULL)
4353      {      {
# Line 4068  do Line 4364  do
4364    printf("\n");    printf("\n");
4365  #endif  #endif
4366    
4367      /* If req_char is set, we know that that character must appear in the subject
4368      for the match to succeed. If the first character is set, req_char must be
4369      later in the subject; otherwise the test starts at the match point. This
4370      optimization can save a huge amount of backtracking in patterns with nested
4371      unlimited repeats that aren't going to match. We don't know what the state of
4372      case matching may be when this character is hit, so test for it in both its
4373      cases if necessary. However, the different cased versions will not be set up
4374      unless PCRE_CASELESS was given or the casing state changes within the regex.
4375      Writing separate code makes it go faster, as does using an autoincrement and
4376      backing off on a match. */
4377    
4378      if (req_char >= 0)
4379        {
4380        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4381    
4382        /* We don't need to repeat the search if we haven't yet reached the
4383        place we found it at last time. */
4384    
4385        if (p > req_char_ptr)
4386          {
4387          /* Do a single test if no case difference is set up */
4388    
4389          if (req_char == req_char2)
4390            {
4391            while (p < end_subject)
4392              {
4393              if (*p++ == req_char) { p--; break; }
4394              }
4395            }
4396    
4397          /* Otherwise test for either case */
4398    
4399          else
4400            {
4401            while (p < end_subject)
4402              {
4403              register int pp = *p++;
4404              if (pp == req_char || pp == req_char2) { p--; break; }
4405              }
4406            }
4407    
4408          /* If we can't find the required character, break the matching loop */
4409    
4410          if (p >= end_subject) break;
4411    
4412          /* If we have found the required character, save the point where we
4413          found it, so that we don't search again next time round the loop if
4414          the start hasn't passed this character yet. */
4415    
4416          req_char_ptr = p;
4417          }
4418        }
4419    
4420    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4421    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4422    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4075  do Line 4424  do
4424    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4425    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4426    
4427      match_block.start_match = start_match;
4428    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4429      continue;      continue;
4430    
# Line 4106  do Line 4456  do
4456    DPRINTF((">>>> returning %d\n", rc));    DPRINTF((">>>> returning %d\n", rc));
4457    return rc;    return rc;
4458    }    }
4459    
4460    /* This "while" is the end of the "do" above */
4461    
4462  while (!anchored &&  while (!anchored &&
4463         match_block.errorcode == PCRE_ERROR_NOMATCH &&         match_block.errorcode == PCRE_ERROR_NOMATCH &&
4464         start_match++ < end_subject);         start_match++ < end_subject);

Legend:
Removed from v.23  
changed lines
  Added in v.42

  ViewVC Help
Powered by ViewVC 1.1.5