/[pcre]/code/tags/pcre-2.08a/pcre.c
ViewVC logotype

Diff of /code/tags/pcre-2.08a/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 23 by nigel, Sat Feb 24 21:38:41 2007 UTC revision 37 by nigel, Sat Feb 24 21:39:09 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-1999 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 107  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111    
112  static BOOL  static BOOL
113    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114      BOOL, int);      BOOL, int, int *, int *, compile_data *);
   
 /* Structure for passing "static" information around between the functions  
 doing the matching, so that they are thread-safe. */  
   
 typedef struct match_data {  
   int    errorcode;             /* As it says */  
   int   *offset_vector;         /* Offset vector */  
   int    offset_end;            /* One past the end */  
   int    offset_max;            /* The maximum usable for return data */  
   BOOL   offset_overflow;       /* Set if too many extractions */  
   BOOL   notbol;                /* NOTBOL flag */  
   BOOL   noteol;                /* NOTEOL flag */  
   BOOL   endonly;               /* Dollar not before final \n */  
   const uschar *start_subject;  /* Start of the subject string */  
   const uschar *end_subject;    /* End of the subject string */  
   const uschar *end_match_ptr;  /* Subject position at end match */  
   int     end_offset_top;       /* Highwater mark at end of match */  
 } match_data;  
115    
116    
117    
# Line 145  void  (*pcre_free)(void *) = free; Line 131  void  (*pcre_free)(void *) = free;
131    
132    
133  /*************************************************  /*************************************************
134    *             Default character tables           *
135    *************************************************/
136    
137    /* A default set of character tables is included in the PCRE binary. Its source
138    is built by the maketables auxiliary program, which uses the default C ctypes
139    functions, and put in the file chartables.c. These tables are used by PCRE
140    whenever the caller of pcre_compile() does not provide an alternate set of
141    tables. */
142    
143    #include "chartables.c"
144    
145    
146    
147    /*************************************************
148  *          Return version string                 *  *          Return version string                 *
149  *************************************************/  *************************************************/
150    
# Line 162  return PCRE_VERSION; Line 162  return PCRE_VERSION;
162  *************************************************/  *************************************************/
163    
164  /* This function picks potentially useful data out of the private  /* This function picks potentially useful data out of the private
165  structure.  structure. The public options are passed back in an int - though the
166    re->options field has been expanded to a long int, all the public options
167    at the low end of it, and so even on 16-bit systems this will still be OK.
168    Therefore, I haven't changed the API for pcre_info().
169    
170  Arguments:  Arguments:
171    external_re   points to compiled code    external_re   points to compiled code
# Line 181  pcre_info(const pcre *external_re, int * Line 184  pcre_info(const pcre *external_re, int *
184  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
185  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
186  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
187  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
188  if (first_char != NULL)  if (first_char != NULL)
189    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
190       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 237  Arguments: Line 240  Arguments:
240    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
241    options    the options bits    options    the options bits
242    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
243      cd         pointer to char tables block
244    
245  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
246               negative => a special escape sequence               negative => a special escape sequence
# Line 245  Returns:     zero or positive => a data Line 249  Returns:     zero or positive => a data
249    
250  static int  static int
251  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
252    int options, BOOL isclass)    int options, BOOL isclass, compile_data *cd)
253  {  {
254  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
255  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
# Line 288  else Line 292  else
292        {        {
293        oldptr = ptr;        oldptr = ptr;
294        c -= '0';        c -= '0';
295        while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
296          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
297        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
298          {          {
# Line 314  else Line 318  else
318    
319      case '0':      case '0':
320      c -= '0';      c -= '0';
321      while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
322        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
323          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
324      break;      break;
# Line 323  else Line 327  else
327    
328      case 'x':      case 'x':
329      c = 0;      c = 0;
330      while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
331        {        {
332        ptr++;        ptr++;
333        c = c * 16 + pcre_lcc[*ptr] -        c = c * 16 + cd->lcc[*ptr] -
334          (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
335        }        }
336      break;      break;
337    
# Line 341  else Line 345  else
345    
346      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped */
347    
348      if (c >= 'a' && c <= 'z') c = pcre_fcc[c];      if (c >= 'a' && c <= 'z') c = cd->fcc[c];
349      c ^= 0x40;      c ^= 0x40;
350      break;      break;
351    
352      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
353      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
354      for Perl compatibility, it is a literal. */      for Perl compatibility, it is a literal. This code looks a bit odd, but
355        there used to be some cases other than the default, and there may be again
356        in future, so I haven't "optimized" it. */
357    
358      default:      default:
359      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 377  where the ddds are digits. Line 383  where the ddds are digits.
383    
384  Arguments:  Arguments:
385    p         pointer to the first char after '{'    p         pointer to the first char after '{'
386      cd        pointer to char tables block
387    
388  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
389  */  */
390    
391  static BOOL  static BOOL
392  is_counted_repeat(const uschar *p)  is_counted_repeat(const uschar *p, compile_data *cd)
393  {  {
394  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
395  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
396  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
397    
398  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
399  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
400    
401  if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
402  while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
403  return (*p == '}');  return (*p == '}');
404  }  }
405    
# Line 412  Arguments: Line 419  Arguments:
419    maxp       pointer to int for max    maxp       pointer to int for max
420               returned as -1 if no max               returned as -1 if no max
421    errorptr   points to pointer to error message    errorptr   points to pointer to error message
422      cd         pointer to character tables clock
423    
424  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
425               current ptr on error, with errorptr set               current ptr on error, with errorptr set
426  */  */
427    
428  static const uschar *  static const uschar *
429  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)  read_repeat_counts(const uschar *p, int *minp, int *maxp,
430      const char **errorptr, compile_data *cd)
431  {  {
432  int min = 0;  int min = 0;
433  int max = -1;  int max = -1;
434    
435  while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
436    
437  if (*p == '}') max = min; else  if (*p == '}') max = min; else
438    {    {
439    if (*(++p) != '}')    if (*(++p) != '}')
440      {      {
441      max = 0;      max = 0;
442      while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
443      if (max < min)      if (max < min)
444        {        {
445        *errorptr = ERR4;        *errorptr = ERR4;
# Line 526  for (;;) Line 535  for (;;)
535    
536      case OP_REVERSE:      case OP_REVERSE:
537      cc++;      cc++;
538        /* Fall through */
539    
540      case OP_CREF:      case OP_CREF:
541      case OP_OPT:      case OP_OPT:
# Line 615  for (;;) Line 625  for (;;)
625  /* Scan the pattern, compiling it into the code vector.  /* Scan the pattern, compiling it into the code vector.
626    
627  Arguments:  Arguments:
628    options     the option bits    options      the option bits
629    brackets    points to number of brackets used    brackets     points to number of brackets used
630    code        points to the pointer to the current code point    code         points to the pointer to the current code point
631    ptrptr      points to the current pattern pointer    ptrptr       points to the current pattern pointer
632    errorptr    points to pointer to error message    errorptr     points to pointer to error message
633    optchanged  set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
634      reqchar      set to the last literal character required, else -1
635      countlits    set to count of mandatory literal characters
636      cd           contains pointers to tables
637    
638  Returns:      TRUE on success  Returns:       TRUE on success
639                FALSE, with *errorptr set on error                 FALSE, with *errorptr set on error
640  */  */
641    
642  static BOOL  static BOOL
643  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
644    const uschar **ptrptr, const char **errorptr, int *optchanged)    const uschar **ptrptr, const char **errorptr, int *optchanged,
645      int *reqchar, int *countlits, compile_data *cd)
646  {  {
647  int repeat_type, op_type;  int repeat_type, op_type;
648  int repeat_min, repeat_max;  int repeat_min, repeat_max;
649  int bravalue, length;  int bravalue, length;
650  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
651    int prevreqchar;
652    int condcount = 0;
653    int subcountlits = 0;
654  register int c;  register int c;
655  register uschar *code = *codeptr;  register uschar *code = *codeptr;
656  uschar *tempcode;  uschar *tempcode;
# Line 647  uschar class[32]; Line 664  uschar class[32];
664  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
665  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
666    
667    /* Initialize no required char, and count of literals */
668    
669    *reqchar = prevreqchar = -1;
670    *countlits = 0;
671    
672  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
673    
674  for (;; ptr++)  for (;; ptr++)
# Line 656  for (;; ptr++) Line 678  for (;; ptr++)
678    int class_lastchar;    int class_lastchar;
679    int newoptions;    int newoptions;
680    int condref;    int condref;
681      int subreqchar;
682    
683    c = *ptr;    c = *ptr;
684    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
685      {      {
686      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
687      if (c == '#')      if (c == '#')
688        {        {
689        while ((c = *(++ptr)) != 0 && c != '\n');        while ((c = *(++ptr)) != 0 && c != '\n');
# Line 748  for (;; ptr++) Line 771  for (;; ptr++)
771    
772        if (c == '\\')        if (c == '\\')
773          {          {
774          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
775          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';
776          else if (c < 0)          else if (c < 0)
777            {            {
778              register const uschar *cbits = cd->cbits;
779            class_charcount = 10;            class_charcount = 10;
780            switch (-c)            switch (-c)
781              {              {
782              case ESC_d:              case ESC_d:
783              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
784              continue;              continue;
785    
786              case ESC_D:              case ESC_D:
787              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
788              continue;              continue;
789    
790              case ESC_w:              case ESC_w:
791              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
792                class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);                class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
793              continue;              continue;
794    
795              case ESC_W:              case ESC_W:
796              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
797                class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);                class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
798              continue;              continue;
799    
800              case ESC_s:              case ESC_s:
801              for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
802              continue;              continue;
803    
804              case ESC_S:              case ESC_S:
805              for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
806              continue;              continue;
807    
808              default:              default:
# Line 810  for (;; ptr++) Line 834  for (;; ptr++)
834    
835          if (d == '\\')          if (d == '\\')
836            {            {
837            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
838            if (d < 0)            if (d < 0)
839              {              {
840              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
# Line 832  for (;; ptr++) Line 856  for (;; ptr++)
856            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
857            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
858              {              {
859              int uc = pcre_fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
860              class[uc/8] |= (1 << (uc&7));              class[uc/8] |= (1 << (uc&7));
861              }              }
862            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
# Line 847  for (;; ptr++) Line 871  for (;; ptr++)
871        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
872        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
873          {          {
874          c = pcre_fcc[c];   /* flip case */          c = cd->fcc[c];   /* flip case */
875          class[c/8] |= (1 << (c&7));          class[c/8] |= (1 << (c&7));
876          }          }
877        class_charcount++;        class_charcount++;
# Line 894  for (;; ptr++) Line 918  for (;; ptr++)
918      /* Various kinds of repeat */      /* Various kinds of repeat */
919    
920      case '{':      case '{':
921      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
922      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
923      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
924      goto REPEAT;      goto REPEAT;
925    
# Line 928  for (;; ptr++) Line 952  for (;; ptr++)
952        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
953      else repeat_type = greedy_default;      else repeat_type = greedy_default;
954    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
955      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
956      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
957      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
958        out any reqchar setting, backing up to the previous value. We must also
959        adjust the countlits value. */
960    
961      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
962        {        {
963        int len = previous[1];        int len = previous[1];
964    
965          if (repeat_min == 0) *reqchar = prevreqchar;
966          *countlits += repeat_min - 1;
967    
968        if (len == 1)        if (len == 1)
969          {          {
970          c = previous[2];          c = previous[2];
# Line 978  for (;; ptr++) Line 1003  for (;; ptr++)
1003        code = previous;        code = previous;
1004    
1005        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1006        repeat_type += op_type;      /* Combine both values for many cases */  
1007          /* If the maximum is zero then the minimum must also be zero; Perl allows
1008          this case, so we do too - by simply omitting the item altogether. */
1009    
1010          if (repeat_max == 0) goto END_REPEAT;
1011    
1012          /* Combine the op_type with the repeat_type */
1013    
1014          repeat_type += op_type;
1015    
1016        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1017        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1055  for (;; ptr++) Line 1088  for (;; ptr++)
1088        }        }
1089    
1090      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1091      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1092    
1093      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1094        {        {
1095          if (repeat_max == 0)
1096            {
1097            code = previous;
1098            goto END_REPEAT;
1099            }
1100        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1101          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1102        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1082  for (;; ptr++) Line 1120  for (;; ptr++)
1120      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1121               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1122        {        {
1123        int i, ketoffset = 0;        register int i;
1124          int ketoffset = 0;
1125        int len = code - previous;        int len = code - previous;
1126          uschar *bralink = NULL;
1127    
1128        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1129        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1098  for (;; ptr++) Line 1138  for (;; ptr++)
1138          ketoffset = code - ket;          ketoffset = code - ket;
1139          }          }
1140    
1141        /* If the minimum is greater than zero, and the maximum is unlimited or        /* The case of a zero minimum is special because of the need to stick
1142        equal to the minimum, the first copy remains where it is, and is        OP_BRAZERO in front of it, and because the group appears once in the
1143        replicated up to the minimum number of times. This case includes the +        data, whereas in other cases it appears the minimum number of times. For
1144        repeat, but of course no replication is needed in that case. */        this reason, it is simplest to treat this case separately, as otherwise
1145          the code gets far too mess. There are several special subcases when the
1146          minimum is zero. */
1147    
1148        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))        if (repeat_min == 0)
1149          {          {
1150          for (i = 1; i < repeat_min; i++)          /* If we set up a required char from the bracket, we must back off
1151            to the previous value and reset the countlits value too. */
1152    
1153            if (subcountlits > 0)
1154            {            {
1155            memcpy(code, previous, len);            *reqchar = prevreqchar;
1156            code += len;            *countlits -= subcountlits;
1157            }            }
         }  
1158    
1159        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is also zero, we just omit the group from the output
1160        Then, if there is a fixed upper limit, replicated up to that many times,          altogether. */
       sticking BRAZERO in front of all the optional ones. */  
1161    
1162        else          if (repeat_max == 0)
1163          {            {
1164          if (repeat_min == 0)            code = previous;
1165              goto END_REPEAT;
1166              }
1167    
1168            /* If the maximum is 1 or unlimited, we just have to stick in the
1169            BRAZERO and do no more at this point. */
1170    
1171            if (repeat_max <= 1)
1172            {            {
1173            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1174            code++;            code++;
1175            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1176            }            }
1177    
1178            /* If the maximum is greater than 1 and limited, we have to replicate
1179            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1180            The first one has to be handled carefully because it's the original
1181            copy, which has to be moved up. The remainder can be handled by code
1182            that is common with the non-zero minimum case below. We just have to
1183            adjust the value or repeat_max, since one less copy is required. */
1184    
1185            else
1186              {
1187              int offset;
1188              memmove(previous+4, previous, len);
1189              code += 4;
1190              *previous++ = OP_BRAZERO + repeat_type;
1191              *previous++ = OP_BRA;
1192    
1193              /* We chain together the bracket offset fields that have to be
1194              filled in later when the ends of the brackets are reached. */
1195    
1196              offset = (bralink == NULL)? 0 : previous - bralink;
1197              bralink = previous;
1198              *previous++ = offset >> 8;
1199              *previous++ = offset & 255;
1200              }
1201    
1202            repeat_max--;
1203            }
1204    
1205          /* If the minimum is greater than zero, replicate the group as many
1206          times as necessary, and adjust the maximum to the number of subsequent
1207          copies that we need. */
1208    
1209          else
1210            {
1211          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1212            {            {
1213            memcpy(code, previous, len);            memcpy(code, previous, len);
1214            code += len;            code += len;
1215            }            }
1216            if (repeat_max > 0) repeat_max -= repeat_min;
1217            }
1218    
1219          /* This code is common to both the zero and non-zero minimum cases. If
1220          the maximum is limited, it replicates the group in a nested fashion,
1221          remembering the bracket starts on a stack. In the case of a zero minimum,
1222          the first one was set up above. In all cases the repeat_max now specifies
1223          the number of additional copies needed. */
1224    
1225          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        if (repeat_max >= 0)
1226            {
1227            for (i = repeat_max - 1; i >= 0; i--)
1228            {            {
1229            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1230    
1231              /* All but the final copy start a new nesting, maintaining the
1232              chain of brackets outstanding. */
1233    
1234              if (i != 0)
1235                {
1236                int offset;
1237                *code++ = OP_BRA;
1238                offset = (bralink == NULL)? 0 : code - bralink;
1239                bralink = code;
1240                *code++ = offset >> 8;
1241                *code++ = offset & 255;
1242                }
1243    
1244            memcpy(code, previous, len);            memcpy(code, previous, len);
1245            code += len;            code += len;
1246            }            }
1247    
1248            /* Now chain through the pending brackets, and fill in their length
1249            fields (which are holding the chain links pro tem). */
1250    
1251            while (bralink != NULL)
1252              {
1253              int oldlinkoffset;
1254              int offset = code - bralink + 1;
1255              uschar *bra = code - offset;
1256              oldlinkoffset = (bra[1] << 8) + bra[2];
1257              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1258              *code++ = OP_KET;
1259              *code++ = bra[1] = offset >> 8;
1260              *code++ = bra[2] = (offset & 255);
1261              }
1262          }          }
1263    
1264        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
# Line 1144  for (;; ptr++) Line 1266  for (;; ptr++)
1266        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
1267        correct offset was computed above. */        correct offset was computed above. */
1268    
1269        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1270        }        }
1271    
1272      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1157  for (;; ptr++) Line 1279  for (;; ptr++)
1279    
1280      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1281    
1282        END_REPEAT:
1283      previous = NULL;      previous = NULL;
1284      break;      break;
1285    
# Line 1191  for (;; ptr++) Line 1314  for (;; ptr++)
1314    
1315          case '(':          case '(':
1316          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
1317          if ((pcre_ctypes[*(++ptr)] & ctype_digit) != 0)          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1318            {            {
1319            condref = *ptr - '0';            condref = *ptr - '0';
1320            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
# Line 1324  for (;; ptr++) Line 1447  for (;; ptr++)
1447           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
1448           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1449            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1450           condref))                     /* Condition reference number */           condref,                      /* Condition reference number */
1451             &subreqchar,                  /* For possible last char */
1452             &subcountlits,                /* For literal count */
1453             cd))                          /* Tables block */
1454        goto FAILED;        goto FAILED;
1455    
1456      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 1337  for (;; ptr++) Line 1463  for (;; ptr++)
1463    
1464      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1465        {        {
       int branchcount = 0;  
1466        uschar *tc = code;        uschar *tc = code;
1467          condcount = 0;
1468    
1469        do {        do {
1470           branchcount++;           condcount++;
1471           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1472           }           }
1473        while (*tc != OP_KET);        while (*tc != OP_KET);
1474    
1475        if (branchcount > 2)        if (condcount > 2)
1476          {          {
1477          *errorptr = ERR27;          *errorptr = ERR27;
1478          goto FAILED;          goto FAILED;
1479          }          }
1480        }        }
1481    
1482        /* Handle updating of the required character. If the subpattern didn't
1483        set one, leave it as it was. Otherwise, update it for normal brackets of
1484        all kinds, forward assertions, and conditions with two branches. Don't
1485        update the literal count for forward assertions, however. If the bracket
1486        is followed by a quantifier with zero repeat, we have to back off. Hence
1487        the definition of prevreqchar and subcountlits outside the main loop so
1488        that they can be accessed for the back off. */
1489    
1490        if (subreqchar > 0 &&
1491             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1492             (bravalue == OP_COND && condcount == 2)))
1493          {
1494          prevreqchar = *reqchar;
1495          *reqchar = subreqchar;
1496          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1497          }
1498    
1499      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1500    
1501      code = tempcode;      code = tempcode;
# Line 1372  for (;; ptr++) Line 1515  for (;; ptr++)
1515    
1516      case '\\':      case '\\':
1517      tempptr = ptr;      tempptr = ptr;
1518      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1519    
1520      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1521      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 1417  for (;; ptr++) Line 1560  for (;; ptr++)
1560        {        {
1561        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
1562          {          {
1563          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
1564          if (c == '#')          if (c == '#')
1565            {            {
1566            while ((c = *(++ptr)) != 0 && c != '\n');            while ((c = *(++ptr)) != 0 && c != '\n');
# Line 1433  for (;; ptr++) Line 1576  for (;; ptr++)
1576        if (c == '\\')        if (c == '\\')
1577          {          {
1578          tempptr = ptr;          tempptr = ptr;
1579          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1580          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
1581          }          }
1582    
# Line 1445  for (;; ptr++) Line 1588  for (;; ptr++)
1588    
1589      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
1590    
1591      while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1592    
1593        /* Update the last character and the count of literals */
1594    
1595        prevreqchar = (length > 1)? code[-2] : *reqchar;
1596        *reqchar = code[-1];
1597        *countlits += length;
1598    
1599      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1600      the next state. */      the next state. */
# Line 1490  Argument: Line 1639  Argument:
1639    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1640    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1641    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1642      reqchar     -> place to put the last required character, or a negative number
1643      countlits   -> place to put the shortest literal count of any branch
1644      cd          points to the data block with tables pointers
1645    
1646  Returns:      TRUE on success  Returns:      TRUE on success
1647  */  */
1648    
1649  static BOOL  static BOOL
1650  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1651    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref)    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1652      int *reqchar, int *countlits, compile_data *cd)
1653  {  {
1654  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1655  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1504  uschar *last_branch = code; Line 1657  uschar *last_branch = code;
1657  uschar *start_bracket = code;  uschar *start_bracket = code;
1658  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1659  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1660    int branchreqchar, branchcountlits;
1661    
1662    *reqchar = -1;
1663    *countlits = INT_MAX;
1664  code += 3;  code += 3;
1665    
1666  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1543  for (;;) Line 1699  for (;;)
1699    
1700    /* Now compile the branch */    /* Now compile the branch */
1701    
1702    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1703          &branchreqchar, &branchcountlits, cd))
1704      {      {
1705      *ptrptr = ptr;      *ptrptr = ptr;
1706      return FALSE;      return FALSE;
# Line 1555  for (;;) Line 1712  for (;;)
1712    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1713    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1714    
1715      /* Save the last required character if all branches have the same; a current
1716      value of -1 means unset, while -2 means "previous branch had no last required
1717      char".  */
1718    
1719      if (*reqchar != -2)
1720        {
1721        if (branchreqchar >= 0)
1722          {
1723          if (*reqchar == -1) *reqchar = branchreqchar;
1724          else if (*reqchar != branchreqchar) *reqchar = -2;
1725          }
1726        else *reqchar = -2;
1727        }
1728    
1729      /* Keep the shortest literal count */
1730    
1731      if (branchcountlits < *countlits) *countlits = branchcountlits;
1732      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1733    
1734    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1735    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1736    the branch with OP_END. */    the branch with OP_END. */
# Line 1649  for (;;) Line 1825  for (;;)
1825      code += 2;      code += 2;
1826      break;      break;
1827    
1828        case OP_WORD_BOUNDARY:
1829        case OP_NOT_WORD_BOUNDARY:
1830        code++;
1831        break;
1832    
1833      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1834      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1835      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1676  all of whose alternatives start with OP_ Line 1857  all of whose alternatives start with OP_
1857  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
1858  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
1859    
1860  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1861  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
1862  trying them again.  so there is no point trying them again.
1863    
1864  Arguments:  Arguments:
1865    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1696  do { Line 1877  do {
1877     register int op = *scode;     register int op = *scode;
1878     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1879       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
1880     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1881                (*options & PCRE_DOTALL) != 0)
1882       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
1883     else if (op != OP_SOD &&     else if (op != OP_SOD &&
1884             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1710  return TRUE; Line 1892  return TRUE;
1892    
1893    
1894  /*************************************************  /*************************************************
1895  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
1896  *************************************************/  *************************************************/
1897    
1898  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
1899  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
1900    matching and for non-DOTALL patterns that start with .* (which must start at
1901    the beginning or after \n).
1902    
1903  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
1904  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1728  do { Line 1912  do {
1912     register int op = *scode;     register int op = *scode;
1913     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1914       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
1915       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1916         { if (scode[1] != OP_ANY) return FALSE; }
1917     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
1918     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
1919     }     }
# Line 1813  Arguments: Line 1999  Arguments:
1999    options      various option bits    options      various option bits
2000    errorptr     pointer to pointer to error text    errorptr     pointer to pointer to error text
2001    erroroffset  ptr offset in pattern where error was detected    erroroffset  ptr offset in pattern where error was detected
2002      tables       pointer to character tables or NULL
2003    
2004  Returns:       pointer to compiled data block, or NULL on error,  Returns:       pointer to compiled data block, or NULL on error,
2005                 with errorptr and erroroffset set                 with errorptr and erroroffset set
# Line 1820  Returns:       pointer to compiled data Line 2007  Returns:       pointer to compiled data
2007    
2008  pcre *  pcre *
2009  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
2010    int *erroroffset)    int *erroroffset, const unsigned char *tables)
2011  {  {
2012  real_pcre *re;  real_pcre *re;
2013  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2014  int runlength;  int runlength;
2015  int c, size;  int c, size, reqchar, countlits;
2016  int bracount = 0;  int bracount = 0;
2017  int top_backref = 0;  int top_backref = 0;
2018  int branch_extra = 0;  int branch_extra = 0;
# Line 1833  int branch_newextra; Line 2020  int branch_newextra;
2020  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2021  uschar *code;  uschar *code;
2022  const uschar *ptr;  const uschar *ptr;
2023    compile_data compile_block;
2024  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
2025  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
2026    
# Line 1861  if ((options & ~PUBLIC_OPTIONS) != 0) Line 2049  if ((options & ~PUBLIC_OPTIONS) != 0)
2049    return NULL;    return NULL;
2050    }    }
2051    
2052    /* Set up pointers to the individual character tables */
2053    
2054    if (tables == NULL) tables = pcre_default_tables;
2055    compile_block.lcc = tables + lcc_offset;
2056    compile_block.fcc = tables + fcc_offset;
2057    compile_block.cbits = tables + cbits_offset;
2058    compile_block.ctypes = tables + ctypes_offset;
2059    
2060    /* Reflect pattern for debugging output */
2061    
2062  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
2063  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
2064    
# Line 1879  while ((c = *(++ptr)) != 0) Line 2077  while ((c = *(++ptr)) != 0)
2077    
2078    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2079      {      {
2080      if ((pcre_ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2081      if (c == '#')      if (c == '#')
2082        {        {
2083        while ((c = *(++ptr)) != 0 && c != '\n');        while ((c = *(++ptr)) != 0 && c != '\n');
# Line 1897  while ((c = *(++ptr)) != 0) Line 2095  while ((c = *(++ptr)) != 0)
2095      case '\\':      case '\\':
2096        {        {
2097        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
2098        c = check_escape(&ptr, errorptr, bracount, options, FALSE);        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2099        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2100        if (c >= 0)        if (c >= 0)
2101          {          {
# Line 1917  while ((c = *(++ptr)) != 0) Line 2115  while ((c = *(++ptr)) != 0)
2115        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
2116        if (refnum > top_backref) top_backref = refnum;        if (refnum > top_backref) top_backref = refnum;
2117        length++;   /* For single back reference */        length++;   /* For single back reference */
2118        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2119          {          {
2120          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2121          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2122          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2123            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 1943  while ((c = *(++ptr)) != 0) Line 2141  while ((c = *(++ptr)) != 0)
2141      or back reference. */      or back reference. */
2142    
2143      case '{':      case '{':
2144      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2145      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2146      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2147      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
2148        (min == 1 && max == -1))        (min == 1 && max == -1))
# Line 1979  while ((c = *(++ptr)) != 0) Line 2177  while ((c = *(++ptr)) != 0)
2177        {        {
2178        if (*ptr == '\\')        if (*ptr == '\\')
2179          {          {
2180          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2181              &compile_block);
2182          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2183          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;          if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2184          }          }
# Line 1996  while ((c = *(++ptr)) != 0) Line 2195  while ((c = *(++ptr)) != 0)
2195    
2196        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
2197    
2198        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2199          {          {
2200          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2201          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2202          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
2203            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 2064  while ((c = *(++ptr)) != 0) Line 2263  while ((c = *(++ptr)) != 0)
2263          group. */          group. */
2264    
2265          case '(':          case '(':
2266          if ((pcre_ctypes[ptr[3]] & ctype_digit) != 0)          if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2267            {            {
2268            ptr += 4;            ptr += 4;
2269            length += 2;            length += 2;
2270            while ((pcre_ctypes[*ptr] & ctype_digit) != 0) ptr++;            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2271            if (*ptr != ')')            if (*ptr != ')')
2272              {              {
2273              *errorptr = ERR26;              *errorptr = ERR26;
# Line 2153  while ((c = *(++ptr)) != 0) Line 2352  while ((c = *(++ptr)) != 0)
2352              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2353              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2354              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2355              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2356                flag ever changes within the regex. This is used by the "required
2357                character" code. */
2358    
2359              case ':':              case ':':
2360              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2361                {                {
2362                length += 4;                length += 4;
2363                branch_newextra = 2;                branch_newextra = 2;
2364                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2365                }                }
2366              goto END_OPTIONS;              goto END_OPTIONS;
2367    
# Line 2237  while ((c = *(++ptr)) != 0) Line 2439  while ((c = *(++ptr)) != 0)
2439        /* Leave ptr at the final char; for read_repeat_counts this happens        /* Leave ptr at the final char; for read_repeat_counts this happens
2440        automatically; for the others we need an increment. */        automatically; for the others we need an increment. */
2441    
2442        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))        if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2443          {          {
2444          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);          ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2445              &compile_block);
2446          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2447          }          }
2448        else if (c == '*') { minval = 0; maxval = -1; ptr++; }        else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2449        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2450        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2451    
2452        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2453        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2454        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2455        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2456    
2457        if (minval == 0) length++;        if (minval == 0)
2458          else if (minval > 1) length += (minval - 1) * duplength;          {
2459        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2460            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2461            }
2462    
2463          /* When the minimum is greater than zero, 1 we have to replicate up to
2464          minval-1 times, with no additions required in the copies. Then, if
2465          there is a limited maximum we have to replicate up to maxval-1 times
2466          allowing for a BRAZERO item before each optional copy and nesting
2467          brackets for all but one of the optional copies. */
2468    
2469          else
2470            {
2471            length += (minval - 1) * duplength;
2472            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2473              length += (maxval - minval) * (duplength + 7) - 6;
2474            }
2475        }        }
2476      continue;      continue;
2477    
# Line 2270  while ((c = *(++ptr)) != 0) Line 2488  while ((c = *(++ptr)) != 0)
2488        {        {
2489        if ((options & PCRE_EXTENDED) != 0)        if ((options & PCRE_EXTENDED) != 0)
2490          {          {
2491          if ((pcre_ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2492          if (c == '#')          if (c == '#')
2493            {            {
2494            while ((c = *(++ptr)) != 0 && c != '\n');            while ((c = *(++ptr)) != 0 && c != '\n');
# Line 2284  while ((c = *(++ptr)) != 0) Line 2502  while ((c = *(++ptr)) != 0)
2502        if (c == '\\')        if (c == '\\')
2503          {          {
2504          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
2505          c = check_escape(&ptr, errorptr, bracount, options, FALSE);          c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2506              &compile_block);
2507          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2508          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2509          }          }
# Line 2296  while ((c = *(++ptr)) != 0) Line 2515  while ((c = *(++ptr)) != 0)
2515    
2516      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2517    
2518      while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (runlength < 255 &&
2519          (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2520    
2521      ptr--;      ptr--;
2522      length += runlength;      length += runlength;
# Line 2331  if (re == NULL) Line 2551  if (re == NULL)
2551    
2552  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2553  re->options = options;  re->options = options;
2554    re->tables = tables;
2555    
2556  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
2557  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 2340  ptr = (const uschar *)pattern; Line 2561  ptr = (const uschar *)pattern;
2561  code = re->code;  code = re->code;
2562  *code = OP_BRA;  *code = OP_BRA;
2563  bracount = 0;  bracount = 0;
2564  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1);  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2565      &reqchar, &countlits, &compile_block);
2566  re->top_bracket = bracount;  re->top_bracket = bracount;
2567  re->top_backref = top_backref;  re->top_backref = top_backref;
2568    
# Line 2372  if (*errorptr != NULL) Line 2594  if (*errorptr != NULL)
2594    return NULL;    return NULL;
2595    }    }
2596    
2597  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2598  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2599  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2600  unanchored matches no end. In the case of multiline matches, an alternative is  
2601  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2602    that speeds up unanchored matches no end. If not, see if we can set the
2603    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2604    start with ^. and also when all branches start with .* for non-DOTALL matches.
2605    */
2606    
2607  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2608    {    {
# Line 2396  if ((options & PCRE_ANCHORED) == 0) Line 2622  if ((options & PCRE_ANCHORED) == 0)
2622      }      }
2623    }    }
2624    
2625    /* Save the last required character if there are at least two literal
2626    characters on all paths, or if there is no first character setting. */
2627    
2628    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2629      {
2630      re->req_char = reqchar;
2631      re->options |= PCRE_REQCHSET;
2632      }
2633    
2634  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2635    
2636  #ifdef DEBUG  #ifdef DEBUG
# Line 2405  printf("Length = %d top_bracket = %d top Line 2640  printf("Length = %d top_bracket = %d top
2640    
2641  if (re->options != 0)  if (re->options != 0)
2642    {    {
2643    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2644      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2645      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2646        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2647      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2648      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2649      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2422  if ((re->options & PCRE_FIRSTSET) != 0) Line 2658  if ((re->options & PCRE_FIRSTSET) != 0)
2658      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2659    }    }
2660    
2661    if ((re->options & PCRE_REQCHSET) != 0)
2662      {
2663      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2664        else printf("Req char = \\x%02x\n", re->req_char);
2665      }
2666    
2667  code_end = code;  code_end = code;
2668  code_base = code = re->code;  code_base = code = re->code;
2669    
# Line 2637  return (pcre *)re; Line 2879  return (pcre *)re;
2879    
2880    
2881  /*************************************************  /*************************************************
 *        Match a character type                  *  
 *************************************************/  
   
 /* Not used in all the places it might be as it's sometimes faster  
 to put the code inline.  
   
 Arguments:  
   type        the character type  
   c           the character  
   dotall      the dotall flag  
   
 Returns:      TRUE if character is of the type  
 */  
   
 static BOOL  
 match_type(int type, int c, BOOL dotall)  
 {  
   
 #ifdef DEBUG  
 if (isprint(c)) printf("matching subject %c against ", c);  
   else printf("matching subject \\x%02x against ", c);  
 printf("%s\n", OP_names[type]);  
 #endif  
   
 switch(type)  
   {  
   case OP_ANY:            return dotall || c != '\n';  
   case OP_NOT_DIGIT:      return (pcre_ctypes[c] & ctype_digit) == 0;  
   case OP_DIGIT:          return (pcre_ctypes[c] & ctype_digit) != 0;  
   case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;  
   case OP_WHITESPACE:     return (pcre_ctypes[c] & ctype_space) != 0;  
   case OP_NOT_WORDCHAR:   return (pcre_ctypes[c] & ctype_word) == 0;  
   case OP_WORDCHAR:       return (pcre_ctypes[c] & ctype_word) != 0;  
   }  
 return FALSE;  
 }  
   
   
   
 /*************************************************  
2882  *          Match a back-reference                *  *          Match a back-reference                *
2883  *************************************************/  *************************************************/
2884    
# Line 2695  Returns:      TRUE if matched Line 2897  Returns:      TRUE if matched
2897    
2898  static BOOL  static BOOL
2899  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2900    int ims)    unsigned long int ims)
2901  {  {
2902  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
2903    
# Line 2719  if (length > md->end_subject - eptr) ret Line 2921  if (length > md->end_subject - eptr) ret
2921  /* Separate the caselesss case for speed */  /* Separate the caselesss case for speed */
2922    
2923  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
2924    { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }    {
2925      while (length-- > 0)
2926        if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2927      }
2928  else  else
2929    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2930    
# Line 2751  Returns:       TRUE if matched Line 2956  Returns:       TRUE if matched
2956    
2957  static BOOL  static BOOL
2958  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
2959    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2960      const uschar *eptrb)
2961  {  {
2962  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
2963    
2964  for (;;)  for (;;)
2965    {    {
# Line 2782  for (;;) Line 2988  for (;;)
2988      int number = op - OP_BRA;      int number = op - OP_BRA;
2989      int offset = number << 1;      int offset = number << 1;
2990    
2991      DPRINTF(("start bracket %d\n", number));  #ifdef DEBUG
2992        printf("start bracket %d subject=", number);
2993        pchars(eptr, 16, TRUE, md);
2994        printf("\n");
2995    #endif
2996    
2997      if (offset < md->offset_max)      if (offset < md->offset_max)
2998        {        {
# Line 2864  for (;;) Line 3074  for (;;)
3074      ecode += 2;      ecode += 2;
3075      break;      break;
3076    
3077      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3078        an empty string - recursion will then try other alternatives, if any. */
3079    
3080      case OP_END:      case OP_END:
3081        if (md->notempty && eptr == md->start_match) return FALSE;
3082      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3083      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3084      return TRUE;      return TRUE;
# Line 3172  for (;;) Line 3384  for (;;)
3384      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
3385        {        {
3386        BOOL prev_is_word = (eptr != md->start_subject) &&        BOOL prev_is_word = (eptr != md->start_subject) &&
3387          ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);          ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3388        BOOL cur_is_word = (eptr < md->end_subject) &&        BOOL cur_is_word = (eptr < md->end_subject) &&
3389          ((pcre_ctypes[*eptr] & ctype_word) != 0);          ((md->ctypes[*eptr] & ctype_word) != 0);
3390        if ((*ecode++ == OP_WORD_BOUNDARY)?        if ((*ecode++ == OP_WORD_BOUNDARY)?
3391             cur_is_word == prev_is_word : cur_is_word != prev_is_word)             cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3392          return FALSE;          return FALSE;
# Line 3191  for (;;) Line 3403  for (;;)
3403      break;      break;
3404    
3405      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
3406      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)      if (eptr >= md->end_subject ||
3407           (md->ctypes[*eptr++] & ctype_digit) != 0)
3408        return FALSE;        return FALSE;
3409      ecode++;      ecode++;
3410      break;      break;
3411    
3412      case OP_DIGIT:      case OP_DIGIT:
3413      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)      if (eptr >= md->end_subject ||
3414           (md->ctypes[*eptr++] & ctype_digit) == 0)
3415        return FALSE;        return FALSE;
3416      ecode++;      ecode++;
3417      break;      break;
3418    
3419      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
3420      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)      if (eptr >= md->end_subject ||
3421           (md->ctypes[*eptr++] & ctype_space) != 0)
3422        return FALSE;        return FALSE;
3423      ecode++;      ecode++;
3424      break;      break;
3425    
3426      case OP_WHITESPACE:      case OP_WHITESPACE:
3427      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)      if (eptr >= md->end_subject ||
3428           (md->ctypes[*eptr++] & ctype_space) == 0)
3429        return FALSE;        return FALSE;
3430      ecode++;      ecode++;
3431      break;      break;
3432    
3433      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
3434      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)      if (eptr >= md->end_subject ||
3435           (md->ctypes[*eptr++] & ctype_word) != 0)
3436        return FALSE;        return FALSE;
3437      ecode++;      ecode++;
3438      break;      break;
3439    
3440      case OP_WORDCHAR:      case OP_WORDCHAR:
3441      if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)      if (eptr >= md->end_subject ||
3442           (md->ctypes[*eptr++] & ctype_word) == 0)
3443        return FALSE;        return FALSE;
3444      ecode++;      ecode++;
3445      break;      break;
# Line 3453  for (;;) Line 3671  for (;;)
3671        if (length > md->end_subject - eptr) return FALSE;        if (length > md->end_subject - eptr) return FALSE;
3672        if ((ims & PCRE_CASELESS) != 0)        if ((ims & PCRE_CASELESS) != 0)
3673          {          {
3674          while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;          while (length-- > 0)
3675              if (md->lcc[*ecode++] != md->lcc[*eptr++])
3676                return FALSE;
3677          }          }
3678        else        else
3679          {          {
# Line 3510  for (;;) Line 3730  for (;;)
3730    
3731      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
3732        {        {
3733        c = pcre_lcc[c];        c = md->lcc[c];
3734        for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
3735            if (c != md->lcc[*eptr++]) return FALSE;
3736        if (min == max) continue;        if (min == max) continue;
3737        if (minimize)        if (minimize)
3738          {          {
# Line 3519  for (;;) Line 3740  for (;;)
3740            {            {
3741            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3742              return TRUE;              return TRUE;
3743            if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])            if (i >= max || eptr >= md->end_subject ||
3744                  c != md->lcc[*eptr++])
3745              return FALSE;              return FALSE;
3746            }            }
3747          /* Control never gets here */          /* Control never gets here */
# Line 3529  for (;;) Line 3751  for (;;)
3751          const uschar *pp = eptr;          const uschar *pp = eptr;
3752          for (i = min; i < max; i++)          for (i = min; i < max; i++)
3753            {            {
3754            if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3755            eptr++;            eptr++;
3756            }            }
3757          while (eptr >= pp)          while (eptr >= pp)
# Line 3579  for (;;) Line 3801  for (;;)
3801      ecode++;      ecode++;
3802      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
3803        {        {
3804        if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;        if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3805        }        }
3806      else      else
3807        {        {
# Line 3639  for (;;) Line 3861  for (;;)
3861    
3862      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
3863        {        {
3864        c = pcre_lcc[c];        c = md->lcc[c];
3865        for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;        for (i = 1; i <= min; i++)
3866            if (c == md->lcc[*eptr++]) return FALSE;
3867        if (min == max) continue;        if (min == max) continue;
3868        if (minimize)        if (minimize)
3869          {          {
# Line 3648  for (;;) Line 3871  for (;;)
3871            {            {
3872            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3873              return TRUE;              return TRUE;
3874            if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])            if (i >= max || eptr >= md->end_subject ||
3875                  c == md->lcc[*eptr++])
3876              return FALSE;              return FALSE;
3877            }            }
3878          /* Control never gets here */          /* Control never gets here */
# Line 3658  for (;;) Line 3882  for (;;)
3882          const uschar *pp = eptr;          const uschar *pp = eptr;
3883          for (i = min; i < max; i++)          for (i = min; i < max; i++)
3884            {            {
3885            if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;            if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3886            eptr++;            eptr++;
3887            }            }
3888          while (eptr >= pp)          while (eptr >= pp)
# Line 3752  for (;;) Line 3976  for (;;)
3976    
3977        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
3978        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3979          if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3980        break;        break;
3981    
3982        case OP_DIGIT:        case OP_DIGIT:
3983        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3984          if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3985        break;        break;
3986    
3987        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
3988        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3989          if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3990        break;        break;
3991    
3992        case OP_WHITESPACE:        case OP_WHITESPACE:
3993        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
3994          if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;          if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3995        break;        break;
3996    
3997        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
3998        for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)        for (i = 1; i <= min; i++)
3999          return FALSE;          if ((md->ctypes[*eptr++] & ctype_word) != 0)
4000              return FALSE;
4001        break;        break;
4002    
4003        case OP_WORDCHAR:        case OP_WORDCHAR:
4004        for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)        for (i = 1; i <= min; i++)
4005          return FALSE;          if ((md->ctypes[*eptr++] & ctype_word) == 0)
4006              return FALSE;
4007        break;        break;
4008        }        }
4009    
# Line 3786  for (;;) Line 4012  for (;;)
4012      if (min == max) continue;      if (min == max) continue;
4013    
4014      /* If minimizing, we have to test the rest of the pattern before each      /* If minimizing, we have to test the rest of the pattern before each
4015      subsequent match, so inlining isn't much help; just use the function. */      subsequent match. */
4016    
4017      if (minimize)      if (minimize)
4018        {        {
4019        for (i = min;; i++)        for (i = min;; i++)
4020          {          {
4021          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4022          if (i >= max || eptr >= md->end_subject ||          if (i >= max || eptr >= md->end_subject) return FALSE;
4023            !match_type(ctype, *eptr++, (ims & PCRE_DOTALL) != 0))  
4024              return FALSE;          c = *eptr++;
4025            switch(ctype)
4026              {
4027              case OP_ANY:
4028              if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4029              break;
4030    
4031              case OP_NOT_DIGIT:
4032              if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4033              break;
4034    
4035              case OP_DIGIT:
4036              if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4037              break;
4038    
4039              case OP_NOT_WHITESPACE:
4040              if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4041              break;
4042    
4043              case OP_WHITESPACE:
4044              if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4045              break;
4046    
4047              case OP_NOT_WORDCHAR:
4048              if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4049              break;
4050    
4051              case OP_WORDCHAR:
4052              if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4053              break;
4054              }
4055          }          }
4056        /* Control never gets here */        /* Control never gets here */
4057        }        }
# Line 3828  for (;;) Line 4084  for (;;)
4084          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
4085          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4086            {            {
4087            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4088              break;              break;
4089            eptr++;            eptr++;
4090            }            }
# Line 3837  for (;;) Line 4093  for (;;)
4093          case OP_DIGIT:          case OP_DIGIT:
4094          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4095            {            {
4096            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4097              break;              break;
4098            eptr++;            eptr++;
4099            }            }
# Line 3846  for (;;) Line 4102  for (;;)
4102          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
4103          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4104            {            {
4105            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4106              break;              break;
4107            eptr++;            eptr++;
4108            }            }
# Line 3855  for (;;) Line 4111  for (;;)
4111          case OP_WHITESPACE:          case OP_WHITESPACE:
4112          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4113            {            {
4114            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4115              break;              break;
4116            eptr++;            eptr++;
4117            }            }
# Line 3864  for (;;) Line 4120  for (;;)
4120          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
4121          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4122            {            {
4123            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4124              break;              break;
4125            eptr++;            eptr++;
4126            }            }
# Line 3873  for (;;) Line 4129  for (;;)
4129          case OP_WORDCHAR:          case OP_WORDCHAR:
4130          for (i = min; i < max; i++)          for (i = min; i < max; i++)
4131            {            {
4132            if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4133              break;              break;
4134            eptr++;            eptr++;
4135            }            }
# Line 3919  Arguments: Line 4175  Arguments:
4175    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4176    subject         points to the subject string    subject         points to the subject string
4177    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4178      start_offset    where to start in the subject string
4179    options         option bits    options         option bits
4180    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4181    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3931  Returns:          > 0 => success; value Line 4188  Returns:          > 0 => success; value
4188    
4189  int  int
4190  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4191    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4192      int offsetcount)
4193  {  {
4194  int resetcount, ocount;  int resetcount, ocount;
4195  int first_char = -1;  int first_char = -1;
4196  int ims = 0;  int req_char = -1;
4197    int req_char2 = -1;
4198    unsigned long int ims = 0;
4199  match_data match_block;  match_data match_block;
4200  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4201  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4202  const uschar *end_subject;  const uschar *end_subject;
4203    const uschar *req_char_ptr = start_match - 1;
4204  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4205  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4206  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 3960  match_block.endonly = (re->options & PCR Line 4221  match_block.endonly = (re->options & PCR
4221    
4222  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4223  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4224    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4225    
4226  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4227    
4228    match_block.lcc = re->tables + lcc_offset;
4229    match_block.ctypes = re->tables + ctypes_offset;
4230    
4231  /* The ims options can vary during the matching as a result of the presence  /* The ims options can vary during the matching as a result of the presence
4232  of (?ims) items in the pattern. They are kept in a local variable so that  of (?ims) items in the pattern. They are kept in a local variable so that
4233  restoring at the exit of a group is easy. */  restoring at the exit of a group is easy. */
# Line 3997  in the pattern. */ Line 4262  in the pattern. */
4262  resetcount = 2 + re->top_bracket * 2;  resetcount = 2 + re->top_bracket * 2;
4263  if (resetcount > offsetcount) resetcount = ocount;  if (resetcount > offsetcount) resetcount = ocount;
4264    
4265    /* Reset the working variable associated with each extraction. These should
4266    never be used unless previously set, but they get saved and restored, and so we
4267    initialize them to avoid reading uninitialized locations. */
4268    
4269    if (match_block.offset_vector != NULL)
4270      {
4271      register int *iptr = match_block.offset_vector + ocount;
4272      register int *iend = iptr - resetcount/2 + 1;
4273      while (--iptr >= iend) *iptr = -1;
4274      }
4275    
4276  /* Set up the first character to match, if available. The first_char value is  /* Set up the first character to match, if available. The first_char value is
4277  never set for an anchored regular expression, but the anchoring may be forced  never set for an anchored regular expression, but the anchoring may be forced
4278  at run time, so we have to test for anchoring. The first char may be unset for  at run time, so we have to test for anchoring. The first char may be unset for
# Line 4008  if (!anchored) Line 4284  if (!anchored)
4284    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->options & PCRE_FIRSTSET) != 0)
4285      {      {
4286      first_char = re->first_char;      first_char = re->first_char;
4287      if ((ims & PCRE_CASELESS) != 0) first_char = pcre_lcc[first_char];      if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4288      }      }
4289    else    else
4290      if (!startline && extra != NULL &&      if (!startline && extra != NULL &&
# Line 4016  if (!anchored) Line 4292  if (!anchored)
4292          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4293    }    }
4294    
4295  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4296    character" set. If the PCRE_CASELESS is set, implying that the match starts
4297    caselessly, or if there are any changes of this flag within the regex, set up
4298    both cases of the character. Otherwise set the two values the same, which will
4299    avoid duplicate testing (which takes significant time). This covers the vast
4300    majority of cases. It will be suboptimal when the case flag changes in a regex
4301    and the required character in fact is caseful. */
4302    
4303    if ((re->options & PCRE_REQCHSET) != 0)
4304      {
4305      req_char = re->req_char;
4306      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4307        (re->tables + fcc_offset)[req_char] : req_char;
4308      }
4309    
4310    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4311    the loop runs just once. */
4312    
4313  do  do
4314    {    {
# Line 4033  do Line 4325  do
4325    if (first_char >= 0)    if (first_char >= 0)
4326      {      {
4327      if ((ims & PCRE_CASELESS) != 0)      if ((ims & PCRE_CASELESS) != 0)
4328        while (start_match < end_subject && pcre_lcc[*start_match] != first_char)        while (start_match < end_subject &&
4329                 match_block.lcc[*start_match] != first_char)
4330          start_match++;          start_match++;
4331      else      else
4332        while (start_match < end_subject && *start_match != first_char)        while (start_match < end_subject && *start_match != first_char)
# Line 4051  do Line 4344  do
4344        }        }
4345      }      }
4346    
4347    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4348    
4349    else if (start_bits != NULL)    else if (start_bits != NULL)
4350      {      {
# Line 4068  do Line 4361  do
4361    printf("\n");    printf("\n");
4362  #endif  #endif
4363    
4364      /* If req_char is set, we know that that character must appear in the subject
4365      for the match to succeed. If the first character is set, req_char must be
4366      later in the subject; otherwise the test starts at the match point. This
4367      optimization can save a huge amount of backtracking in patterns with nested
4368      unlimited repeats that aren't going to match. We don't know what the state of
4369      case matching may be when this character is hit, so test for it in both its
4370      cases if necessary. However, the different cased versions will not be set up
4371      unless PCRE_CASELESS was given or the casing state changes within the regex.
4372      Writing separate code makes it go faster, as does using an autoincrement and
4373      backing off on a match. */
4374    
4375      if (req_char >= 0)
4376        {
4377        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4378    
4379        /* We don't need to repeat the search if we haven't yet reached the
4380        place we found it at last time. */
4381    
4382        if (p > req_char_ptr)
4383          {
4384          /* Do a single test if no case difference is set up */
4385    
4386          if (req_char == req_char2)
4387            {
4388            while (p < end_subject)
4389              {
4390              if (*p++ == req_char) { p--; break; }
4391              }
4392            }
4393    
4394          /* Otherwise test for either case */
4395    
4396          else
4397            {
4398            while (p < end_subject)
4399              {
4400              register int pp = *p++;
4401              if (pp == req_char || pp == req_char2) { p--; break; }
4402              }
4403            }
4404    
4405          /* If we can't find the required character, break the matching loop */
4406    
4407          if (p >= end_subject) break;
4408    
4409          /* If we have found the required character, save the point where we
4410          found it, so that we don't search again next time round the loop if
4411          the start hasn't passed this character yet. */
4412    
4413          req_char_ptr = p;
4414          }
4415        }
4416    
4417    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4418    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4419    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4075  do Line 4421  do
4421    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4422    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4423    
4424      match_block.start_match = start_match;
4425    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4426      continue;      continue;
4427    
# Line 4106  do Line 4453  do
4453    DPRINTF((">>>> returning %d\n", rc));    DPRINTF((">>>> returning %d\n", rc));
4454    return rc;    return rc;
4455    }    }
4456    
4457    /* This "while" is the end of the "do" above */
4458    
4459  while (!anchored &&  while (!anchored &&
4460         match_block.errorcode == PCRE_ERROR_NOMATCH &&         match_block.errorcode == PCRE_ERROR_NOMATCH &&
4461         start_match++ < end_subject);         start_match++ < end_subject);

Legend:
Removed from v.23  
changed lines
  Added in v.37

  ViewVC Help
Powered by ViewVC 1.1.5