/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 35 by nigel, Sat Feb 24 21:39:05 2007 UTC revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 82  static const char *OP_names[] = { Line 82  static const char *OP_names[] = {
82    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
85    "class", "Ref",    "class", "Ref", "Recurse",
86    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 107  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
108  };  };
109    
110    /* Tables of names of POSIX character classes and their lengths. The list is
111    terminated by a zero length entry. The first three must be alpha, upper, lower,
112    as this is assumed for handling case independence. */
113    
114    static const char *posix_names[] = {
115      "alpha", "lower", "upper",
116      "alnum", "ascii", "cntrl", "digit", "graph",
117      "print", "punct", "space", "word",  "xdigit" };
118    
119    static const uschar posix_name_lengths[] = {
120      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121    
122    /* Table of class bit maps for each POSIX class; up to three may be combined
123    to form the class. */
124    
125    static const int posix_class_maps[] = {
126      cbit_lower, cbit_upper, -1,             /* alpha */
127      cbit_lower, -1,         -1,             /* lower */
128      cbit_upper, -1,         -1,             /* upper */
129      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
130      cbit_print, cbit_cntrl, -1,             /* ascii */
131      cbit_cntrl, -1,         -1,             /* cntrl */
132      cbit_digit, -1,         -1,             /* digit */
133      cbit_graph, -1,         -1,             /* graph */
134      cbit_print, -1,         -1,             /* print */
135      cbit_punct, -1,         -1,             /* punct */
136      cbit_space, -1,         -1,             /* space */
137      cbit_word,  -1,         -1,             /* word */
138      cbit_xdigit,-1,         -1              /* xdigit */
139    };
140    
141    
142  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
143    
144  static BOOL  static BOOL
145    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
147    
148    /* Structure for building a chain of data that actually lives on the
149    stack, for holding the values of the subject pointer at the start of each
150    subpattern, so as to detect when an empty string has been matched by a
151    subpattern - to break infinite loops. */
152    
153    typedef struct eptrblock {
154      struct eptrblock *prev;
155      const uschar *saved_eptr;
156    } eptrblock;
157    
158    /* Flag bits for the match() function */
159    
160    #define match_condassert   0x01    /* Called to check a condition assertion */
161    #define match_isgroup      0x02    /* Set if start of bracketed group */
162    
163    
164    
# Line 148  tables. */ Line 195  tables. */
195  *          Return version string                 *  *          Return version string                 *
196  *************************************************/  *************************************************/
197    
198    #define STRING(a)  # a
199    #define XSTRING(s) STRING(s)
200    
201  const char *  const char *
202  pcre_version(void)  pcre_version(void)
203  {  {
204  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
205  }  }
206    
207    
208    
209    
210  /*************************************************  /*************************************************
211  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
212  *************************************************/  *************************************************/
213    
214  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
215  structure.  of the private structure, but its interface was too rigid. It remains for
216    backwards compatibility. The public options are passed back in an int - though
217    the re->options field has been expanded to a long int, all the public options
218    at the low end of it, and so even on 16-bit systems this will still be OK.
219    Therefore, I haven't changed the API for pcre_info().
220    
221  Arguments:  Arguments:
222    external_re   points to compiled code    external_re   points to compiled code
# Line 171  Arguments: Line 225  Arguments:
225                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
226                  or -2 otherwise                  or -2 otherwise
227    
228  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
229                  or negative values on error                  or negative values on error
230  */  */
231    
# Line 181  pcre_info(const pcre *external_re, int * Line 235  pcre_info(const pcre *external_re, int *
235  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
236  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
237  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
238  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
239  if (first_char != NULL)  if (first_char != NULL)
240    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
241       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 190  return re->top_bracket; Line 244  return re->top_bracket;
244    
245    
246    
247    /*************************************************
248    *        Return info about compiled pattern      *
249    *************************************************/
250    
251    /* This is a newer "info" function which has an extensible interface so
252    that additional items can be added compatibly.
253    
254    Arguments:
255      external_re      points to compiled code
256      external_study   points to study data, or NULL
257      what             what information is required
258      where            where to put the information
259    
260    Returns:           0 if data returned, negative on error
261    */
262    
263    int
264    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
265      void *where)
266    {
267    const real_pcre *re = (const real_pcre *)external_re;
268    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
269    
270    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
271    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
272    
273    switch (what)
274      {
275      case PCRE_INFO_OPTIONS:
276      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
277      break;
278    
279      case PCRE_INFO_SIZE:
280      *((size_t *)where) = re->size;
281      break;
282    
283      case PCRE_INFO_CAPTURECOUNT:
284      *((int *)where) = re->top_bracket;
285      break;
286    
287      case PCRE_INFO_BACKREFMAX:
288      *((int *)where) = re->top_backref;
289      break;
290    
291      case PCRE_INFO_FIRSTCHAR:
292      *((int *)where) =
293        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
294        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
295      break;
296    
297      case PCRE_INFO_FIRSTTABLE:
298      *((const uschar **)where) =
299        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
300          study->start_bits : NULL;
301      break;
302    
303      case PCRE_INFO_LASTLITERAL:
304      *((int *)where) =
305        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
306      break;
307    
308      default: return PCRE_ERROR_BADOPTION;
309      }
310    
311    return 0;
312    }
313    
314    
315    
316  #ifdef DEBUG  #ifdef DEBUG
317  /*************************************************  /*************************************************
# Line 249  check_escape(const uschar **ptrptr, cons Line 371  check_escape(const uschar **ptrptr, cons
371    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
372  {  {
373  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
374  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
375    
376    c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
377  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
378    
379  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 532  for (;;) Line 654  for (;;)
654    
655      case OP_REVERSE:      case OP_REVERSE:
656      cc++;      cc++;
657        /* Fall through */
658    
659      case OP_CREF:      case OP_CREF:
660      case OP_OPT:      case OP_OPT:
# Line 615  for (;;) Line 738  for (;;)
738    
739    
740  /*************************************************  /*************************************************
741    *           Check for POSIX class syntax         *
742    *************************************************/
743    
744    /* This function is called when the sequence "[:" or "[." or "[=" is
745    encountered in a character class. It checks whether this is followed by an
746    optional ^ and then a sequence of letters, terminated by a matching ":]" or
747    ".]" or "=]".
748    
749    Argument:
750      ptr      pointer to the initial [
751      endptr   where to return the end pointer
752      cd       pointer to compile data
753    
754    Returns:   TRUE or FALSE
755    */
756    
757    static BOOL
758    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
759    {
760    int terminator;          /* Don't combine these lines; the Solaris cc */
761    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
762    if (*(++ptr) == '^') ptr++;
763    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
764    if (*ptr == terminator && ptr[1] == ']')
765      {
766      *endptr = ptr;
767      return TRUE;
768      }
769    return FALSE;
770    }
771    
772    
773    
774    
775    /*************************************************
776    *          Check POSIX class name                *
777    *************************************************/
778    
779    /* This function is called to check the name given in a POSIX-style class entry
780    such as [:alnum:].
781    
782    Arguments:
783      ptr        points to the first letter
784      len        the length of the name
785    
786    Returns:     a value representing the name, or -1 if unknown
787    */
788    
789    static int
790    check_posix_name(const uschar *ptr, int len)
791    {
792    register int yield = 0;
793    while (posix_name_lengths[yield] != 0)
794      {
795      if (len == posix_name_lengths[yield] &&
796        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
797      yield++;
798      }
799    return -1;
800    }
801    
802    
803    
804    
805    /*************************************************
806  *           Compile one branch                   *  *           Compile one branch                   *
807  *************************************************/  *************************************************/
808    
# Line 627  Arguments: Line 815  Arguments:
815    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
816    errorptr     points to pointer to error message    errorptr     points to pointer to error message
817    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
818      reqchar      set to the last literal character required, else -1
819      countlits    set to count of mandatory literal characters
820    cd           contains pointers to tables    cd           contains pointers to tables
821    
822  Returns:       TRUE on success  Returns:       TRUE on success
# Line 636  Returns:       TRUE on success Line 826  Returns:       TRUE on success
826  static BOOL  static BOOL
827  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
828    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
829    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
830  {  {
831  int repeat_type, op_type;  int repeat_type, op_type;
832  int repeat_min, repeat_max;  int repeat_min, repeat_max;
833  int bravalue, length;  int bravalue, length;
834  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
835    int prevreqchar;
836    int condcount = 0;
837    int subcountlits = 0;
838  register int c;  register int c;
839  register uschar *code = *codeptr;  register uschar *code = *codeptr;
840  uschar *tempcode;  uschar *tempcode;
# Line 655  uschar class[32]; Line 848  uschar class[32];
848  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
849  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
850    
851    /* Initialize no required char, and count of literals */
852    
853    *reqchar = prevreqchar = -1;
854    *countlits = 0;
855    
856  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
857    
858  for (;; ptr++)  for (;; ptr++)
# Line 664  for (;; ptr++) Line 862  for (;; ptr++)
862    int class_lastchar;    int class_lastchar;
863    int newoptions;    int newoptions;
864    int condref;    int condref;
865      int subreqchar;
866    
867    c = *ptr;    c = *ptr;
868    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 671  for (;; ptr++) Line 870  for (;; ptr++)
870      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
871      if (c == '#')      if (c == '#')
872        {        {
873        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
874          on the Macintosh. */
875          while ((c = *(++ptr)) != 0 && c != '\n') ;
876        continue;        continue;
877        }        }
878      }      }
# Line 746  for (;; ptr++) Line 947  for (;; ptr++)
947          goto FAILED;          goto FAILED;
948          }          }
949    
950          /* Handle POSIX class names. Perl allows a negation extension of the
951          form [:^name]. A square bracket that doesn't match the syntax is
952          treated as a literal. We also recognize the POSIX constructions
953          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
954          5.6 does. */
955    
956          if (c == '[' &&
957              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
958              check_posix_syntax(ptr, &tempptr, cd))
959            {
960            BOOL local_negate = FALSE;
961            int posix_class, i;
962            register const uschar *cbits = cd->cbits;
963    
964            if (ptr[1] != ':')
965              {
966              *errorptr = ERR31;
967              goto FAILED;
968              }
969    
970            ptr += 2;
971            if (*ptr == '^')
972              {
973              local_negate = TRUE;
974              ptr++;
975              }
976    
977            posix_class = check_posix_name(ptr, tempptr - ptr);
978            if (posix_class < 0)
979              {
980              *errorptr = ERR30;
981              goto FAILED;
982              }
983    
984            /* If matching is caseless, upper and lower are converted to
985            alpha. This relies on the fact that the class table starts with
986            alpha, lower, upper as the first 3 entries. */
987    
988            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
989              posix_class = 0;
990    
991            /* Or into the map we are building up to 3 of the static class
992            tables, or their negations. */
993    
994            posix_class *= 3;
995            for (i = 0; i < 3; i++)
996              {
997              int taboffset = posix_class_maps[posix_class + i];
998              if (taboffset < 0) break;
999              if (local_negate)
1000                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1001              else
1002                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1003              }
1004    
1005            ptr = tempptr + 1;
1006            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1007            continue;
1008            }
1009    
1010        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1011        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1012        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 773  for (;; ptr++) Line 1034  for (;; ptr++)
1034              continue;              continue;
1035    
1036              case ESC_w:              case ESC_w:
1037              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1038              continue;              continue;
1039    
1040              case ESC_W:              case ESC_W:
1041              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1042              continue;              continue;
1043    
1044              case ESC_s:              case ESC_s:
# Line 937  for (;; ptr++) Line 1196  for (;; ptr++)
1196        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1197      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1198    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1199      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1200      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1201      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1202        out any reqchar setting, backing up to the previous value. We must also
1203        adjust the countlits value. */
1204    
1205      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1206        {        {
1207        int len = previous[1];        int len = previous[1];
1208    
1209          if (repeat_min == 0) *reqchar = prevreqchar;
1210          *countlits += repeat_min - 1;
1211    
1212        if (len == 1)        if (len == 1)
1213          {          {
1214          c = previous[2];          c = previous[2];
# Line 987  for (;; ptr++) Line 1247  for (;; ptr++)
1247        code = previous;        code = previous;
1248    
1249        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1250        repeat_type += op_type;      /* Combine both values for many cases */  
1251          /* If the maximum is zero then the minimum must also be zero; Perl allows
1252          this case, so we do too - by simply omitting the item altogether. */
1253    
1254          if (repeat_max == 0) goto END_REPEAT;
1255    
1256          /* Combine the op_type with the repeat_type */
1257    
1258          repeat_type += op_type;
1259    
1260        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1261        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1064  for (;; ptr++) Line 1332  for (;; ptr++)
1332        }        }
1333    
1334      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1335      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1336    
1337      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1338        {        {
1339          if (repeat_max == 0)
1340            {
1341            code = previous;
1342            goto END_REPEAT;
1343            }
1344        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1345          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1346        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1118  for (;; ptr++) Line 1391  for (;; ptr++)
1391    
1392        if (repeat_min == 0)        if (repeat_min == 0)
1393          {          {
1394            /* If we set up a required char from the bracket, we must back off
1395            to the previous value and reset the countlits value too. */
1396    
1397            if (subcountlits > 0)
1398              {
1399              *reqchar = prevreqchar;
1400              *countlits -= subcountlits;
1401              }
1402    
1403          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
1404          altogether. */          altogether. */
1405    
1406          if (repeat_max == 0)          if (repeat_max == 0)
1407            {            {
1408            code = previous;            code = previous;
1409            previous = NULL;            goto END_REPEAT;
           break;  
1410            }            }
1411    
1412          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
# Line 1230  for (;; ptr++) Line 1511  for (;; ptr++)
1511        correct offset was computed above. */        correct offset was computed above. */
1512    
1513        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
   
   
 #ifdef NEVER  
       /* If the minimum is greater than zero, and the maximum is unlimited or  
       equal to the minimum, the first copy remains where it is, and is  
       replicated up to the minimum number of times. This case includes the +  
       repeat, but of course no replication is needed in that case. */  
   
       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))  
         {  
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the minimum is zero, stick BRAZERO in front of the first copy.  
       Then, if there is a fixed upper limit, replicated up to that many times,  
       sticking BRAZERO in front of all the optional ones. */  
   
       else  
         {  
         if (repeat_min == 0)  
           {  
           memmove(previous+1, previous, len);  
           code++;  
           *previous++ = OP_BRAZERO + repeat_type;  
           }  
   
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
   
         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)  
           {  
           *code++ = OP_BRAZERO + repeat_type;  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the maximum is unlimited, set a repeater in the final copy. We  
       can't just offset backwards from the current code point, because we  
       don't know if there's been an options resetting after the ket. The  
       correct offset was computed above. */  
   
       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;  
 #endif  
   
   
1514        }        }
1515    
1516      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1295  for (;; ptr++) Line 1523  for (;; ptr++)
1523    
1524      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1525    
1526        END_REPEAT:
1527      previous = NULL;      previous = NULL;
1528      break;      break;
1529    
# Line 1372  for (;; ptr++) Line 1601  for (;; ptr++)
1601          ptr++;          ptr++;
1602          break;          break;
1603    
1604            case 'R':                 /* Pattern recursion */
1605            *code++ = OP_RECURSE;
1606            ptr++;
1607            continue;
1608    
1609          default:                  /* Option setting */          default:                  /* Option setting */
1610          set = unset = 0;          set = unset = 0;
1611          optset = &set;          optset = &set;
# Line 1463  for (;; ptr++) Line 1697  for (;; ptr++)
1697           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1698            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1699           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1700             &subreqchar,                  /* For possible last char */
1701             &subcountlits,                /* For literal count */
1702           cd))                          /* Tables block */           cd))                          /* Tables block */
1703        goto FAILED;        goto FAILED;
1704    
# Line 1476  for (;; ptr++) Line 1712  for (;; ptr++)
1712    
1713      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1714        {        {
       int branchcount = 0;  
1715        uschar *tc = code;        uschar *tc = code;
1716          condcount = 0;
1717    
1718        do {        do {
1719           branchcount++;           condcount++;
1720           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1721           }           }
1722        while (*tc != OP_KET);        while (*tc != OP_KET);
1723    
1724        if (branchcount > 2)        if (condcount > 2)
1725          {          {
1726          *errorptr = ERR27;          *errorptr = ERR27;
1727          goto FAILED;          goto FAILED;
1728          }          }
1729        }        }
1730    
1731        /* Handle updating of the required character. If the subpattern didn't
1732        set one, leave it as it was. Otherwise, update it for normal brackets of
1733        all kinds, forward assertions, and conditions with two branches. Don't
1734        update the literal count for forward assertions, however. If the bracket
1735        is followed by a quantifier with zero repeat, we have to back off. Hence
1736        the definition of prevreqchar and subcountlits outside the main loop so
1737        that they can be accessed for the back off. */
1738    
1739        if (subreqchar > 0 &&
1740             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1741             (bravalue == OP_COND && condcount == 2)))
1742          {
1743          prevreqchar = *reqchar;
1744          *reqchar = subreqchar;
1745          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1746          }
1747    
1748      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1749    
1750      code = tempcode;      code = tempcode;
# Line 1559  for (;; ptr++) Line 1812  for (;; ptr++)
1812          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
1813          if (c == '#')          if (c == '#')
1814            {            {
1815            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
1816              on the Macintosh. */
1817              while ((c = *(++ptr)) != 0 && c != '\n') ;
1818            if (c == 0) break;            if (c == 0) break;
1819            continue;            continue;
1820            }            }
# Line 1586  for (;; ptr++) Line 1841  for (;; ptr++)
1841    
1842      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1843    
1844        /* Update the last character and the count of literals */
1845    
1846        prevreqchar = (length > 1)? code[-2] : *reqchar;
1847        *reqchar = code[-1];
1848        *countlits += length;
1849    
1850      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1851      the next state. */      the next state. */
1852    
# Line 1629  Argument: Line 1890  Argument:
1890    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1891    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1892    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1893      reqchar     -> place to put the last required character, or a negative number
1894      countlits   -> place to put the shortest literal count of any branch
1895    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
1896    
1897  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1637  Returns:      TRUE on success Line 1900  Returns:      TRUE on success
1900  static BOOL  static BOOL
1901  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1902    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1903    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
1904  {  {
1905  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1906  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1645  uschar *last_branch = code; Line 1908  uschar *last_branch = code;
1908  uschar *start_bracket = code;  uschar *start_bracket = code;
1909  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1910  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1911    int branchreqchar, branchcountlits;
1912    
1913    *reqchar = -1;
1914    *countlits = INT_MAX;
1915  code += 3;  code += 3;
1916    
1917  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1684  for (;;) Line 1950  for (;;)
1950    
1951    /* Now compile the branch */    /* Now compile the branch */
1952    
1953    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1954          &branchreqchar, &branchcountlits, cd))
1955      {      {
1956      *ptrptr = ptr;      *ptrptr = ptr;
1957      return FALSE;      return FALSE;
# Line 1696  for (;;) Line 1963  for (;;)
1963    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1964    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1965    
1966      /* Save the last required character if all branches have the same; a current
1967      value of -1 means unset, while -2 means "previous branch had no last required
1968      char".  */
1969    
1970      if (*reqchar != -2)
1971        {
1972        if (branchreqchar >= 0)
1973          {
1974          if (*reqchar == -1) *reqchar = branchreqchar;
1975          else if (*reqchar != branchreqchar) *reqchar = -2;
1976          }
1977        else *reqchar = -2;
1978        }
1979    
1980      /* Keep the shortest literal count */
1981    
1982      if (branchcountlits < *countlits) *countlits = branchcountlits;
1983      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1984    
1985    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1986    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1987    the branch with OP_END. */    the branch with OP_END. */
# Line 1977  pcre_compile(const char *pattern, int op Line 2263  pcre_compile(const char *pattern, int op
2263  real_pcre *re;  real_pcre *re;
2264  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2265  int runlength;  int runlength;
2266  int c, size;  int c, reqchar, countlits;
2267  int bracount = 0;  int bracount = 0;
2268  int top_backref = 0;  int top_backref = 0;
2269  int branch_extra = 0;  int branch_extra = 0;
2270  int branch_newextra;  int branch_newextra;
2271  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2272    size_t size;
2273  uschar *code;  uschar *code;
2274  const uschar *ptr;  const uschar *ptr;
2275  compile_data compile_block;  compile_data compile_block;
# Line 2045  while ((c = *(++ptr)) != 0) Line 2332  while ((c = *(++ptr)) != 0)
2332      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2333      if (c == '#')      if (c == '#')
2334        {        {
2335        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
2336          on the Macintosh. */
2337          while ((c = *(++ptr)) != 0 && c != '\n') ;
2338        continue;        continue;
2339        }        }
2340      }      }
# Line 2210  while ((c = *(++ptr)) != 0) Line 2499  while ((c = *(++ptr)) != 0)
2499          ptr += 2;          ptr += 2;
2500          break;          break;
2501    
2502            /* A recursive call to the regex is an extension, to provide the
2503            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2504    
2505            case 'R':
2506            if (ptr[3] != ')')
2507              {
2508              *errorptr = ERR29;
2509              goto PCRE_ERROR_RETURN;
2510              }
2511            ptr += 3;
2512            length += 1;
2513            break;
2514    
2515          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2516    
2517          case '<':          case '<':
# Line 2242  while ((c = *(++ptr)) != 0) Line 2544  while ((c = *(++ptr)) != 0)
2544          else   /* An assertion must follow */          else   /* An assertion must follow */
2545            {            {
2546            ptr++;   /* Can treat like ':' as far as spacing is concerned */            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2547              if (ptr[2] != '?' ||
2548            if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)               (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2549              {              {
2550              ptr += 2;    /* To get right offset in message */              ptr += 2;    /* To get right offset in message */
2551              *errorptr = ERR28;              *errorptr = ERR28;
# Line 2317  while ((c = *(++ptr)) != 0) Line 2619  while ((c = *(++ptr)) != 0)
2619              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2620              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2621              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2622              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2623                flag ever changes within the regex. This is used by the "required
2624                character" code. */
2625    
2626              case ':':              case ':':
2627              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2628                {                {
2629                length += 4;                length += 4;
2630                branch_newextra = 2;                branch_newextra = 2;
2631                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2632                }                }
2633              goto END_OPTIONS;              goto END_OPTIONS;
2634    
# Line 2453  while ((c = *(++ptr)) != 0) Line 2758  while ((c = *(++ptr)) != 0)
2758          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2759          if (c == '#')          if (c == '#')
2760            {            {
2761            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2762              on the Macintosh. */
2763              while ((c = *(++ptr)) != 0 && c != '\n') ;
2764            continue;            continue;
2765            }            }
2766          }          }
# Line 2509  if (re == NULL) Line 2816  if (re == NULL)
2816    return NULL;    return NULL;
2817    }    }
2818    
2819  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
2820    
2821  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2822    re->size = size;
2823  re->options = options;  re->options = options;
2824  re->tables = tables;  re->tables = tables;
2825    
# Line 2524  code = re->code; Line 2832  code = re->code;
2832  *code = OP_BRA;  *code = OP_BRA;
2833  bracount = 0;  bracount = 0;
2834  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2835    &compile_block);    &reqchar, &countlits, &compile_block);
2836  re->top_bracket = bracount;  re->top_bracket = bracount;
2837  re->top_backref = top_backref;  re->top_backref = top_backref;
2838    
# Line 2584  if ((options & PCRE_ANCHORED) == 0) Line 2892  if ((options & PCRE_ANCHORED) == 0)
2892      }      }
2893    }    }
2894    
2895    /* Save the last required character if there are at least two literal
2896    characters on all paths, or if there is no first character setting. */
2897    
2898    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2899      {
2900      re->req_char = reqchar;
2901      re->options |= PCRE_REQCHSET;
2902      }
2903    
2904  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2905    
2906  #ifdef DEBUG  #ifdef DEBUG
# Line 2593  printf("Length = %d top_bracket = %d top Line 2910  printf("Length = %d top_bracket = %d top
2910    
2911  if (re->options != 0)  if (re->options != 0)
2912    {    {
2913    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2914      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2915      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2916        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2917      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2918      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2919      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2610  if ((re->options & PCRE_FIRSTSET) != 0) Line 2928  if ((re->options & PCRE_FIRSTSET) != 0)
2928      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2929    }    }
2930    
2931    if ((re->options & PCRE_REQCHSET) != 0)
2932      {
2933      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2934        else printf("Req char = \\x%02x\n", re->req_char);
2935      }
2936    
2937  code_end = code;  code_end = code;
2938  code_base = code = re->code;  code_base = code = re->code;
2939    
# Line 2843  Returns:      TRUE if matched Line 3167  Returns:      TRUE if matched
3167    
3168  static BOOL  static BOOL
3169  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3170    int ims)    unsigned long int ims)
3171  {  {
3172  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3173    
# Line 2894  Arguments: Line 3218  Arguments:
3218     offset_top  current top pointer     offset_top  current top pointer
3219     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3220     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
3221     condassert  TRUE if called to check a condition assertion     eptrb       pointer to chain of blocks containing eptr at start of
3222     eptrb       eptr at start of last bracket                   brackets - for testing for empty matches
3223       flags       can contain
3224                     match_condassert - this is an assertion condition
3225                     match_isgroup - this is the start of a bracketed group
3226    
3227  Returns:       TRUE if matched  Returns:       TRUE if matched
3228  */  */
3229    
3230  static BOOL  static BOOL
3231  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3232    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3233      int flags)
3234  {  {
3235  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3236    eptrblock newptrb;
3237    
3238    /* At the start of a bracketed group, add the current subject pointer to the
3239    stack of such pointers, to be re-instated at the end of the group when we hit
3240    the closing ket. When match() is called in other circumstances, we don't add to
3241    the stack. */
3242    
3243    if ((flags & match_isgroup) != 0)
3244      {
3245      newptrb.prev = eptrb;
3246      newptrb.saved_eptr = eptr;
3247      eptrb = &newptrb;
3248      }
3249    
3250    /* Now start processing the operations. */
3251    
3252  for (;;)  for (;;)
3253    {    {
# Line 2950  for (;;) Line 3293  for (;;)
3293    
3294        do        do
3295          {          {
3296          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3297              return TRUE;
3298          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3299          }          }
3300        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2976  for (;;) Line 3320  for (;;)
3320      DPRINTF(("start bracket 0\n"));      DPRINTF(("start bracket 0\n"));
3321      do      do
3322        {        {
3323        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3324            return TRUE;
3325        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3326        }        }
3327      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2995  for (;;) Line 3340  for (;;)
3340        return match(eptr,        return match(eptr,
3341          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3342            5 : 3 + (ecode[1] << 8) + ecode[2]),            5 : 3 + (ecode[1] << 8) + ecode[2]),
3343          offset_top, md, ims, FALSE, eptr);          offset_top, md, ims, eptrb, match_isgroup);
3344        }        }
3345    
3346      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
# Line 3003  for (;;) Line 3348  for (;;)
3348    
3349      else      else
3350        {        {
3351        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3352              match_condassert | match_isgroup))
3353          {          {
3354          ecode += 3 + (ecode[4] << 8) + ecode[5];          ecode += 3 + (ecode[4] << 8) + ecode[5];
3355          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3356          }          }
3357        else ecode += (ecode[1] << 8) + ecode[2];        else ecode += (ecode[1] << 8) + ecode[2];
3358        return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);        return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3359        }        }
3360      /* Control never reaches here */      /* Control never reaches here */
3361    
# Line 3019  for (;;) Line 3365  for (;;)
3365      ecode += 2;      ecode += 2;
3366      break;      break;
3367    
3368      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3369        an empty string - recursion will then try other alternatives, if any. */
3370    
3371      case OP_END:      case OP_END:
3372        if (md->notempty && eptr == md->start_match) return FALSE;
3373      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3374      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3375      return TRUE;      return TRUE;
# Line 3031  for (;;) Line 3379  for (;;)
3379      case OP_OPT:      case OP_OPT:
3380      ims = ecode[1];      ims = ecode[1];
3381      ecode += 2;      ecode += 2;
3382      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3383      break;      break;
3384    
3385      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 3044  for (;;) Line 3392  for (;;)
3392      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3393      do      do
3394        {        {
3395        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3396        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3397        }        }
3398      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 3052  for (;;) Line 3400  for (;;)
3400    
3401      /* If checking an assertion for a condition, return TRUE. */      /* If checking an assertion for a condition, return TRUE. */
3402    
3403      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3404    
3405      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3406      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
# Line 3068  for (;;) Line 3416  for (;;)
3416      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3417      do      do
3418        {        {
3419        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3420            return FALSE;
3421        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3422        }        }
3423      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3424    
3425      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3426    
3427      ecode += 3;      ecode += 3;
3428      continue;      continue;
3429    
# Line 3087  for (;;) Line 3437  for (;;)
3437      ecode += 3;      ecode += 3;
3438      break;      break;
3439    
3440        /* Recursion matches the current regex, nested. If there are any capturing
3441        brackets started but not finished, we have to save their starting points
3442        and reinstate them after the recursion. However, we don't know how many
3443        such there are (offset_top records the completed total) so we just have
3444        to save all the potential data. There may be up to 99 such values, which
3445        is a bit large to put on the stack, but using malloc for small numbers
3446        seems expensive. As a compromise, the stack is used when there are fewer
3447        than 16 values to store; otherwise malloc is used. A problem is what to do
3448        if the malloc fails ... there is no way of returning to the top level with
3449        an error. Save the top 15 values on the stack, and accept that the rest
3450        may be wrong. */
3451    
3452        case OP_RECURSE:
3453          {
3454          BOOL rc;
3455          int *save;
3456          int stacksave[15];
3457    
3458          c = md->offset_max;
3459    
3460          if (c < 16) save = stacksave; else
3461            {
3462            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3463            if (save == NULL)
3464              {
3465              save = stacksave;
3466              c = 15;
3467              }
3468            }
3469    
3470          for (i = 1; i <= c; i++)
3471            save[i] = md->offset_vector[md->offset_end - i];
3472          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3473            match_isgroup);
3474          for (i = 1; i <= c; i++)
3475            md->offset_vector[md->offset_end - i] = save[i];
3476          if (save != stacksave) (pcre_free)(save);
3477          if (!rc) return FALSE;
3478    
3479          /* In case the recursion has set more capturing values, save the final
3480          number, then move along the subject till after the recursive match,
3481          and advance one byte in the pattern code. */
3482    
3483          offset_top = md->end_offset_top;
3484          eptr = md->end_match_ptr;
3485          ecode++;
3486          }
3487        break;
3488    
3489      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3490      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 3098  for (;;) Line 3496  for (;;)
3496      case OP_ONCE:      case OP_ONCE:
3497        {        {
3498        const uschar *prev = ecode;        const uschar *prev = ecode;
3499          const uschar *saved_eptr = eptr;
3500    
3501        do        do
3502          {          {
3503          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3504              break;
3505          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3506          }          }
3507        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 3124  for (;;) Line 3524  for (;;)
3524        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3525        course of events. */        course of events. */
3526    
3527        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3528          {          {
3529          ecode += 3;          ecode += 3;
3530          break;          break;
# Line 3138  for (;;) Line 3538  for (;;)
3538        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3539          {          {
3540          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3541          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3542          }          }
3543    
3544        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3545          {          {
3546          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3547              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3548                  return TRUE;
3549          }          }
3550        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3551          {          {
3552          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3553              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3554          }          }
3555        }        }
3556      return FALSE;      return FALSE;
# Line 3170  for (;;) Line 3571  for (;;)
3571      case OP_BRAZERO:      case OP_BRAZERO:
3572        {        {
3573        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3574        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3575            return TRUE;
3576        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3577        ecode = next + 3;        ecode = next + 3;
3578        }        }
# Line 3180  for (;;) Line 3582  for (;;)
3582        {        {
3583        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3584        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3585        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3586            return TRUE;
3587        ecode++;        ecode++;
3588        }        }
3589      break;      break;
# Line 3195  for (;;) Line 3598  for (;;)
3598      case OP_KETRMAX:      case OP_KETRMAX:
3599        {        {
3600        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3601          const uschar *saved_eptr = eptrb->saved_eptr;
3602    
3603          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3604    
3605        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3606            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 3214  for (;;) Line 3620  for (;;)
3620          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
3621          int offset = number << 1;          int offset = number << 1;
3622    
3623          DPRINTF(("end bracket %d\n", number));  #ifdef DEBUG
3624            printf("end bracket %d", number);
3625            printf("\n");
3626    #endif
3627    
3628          if (number > 0)          if (number > 0)
3629            {            {
# Line 3232  for (;;) Line 3641  for (;;)
3641        the group. */        the group. */
3642    
3643        ims = original_ims;        ims = original_ims;
3644        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3645    
3646        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3647        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3240  for (;;) Line 3649  for (;;)
3649        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3650        course of events. */        course of events. */
3651    
3652        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3653          {          {
3654          ecode += 3;          ecode += 3;
3655          break;          break;
# Line 3251  for (;;) Line 3660  for (;;)
3660    
3661        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3662          {          {
3663          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3664              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3665                  return TRUE;
3666          }          }
3667        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3668          {          {
3669          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3670              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3671          }          }
3672        }        }
3673      return FALSE;      return FALSE;
# Line 3468  for (;;) Line 3878  for (;;)
3878          {          {
3879          for (i = min;; i++)          for (i = min;; i++)
3880            {            {
3881            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3882              return TRUE;              return TRUE;
3883            if (i >= max || !match_ref(offset, eptr, length, md, ims))            if (i >= max || !match_ref(offset, eptr, length, md, ims))
3884              return FALSE;              return FALSE;
# Line 3489  for (;;) Line 3899  for (;;)
3899            }            }
3900          while (eptr >= pp)          while (eptr >= pp)
3901            {            {
3902            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3903              return TRUE;              return TRUE;
3904            eptr -= length;            eptr -= length;
3905            }            }
# Line 3560  for (;;) Line 3970  for (;;)
3970          {          {
3971          for (i = min;; i++)          for (i = min;; i++)
3972            {            {
3973            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3974              return TRUE;              return TRUE;
3975            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
3976            c = *eptr++;            c = *eptr++;
# Line 3584  for (;;) Line 3994  for (;;)
3994            }            }
3995    
3996          while (eptr >= pp)          while (eptr >= pp)
3997            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
3998              return TRUE;              return TRUE;
3999          return FALSE;          return FALSE;
4000          }          }
# Line 3681  for (;;) Line 4091  for (;;)
4091          {          {
4092          for (i = min;; i++)          for (i = min;; i++)
4093            {            {
4094            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4095              return TRUE;              return TRUE;
4096            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4097                c != md->lcc[*eptr++])                c != md->lcc[*eptr++])
# Line 3698  for (;;) Line 4108  for (;;)
4108            eptr++;            eptr++;
4109            }            }
4110          while (eptr >= pp)          while (eptr >= pp)
4111            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4112              return TRUE;              return TRUE;
4113          return FALSE;          return FALSE;
4114          }          }
# Line 3715  for (;;) Line 4125  for (;;)
4125          {          {
4126          for (i = min;; i++)          for (i = min;; i++)
4127            {            {
4128            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4129              return TRUE;              return TRUE;
4130            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4131            }            }
# Line 3730  for (;;) Line 4140  for (;;)
4140            eptr++;            eptr++;
4141            }            }
4142          while (eptr >= pp)          while (eptr >= pp)
4143           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4144             return TRUE;             return TRUE;
4145          return FALSE;          return FALSE;
4146          }          }
# Line 3812  for (;;) Line 4222  for (;;)
4222          {          {
4223          for (i = min;; i++)          for (i = min;; i++)
4224            {            {
4225            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4226              return TRUE;              return TRUE;
4227            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4228                c == md->lcc[*eptr++])                c == md->lcc[*eptr++])
# Line 3829  for (;;) Line 4239  for (;;)
4239            eptr++;            eptr++;
4240            }            }
4241          while (eptr >= pp)          while (eptr >= pp)
4242            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4243              return TRUE;              return TRUE;
4244          return FALSE;          return FALSE;
4245          }          }
# Line 3846  for (;;) Line 4256  for (;;)
4256          {          {
4257          for (i = min;; i++)          for (i = min;; i++)
4258            {            {
4259            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4260              return TRUE;              return TRUE;
4261            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4262            }            }
# Line 3861  for (;;) Line 4271  for (;;)
4271            eptr++;            eptr++;
4272            }            }
4273          while (eptr >= pp)          while (eptr >= pp)
4274           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4275             return TRUE;             return TRUE;
4276          return FALSE;          return FALSE;
4277          }          }
# Line 3961  for (;;) Line 4371  for (;;)
4371        {        {
4372        for (i = min;; i++)        for (i = min;; i++)
4373          {          {
4374          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4375          if (i >= max || eptr >= md->end_subject) return FALSE;          if (i >= max || eptr >= md->end_subject) return FALSE;
4376    
4377          c = *eptr++;          c = *eptr++;
# Line 4080  for (;;) Line 4490  for (;;)
4490          }          }
4491    
4492        while (eptr >= pp)        while (eptr >= pp)
4493          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4494            return TRUE;            return TRUE;
4495        return FALSE;        return FALSE;
4496        }        }
# Line 4136  pcre_exec(const pcre *external_re, const Line 4546  pcre_exec(const pcre *external_re, const
4546  {  {
4547  int resetcount, ocount;  int resetcount, ocount;
4548  int first_char = -1;  int first_char = -1;
4549  int ims = 0;  int req_char = -1;
4550    int req_char2 = -1;
4551    unsigned long int ims = 0;
4552  match_data match_block;  match_data match_block;
4553  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4554  const uschar *start_match = (const uschar *)subject + start_offset;  const uschar *start_match = (const uschar *)subject + start_offset;
4555  const uschar *end_subject;  const uschar *end_subject;
4556    const uschar *req_char_ptr = start_match - 1;
4557  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4558  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4559  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 4153  if (re == NULL || subject == NULL || Line 4566  if (re == NULL || subject == NULL ||
4566     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4567  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4568    
4569    match_block.start_pattern = re->code;
4570  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4571  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4572  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
# Line 4161  match_block.endonly = (re->options & PCR Line 4575  match_block.endonly = (re->options & PCR
4575    
4576  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4577  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4578    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4579    
4580  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4581    
# Line 4231  if (!anchored) Line 4646  if (!anchored)
4646          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4647    }    }
4648    
4649  /* Loop for unanchored matches; for anchored regexs the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4650    character" set. If the PCRE_CASELESS is set, implying that the match starts
4651    caselessly, or if there are any changes of this flag within the regex, set up
4652    both cases of the character. Otherwise set the two values the same, which will
4653    avoid duplicate testing (which takes significant time). This covers the vast
4654    majority of cases. It will be suboptimal when the case flag changes in a regex
4655    and the required character in fact is caseful. */
4656    
4657    if ((re->options & PCRE_REQCHSET) != 0)
4658      {
4659      req_char = re->req_char;
4660      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4661        (re->tables + fcc_offset)[req_char] : req_char;
4662      }
4663    
4664    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4665    the loop runs just once. */
4666    
4667  do  do
4668    {    {
# Line 4260  do Line 4691  do
4691    
4692    else if (startline)    else if (startline)
4693      {      {
4694      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
4695        {        {
4696        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != '\n')
4697          start_match++;          start_match++;
4698        }        }
4699      }      }
4700    
4701    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4702    
4703    else if (start_bits != NULL)    else if (start_bits != NULL)
4704      {      {
# Line 4284  do Line 4715  do
4715    printf("\n");    printf("\n");
4716  #endif  #endif
4717    
4718      /* If req_char is set, we know that that character must appear in the subject
4719      for the match to succeed. If the first character is set, req_char must be
4720      later in the subject; otherwise the test starts at the match point. This
4721      optimization can save a huge amount of backtracking in patterns with nested
4722      unlimited repeats that aren't going to match. We don't know what the state of
4723      case matching may be when this character is hit, so test for it in both its
4724      cases if necessary. However, the different cased versions will not be set up
4725      unless PCRE_CASELESS was given or the casing state changes within the regex.
4726      Writing separate code makes it go faster, as does using an autoincrement and
4727      backing off on a match. */
4728    
4729      if (req_char >= 0)
4730        {
4731        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4732    
4733        /* We don't need to repeat the search if we haven't yet reached the
4734        place we found it at last time. */
4735    
4736        if (p > req_char_ptr)
4737          {
4738          /* Do a single test if no case difference is set up */
4739    
4740          if (req_char == req_char2)
4741            {
4742            while (p < end_subject)
4743              {
4744              if (*p++ == req_char) { p--; break; }
4745              }
4746            }
4747    
4748          /* Otherwise test for either case */
4749    
4750          else
4751            {
4752            while (p < end_subject)
4753              {
4754              register int pp = *p++;
4755              if (pp == req_char || pp == req_char2) { p--; break; }
4756              }
4757            }
4758    
4759          /* If we can't find the required character, break the matching loop */
4760    
4761          if (p >= end_subject) break;
4762    
4763          /* If we have found the required character, save the point where we
4764          found it, so that we don't search again next time round the loop if
4765          the start hasn't passed this character yet. */
4766    
4767          req_char_ptr = p;
4768          }
4769        }
4770    
4771    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4772    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4773    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4291  do Line 4775  do
4775    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4776    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4777    
4778    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    match_block.start_match = start_match;
4779      if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
4780      continue;      continue;
4781    
4782    /* Copy the offset information from temporary store if necessary */    /* Copy the offset information from temporary store if necessary */

Legend:
Removed from v.35  
changed lines
  Added in v.47

  ViewVC Help
Powered by ViewVC 1.1.5