/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 27 by nigel, Sat Feb 24 21:38:49 2007 UTC revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 78  static const char *OP_names[] = { Line 82  static const char *OP_names[] = {
82    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
85    "class", "Ref",    "class", "Ref", "Recurse",
86    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 103  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
108  };  };
109    
110    /* Tables of names of POSIX character classes and their lengths. The list is
111    terminated by a zero length entry. The first three must be alpha, upper, lower,
112    as this is assumed for handling case independence. */
113    
114    static const char *posix_names[] = {
115      "alpha", "lower", "upper",
116      "alnum", "ascii", "cntrl", "digit", "graph",
117      "print", "punct", "space", "word",  "xdigit" };
118    
119    static const uschar posix_name_lengths[] = {
120      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121    
122    /* Table of class bit maps for each POSIX class; up to three may be combined
123    to form the class. */
124    
125    static const int posix_class_maps[] = {
126      cbit_lower, cbit_upper, -1,             /* alpha */
127      cbit_lower, -1,         -1,             /* lower */
128      cbit_upper, -1,         -1,             /* upper */
129      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
130      cbit_print, cbit_cntrl, -1,             /* ascii */
131      cbit_cntrl, -1,         -1,             /* cntrl */
132      cbit_digit, -1,         -1,             /* digit */
133      cbit_graph, -1,         -1,             /* graph */
134      cbit_print, -1,         -1,             /* print */
135      cbit_punct, -1,         -1,             /* punct */
136      cbit_space, -1,         -1,             /* space */
137      cbit_word,  -1,         -1,             /* word */
138      cbit_xdigit,-1,         -1              /* xdigit */
139    };
140    
141    
142  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
143    
144  static BOOL  static BOOL
145    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
147    
148    /* Structure for building a chain of data that actually lives on the
149    stack, for holding the values of the subject pointer at the start of each
150    subpattern, so as to detect when an empty string has been matched by a
151    subpattern - to break infinite loops. */
152    
153    typedef struct eptrblock {
154      struct eptrblock *prev;
155      const uschar *saved_eptr;
156    } eptrblock;
157    
158    /* Flag bits for the match() function */
159    
160    #define match_condassert   0x01    /* Called to check a condition assertion */
161    #define match_isgroup      0x02    /* Set if start of bracketed group */
162    
163    
164    
# Line 144  tables. */ Line 195  tables. */
195  *          Return version string                 *  *          Return version string                 *
196  *************************************************/  *************************************************/
197    
198    #define STRING(a)  # a
199    #define XSTRING(s) STRING(s)
200    
201  const char *  const char *
202  pcre_version(void)  pcre_version(void)
203  {  {
204  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
205  }  }
206    
207    
208    
209    
210  /*************************************************  /*************************************************
211  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
212  *************************************************/  *************************************************/
213    
214  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
215  structure.  of the private structure, but its interface was too rigid. It remains for
216    backwards compatibility. The public options are passed back in an int - though
217    the re->options field has been expanded to a long int, all the public options
218    at the low end of it, and so even on 16-bit systems this will still be OK.
219    Therefore, I haven't changed the API for pcre_info().
220    
221  Arguments:  Arguments:
222    external_re   points to compiled code    external_re   points to compiled code
# Line 167  Arguments: Line 225  Arguments:
225                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
226                  or -2 otherwise                  or -2 otherwise
227    
228  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
229                  or negative values on error                  or negative values on error
230  */  */
231    
# Line 177  pcre_info(const pcre *external_re, int * Line 235  pcre_info(const pcre *external_re, int *
235  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
236  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
237  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
238  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
239  if (first_char != NULL)  if (first_char != NULL)
240    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
241       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 186  return re->top_bracket; Line 244  return re->top_bracket;
244    
245    
246    
247    /*************************************************
248    *        Return info about compiled pattern      *
249    *************************************************/
250    
251    /* This is a newer "info" function which has an extensible interface so
252    that additional items can be added compatibly.
253    
254    Arguments:
255      external_re      points to compiled code
256      external_study   points to study data, or NULL
257      what             what information is required
258      where            where to put the information
259    
260    Returns:           0 if data returned, negative on error
261    */
262    
263    int
264    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
265      void *where)
266    {
267    const real_pcre *re = (const real_pcre *)external_re;
268    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
269    
270    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
271    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
272    
273    switch (what)
274      {
275      case PCRE_INFO_OPTIONS:
276      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
277      break;
278    
279      case PCRE_INFO_SIZE:
280      *((size_t *)where) = re->size;
281      break;
282    
283      case PCRE_INFO_CAPTURECOUNT:
284      *((int *)where) = re->top_bracket;
285      break;
286    
287      case PCRE_INFO_BACKREFMAX:
288      *((int *)where) = re->top_backref;
289      break;
290    
291      case PCRE_INFO_FIRSTCHAR:
292      *((int *)where) =
293        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
294        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
295      break;
296    
297      case PCRE_INFO_FIRSTTABLE:
298      *((const uschar **)where) =
299        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
300          study->start_bits : NULL;
301      break;
302    
303      case PCRE_INFO_LASTLITERAL:
304      *((int *)where) =
305        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
306      break;
307    
308      default: return PCRE_ERROR_BADOPTION;
309      }
310    
311    return 0;
312    }
313    
314    
315    
316  #ifdef DEBUG  #ifdef DEBUG
317  /*************************************************  /*************************************************
# Line 245  check_escape(const uschar **ptrptr, cons Line 371  check_escape(const uschar **ptrptr, cons
371    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
372  {  {
373  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
374  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
375    
376    c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
377  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
378    
379  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 528  for (;;) Line 654  for (;;)
654    
655      case OP_REVERSE:      case OP_REVERSE:
656      cc++;      cc++;
657        /* Fall through */
658    
659      case OP_CREF:      case OP_CREF:
660      case OP_OPT:      case OP_OPT:
# Line 611  for (;;) Line 738  for (;;)
738    
739    
740  /*************************************************  /*************************************************
741    *           Check for POSIX class syntax         *
742    *************************************************/
743    
744    /* This function is called when the sequence "[:" or "[." or "[=" is
745    encountered in a character class. It checks whether this is followed by an
746    optional ^ and then a sequence of letters, terminated by a matching ":]" or
747    ".]" or "=]".
748    
749    Argument:
750      ptr      pointer to the initial [
751      endptr   where to return the end pointer
752      cd       pointer to compile data
753    
754    Returns:   TRUE or FALSE
755    */
756    
757    static BOOL
758    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
759    {
760    int terminator;          /* Don't combine these lines; the Solaris cc */
761    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
762    if (*(++ptr) == '^') ptr++;
763    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
764    if (*ptr == terminator && ptr[1] == ']')
765      {
766      *endptr = ptr;
767      return TRUE;
768      }
769    return FALSE;
770    }
771    
772    
773    
774    
775    /*************************************************
776    *          Check POSIX class name                *
777    *************************************************/
778    
779    /* This function is called to check the name given in a POSIX-style class entry
780    such as [:alnum:].
781    
782    Arguments:
783      ptr        points to the first letter
784      len        the length of the name
785    
786    Returns:     a value representing the name, or -1 if unknown
787    */
788    
789    static int
790    check_posix_name(const uschar *ptr, int len)
791    {
792    register int yield = 0;
793    while (posix_name_lengths[yield] != 0)
794      {
795      if (len == posix_name_lengths[yield] &&
796        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
797      yield++;
798      }
799    return -1;
800    }
801    
802    
803    
804    
805    /*************************************************
806  *           Compile one branch                   *  *           Compile one branch                   *
807  *************************************************/  *************************************************/
808    
# Line 623  Arguments: Line 815  Arguments:
815    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
816    errorptr     points to pointer to error message    errorptr     points to pointer to error message
817    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
818      reqchar      set to the last literal character required, else -1
819      countlits    set to count of mandatory literal characters
820    cd           contains pointers to tables    cd           contains pointers to tables
821    
822  Returns:       TRUE on success  Returns:       TRUE on success
# Line 632  Returns:       TRUE on success Line 826  Returns:       TRUE on success
826  static BOOL  static BOOL
827  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
828    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
829    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
830  {  {
831  int repeat_type, op_type;  int repeat_type, op_type;
832  int repeat_min, repeat_max;  int repeat_min, repeat_max;
833  int bravalue, length;  int bravalue, length;
834  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
835    int prevreqchar;
836    int condcount = 0;
837    int subcountlits = 0;
838  register int c;  register int c;
839  register uschar *code = *codeptr;  register uschar *code = *codeptr;
840  uschar *tempcode;  uschar *tempcode;
# Line 651  uschar class[32]; Line 848  uschar class[32];
848  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
849  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
850    
851    /* Initialize no required char, and count of literals */
852    
853    *reqchar = prevreqchar = -1;
854    *countlits = 0;
855    
856  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
857    
858  for (;; ptr++)  for (;; ptr++)
# Line 660  for (;; ptr++) Line 862  for (;; ptr++)
862    int class_lastchar;    int class_lastchar;
863    int newoptions;    int newoptions;
864    int condref;    int condref;
865      int subreqchar;
866    
867    c = *ptr;    c = *ptr;
868    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 667  for (;; ptr++) Line 870  for (;; ptr++)
870      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
871      if (c == '#')      if (c == '#')
872        {        {
873        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
874          on the Macintosh. */
875          while ((c = *(++ptr)) != 0 && c != '\n') ;
876        continue;        continue;
877        }        }
878      }      }
# Line 742  for (;; ptr++) Line 947  for (;; ptr++)
947          goto FAILED;          goto FAILED;
948          }          }
949    
950          /* Handle POSIX class names. Perl allows a negation extension of the
951          form [:^name]. A square bracket that doesn't match the syntax is
952          treated as a literal. We also recognize the POSIX constructions
953          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
954          5.6 does. */
955    
956          if (c == '[' &&
957              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
958              check_posix_syntax(ptr, &tempptr, cd))
959            {
960            BOOL local_negate = FALSE;
961            int posix_class, i;
962            register const uschar *cbits = cd->cbits;
963    
964            if (ptr[1] != ':')
965              {
966              *errorptr = ERR31;
967              goto FAILED;
968              }
969    
970            ptr += 2;
971            if (*ptr == '^')
972              {
973              local_negate = TRUE;
974              ptr++;
975              }
976    
977            posix_class = check_posix_name(ptr, tempptr - ptr);
978            if (posix_class < 0)
979              {
980              *errorptr = ERR30;
981              goto FAILED;
982              }
983    
984            /* If matching is caseless, upper and lower are converted to
985            alpha. This relies on the fact that the class table starts with
986            alpha, lower, upper as the first 3 entries. */
987    
988            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
989              posix_class = 0;
990    
991            /* Or into the map we are building up to 3 of the static class
992            tables, or their negations. */
993    
994            posix_class *= 3;
995            for (i = 0; i < 3; i++)
996              {
997              int taboffset = posix_class_maps[posix_class + i];
998              if (taboffset < 0) break;
999              if (local_negate)
1000                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1001              else
1002                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1003              }
1004    
1005            ptr = tempptr + 1;
1006            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1007            continue;
1008            }
1009    
1010        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
1011        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
1012        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 769  for (;; ptr++) Line 1034  for (;; ptr++)
1034              continue;              continue;
1035    
1036              case ESC_w:              case ESC_w:
1037              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1038              continue;              continue;
1039    
1040              case ESC_W:              case ESC_W:
1041              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1042              continue;              continue;
1043    
1044              case ESC_s:              case ESC_s:
# Line 933  for (;; ptr++) Line 1196  for (;; ptr++)
1196        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1197      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1198    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1199      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1200      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1201      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1202        out any reqchar setting, backing up to the previous value. We must also
1203        adjust the countlits value. */
1204    
1205      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1206        {        {
1207        int len = previous[1];        int len = previous[1];
1208    
1209          if (repeat_min == 0) *reqchar = prevreqchar;
1210          *countlits += repeat_min - 1;
1211    
1212        if (len == 1)        if (len == 1)
1213          {          {
1214          c = previous[2];          c = previous[2];
# Line 983  for (;; ptr++) Line 1247  for (;; ptr++)
1247        code = previous;        code = previous;
1248    
1249        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1250        repeat_type += op_type;      /* Combine both values for many cases */  
1251          /* If the maximum is zero then the minimum must also be zero; Perl allows
1252          this case, so we do too - by simply omitting the item altogether. */
1253    
1254          if (repeat_max == 0) goto END_REPEAT;
1255    
1256          /* Combine the op_type with the repeat_type */
1257    
1258          repeat_type += op_type;
1259    
1260        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1261        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1060  for (;; ptr++) Line 1332  for (;; ptr++)
1332        }        }
1333    
1334      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1335      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1336    
1337      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1338        {        {
1339          if (repeat_max == 0)
1340            {
1341            code = previous;
1342            goto END_REPEAT;
1343            }
1344        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1345          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1346        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1087  for (;; ptr++) Line 1364  for (;; ptr++)
1364      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1365               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1366        {        {
1367        int i, ketoffset = 0;        register int i;
1368          int ketoffset = 0;
1369        int len = code - previous;        int len = code - previous;
1370          uschar *bralink = NULL;
1371    
1372        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1373        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1103  for (;; ptr++) Line 1382  for (;; ptr++)
1382          ketoffset = code - ket;          ketoffset = code - ket;
1383          }          }
1384    
1385        /* If the minimum is greater than zero, and the maximum is unlimited or        /* The case of a zero minimum is special because of the need to stick
1386        equal to the minimum, the first copy remains where it is, and is        OP_BRAZERO in front of it, and because the group appears once in the
1387        replicated up to the minimum number of times. This case includes the +        data, whereas in other cases it appears the minimum number of times. For
1388        repeat, but of course no replication is needed in that case. */        this reason, it is simplest to treat this case separately, as otherwise
1389          the code gets far too mess. There are several special subcases when the
1390          minimum is zero. */
1391    
1392        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))        if (repeat_min == 0)
1393          {          {
1394          for (i = 1; i < repeat_min; i++)          /* If we set up a required char from the bracket, we must back off
1395            to the previous value and reset the countlits value too. */
1396    
1397            if (subcountlits > 0)
1398            {            {
1399            memcpy(code, previous, len);            *reqchar = prevreqchar;
1400            code += len;            *countlits -= subcountlits;
1401            }            }
         }  
1402    
1403        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is also zero, we just omit the group from the output
1404        Then, if there is a fixed upper limit, replicated up to that many times,          altogether. */
       sticking BRAZERO in front of all the optional ones. */  
1405    
1406        else          if (repeat_max == 0)
1407          {            {
1408          if (repeat_min == 0)            code = previous;
1409              goto END_REPEAT;
1410              }
1411    
1412            /* If the maximum is 1 or unlimited, we just have to stick in the
1413            BRAZERO and do no more at this point. */
1414    
1415            if (repeat_max <= 1)
1416            {            {
1417            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1418            code++;            code++;
1419            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1420            }            }
1421    
1422            /* If the maximum is greater than 1 and limited, we have to replicate
1423            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1424            The first one has to be handled carefully because it's the original
1425            copy, which has to be moved up. The remainder can be handled by code
1426            that is common with the non-zero minimum case below. We just have to
1427            adjust the value or repeat_max, since one less copy is required. */
1428    
1429            else
1430              {
1431              int offset;
1432              memmove(previous+4, previous, len);
1433              code += 4;
1434              *previous++ = OP_BRAZERO + repeat_type;
1435              *previous++ = OP_BRA;
1436    
1437              /* We chain together the bracket offset fields that have to be
1438              filled in later when the ends of the brackets are reached. */
1439    
1440              offset = (bralink == NULL)? 0 : previous - bralink;
1441              bralink = previous;
1442              *previous++ = offset >> 8;
1443              *previous++ = offset & 255;
1444              }
1445    
1446            repeat_max--;
1447            }
1448    
1449          /* If the minimum is greater than zero, replicate the group as many
1450          times as necessary, and adjust the maximum to the number of subsequent
1451          copies that we need. */
1452    
1453          else
1454            {
1455          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1456            {            {
1457            memcpy(code, previous, len);            memcpy(code, previous, len);
1458            code += len;            code += len;
1459            }            }
1460            if (repeat_max > 0) repeat_max -= repeat_min;
1461            }
1462    
1463          /* This code is common to both the zero and non-zero minimum cases. If
1464          the maximum is limited, it replicates the group in a nested fashion,
1465          remembering the bracket starts on a stack. In the case of a zero minimum,
1466          the first one was set up above. In all cases the repeat_max now specifies
1467          the number of additional copies needed. */
1468    
1469          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        if (repeat_max >= 0)
1470            {
1471            for (i = repeat_max - 1; i >= 0; i--)
1472            {            {
1473            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1474    
1475              /* All but the final copy start a new nesting, maintaining the
1476              chain of brackets outstanding. */
1477    
1478              if (i != 0)
1479                {
1480                int offset;
1481                *code++ = OP_BRA;
1482                offset = (bralink == NULL)? 0 : code - bralink;
1483                bralink = code;
1484                *code++ = offset >> 8;
1485                *code++ = offset & 255;
1486                }
1487    
1488            memcpy(code, previous, len);            memcpy(code, previous, len);
1489            code += len;            code += len;
1490            }            }
1491    
1492            /* Now chain through the pending brackets, and fill in their length
1493            fields (which are holding the chain links pro tem). */
1494    
1495            while (bralink != NULL)
1496              {
1497              int oldlinkoffset;
1498              int offset = code - bralink + 1;
1499              uschar *bra = code - offset;
1500              oldlinkoffset = (bra[1] << 8) + bra[2];
1501              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1502              *code++ = OP_KET;
1503              *code++ = bra[1] = offset >> 8;
1504              *code++ = bra[2] = (offset & 255);
1505              }
1506          }          }
1507    
1508        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
# Line 1149  for (;; ptr++) Line 1510  for (;; ptr++)
1510        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
1511        correct offset was computed above. */        correct offset was computed above. */
1512    
1513        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1514        }        }
1515    
1516      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1162  for (;; ptr++) Line 1523  for (;; ptr++)
1523    
1524      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1525    
1526        END_REPEAT:
1527      previous = NULL;      previous = NULL;
1528      break;      break;
1529    
# Line 1239  for (;; ptr++) Line 1601  for (;; ptr++)
1601          ptr++;          ptr++;
1602          break;          break;
1603    
1604            case 'R':                 /* Pattern recursion */
1605            *code++ = OP_RECURSE;
1606            ptr++;
1607            continue;
1608    
1609          default:                  /* Option setting */          default:                  /* Option setting */
1610          set = unset = 0;          set = unset = 0;
1611          optset = &set;          optset = &set;
# Line 1330  for (;; ptr++) Line 1697  for (;; ptr++)
1697           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1698            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1699           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1700             &subreqchar,                  /* For possible last char */
1701             &subcountlits,                /* For literal count */
1702           cd))                          /* Tables block */           cd))                          /* Tables block */
1703        goto FAILED;        goto FAILED;
1704    
# Line 1343  for (;; ptr++) Line 1712  for (;; ptr++)
1712    
1713      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1714        {        {
       int branchcount = 0;  
1715        uschar *tc = code;        uschar *tc = code;
1716          condcount = 0;
1717    
1718        do {        do {
1719           branchcount++;           condcount++;
1720           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1721           }           }
1722        while (*tc != OP_KET);        while (*tc != OP_KET);
1723    
1724        if (branchcount > 2)        if (condcount > 2)
1725          {          {
1726          *errorptr = ERR27;          *errorptr = ERR27;
1727          goto FAILED;          goto FAILED;
1728          }          }
1729        }        }
1730    
1731        /* Handle updating of the required character. If the subpattern didn't
1732        set one, leave it as it was. Otherwise, update it for normal brackets of
1733        all kinds, forward assertions, and conditions with two branches. Don't
1734        update the literal count for forward assertions, however. If the bracket
1735        is followed by a quantifier with zero repeat, we have to back off. Hence
1736        the definition of prevreqchar and subcountlits outside the main loop so
1737        that they can be accessed for the back off. */
1738    
1739        if (subreqchar > 0 &&
1740             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1741             (bravalue == OP_COND && condcount == 2)))
1742          {
1743          prevreqchar = *reqchar;
1744          *reqchar = subreqchar;
1745          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1746          }
1747    
1748      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1749    
1750      code = tempcode;      code = tempcode;
# Line 1426  for (;; ptr++) Line 1812  for (;; ptr++)
1812          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if ((cd->ctypes[c] & ctype_space) != 0) continue;
1813          if (c == '#')          if (c == '#')
1814            {            {
1815            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
1816              on the Macintosh. */
1817              while ((c = *(++ptr)) != 0 && c != '\n') ;
1818            if (c == 0) break;            if (c == 0) break;
1819            continue;            continue;
1820            }            }
# Line 1453  for (;; ptr++) Line 1841  for (;; ptr++)
1841    
1842      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1843    
1844        /* Update the last character and the count of literals */
1845    
1846        prevreqchar = (length > 1)? code[-2] : *reqchar;
1847        *reqchar = code[-1];
1848        *countlits += length;
1849    
1850      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1851      the next state. */      the next state. */
1852    
# Line 1496  Argument: Line 1890  Argument:
1890    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1891    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1892    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1893      reqchar     -> place to put the last required character, or a negative number
1894      countlits   -> place to put the shortest literal count of any branch
1895    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
1896    
1897  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1504  Returns:      TRUE on success Line 1900  Returns:      TRUE on success
1900  static BOOL  static BOOL
1901  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1902    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1903    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
1904  {  {
1905  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1906  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1512  uschar *last_branch = code; Line 1908  uschar *last_branch = code;
1908  uschar *start_bracket = code;  uschar *start_bracket = code;
1909  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1910  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1911    int branchreqchar, branchcountlits;
1912    
1913    *reqchar = -1;
1914    *countlits = INT_MAX;
1915  code += 3;  code += 3;
1916    
1917  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1551  for (;;) Line 1950  for (;;)
1950    
1951    /* Now compile the branch */    /* Now compile the branch */
1952    
1953    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1954          &branchreqchar, &branchcountlits, cd))
1955      {      {
1956      *ptrptr = ptr;      *ptrptr = ptr;
1957      return FALSE;      return FALSE;
# Line 1563  for (;;) Line 1963  for (;;)
1963    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1964    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1965    
1966      /* Save the last required character if all branches have the same; a current
1967      value of -1 means unset, while -2 means "previous branch had no last required
1968      char".  */
1969    
1970      if (*reqchar != -2)
1971        {
1972        if (branchreqchar >= 0)
1973          {
1974          if (*reqchar == -1) *reqchar = branchreqchar;
1975          else if (*reqchar != branchreqchar) *reqchar = -2;
1976          }
1977        else *reqchar = -2;
1978        }
1979    
1980      /* Keep the shortest literal count */
1981    
1982      if (branchcountlits < *countlits) *countlits = branchcountlits;
1983      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1984    
1985    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1986    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1987    the branch with OP_END. */    the branch with OP_END. */
# Line 1657  for (;;) Line 2076  for (;;)
2076      code += 2;      code += 2;
2077      break;      break;
2078    
2079        case OP_WORD_BOUNDARY:
2080        case OP_NOT_WORD_BOUNDARY:
2081        code++;
2082        break;
2083    
2084      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
2085      case OP_ASSERTBACK:      case OP_ASSERTBACK:
2086      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1684  all of whose alternatives start with OP_ Line 2108  all of whose alternatives start with OP_
2108  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2109  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2110    
2111  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2112  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2113  trying them again.  so there is no point trying them again.
2114    
2115  Arguments:  Arguments:
2116    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1704  do { Line 2128  do {
2128     register int op = *scode;     register int op = *scode;
2129     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2130       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2131     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2132                (*options & PCRE_DOTALL) != 0)
2133       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
2134     else if (op != OP_SOD &&     else if (op != OP_SOD &&
2135             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1718  return TRUE; Line 2143  return TRUE;
2143    
2144    
2145  /*************************************************  /*************************************************
2146  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2147  *************************************************/  *************************************************/
2148    
2149  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2150  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2151    matching and for non-DOTALL patterns that start with .* (which must start at
2152    the beginning or after \n).
2153    
2154  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2155  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1736  do { Line 2163  do {
2163     register int op = *scode;     register int op = *scode;
2164     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2165       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
2166       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2167         { if (scode[1] != OP_ANY) return FALSE; }
2168     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
2169     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2170     }     }
# Line 1834  pcre_compile(const char *pattern, int op Line 2263  pcre_compile(const char *pattern, int op
2263  real_pcre *re;  real_pcre *re;
2264  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2265  int runlength;  int runlength;
2266  int c, size;  int c, reqchar, countlits;
2267  int bracount = 0;  int bracount = 0;
2268  int top_backref = 0;  int top_backref = 0;
2269  int branch_extra = 0;  int branch_extra = 0;
2270  int branch_newextra;  int branch_newextra;
2271  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2272    size_t size;
2273  uschar *code;  uschar *code;
2274  const uschar *ptr;  const uschar *ptr;
2275  compile_data compile_block;  compile_data compile_block;
# Line 1902  while ((c = *(++ptr)) != 0) Line 2332  while ((c = *(++ptr)) != 0)
2332      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2333      if (c == '#')      if (c == '#')
2334        {        {
2335        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
2336          on the Macintosh. */
2337          while ((c = *(++ptr)) != 0 && c != '\n') ;
2338        continue;        continue;
2339        }        }
2340      }      }
# Line 2067  while ((c = *(++ptr)) != 0) Line 2499  while ((c = *(++ptr)) != 0)
2499          ptr += 2;          ptr += 2;
2500          break;          break;
2501    
2502            /* A recursive call to the regex is an extension, to provide the
2503            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2504    
2505            case 'R':
2506            if (ptr[3] != ')')
2507              {
2508              *errorptr = ERR29;
2509              goto PCRE_ERROR_RETURN;
2510              }
2511            ptr += 3;
2512            length += 1;
2513            break;
2514    
2515          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2516    
2517          case '<':          case '<':
# Line 2099  while ((c = *(++ptr)) != 0) Line 2544  while ((c = *(++ptr)) != 0)
2544          else   /* An assertion must follow */          else   /* An assertion must follow */
2545            {            {
2546            ptr++;   /* Can treat like ':' as far as spacing is concerned */            ptr++;   /* Can treat like ':' as far as spacing is concerned */
2547              if (ptr[2] != '?' ||
2548            if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)               (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2549              {              {
2550              ptr += 2;    /* To get right offset in message */              ptr += 2;    /* To get right offset in message */
2551              *errorptr = ERR28;              *errorptr = ERR28;
# Line 2174  while ((c = *(++ptr)) != 0) Line 2619  while ((c = *(++ptr)) != 0)
2619              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2620              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2621              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2622              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2623                flag ever changes within the regex. This is used by the "required
2624                character" code. */
2625    
2626              case ':':              case ':':
2627              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2628                {                {
2629                length += 4;                length += 4;
2630                branch_newextra = 2;                branch_newextra = 2;
2631                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2632                }                }
2633              goto END_OPTIONS;              goto END_OPTIONS;
2634    
# Line 2268  while ((c = *(++ptr)) != 0) Line 2716  while ((c = *(++ptr)) != 0)
2716        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2717        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2718    
2719        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2720        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2721        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2722        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2723    
2724        if (minval == 0) length++;        if (minval == 0)
2725          else if (minval > 1) length += (minval - 1) * duplength;          {
2726        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2727            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2728            }
2729    
2730          /* When the minimum is greater than zero, 1 we have to replicate up to
2731          minval-1 times, with no additions required in the copies. Then, if
2732          there is a limited maximum we have to replicate up to maxval-1 times
2733          allowing for a BRAZERO item before each optional copy and nesting
2734          brackets for all but one of the optional copies. */
2735    
2736          else
2737            {
2738            length += (minval - 1) * duplength;
2739            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2740              length += (maxval - minval) * (duplength + 7) - 6;
2741            }
2742        }        }
2743      continue;      continue;
2744    
# Line 2295  while ((c = *(++ptr)) != 0) Line 2758  while ((c = *(++ptr)) != 0)
2758          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;          if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2759          if (c == '#')          if (c == '#')
2760            {            {
2761            while ((c = *(++ptr)) != 0 && c != '\n');            /* The space before the ; is to avoid a warning on a silly compiler
2762              on the Macintosh. */
2763              while ((c = *(++ptr)) != 0 && c != '\n') ;
2764            continue;            continue;
2765            }            }
2766          }          }
# Line 2351  if (re == NULL) Line 2816  if (re == NULL)
2816    return NULL;    return NULL;
2817    }    }
2818    
2819  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
2820    
2821  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2822    re->size = size;
2823  re->options = options;  re->options = options;
2824  re->tables = tables;  re->tables = tables;
2825    
# Line 2366  code = re->code; Line 2832  code = re->code;
2832  *code = OP_BRA;  *code = OP_BRA;
2833  bracount = 0;  bracount = 0;
2834  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2835    &compile_block);    &reqchar, &countlits, &compile_block);
2836  re->top_bracket = bracount;  re->top_bracket = bracount;
2837  re->top_backref = top_backref;  re->top_backref = top_backref;
2838    
# Line 2398  if (*errorptr != NULL) Line 2864  if (*errorptr != NULL)
2864    return NULL;    return NULL;
2865    }    }
2866    
2867  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2868  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2869  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2870  unanchored matches no end. In the case of multiline matches, an alternative is  
2871  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2872    that speeds up unanchored matches no end. If not, see if we can set the
2873    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2874    start with ^. and also when all branches start with .* for non-DOTALL matches.
2875    */
2876    
2877  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2878    {    {
# Line 2422  if ((options & PCRE_ANCHORED) == 0) Line 2892  if ((options & PCRE_ANCHORED) == 0)
2892      }      }
2893    }    }
2894    
2895    /* Save the last required character if there are at least two literal
2896    characters on all paths, or if there is no first character setting. */
2897    
2898    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2899      {
2900      re->req_char = reqchar;
2901      re->options |= PCRE_REQCHSET;
2902      }
2903    
2904  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2905    
2906  #ifdef DEBUG  #ifdef DEBUG
# Line 2431  printf("Length = %d top_bracket = %d top Line 2910  printf("Length = %d top_bracket = %d top
2910    
2911  if (re->options != 0)  if (re->options != 0)
2912    {    {
2913    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2914      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2915      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2916        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2917      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2918      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2919      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2448  if ((re->options & PCRE_FIRSTSET) != 0) Line 2928  if ((re->options & PCRE_FIRSTSET) != 0)
2928      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2929    }    }
2930    
2931    if ((re->options & PCRE_REQCHSET) != 0)
2932      {
2933      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2934        else printf("Req char = \\x%02x\n", re->req_char);
2935      }
2936    
2937  code_end = code;  code_end = code;
2938  code_base = code = re->code;  code_base = code = re->code;
2939    
# Line 2681  Returns:      TRUE if matched Line 3167  Returns:      TRUE if matched
3167    
3168  static BOOL  static BOOL
3169  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3170    int ims)    unsigned long int ims)
3171  {  {
3172  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3173    
# Line 2732  Arguments: Line 3218  Arguments:
3218     offset_top  current top pointer     offset_top  current top pointer
3219     md          pointer to "static" info for the match     md          pointer to "static" info for the match
3220     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
3221     condassert  TRUE if called to check a condition assertion     eptrb       pointer to chain of blocks containing eptr at start of
3222     eptrb       eptr at start of last bracket                   brackets - for testing for empty matches
3223       flags       can contain
3224                     match_condassert - this is an assertion condition
3225                     match_isgroup - this is the start of a bracketed group
3226    
3227  Returns:       TRUE if matched  Returns:       TRUE if matched
3228  */  */
3229    
3230  static BOOL  static BOOL
3231  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3232    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3233      int flags)
3234  {  {
3235  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3236    eptrblock newptrb;
3237    
3238    /* At the start of a bracketed group, add the current subject pointer to the
3239    stack of such pointers, to be re-instated at the end of the group when we hit
3240    the closing ket. When match() is called in other circumstances, we don't add to
3241    the stack. */
3242    
3243    if ((flags & match_isgroup) != 0)
3244      {
3245      newptrb.prev = eptrb;
3246      newptrb.saved_eptr = eptr;
3247      eptrb = &newptrb;
3248      }
3249    
3250    /* Now start processing the operations. */
3251    
3252  for (;;)  for (;;)
3253    {    {
# Line 2771  for (;;) Line 3276  for (;;)
3276      int number = op - OP_BRA;      int number = op - OP_BRA;
3277      int offset = number << 1;      int offset = number << 1;
3278    
3279      DPRINTF(("start bracket %d\n", number));  #ifdef DEBUG
3280        printf("start bracket %d subject=", number);
3281        pchars(eptr, 16, TRUE, md);
3282        printf("\n");
3283    #endif
3284    
3285      if (offset < md->offset_max)      if (offset < md->offset_max)
3286        {        {
# Line 2784  for (;;) Line 3293  for (;;)
3293    
3294        do        do
3295          {          {
3296          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3297              return TRUE;
3298          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3299          }          }
3300        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2810  for (;;) Line 3320  for (;;)
3320      DPRINTF(("start bracket 0\n"));      DPRINTF(("start bracket 0\n"));
3321      do      do
3322        {        {
3323        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3324            return TRUE;
3325        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3326        }        }
3327      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2829  for (;;) Line 3340  for (;;)
3340        return match(eptr,        return match(eptr,
3341          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?          ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3342            5 : 3 + (ecode[1] << 8) + ecode[2]),            5 : 3 + (ecode[1] << 8) + ecode[2]),
3343          offset_top, md, ims, FALSE, eptr);          offset_top, md, ims, eptrb, match_isgroup);
3344        }        }
3345    
3346      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
# Line 2837  for (;;) Line 3348  for (;;)
3348    
3349      else      else
3350        {        {
3351        if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))        if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3352              match_condassert | match_isgroup))
3353          {          {
3354          ecode += 3 + (ecode[4] << 8) + ecode[5];          ecode += 3 + (ecode[4] << 8) + ecode[5];
3355          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];          while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3356          }          }
3357        else ecode += (ecode[1] << 8) + ecode[2];        else ecode += (ecode[1] << 8) + ecode[2];
3358        return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);        return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3359        }        }
3360      /* Control never reaches here */      /* Control never reaches here */
3361    
# Line 2853  for (;;) Line 3365  for (;;)
3365      ecode += 2;      ecode += 2;
3366      break;      break;
3367    
3368      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3369        an empty string - recursion will then try other alternatives, if any. */
3370    
3371      case OP_END:      case OP_END:
3372        if (md->notempty && eptr == md->start_match) return FALSE;
3373      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3374      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3375      return TRUE;      return TRUE;
# Line 2865  for (;;) Line 3379  for (;;)
3379      case OP_OPT:      case OP_OPT:
3380      ims = ecode[1];      ims = ecode[1];
3381      ecode += 2;      ecode += 2;
3382      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3383      break;      break;
3384    
3385      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 2878  for (;;) Line 3392  for (;;)
3392      case OP_ASSERTBACK:      case OP_ASSERTBACK:
3393      do      do
3394        {        {
3395        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3396        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3397        }        }
3398      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
# Line 2886  for (;;) Line 3400  for (;;)
3400    
3401      /* If checking an assertion for a condition, return TRUE. */      /* If checking an assertion for a condition, return TRUE. */
3402    
3403      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3404    
3405      /* Continue from after the assertion, updating the offsets high water      /* Continue from after the assertion, updating the offsets high water
3406      mark, since extracts may have been taken during the assertion. */      mark, since extracts may have been taken during the assertion. */
# Line 2902  for (;;) Line 3416  for (;;)
3416      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
3417      do      do
3418        {        {
3419        if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;        if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3420            return FALSE;
3421        ecode += (ecode[1] << 8) + ecode[2];        ecode += (ecode[1] << 8) + ecode[2];
3422        }        }
3423      while (*ecode == OP_ALT);      while (*ecode == OP_ALT);
3424    
3425      if (condassert) return TRUE;      if ((flags & match_condassert) != 0) return TRUE;
3426    
3427      ecode += 3;      ecode += 3;
3428      continue;      continue;
3429    
# Line 2921  for (;;) Line 3437  for (;;)
3437      ecode += 3;      ecode += 3;
3438      break;      break;
3439    
3440        /* Recursion matches the current regex, nested. If there are any capturing
3441        brackets started but not finished, we have to save their starting points
3442        and reinstate them after the recursion. However, we don't know how many
3443        such there are (offset_top records the completed total) so we just have
3444        to save all the potential data. There may be up to 99 such values, which
3445        is a bit large to put on the stack, but using malloc for small numbers
3446        seems expensive. As a compromise, the stack is used when there are fewer
3447        than 16 values to store; otherwise malloc is used. A problem is what to do
3448        if the malloc fails ... there is no way of returning to the top level with
3449        an error. Save the top 15 values on the stack, and accept that the rest
3450        may be wrong. */
3451    
3452        case OP_RECURSE:
3453          {
3454          BOOL rc;
3455          int *save;
3456          int stacksave[15];
3457    
3458          c = md->offset_max;
3459    
3460          if (c < 16) save = stacksave; else
3461            {
3462            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3463            if (save == NULL)
3464              {
3465              save = stacksave;
3466              c = 15;
3467              }
3468            }
3469    
3470          for (i = 1; i <= c; i++)
3471            save[i] = md->offset_vector[md->offset_end - i];
3472          rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3473            match_isgroup);
3474          for (i = 1; i <= c; i++)
3475            md->offset_vector[md->offset_end - i] = save[i];
3476          if (save != stacksave) (pcre_free)(save);
3477          if (!rc) return FALSE;
3478    
3479          /* In case the recursion has set more capturing values, save the final
3480          number, then move along the subject till after the recursive match,
3481          and advance one byte in the pattern code. */
3482    
3483          offset_top = md->end_offset_top;
3484          eptr = md->end_match_ptr;
3485          ecode++;
3486          }
3487        break;
3488    
3489      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3490      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 2932  for (;;) Line 3496  for (;;)
3496      case OP_ONCE:      case OP_ONCE:
3497        {        {
3498        const uschar *prev = ecode;        const uschar *prev = ecode;
3499          const uschar *saved_eptr = eptr;
3500    
3501        do        do
3502          {          {
3503          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3504              break;
3505          ecode += (ecode[1] << 8) + ecode[2];          ecode += (ecode[1] << 8) + ecode[2];
3506          }          }
3507        while (*ecode == OP_ALT);        while (*ecode == OP_ALT);
# Line 2958  for (;;) Line 3524  for (;;)
3524        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3525        course of events. */        course of events. */
3526    
3527        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3528          {          {
3529          ecode += 3;          ecode += 3;
3530          break;          break;
# Line 2972  for (;;) Line 3538  for (;;)
3538        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3539          {          {
3540          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3541          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3542          }          }
3543    
3544        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3545          {          {
3546          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3547              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3548                  return TRUE;
3549          }          }
3550        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3551          {          {
3552          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3553              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3554          }          }
3555        }        }
3556      return FALSE;      return FALSE;
# Line 3004  for (;;) Line 3571  for (;;)
3571      case OP_BRAZERO:      case OP_BRAZERO:
3572        {        {
3573        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3574        if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3575            return TRUE;
3576        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3577        ecode = next + 3;        ecode = next + 3;
3578        }        }
# Line 3014  for (;;) Line 3582  for (;;)
3582        {        {
3583        const uschar *next = ecode+1;        const uschar *next = ecode+1;
3584        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);        do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3585        if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;        if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3586            return TRUE;
3587        ecode++;        ecode++;
3588        }        }
3589      break;      break;
# Line 3029  for (;;) Line 3598  for (;;)
3598      case OP_KETRMAX:      case OP_KETRMAX:
3599        {        {
3600        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];        const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3601          const uschar *saved_eptr = eptrb->saved_eptr;
3602    
3603          eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3604    
3605        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||        if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3606            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||            *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 3048  for (;;) Line 3620  for (;;)
3620          int number = *prev - OP_BRA;          int number = *prev - OP_BRA;
3621          int offset = number << 1;          int offset = number << 1;
3622    
3623          DPRINTF(("end bracket %d\n", number));  #ifdef DEBUG
3624            printf("end bracket %d", number);
3625            printf("\n");
3626    #endif
3627    
3628          if (number > 0)          if (number > 0)
3629            {            {
# Line 3066  for (;;) Line 3641  for (;;)
3641        the group. */        the group. */
3642    
3643        ims = original_ims;        ims = original_ims;
3644        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3645    
3646        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3647        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 3074  for (;;) Line 3649  for (;;)
3649        5.005. If there is an options reset, it will get obeyed in the normal        5.005. If there is an options reset, it will get obeyed in the normal
3650        course of events. */        course of events. */
3651    
3652        if (*ecode == OP_KET || eptr == eptrb)        if (*ecode == OP_KET || eptr == saved_eptr)
3653          {          {
3654          ecode += 3;          ecode += 3;
3655          break;          break;
# Line 3085  for (;;) Line 3660  for (;;)
3660    
3661        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
3662          {          {
3663          if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3664              match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3665                  return TRUE;
3666          }          }
3667        else  /* OP_KETRMAX */        else  /* OP_KETRMAX */
3668          {          {
3669          if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||          if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3670              match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;              match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3671          }          }
3672        }        }
3673      return FALSE;      return FALSE;
# Line 3302  for (;;) Line 3878  for (;;)
3878          {          {
3879          for (i = min;; i++)          for (i = min;; i++)
3880            {            {
3881            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3882              return TRUE;              return TRUE;
3883            if (i >= max || !match_ref(offset, eptr, length, md, ims))            if (i >= max || !match_ref(offset, eptr, length, md, ims))
3884              return FALSE;              return FALSE;
# Line 3323  for (;;) Line 3899  for (;;)
3899            }            }
3900          while (eptr >= pp)          while (eptr >= pp)
3901            {            {
3902            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3903              return TRUE;              return TRUE;
3904            eptr -= length;            eptr -= length;
3905            }            }
# Line 3394  for (;;) Line 3970  for (;;)
3970          {          {
3971          for (i = min;; i++)          for (i = min;; i++)
3972            {            {
3973            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3974              return TRUE;              return TRUE;
3975            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
3976            c = *eptr++;            c = *eptr++;
# Line 3418  for (;;) Line 3994  for (;;)
3994            }            }
3995    
3996          while (eptr >= pp)          while (eptr >= pp)
3997            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
3998              return TRUE;              return TRUE;
3999          return FALSE;          return FALSE;
4000          }          }
# Line 3515  for (;;) Line 4091  for (;;)
4091          {          {
4092          for (i = min;; i++)          for (i = min;; i++)
4093            {            {
4094            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4095              return TRUE;              return TRUE;
4096            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4097                c != md->lcc[*eptr++])                c != md->lcc[*eptr++])
# Line 3532  for (;;) Line 4108  for (;;)
4108            eptr++;            eptr++;
4109            }            }
4110          while (eptr >= pp)          while (eptr >= pp)
4111            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4112              return TRUE;              return TRUE;
4113          return FALSE;          return FALSE;
4114          }          }
# Line 3549  for (;;) Line 4125  for (;;)
4125          {          {
4126          for (i = min;; i++)          for (i = min;; i++)
4127            {            {
4128            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4129              return TRUE;              return TRUE;
4130            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4131            }            }
# Line 3564  for (;;) Line 4140  for (;;)
4140            eptr++;            eptr++;
4141            }            }
4142          while (eptr >= pp)          while (eptr >= pp)
4143           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4144             return TRUE;             return TRUE;
4145          return FALSE;          return FALSE;
4146          }          }
# Line 3646  for (;;) Line 4222  for (;;)
4222          {          {
4223          for (i = min;; i++)          for (i = min;; i++)
4224            {            {
4225            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4226              return TRUE;              return TRUE;
4227            if (i >= max || eptr >= md->end_subject ||            if (i >= max || eptr >= md->end_subject ||
4228                c == md->lcc[*eptr++])                c == md->lcc[*eptr++])
# Line 3663  for (;;) Line 4239  for (;;)
4239            eptr++;            eptr++;
4240            }            }
4241          while (eptr >= pp)          while (eptr >= pp)
4242            if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4243              return TRUE;              return TRUE;
4244          return FALSE;          return FALSE;
4245          }          }
# Line 3680  for (;;) Line 4256  for (;;)
4256          {          {
4257          for (i = min;; i++)          for (i = min;; i++)
4258            {            {
4259            if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4260              return TRUE;              return TRUE;
4261            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;            if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4262            }            }
# Line 3695  for (;;) Line 4271  for (;;)
4271            eptr++;            eptr++;
4272            }            }
4273          while (eptr >= pp)          while (eptr >= pp)
4274           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4275             return TRUE;             return TRUE;
4276          return FALSE;          return FALSE;
4277          }          }
# Line 3795  for (;;) Line 4371  for (;;)
4371        {        {
4372        for (i = min;; i++)        for (i = min;; i++)
4373          {          {
4374          if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;          if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4375          if (i >= max || eptr >= md->end_subject) return FALSE;          if (i >= max || eptr >= md->end_subject) return FALSE;
4376    
4377          c = *eptr++;          c = *eptr++;
# Line 3914  for (;;) Line 4490  for (;;)
4490          }          }
4491    
4492        while (eptr >= pp)        while (eptr >= pp)
4493          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4494            return TRUE;            return TRUE;
4495        return FALSE;        return FALSE;
4496        }        }
# Line 3952  Arguments: Line 4528  Arguments:
4528    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4529    subject         points to the subject string    subject         points to the subject string
4530    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4531      start_offset    where to start in the subject string
4532    options         option bits    options         option bits
4533    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4534    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3964  Returns:          > 0 => success; value Line 4541  Returns:          > 0 => success; value
4541    
4542  int  int
4543  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4544    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4545      int offsetcount)
4546  {  {
4547  int resetcount, ocount;  int resetcount, ocount;
4548  int first_char = -1;  int first_char = -1;
4549  int ims = 0;  int req_char = -1;
4550    int req_char2 = -1;
4551    unsigned long int ims = 0;
4552  match_data match_block;  match_data match_block;
4553  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4554  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4555  const uschar *end_subject;  const uschar *end_subject;
4556    const uschar *req_char_ptr = start_match - 1;
4557  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4558  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4559  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 3985  if (re == NULL || subject == NULL || Line 4566  if (re == NULL || subject == NULL ||
4566     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4567  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4568    
4569    match_block.start_pattern = re->code;
4570  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4571  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4572  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
# Line 3993  match_block.endonly = (re->options & PCR Line 4575  match_block.endonly = (re->options & PCR
4575    
4576  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4577  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4578    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4579    
4580  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4581    
# Line 4063  if (!anchored) Line 4646  if (!anchored)
4646          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4647    }    }
4648    
4649  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4650    character" set. If the PCRE_CASELESS is set, implying that the match starts
4651    caselessly, or if there are any changes of this flag within the regex, set up
4652    both cases of the character. Otherwise set the two values the same, which will
4653    avoid duplicate testing (which takes significant time). This covers the vast
4654    majority of cases. It will be suboptimal when the case flag changes in a regex
4655    and the required character in fact is caseful. */
4656    
4657    if ((re->options & PCRE_REQCHSET) != 0)
4658      {
4659      req_char = re->req_char;
4660      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4661        (re->tables + fcc_offset)[req_char] : req_char;
4662      }
4663    
4664    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4665    the loop runs just once. */
4666    
4667  do  do
4668    {    {
# Line 4092  do Line 4691  do
4691    
4692    else if (startline)    else if (startline)
4693      {      {
4694      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
4695        {        {
4696        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != '\n')
4697          start_match++;          start_match++;
4698        }        }
4699      }      }
4700    
4701    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4702    
4703    else if (start_bits != NULL)    else if (start_bits != NULL)
4704      {      {
# Line 4116  do Line 4715  do
4715    printf("\n");    printf("\n");
4716  #endif  #endif
4717    
4718      /* If req_char is set, we know that that character must appear in the subject
4719      for the match to succeed. If the first character is set, req_char must be
4720      later in the subject; otherwise the test starts at the match point. This
4721      optimization can save a huge amount of backtracking in patterns with nested
4722      unlimited repeats that aren't going to match. We don't know what the state of
4723      case matching may be when this character is hit, so test for it in both its
4724      cases if necessary. However, the different cased versions will not be set up
4725      unless PCRE_CASELESS was given or the casing state changes within the regex.
4726      Writing separate code makes it go faster, as does using an autoincrement and
4727      backing off on a match. */
4728    
4729      if (req_char >= 0)
4730        {
4731        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4732    
4733        /* We don't need to repeat the search if we haven't yet reached the
4734        place we found it at last time. */
4735    
4736        if (p > req_char_ptr)
4737          {
4738          /* Do a single test if no case difference is set up */
4739    
4740          if (req_char == req_char2)
4741            {
4742            while (p < end_subject)
4743              {
4744              if (*p++ == req_char) { p--; break; }
4745              }
4746            }
4747    
4748          /* Otherwise test for either case */
4749    
4750          else
4751            {
4752            while (p < end_subject)
4753              {
4754              register int pp = *p++;
4755              if (pp == req_char || pp == req_char2) { p--; break; }
4756              }
4757            }
4758    
4759          /* If we can't find the required character, break the matching loop */
4760    
4761          if (p >= end_subject) break;
4762    
4763          /* If we have found the required character, save the point where we
4764          found it, so that we don't search again next time round the loop if
4765          the start hasn't passed this character yet. */
4766    
4767          req_char_ptr = p;
4768          }
4769        }
4770    
4771    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4772    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4773    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4123  do Line 4775  do
4775    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4776    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4777    
4778    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    match_block.start_match = start_match;
4779      if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
4780      continue;      continue;
4781    
4782    /* Copy the offset information from temporary store if necessary */    /* Copy the offset information from temporary store if necessary */

Legend:
Removed from v.27  
changed lines
  Added in v.47

  ViewVC Help
Powered by ViewVC 1.1.5