/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 31 by nigel, Sat Feb 24 21:38:57 2007 UTC revision 43 by nigel, Sat Feb 24 21:39:21 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 82  static const char *OP_names[] = { Line 82  static const char *OP_names[] = {
82    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
85    "class", "Ref",    "class", "Ref", "Recurse",
86    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 107  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
108  };  };
109    
110    /* Tables of names of POSIX character classes and their lengths. The list is
111    terminated by a zero length entry. The first three must be alpha, upper, lower,
112    as this is assumed for handling case independence. */
113    
114    static const char *posix_names[] = {
115      "alpha", "lower", "upper",
116      "alnum", "ascii", "cntrl", "digit", "graph",
117      "print", "punct", "space", "word",  "xdigit" };
118    
119    static const uschar posix_name_lengths[] = {
120      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121    
122    /* Table of class bit maps for each POSIX class; up to three may be combined
123    to form the class. */
124    
125    static const int posix_class_maps[] = {
126      cbit_lower, cbit_upper, -1,             /* alpha */
127      cbit_lower, -1,         -1,             /* lower */
128      cbit_upper, -1,         -1,             /* upper */
129      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
130      cbit_print, cbit_cntrl, -1,             /* ascii */
131      cbit_cntrl, -1,         -1,             /* cntrl */
132      cbit_digit, -1,         -1,             /* digit */
133      cbit_graph, -1,         -1,             /* graph */
134      cbit_print, -1,         -1,             /* print */
135      cbit_punct, -1,         -1,             /* punct */
136      cbit_space, -1,         -1,             /* space */
137      cbit_word,  -1,         -1,             /* word */
138      cbit_xdigit,-1,         -1              /* xdigit */
139    };
140    
141    
142  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
143    
144  static BOOL  static BOOL
145    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
147    
148    
149    
# Line 148  tables. */ Line 180  tables. */
180  *          Return version string                 *  *          Return version string                 *
181  *************************************************/  *************************************************/
182    
183    #define STRING(a)  # a
184    #define XSTRING(s) STRING(s)
185    
186  const char *  const char *
187  pcre_version(void)  pcre_version(void)
188  {  {
189  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
190  }  }
191    
192    
193    
194    
195  /*************************************************  /*************************************************
196  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
197  *************************************************/  *************************************************/
198    
199  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
200  structure.  of the private structure, but its interface was too rigid. It remains for
201    backwards compatibility. The public options are passed back in an int - though
202    the re->options field has been expanded to a long int, all the public options
203    at the low end of it, and so even on 16-bit systems this will still be OK.
204    Therefore, I haven't changed the API for pcre_info().
205    
206  Arguments:  Arguments:
207    external_re   points to compiled code    external_re   points to compiled code
# Line 171  Arguments: Line 210  Arguments:
210                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
211                  or -2 otherwise                  or -2 otherwise
212    
213  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
214                  or negative values on error                  or negative values on error
215  */  */
216    
# Line 181  pcre_info(const pcre *external_re, int * Line 220  pcre_info(const pcre *external_re, int *
220  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
221  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
222  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
223  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
224  if (first_char != NULL)  if (first_char != NULL)
225    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
226       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 190  return re->top_bracket; Line 229  return re->top_bracket;
229    
230    
231    
232    /*************************************************
233    *        Return info about compiled pattern      *
234    *************************************************/
235    
236    /* This is a newer "info" function which has an extensible interface so
237    that additional items can be added compatibly.
238    
239    Arguments:
240      external_re      points to compiled code
241      external_study   points to study data, or NULL
242      what             what information is required
243      where            where to put the information
244    
245    Returns:           0 if data returned, negative on error
246    */
247    
248    int
249    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
250      void *where)
251    {
252    const real_pcre *re = (const real_pcre *)external_re;
253    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
254    
255    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
256    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
257    
258    switch (what)
259      {
260      case PCRE_INFO_OPTIONS:
261      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
262      break;
263    
264      case PCRE_INFO_SIZE:
265      *((size_t *)where) = re->size;
266      break;
267    
268      case PCRE_INFO_CAPTURECOUNT:
269      *((int *)where) = re->top_bracket;
270      break;
271    
272      case PCRE_INFO_BACKREFMAX:
273      *((int *)where) = re->top_backref;
274      break;
275    
276      case PCRE_INFO_FIRSTCHAR:
277      *((int *)where) =
278        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
279        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
280      break;
281    
282      case PCRE_INFO_FIRSTTABLE:
283      *((const uschar **)where) =
284        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
285          study->start_bits : NULL;
286      break;
287    
288      case PCRE_INFO_LASTLITERAL:
289      *((int *)where) =
290        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
291      break;
292    
293      default: return PCRE_ERROR_BADOPTION;
294      }
295    
296    return 0;
297    }
298    
299    
300    
301  #ifdef DEBUG  #ifdef DEBUG
302  /*************************************************  /*************************************************
# Line 249  check_escape(const uschar **ptrptr, cons Line 356  check_escape(const uschar **ptrptr, cons
356    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
357  {  {
358  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
359  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
360    
361    c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
362  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
363    
364  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 532  for (;;) Line 639  for (;;)
639    
640      case OP_REVERSE:      case OP_REVERSE:
641      cc++;      cc++;
642        /* Fall through */
643    
644      case OP_CREF:      case OP_CREF:
645      case OP_OPT:      case OP_OPT:
# Line 615  for (;;) Line 723  for (;;)
723    
724    
725  /*************************************************  /*************************************************
726    *           Check for POSIX class syntax         *
727    *************************************************/
728    
729    /* This function is called when the sequence "[:" or "[." or "[=" is
730    encountered in a character class. It checks whether this is followed by an
731    optional ^ and then a sequence of letters, terminated by a matching ":]" or
732    ".]" or "=]".
733    
734    Argument:
735      ptr      pointer to the initial [
736      endptr   where to return the end pointer
737      cd       pointer to compile data
738    
739    Returns:   TRUE or FALSE
740    */
741    
742    static BOOL
743    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
744    {
745    int terminator;          /* Don't combine these lines; the Solaris cc */
746    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
747    if (*(++ptr) == '^') ptr++;
748    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
749    if (*ptr == terminator && ptr[1] == ']')
750      {
751      *endptr = ptr;
752      return TRUE;
753      }
754    return FALSE;
755    }
756    
757    
758    
759    
760    /*************************************************
761    *          Check POSIX class name                *
762    *************************************************/
763    
764    /* This function is called to check the name given in a POSIX-style class entry
765    such as [:alnum:].
766    
767    Arguments:
768      ptr        points to the first letter
769      len        the length of the name
770    
771    Returns:     a value representing the name, or -1 if unknown
772    */
773    
774    static int
775    check_posix_name(const uschar *ptr, int len)
776    {
777    register int yield = 0;
778    while (posix_name_lengths[yield] != 0)
779      {
780      if (len == posix_name_lengths[yield] &&
781        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
782      yield++;
783      }
784    return -1;
785    }
786    
787    
788    
789    
790    /*************************************************
791  *           Compile one branch                   *  *           Compile one branch                   *
792  *************************************************/  *************************************************/
793    
# Line 627  Arguments: Line 800  Arguments:
800    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
801    errorptr     points to pointer to error message    errorptr     points to pointer to error message
802    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
803      reqchar      set to the last literal character required, else -1
804      countlits    set to count of mandatory literal characters
805    cd           contains pointers to tables    cd           contains pointers to tables
806    
807  Returns:       TRUE on success  Returns:       TRUE on success
# Line 636  Returns:       TRUE on success Line 811  Returns:       TRUE on success
811  static BOOL  static BOOL
812  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
813    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
814    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
815  {  {
816  int repeat_type, op_type;  int repeat_type, op_type;
817  int repeat_min, repeat_max;  int repeat_min, repeat_max;
818  int bravalue, length;  int bravalue, length;
819  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
820    int prevreqchar;
821    int condcount = 0;
822    int subcountlits = 0;
823  register int c;  register int c;
824  register uschar *code = *codeptr;  register uschar *code = *codeptr;
825  uschar *tempcode;  uschar *tempcode;
# Line 655  uschar class[32]; Line 833  uschar class[32];
833  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
834  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
835    
836    /* Initialize no required char, and count of literals */
837    
838    *reqchar = prevreqchar = -1;
839    *countlits = 0;
840    
841  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
842    
843  for (;; ptr++)  for (;; ptr++)
# Line 664  for (;; ptr++) Line 847  for (;; ptr++)
847    int class_lastchar;    int class_lastchar;
848    int newoptions;    int newoptions;
849    int condref;    int condref;
850      int subreqchar;
851    
852    c = *ptr;    c = *ptr;
853    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 746  for (;; ptr++) Line 930  for (;; ptr++)
930          goto FAILED;          goto FAILED;
931          }          }
932    
933          /* Handle POSIX class names. Perl allows a negation extension of the
934          form [:^name]. A square bracket that doesn't match the syntax is
935          treated as a literal. We also recognize the POSIX constructions
936          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
937          5.6 does. */
938    
939          if (c == '[' &&
940              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
941              check_posix_syntax(ptr, &tempptr, cd))
942            {
943            BOOL local_negate = FALSE;
944            int posix_class, i;
945            register const uschar *cbits = cd->cbits;
946    
947            if (ptr[1] != ':')
948              {
949              *errorptr = ERR31;
950              goto FAILED;
951              }
952    
953            ptr += 2;
954            if (*ptr == '^')
955              {
956              local_negate = TRUE;
957              ptr++;
958              }
959    
960            posix_class = check_posix_name(ptr, tempptr - ptr);
961            if (posix_class < 0)
962              {
963              *errorptr = ERR30;
964              goto FAILED;
965              }
966    
967            /* If matching is caseless, upper and lower are converted to
968            alpha. This relies on the fact that the class table starts with
969            alpha, lower, upper as the first 3 entries. */
970    
971            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
972              posix_class = 0;
973    
974            /* Or into the map we are building up to 3 of the static class
975            tables, or their negations. */
976    
977            posix_class *= 3;
978            for (i = 0; i < 3; i++)
979              {
980              int taboffset = posix_class_maps[posix_class + i];
981              if (taboffset < 0) break;
982              if (local_negate)
983                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
984              else
985                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
986              }
987    
988            ptr = tempptr + 1;
989            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
990            continue;
991            }
992    
993        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
994        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
995        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 773  for (;; ptr++) Line 1017  for (;; ptr++)
1017              continue;              continue;
1018    
1019              case ESC_w:              case ESC_w:
1020              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1021              continue;              continue;
1022    
1023              case ESC_W:              case ESC_W:
1024              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1025              continue;              continue;
1026    
1027              case ESC_s:              case ESC_s:
# Line 937  for (;; ptr++) Line 1179  for (;; ptr++)
1179        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
1180      else repeat_type = greedy_default;      else repeat_type = greedy_default;
1181    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
1182      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
1183      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
1184      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
1185        out any reqchar setting, backing up to the previous value. We must also
1186        adjust the countlits value. */
1187    
1188      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
1189        {        {
1190        int len = previous[1];        int len = previous[1];
1191    
1192          if (repeat_min == 0) *reqchar = prevreqchar;
1193          *countlits += repeat_min - 1;
1194    
1195        if (len == 1)        if (len == 1)
1196          {          {
1197          c = previous[2];          c = previous[2];
# Line 987  for (;; ptr++) Line 1230  for (;; ptr++)
1230        code = previous;        code = previous;
1231    
1232        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1233        repeat_type += op_type;      /* Combine both values for many cases */  
1234          /* If the maximum is zero then the minimum must also be zero; Perl allows
1235          this case, so we do too - by simply omitting the item altogether. */
1236    
1237          if (repeat_max == 0) goto END_REPEAT;
1238    
1239          /* Combine the op_type with the repeat_type */
1240    
1241          repeat_type += op_type;
1242    
1243        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1244        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1064  for (;; ptr++) Line 1315  for (;; ptr++)
1315        }        }
1316    
1317      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1318      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1319    
1320      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1321        {        {
1322          if (repeat_max == 0)
1323            {
1324            code = previous;
1325            goto END_REPEAT;
1326            }
1327        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1328          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1329        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1118  for (;; ptr++) Line 1374  for (;; ptr++)
1374    
1375        if (repeat_min == 0)        if (repeat_min == 0)
1376          {          {
1377            /* If we set up a required char from the bracket, we must back off
1378            to the previous value and reset the countlits value too. */
1379    
1380            if (subcountlits > 0)
1381              {
1382              *reqchar = prevreqchar;
1383              *countlits -= subcountlits;
1384              }
1385    
1386          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
1387          altogether. */          altogether. */
1388    
1389          if (repeat_max == 0)          if (repeat_max == 0)
1390            {            {
1391            code = previous;            code = previous;
1392            previous = NULL;            goto END_REPEAT;
           break;  
1393            }            }
1394    
1395          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
# Line 1230  for (;; ptr++) Line 1494  for (;; ptr++)
1494        correct offset was computed above. */        correct offset was computed above. */
1495    
1496        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
   
   
 #ifdef NEVER  
       /* If the minimum is greater than zero, and the maximum is unlimited or  
       equal to the minimum, the first copy remains where it is, and is  
       replicated up to the minimum number of times. This case includes the +  
       repeat, but of course no replication is needed in that case. */  
   
       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))  
         {  
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the minimum is zero, stick BRAZERO in front of the first copy.  
       Then, if there is a fixed upper limit, replicated up to that many times,  
       sticking BRAZERO in front of all the optional ones. */  
   
       else  
         {  
         if (repeat_min == 0)  
           {  
           memmove(previous+1, previous, len);  
           code++;  
           *previous++ = OP_BRAZERO + repeat_type;  
           }  
   
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
   
         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)  
           {  
           *code++ = OP_BRAZERO + repeat_type;  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the maximum is unlimited, set a repeater in the final copy. We  
       can't just offset backwards from the current code point, because we  
       don't know if there's been an options resetting after the ket. The  
       correct offset was computed above. */  
   
       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;  
 #endif  
   
   
1497        }        }
1498    
1499      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1295  for (;; ptr++) Line 1506  for (;; ptr++)
1506    
1507      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1508    
1509        END_REPEAT:
1510      previous = NULL;      previous = NULL;
1511      break;      break;
1512    
# Line 1372  for (;; ptr++) Line 1584  for (;; ptr++)
1584          ptr++;          ptr++;
1585          break;          break;
1586    
1587            case 'R':                 /* Pattern recursion */
1588            *code++ = OP_RECURSE;
1589            ptr++;
1590            continue;
1591    
1592          default:                  /* Option setting */          default:                  /* Option setting */
1593          set = unset = 0;          set = unset = 0;
1594          optset = &set;          optset = &set;
# Line 1463  for (;; ptr++) Line 1680  for (;; ptr++)
1680           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1681            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1682           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1683             &subreqchar,                  /* For possible last char */
1684             &subcountlits,                /* For literal count */
1685           cd))                          /* Tables block */           cd))                          /* Tables block */
1686        goto FAILED;        goto FAILED;
1687    
# Line 1476  for (;; ptr++) Line 1695  for (;; ptr++)
1695    
1696      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1697        {        {
       int branchcount = 0;  
1698        uschar *tc = code;        uschar *tc = code;
1699          condcount = 0;
1700    
1701        do {        do {
1702           branchcount++;           condcount++;
1703           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1704           }           }
1705        while (*tc != OP_KET);        while (*tc != OP_KET);
1706    
1707        if (branchcount > 2)        if (condcount > 2)
1708          {          {
1709          *errorptr = ERR27;          *errorptr = ERR27;
1710          goto FAILED;          goto FAILED;
1711          }          }
1712        }        }
1713    
1714        /* Handle updating of the required character. If the subpattern didn't
1715        set one, leave it as it was. Otherwise, update it for normal brackets of
1716        all kinds, forward assertions, and conditions with two branches. Don't
1717        update the literal count for forward assertions, however. If the bracket
1718        is followed by a quantifier with zero repeat, we have to back off. Hence
1719        the definition of prevreqchar and subcountlits outside the main loop so
1720        that they can be accessed for the back off. */
1721    
1722        if (subreqchar > 0 &&
1723             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1724             (bravalue == OP_COND && condcount == 2)))
1725          {
1726          prevreqchar = *reqchar;
1727          *reqchar = subreqchar;
1728          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1729          }
1730    
1731      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1732    
1733      code = tempcode;      code = tempcode;
# Line 1586  for (;; ptr++) Line 1822  for (;; ptr++)
1822    
1823      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1824    
1825        /* Update the last character and the count of literals */
1826    
1827        prevreqchar = (length > 1)? code[-2] : *reqchar;
1828        *reqchar = code[-1];
1829        *countlits += length;
1830    
1831      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1832      the next state. */      the next state. */
1833    
# Line 1629  Argument: Line 1871  Argument:
1871    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1872    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1873    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1874      reqchar     -> place to put the last required character, or a negative number
1875      countlits   -> place to put the shortest literal count of any branch
1876    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
1877    
1878  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1637  Returns:      TRUE on success Line 1881  Returns:      TRUE on success
1881  static BOOL  static BOOL
1882  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1883    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1884    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
1885  {  {
1886  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1887  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1645  uschar *last_branch = code; Line 1889  uschar *last_branch = code;
1889  uschar *start_bracket = code;  uschar *start_bracket = code;
1890  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1891  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1892    int branchreqchar, branchcountlits;
1893    
1894    *reqchar = -1;
1895    *countlits = INT_MAX;
1896  code += 3;  code += 3;
1897    
1898  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1684  for (;;) Line 1931  for (;;)
1931    
1932    /* Now compile the branch */    /* Now compile the branch */
1933    
1934    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1935          &branchreqchar, &branchcountlits, cd))
1936      {      {
1937      *ptrptr = ptr;      *ptrptr = ptr;
1938      return FALSE;      return FALSE;
# Line 1696  for (;;) Line 1944  for (;;)
1944    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1945    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1946    
1947      /* Save the last required character if all branches have the same; a current
1948      value of -1 means unset, while -2 means "previous branch had no last required
1949      char".  */
1950    
1951      if (*reqchar != -2)
1952        {
1953        if (branchreqchar >= 0)
1954          {
1955          if (*reqchar == -1) *reqchar = branchreqchar;
1956          else if (*reqchar != branchreqchar) *reqchar = -2;
1957          }
1958        else *reqchar = -2;
1959        }
1960    
1961      /* Keep the shortest literal count */
1962    
1963      if (branchcountlits < *countlits) *countlits = branchcountlits;
1964      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1965    
1966    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1967    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1968    the branch with OP_END. */    the branch with OP_END. */
# Line 1790  for (;;) Line 2057  for (;;)
2057      code += 2;      code += 2;
2058      break;      break;
2059    
2060        case OP_WORD_BOUNDARY:
2061        case OP_NOT_WORD_BOUNDARY:
2062        code++;
2063        break;
2064    
2065      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
2066      case OP_ASSERTBACK:      case OP_ASSERTBACK:
2067      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1817  all of whose alternatives start with OP_ Line 2089  all of whose alternatives start with OP_
2089  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
2090  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
2091    
2092  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2093  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
2094  trying them again.  so there is no point trying them again.
2095    
2096  Arguments:  Arguments:
2097    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1837  do { Line 2109  do {
2109     register int op = *scode;     register int op = *scode;
2110     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2111       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
2112     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2113                (*options & PCRE_DOTALL) != 0)
2114       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
2115     else if (op != OP_SOD &&     else if (op != OP_SOD &&
2116             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1851  return TRUE; Line 2124  return TRUE;
2124    
2125    
2126  /*************************************************  /*************************************************
2127  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
2128  *************************************************/  *************************************************/
2129    
2130  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
2131  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
2132    matching and for non-DOTALL patterns that start with .* (which must start at
2133    the beginning or after \n).
2134    
2135  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
2136  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1869  do { Line 2144  do {
2144     register int op = *scode;     register int op = *scode;
2145     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2146       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
2147       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2148         { if (scode[1] != OP_ANY) return FALSE; }
2149     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
2150     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
2151     }     }
# Line 1967  pcre_compile(const char *pattern, int op Line 2244  pcre_compile(const char *pattern, int op
2244  real_pcre *re;  real_pcre *re;
2245  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2246  int runlength;  int runlength;
2247  int c, size;  int c, reqchar, countlits;
2248  int bracount = 0;  int bracount = 0;
2249  int top_backref = 0;  int top_backref = 0;
2250  int branch_extra = 0;  int branch_extra = 0;
2251  int branch_newextra;  int branch_newextra;
2252  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2253    size_t size;
2254  uschar *code;  uschar *code;
2255  const uschar *ptr;  const uschar *ptr;
2256  compile_data compile_block;  compile_data compile_block;
# Line 2200  while ((c = *(++ptr)) != 0) Line 2478  while ((c = *(++ptr)) != 0)
2478          ptr += 2;          ptr += 2;
2479          break;          break;
2480    
2481            /* A recursive call to the regex is an extension, to provide the
2482            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2483    
2484            case 'R':
2485            if (ptr[3] != ')')
2486              {
2487              *errorptr = ERR29;
2488              goto PCRE_ERROR_RETURN;
2489              }
2490            ptr += 3;
2491            length += 1;
2492            break;
2493    
2494          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2495    
2496          case '<':          case '<':
# Line 2307  while ((c = *(++ptr)) != 0) Line 2598  while ((c = *(++ptr)) != 0)
2598              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2599              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2600              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2601              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2602                flag ever changes within the regex. This is used by the "required
2603                character" code. */
2604    
2605              case ':':              case ':':
2606              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2607                {                {
2608                length += 4;                length += 4;
2609                branch_newextra = 2;                branch_newextra = 2;
2610                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2611                }                }
2612              goto END_OPTIONS;              goto END_OPTIONS;
2613    
# Line 2499  if (re == NULL) Line 2793  if (re == NULL)
2793    return NULL;    return NULL;
2794    }    }
2795    
2796  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
2797    
2798  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2799    re->size = size;
2800  re->options = options;  re->options = options;
2801  re->tables = tables;  re->tables = tables;
2802    
# Line 2514  code = re->code; Line 2809  code = re->code;
2809  *code = OP_BRA;  *code = OP_BRA;
2810  bracount = 0;  bracount = 0;
2811  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2812    &compile_block);    &reqchar, &countlits, &compile_block);
2813  re->top_bracket = bracount;  re->top_bracket = bracount;
2814  re->top_backref = top_backref;  re->top_backref = top_backref;
2815    
# Line 2546  if (*errorptr != NULL) Line 2841  if (*errorptr != NULL)
2841    return NULL;    return NULL;
2842    }    }
2843    
2844  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2845  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2846  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2847  unanchored matches no end. In the case of multiline matches, an alternative is  
2848  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2849    that speeds up unanchored matches no end. If not, see if we can set the
2850    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2851    start with ^. and also when all branches start with .* for non-DOTALL matches.
2852    */
2853    
2854  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2855    {    {
# Line 2570  if ((options & PCRE_ANCHORED) == 0) Line 2869  if ((options & PCRE_ANCHORED) == 0)
2869      }      }
2870    }    }
2871    
2872    /* Save the last required character if there are at least two literal
2873    characters on all paths, or if there is no first character setting. */
2874    
2875    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2876      {
2877      re->req_char = reqchar;
2878      re->options |= PCRE_REQCHSET;
2879      }
2880    
2881  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2882    
2883  #ifdef DEBUG  #ifdef DEBUG
# Line 2579  printf("Length = %d top_bracket = %d top Line 2887  printf("Length = %d top_bracket = %d top
2887    
2888  if (re->options != 0)  if (re->options != 0)
2889    {    {
2890    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2891      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2892      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2893        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2894      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2895      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2896      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2596  if ((re->options & PCRE_FIRSTSET) != 0) Line 2905  if ((re->options & PCRE_FIRSTSET) != 0)
2905      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2906    }    }
2907    
2908    if ((re->options & PCRE_REQCHSET) != 0)
2909      {
2910      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2911        else printf("Req char = \\x%02x\n", re->req_char);
2912      }
2913    
2914  code_end = code;  code_end = code;
2915  code_base = code = re->code;  code_base = code = re->code;
2916    
# Line 2829  Returns:      TRUE if matched Line 3144  Returns:      TRUE if matched
3144    
3145  static BOOL  static BOOL
3146  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3147    int ims)    unsigned long int ims)
3148  {  {
3149  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
3150    
# Line 2888  Returns:       TRUE if matched Line 3203  Returns:       TRUE if matched
3203    
3204  static BOOL  static BOOL
3205  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
3206    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
3207      const uschar *eptrb)
3208  {  {
3209  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3210    
3211  for (;;)  for (;;)
3212    {    {
# Line 3005  for (;;) Line 3321  for (;;)
3321      ecode += 2;      ecode += 2;
3322      break;      break;
3323    
3324      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3325        an empty string - recursion will then try other alternatives, if any. */
3326    
3327      case OP_END:      case OP_END:
3328        if (md->notempty && eptr == md->start_match) return FALSE;
3329      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3330      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3331      return TRUE;      return TRUE;
# Line 3017  for (;;) Line 3335  for (;;)
3335      case OP_OPT:      case OP_OPT:
3336      ims = ecode[1];      ims = ecode[1];
3337      ecode += 2;      ecode += 2;
3338      DPRINTF(("ims set to %02x\n", ims));      DPRINTF(("ims set to %02lx\n", ims));
3339      break;      break;
3340    
3341      /* Assertion brackets. Check the alternative branches in turn - the      /* Assertion brackets. Check the alternative branches in turn - the
# Line 3073  for (;;) Line 3391  for (;;)
3391      ecode += 3;      ecode += 3;
3392      break;      break;
3393    
3394        /* Recursion matches the current regex, nested. If there are any capturing
3395        brackets started but not finished, we have to save their starting points
3396        and reinstate them after the recursion. However, we don't know how many
3397        such there are (offset_top records the completed total) so we just have
3398        to save all the potential data. There may be up to 99 such values, which
3399        is a bit large to put on the stack, but using malloc for small numbers
3400        seems expensive. As a compromise, the stack is used when there are fewer
3401        than 16 values to store; otherwise malloc is used. A problem is what to do
3402        if the malloc fails ... there is no way of returning to the top level with
3403        an error. Save the top 15 values on the stack, and accept that the rest
3404        may be wrong. */
3405    
3406        case OP_RECURSE:
3407          {
3408          BOOL rc;
3409          int *save;
3410          int stacksave[15];
3411    
3412          c = md->offset_max;
3413    
3414          if (c < 16) save = stacksave; else
3415            {
3416            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3417            if (save == NULL)
3418              {
3419              save = stacksave;
3420              c = 15;
3421              }
3422            }
3423    
3424          for (i = 1; i <= c; i++)
3425            save[i] = md->offset_vector[md->offset_end - i];
3426          rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
3427          for (i = 1; i <= c; i++)
3428            md->offset_vector[md->offset_end - i] = save[i];
3429          if (save != stacksave) (pcre_free)(save);
3430          if (!rc) return FALSE;
3431    
3432          /* In case the recursion has set more capturing values, save the final
3433          number, then move along the subject till after the recursive match,
3434          and advance one byte in the pattern code. */
3435    
3436          offset_top = md->end_offset_top;
3437          eptr = md->end_match_ptr;
3438          ecode++;
3439          }
3440        break;
3441    
3442      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3443      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 3124  for (;;) Line 3489  for (;;)
3489        if (ecode[3] == OP_OPT)        if (ecode[3] == OP_OPT)
3490          {          {
3491          ims = (ims & ~PCRE_IMS) | ecode[4];          ims = (ims & ~PCRE_IMS) | ecode[4];
3492          DPRINTF(("ims set to %02x at group repeat\n", ims));          DPRINTF(("ims set to %02lx at group repeat\n", ims));
3493          }          }
3494    
3495        if (*ecode == OP_KETRMIN)        if (*ecode == OP_KETRMIN)
# Line 3218  for (;;) Line 3583  for (;;)
3583        the group. */        the group. */
3584    
3585        ims = original_ims;        ims = original_ims;
3586        DPRINTF(("ims reset to %02x\n", ims));        DPRINTF(("ims reset to %02lx\n", ims));
3587    
3588        /* For a non-repeating ket, just continue at this level. This also        /* For a non-repeating ket, just continue at this level. This also
3589        happens for a repeating ket if no characters were matched in the group.        happens for a repeating ket if no characters were matched in the group.
# Line 4104  Arguments: Line 4469  Arguments:
4469    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4470    subject         points to the subject string    subject         points to the subject string
4471    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4472      start_offset    where to start in the subject string
4473    options         option bits    options         option bits
4474    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4475    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 4116  Returns:          > 0 => success; value Line 4482  Returns:          > 0 => success; value
4482    
4483  int  int
4484  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4485    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4486      int offsetcount)
4487  {  {
4488  int resetcount, ocount;  int resetcount, ocount;
4489  int first_char = -1;  int first_char = -1;
4490  int ims = 0;  int req_char = -1;
4491    int req_char2 = -1;
4492    unsigned long int ims = 0;
4493  match_data match_block;  match_data match_block;
4494  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4495  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4496  const uschar *end_subject;  const uschar *end_subject;
4497    const uschar *req_char_ptr = start_match - 1;
4498  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4499  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4500  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 4137  if (re == NULL || subject == NULL || Line 4507  if (re == NULL || subject == NULL ||
4507     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4508  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4509    
4510    match_block.start_pattern = re->code;
4511  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4512  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4513  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
# Line 4145  match_block.endonly = (re->options & PCR Line 4516  match_block.endonly = (re->options & PCR
4516    
4517  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4518  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4519    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4520    
4521  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4522    
# Line 4215  if (!anchored) Line 4587  if (!anchored)
4587          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4588    }    }
4589    
4590  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4591    character" set. If the PCRE_CASELESS is set, implying that the match starts
4592    caselessly, or if there are any changes of this flag within the regex, set up
4593    both cases of the character. Otherwise set the two values the same, which will
4594    avoid duplicate testing (which takes significant time). This covers the vast
4595    majority of cases. It will be suboptimal when the case flag changes in a regex
4596    and the required character in fact is caseful. */
4597    
4598    if ((re->options & PCRE_REQCHSET) != 0)
4599      {
4600      req_char = re->req_char;
4601      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4602        (re->tables + fcc_offset)[req_char] : req_char;
4603      }
4604    
4605    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4606    the loop runs just once. */
4607    
4608  do  do
4609    {    {
# Line 4244  do Line 4632  do
4632    
4633    else if (startline)    else if (startline)
4634      {      {
4635      if (start_match > match_block.start_subject)      if (start_match > match_block.start_subject + start_offset)
4636        {        {
4637        while (start_match < end_subject && start_match[-1] != '\n')        while (start_match < end_subject && start_match[-1] != '\n')
4638          start_match++;          start_match++;
4639        }        }
4640      }      }
4641    
4642    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4643    
4644    else if (start_bits != NULL)    else if (start_bits != NULL)
4645      {      {
# Line 4268  do Line 4656  do
4656    printf("\n");    printf("\n");
4657  #endif  #endif
4658    
4659      /* If req_char is set, we know that that character must appear in the subject
4660      for the match to succeed. If the first character is set, req_char must be
4661      later in the subject; otherwise the test starts at the match point. This
4662      optimization can save a huge amount of backtracking in patterns with nested
4663      unlimited repeats that aren't going to match. We don't know what the state of
4664      case matching may be when this character is hit, so test for it in both its
4665      cases if necessary. However, the different cased versions will not be set up
4666      unless PCRE_CASELESS was given or the casing state changes within the regex.
4667      Writing separate code makes it go faster, as does using an autoincrement and
4668      backing off on a match. */
4669    
4670      if (req_char >= 0)
4671        {
4672        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4673    
4674        /* We don't need to repeat the search if we haven't yet reached the
4675        place we found it at last time. */
4676    
4677        if (p > req_char_ptr)
4678          {
4679          /* Do a single test if no case difference is set up */
4680    
4681          if (req_char == req_char2)
4682            {
4683            while (p < end_subject)
4684              {
4685              if (*p++ == req_char) { p--; break; }
4686              }
4687            }
4688    
4689          /* Otherwise test for either case */
4690    
4691          else
4692            {
4693            while (p < end_subject)
4694              {
4695              register int pp = *p++;
4696              if (pp == req_char || pp == req_char2) { p--; break; }
4697              }
4698            }
4699    
4700          /* If we can't find the required character, break the matching loop */
4701    
4702          if (p >= end_subject) break;
4703    
4704          /* If we have found the required character, save the point where we
4705          found it, so that we don't search again next time round the loop if
4706          the start hasn't passed this character yet. */
4707    
4708          req_char_ptr = p;
4709          }
4710        }
4711    
4712    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4713    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4714    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4275  do Line 4716  do
4716    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4717    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4718    
4719      match_block.start_match = start_match;
4720    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4721      continue;      continue;
4722    

Legend:
Removed from v.31  
changed lines
  Added in v.43

  ViewVC Help
Powered by ViewVC 1.1.5