/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 74 by nigel, Sat Feb 24 21:40:30 2007 UTC revision 75 by nigel, Sat Feb 24 21:40:37 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-2003 University of Cambridge             Copyright (c) 1997-2004 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Redistribution and use in source and binary forms, with or without
16  computer system, and to redistribute it freely, subject to the following  modification, are permitted provided that the following conditions are met:
17  restrictions:  
18        * Redistributions of source code must retain the above copyright notice,
19  1. This software is distributed in the hope that it will be useful,        this list of conditions and the following disclaimer.
20     but WITHOUT ANY WARRANTY; without even the implied warranty of  
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.      * Redistributions in binary form must reproduce the above copyright
22          notice, this list of conditions and the following disclaimer in the
23  2. The origin of this software must not be misrepresented, either by        documentation and/or other materials provided with the distribution.
24     explicit claim or by omission.  
25        * Neither the name of the University of Cambridge nor the names of its
26  3. Altered versions must be plainly marked as such, and must not be        contributors may be used to endorse or promote products derived from
27     misrepresented as being the original software.        this software without specific prior written permission.
28    
29  4. If PCRE is embedded in any software that is released under the GNU  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30     General Purpose Licence (GPL), then the terms of that licence shall  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31     supersede any condition above with which it is incompatible.  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39    POSSIBILITY OF SUCH DAMAGE.
40  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
41  */  */
42    
# Line 51  C headers, and the external pcre header. Line 59  C headers, and the external pcre header.
59    
60  #include "internal.h"  #include "internal.h"
61    
62    /* If Unicode Property support is wanted, include a private copy of the
63    function that does it, and the table that translates names to numbers. */
64    
65  /* Allow compilation as C++ source code, should anybody want to do that. */  #ifdef SUPPORT_UCP
66    #include "ucp.c"
67  #ifdef __cplusplus  #include "ucptypetable.c"
 #define class pcre_class  
68  #endif  #endif
69    
   
70  /* Maximum number of items on the nested bracket stacks at compile time. This  /* Maximum number of items on the nested bracket stacks at compile time. This
71  applies to the nesting of all kinds of parentheses. It does not limit  applies to the nesting of all kinds of parentheses. It does not limit
72  un-nested, non-capturing parentheses. This number can be made bigger if  un-nested, non-capturing parentheses. This number can be made bigger if
# Line 75  because the offset vector is always a mu Line 83  because the offset vector is always a mu
83  #define REC_STACK_SAVE_MAX 30  #define REC_STACK_SAVE_MAX 30
84    
85    
 /* The number of bytes in a literal character string above which we can't add  
 any more is set at 250 in order to allow for UTF-8 characters. (In theory it  
 could be 255 when UTF-8 support is excluded, but that means that some of the  
 test output would be different, which just complicates things.) */  
   
 #define MAXLIT 250  
   
   
86  /* The maximum remaining length of subject we are prepared to search for a  /* The maximum remaining length of subject we are prepared to search for a
87  req_byte match. */  req_byte match. */
88    
# Line 106  is invalid. */ Line 106  is invalid. */
106    
107  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #if !EBCDIC   /* This is the "normal" table for ASCII systems */
108  static const short int escapes[] = {  static const short int escapes[] = {
109      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
110      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
111    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
112      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
113      0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
114      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
115    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
116      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
117      0,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
118      0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
119  };  };
120    
121  #else         /* This is the "abnormal" table for EBCDIC systems */  #else         /* This is the "abnormal" table for EBCDIC systems */
# Line 129  static const short int escapes[] = { Line 129  static const short int escapes[] = {
129  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
130  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
131  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
132  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0,      0,  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,
133  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
134  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
135  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 137  static const short int escapes[] = { Line 137  static const short int escapes[] = {
137  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
138  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
139  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
140  /*  D0 */   '}',     0,      0,       0,      0,     0,      0,      0,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
141  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,
142  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W,      0,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
143  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
144  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
145  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 562  return XSTRING(PCRE_MAJOR) "." XSTRING(P Line 562  return XSTRING(PCRE_MAJOR) "." XSTRING(P
562    
563    
564  /*************************************************  /*************************************************
565    *         Flip bytes in an integer               *
566    *************************************************/
567    
568    /* This function is called when the magic number in a regex doesn't match in
569    order to flip its bytes to see if we are dealing with a pattern that was
570    compiled on a host of different endianness. If so, this function is used to
571    flip other byte values.
572    
573    Arguments:
574      value        the number to flip
575      n            the number of bytes to flip (assumed to be 2 or 4)
576    
577    Returns:       the flipped value
578    */
579    
580    static long int
581    byteflip(long int value, int n)
582    {
583    if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
584    return ((value & 0x000000ff) << 24) |
585           ((value & 0x0000ff00) <<  8) |
586           ((value & 0x00ff0000) >>  8) |
587           ((value & 0xff000000) >> 24);
588    }
589    
590    
591    
592    /*************************************************
593    *       Test for a byte-flipped compiled regex   *
594    *************************************************/
595    
596    /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
597    job is to test whether the regex is byte-flipped - that is, it was compiled on
598    a system of opposite endianness. The function is called only when the native
599    MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
600    relevant values into a different data block, and return it.
601    
602    Arguments:
603      re               points to the regex
604      study            points to study data, or NULL
605      internal_re      points to a new regex block
606      internal_study   points to a new study block
607    
608    Returns:           the new block if is is indeed a byte-flipped regex
609                       NULL if it is not
610    */
611    
612    static real_pcre *
613    try_flipped(const real_pcre *re, real_pcre *internal_re,
614      const pcre_study_data *study, pcre_study_data *internal_study)
615    {
616    if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
617      return NULL;
618    
619    *internal_re = *re;           /* To copy other fields */
620    internal_re->size = byteflip(re->size, sizeof(re->size));
621    internal_re->options = byteflip(re->options, sizeof(re->options));
622    internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
623    internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
624    internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
625    internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
626    internal_re->name_table_offset = byteflip(re->name_table_offset,
627      sizeof(re->name_table_offset));
628    internal_re->name_entry_size = byteflip(re->name_entry_size,
629      sizeof(re->name_entry_size));
630    internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
631    
632    if (study != NULL)
633      {
634      *internal_study = *study;   /* To copy other fields */
635      internal_study->size = byteflip(study->size, sizeof(study->size));
636      internal_study->options = byteflip(study->options, sizeof(study->options));
637      }
638    
639    return internal_re;
640    }
641    
642    
643    
644    /*************************************************
645  * (Obsolete) Return info about compiled pattern  *  * (Obsolete) Return info about compiled pattern  *
646  *************************************************/  *************************************************/
647    
# Line 573  at the low end of it, and so even on 16- Line 653  at the low end of it, and so even on 16-
653  Therefore, I haven't changed the API for pcre_info().  Therefore, I haven't changed the API for pcre_info().
654    
655  Arguments:  Arguments:
656    external_re   points to compiled code    argument_re   points to compiled code
657    optptr        where to pass back the options    optptr        where to pass back the options
658    first_byte    where to pass back the first character,    first_byte    where to pass back the first character,
659                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
# Line 584  Returns:        number of capturing subp Line 664  Returns:        number of capturing subp
664  */  */
665    
666  EXPORT int  EXPORT int
667  pcre_info(const pcre *external_re, int *optptr, int *first_byte)  pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
668  {  {
669  const real_pcre *re = (const real_pcre *)external_re;  real_pcre internal_re;
670    const real_pcre *re = (const real_pcre *)argument_re;
671  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
672  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER)
673      {
674      re = try_flipped(re, &internal_re, NULL, NULL);
675      if (re == NULL) return PCRE_ERROR_BADMAGIC;
676      }
677  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
678  if (first_byte != NULL)  if (first_byte != NULL)
679    *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :    *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
# Line 606  return re->top_bracket; Line 691  return re->top_bracket;
691  that additional items can be added compatibly.  that additional items can be added compatibly.
692    
693  Arguments:  Arguments:
694    external_re      points to compiled code    argument_re      points to compiled code
695    extra_data       points extra data, or NULL    extra_data       points extra data, or NULL
696    what             what information is required    what             what information is required
697    where            where to put the information    where            where to put the information
# Line 615  Returns:           0 if data returned, n Line 700  Returns:           0 if data returned, n
700  */  */
701    
702  EXPORT int  EXPORT int
703  pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,  pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
704    void *where)    void *where)
705  {  {
706  const real_pcre *re = (const real_pcre *)external_re;  real_pcre internal_re;
707    pcre_study_data internal_study;
708    const real_pcre *re = (const real_pcre *)argument_re;
709  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
710    
711  if (re == NULL || where == NULL) return PCRE_ERROR_NULL;  if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  
712    
713  if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)  if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
714    study = (const pcre_study_data *)extra_data->study_data;    study = (const pcre_study_data *)extra_data->study_data;
715    
716    if (re->magic_number != MAGIC_NUMBER)
717      {
718      re = try_flipped(re, &internal_re, study, &internal_study);
719      if (re == NULL) return PCRE_ERROR_BADMAGIC;
720      if (study != NULL) study = &internal_study;
721      }
722    
723  switch (what)  switch (what)
724    {    {
725    case PCRE_INFO_OPTIONS:    case PCRE_INFO_OPTIONS:
# Line 655  switch (what) Line 748  switch (what)
748      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
749    break;    break;
750    
751      /* Make sure we pass back the pointer to the bit vector in the external
752      block, not the internal copy (with flipped integer fields). */
753    
754    case PCRE_INFO_FIRSTTABLE:    case PCRE_INFO_FIRSTTABLE:
755    *((const uschar **)where) =    *((const uschar **)where) =
756      (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?      (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
757        study->start_bits : NULL;        ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
758    break;    break;
759    
760    case PCRE_INFO_LASTLITERAL:    case PCRE_INFO_LASTLITERAL:
# Line 675  switch (what) Line 771  switch (what)
771    break;    break;
772    
773    case PCRE_INFO_NAMETABLE:    case PCRE_INFO_NAMETABLE:
774    *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);    *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
775      break;
776    
777      case PCRE_INFO_DEFAULT_TABLES:
778      *((const uschar **)where) = (const uschar *)pcre_default_tables;
779    break;    break;
780    
781    default: return PCRE_ERROR_BADOPTION;    default: return PCRE_ERROR_BADOPTION;
# Line 713  switch (what) Line 813  switch (what)
813  #endif  #endif
814    break;    break;
815    
816      case PCRE_CONFIG_UNICODE_PROPERTIES:
817    #ifdef SUPPORT_UCP
818      *((int *)where) = 1;
819    #else
820      *((int *)where) = 0;
821    #endif
822      break;
823    
824    case PCRE_CONFIG_NEWLINE:    case PCRE_CONFIG_NEWLINE:
825    *((int *)where) = NEWLINE;    *((int *)where) = NEWLINE;
826    break;    break;
# Line 835  else Line 943  else
943      case 'l':      case 'l':
944      case 'L':      case 'L':
945      case 'N':      case 'N':
     case 'p':  
     case 'P':  
946      case 'u':      case 'u':
947      case 'U':      case 'U':
     case 'X':  
948      *errorptr = ERR37;      *errorptr = ERR37;
949      break;      break;
950    
# Line 989  return c; Line 1094  return c;
1094    
1095    
1096    
1097    #ifdef SUPPORT_UCP
1098    /*************************************************
1099    *               Handle \P and \p                 *
1100    *************************************************/
1101    
1102    /* This function is called after \P or \p has been encountered, provided that
1103    PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104    pointing at the P or p. On exit, it is pointing at the final character of the
1105    escape sequence.
1106    
1107    Argument:
1108      ptrptr     points to the pattern position pointer
1109      negptr     points to a boolean that is set TRUE for negation else FALSE
1110      errorptr   points to the pointer to the error message
1111    
1112    Returns:     value from ucp_type_table, or -1 for an invalid type
1113    */
1114    
1115    static int
1116    get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1117    {
1118    int c, i, bot, top;
1119    const uschar *ptr = *ptrptr;
1120    char name[4];
1121    
1122    c = *(++ptr);
1123    if (c == 0) goto ERROR_RETURN;
1124    
1125    *negptr = FALSE;
1126    
1127    /* \P or \p can be followed by a one- or two-character name in {}, optionally
1128    preceded by ^ for negation. */
1129    
1130    if (c == '{')
1131      {
1132      if (ptr[1] == '^')
1133        {
1134        *negptr = TRUE;
1135        ptr++;
1136        }
1137      for (i = 0; i <= 2; i++)
1138        {
1139        c = *(++ptr);
1140        if (c == 0) goto ERROR_RETURN;
1141        if (c == '}') break;
1142        name[i] = c;
1143        }
1144      if (c !='}')   /* Try to distinguish error cases */
1145        {
1146        while (*(++ptr) != 0 && *ptr != '}');
1147        if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1148        }
1149      name[i] = 0;
1150      }
1151    
1152    /* Otherwise there is just one following character */
1153    
1154    else
1155      {
1156      name[0] = c;
1157      name[1] = 0;
1158      }
1159    
1160    *ptrptr = ptr;
1161    
1162    /* Search for a recognized property name using binary chop */
1163    
1164    bot = 0;
1165    top = sizeof(utt)/sizeof(ucp_type_table);
1166    
1167    while (bot < top)
1168      {
1169      i = (bot + top)/2;
1170      c = strcmp(name, utt[i].name);
1171      if (c == 0) return utt[i].value;
1172      if (c > 0) bot = i + 1; else top = i;
1173      }
1174    
1175    UNKNOWN_RETURN:
1176    *errorptr = ERR47;
1177    *ptrptr = ptr;
1178    return -1;
1179    
1180    ERROR_RETURN:
1181    *errorptr = ERR46;
1182    *ptrptr = ptr;
1183    return -1;
1184    }
1185    #endif
1186    
1187    
1188    
1189    
1190  /*************************************************  /*************************************************
1191  *            Check for counted repeat            *  *            Check for counted repeat            *
1192  *************************************************/  *************************************************/
# Line 1085  return p; Line 1283  return p;
1283  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1284  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1285  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, a change of option is important.
1286    For some calls, it makes sense to skip negative forward and all backward
1287    assertions, and also the \b assertion; for others it does not.
1288    
1289  Arguments:  Arguments:
1290    code       pointer to the start of the group    code         pointer to the start of the group
1291    options    pointer to external options    options      pointer to external options
1292    optbit     the option bit whose changing is significant, or    optbit       the option bit whose changing is significant, or
1293                 zero if none are                   zero if none are
1294      skipassert   TRUE if certain assertions are to be skipped
1295    
1296  Returns:     pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1297  */  */
1298    
1299  static const uschar*  static const uschar*
1300  first_significant_code(const uschar *code, int *options, int optbit)  first_significant_code(const uschar *code, int *options, int optbit,
1301      BOOL skipassert)
1302  {  {
1303  for (;;)  for (;;)
1304    {    {
# Line 1111  for (;;) Line 1313  for (;;)
1313      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1314      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1315      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1316        if (!skipassert) return code;
1317      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1318        code += OP_lengths[*code];
1319        break;
1320    
1321        case OP_WORD_BOUNDARY:
1322        case OP_NOT_WORD_BOUNDARY:
1323        if (!skipassert) return code;
1324      /* Fall through */      /* Fall through */
1325    
1326      case OP_CALLOUT:      case OP_CALLOUT:
1327      case OP_CREF:      case OP_CREF:
1328      case OP_BRANUMBER:      case OP_BRANUMBER:
     case OP_WORD_BOUNDARY:  
     case OP_NOT_WORD_BOUNDARY:  
1329      code += OP_lengths[*code];      code += OP_lengths[*code];
1330      break;      break;
1331    
# Line 1220  for (;;) Line 1427  for (;;)
1427      cc += OP_lengths[*cc];      cc += OP_lengths[*cc];
1428      break;      break;
1429    
1430      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.      /* Handle literal characters */
     This requires a scan of the string, unfortunately. We assume valid UTF-8  
     strings, so all we do is reduce the length by one for every byte whose bits  
     are 10xxxxxx. */  
1431    
1432      case OP_CHARS:      case OP_CHAR:
1433      branchlength += *(++cc);      case OP_CHARNC:
1434        branchlength++;
1435        cc += 2;
1436  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1437      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0)
1438        for (d = 1; d <= *cc; d++)        {
1439          if ((cc[d] & 0xc0) == 0x80) branchlength--;        while ((*cc & 0xc0) == 0x80) cc++;
1440          }
1441  #endif  #endif
     cc += *cc + 1;  
1442      break;      break;
1443    
1444      /* Handle exact repetitions. The count is already in characters, but we      /* Handle exact repetitions. The count is already in characters, but we
# Line 1256  for (;;) Line 1462  for (;;)
1462    
1463      /* Handle single-char matchers */      /* Handle single-char matchers */
1464    
1465        case OP_PROP:
1466        case OP_NOTPROP:
1467        cc++;
1468        /* Fall through */
1469    
1470      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1471      case OP_DIGIT:      case OP_DIGIT:
1472      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1342  for (;;) Line 1553  for (;;)
1553    {    {
1554    register int c = *code;    register int c = *code;
1555    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];  
1556    else if (c > OP_BRA)    else if (c > OP_BRA)
1557      {      {
1558      int n = c - OP_BRA;      int n = c - OP_BRA;
# Line 1358  for (;;) Line 1568  for (;;)
1568    
1569      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1570      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1571      to scan along to skip the extra characters. All opcodes are less than 128,      to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572      so we can use relatively efficient code. */      can use relatively efficient code. */
1573    
1574      if (utf8) switch(c)      if (utf8) switch(c)
1575        {        {
1576          case OP_CHAR:
1577          case OP_CHARNC:
1578        case OP_EXACT:        case OP_EXACT:
1579        case OP_UPTO:        case OP_UPTO:
1580        case OP_MINUPTO:        case OP_MINUPTO:
# Line 1377  for (;;) Line 1589  for (;;)
1589    
1590        /* XCLASS is used for classes that cannot be represented just by a bit        /* XCLASS is used for classes that cannot be represented just by a bit
1591        map. This includes negated single high-valued characters. The length in        map. This includes negated single high-valued characters. The length in
1592        the table is zero; the actual length is stored in the compled code. */        the table is zero; the actual length is stored in the compiled code. */
1593    
1594        case OP_XCLASS:        case OP_XCLASS:
1595        code += GET(code, 1) + 1;        code += GET(code, 1) + 1;
# Line 1416  for (;;) Line 1628  for (;;)
1628    register int c = *code;    register int c = *code;
1629    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1630    else if (c == OP_RECURSE) return code;    else if (c == OP_RECURSE) return code;
   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];  
1631    else if (c > OP_BRA)    else if (c > OP_BRA)
1632      {      {
1633      code += OP_lengths[OP_BRA];      code += OP_lengths[OP_BRA];
# Line 1429  for (;;) Line 1640  for (;;)
1640    
1641      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1642      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1643      to scan along to skip the extra characters. All opcodes are less than 128,      to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644      so we can use relatively efficient code. */      can use relatively efficient code. */
1645    
1646      if (utf8) switch(c)      if (utf8) switch(c)
1647        {        {
1648          case OP_CHAR:
1649          case OP_CHARNC:
1650        case OP_EXACT:        case OP_EXACT:
1651        case OP_UPTO:        case OP_UPTO:
1652        case OP_MINUPTO:        case OP_MINUPTO:
# Line 1448  for (;;) Line 1661  for (;;)
1661    
1662        /* XCLASS is used for classes that cannot be represented just by a bit        /* XCLASS is used for classes that cannot be represented just by a bit
1663        map. This includes negated single high-valued characters. The length in        map. This includes negated single high-valued characters. The length in
1664        the table is zero; the actual length is stored in the compled code. */        the table is zero; the actual length is stored in the compiled code. */
1665    
1666        case OP_XCLASS:        case OP_XCLASS:
1667        code += GET(code, 1) + 1;        code += GET(code, 1) + 1;
# Line 1483  static BOOL Line 1696  static BOOL
1696  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1697  {  {
1698  register int c;  register int c;
1699  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1700       code < endcode;       code < endcode;
1701       code = first_significant_code(code + OP_lengths[c], NULL, 0))       code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1702    {    {
1703    const uschar *ccode;    const uschar *ccode;
1704    
# Line 1551  for (code = first_significant_code(code Line 1764  for (code = first_significant_code(code
1764    
1765      /* Opcodes that must match a character */      /* Opcodes that must match a character */
1766    
1767        case OP_PROP:
1768        case OP_NOTPROP:
1769        case OP_EXTUNI:
1770      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1771      case OP_DIGIT:      case OP_DIGIT:
1772      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1559  for (code = first_significant_code(code Line 1775  for (code = first_significant_code(code
1775      case OP_WORDCHAR:      case OP_WORDCHAR:
1776      case OP_ANY:      case OP_ANY:
1777      case OP_ANYBYTE:      case OP_ANYBYTE:
1778      case OP_CHARS:      case OP_CHAR:
1779        case OP_CHARNC:
1780      case OP_NOT:      case OP_NOT:
1781      case OP_PLUS:      case OP_PLUS:
1782      case OP_MINPLUS:      case OP_MINPLUS:
# Line 1734  while ((ptr = (uschar *)find_recurse(ptr Line 1951  while ((ptr = (uschar *)find_recurse(ptr
1951    
1952    
1953  /*************************************************  /*************************************************
1954    *        Insert an automatic callout point       *
1955    *************************************************/
1956    
1957    /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958    callout points before each pattern item.
1959    
1960    Arguments:
1961      code           current code pointer
1962      ptr            current pattern pointer
1963      cd             pointers to tables etc
1964    
1965    Returns:         new code pointer
1966    */
1967    
1968    static uschar *
1969    auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1970    {
1971    *code++ = OP_CALLOUT;
1972    *code++ = 255;
1973    PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1974    PUT(code, LINK_SIZE, 0);                /* Default length */
1975    return code + 2*LINK_SIZE;
1976    }
1977    
1978    
1979    
1980    /*************************************************
1981    *         Complete a callout item                *
1982    *************************************************/
1983    
1984    /* A callout item contains the length of the next item in the pattern, which
1985    we can't fill in till after we have reached the relevant point. This is used
1986    for both automatic and manual callouts.
1987    
1988    Arguments:
1989      previous_callout   points to previous callout item
1990      ptr                current pattern pointer
1991      cd                 pointers to tables etc
1992    
1993    Returns:             nothing
1994    */
1995    
1996    static void
1997    complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1998    {
1999    int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000    PUT(previous_callout, 2 + LINK_SIZE, length);
2001    }
2002    
2003    
2004    
2005    #ifdef SUPPORT_UCP
2006    /*************************************************
2007    *           Get othercase range                  *
2008    *************************************************/
2009    
2010    /* This function is passed the start and end of a class range, in UTF-8 mode
2011    with UCP support. It searches up the characters, looking for internal ranges of
2012    characters in the "other" case. Each call returns the next one, updating the
2013    start address.
2014    
2015    Arguments:
2016      cptr        points to starting character value; updated
2017      d           end value
2018      ocptr       where to put start of othercase range
2019      odptr       where to put end of othercase range
2020    
2021    Yield:        TRUE when range returned; FALSE when no more
2022    */
2023    
2024    static BOOL
2025    get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2026    {
2027    int c, chartype, othercase, next;
2028    
2029    for (c = *cptr; c <= d; c++)
2030      {
2031      if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2032      }
2033    
2034    if (c > d) return FALSE;
2035    
2036    *ocptr = othercase;
2037    next = othercase + 1;
2038    
2039    for (++c; c <= d; c++)
2040      {
2041      if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2042        break;
2043      next++;
2044      }
2045    
2046    *odptr = next - 1;
2047    *cptr = c;
2048    
2049    return TRUE;
2050    }
2051    #endif  /* SUPPORT_UCP */
2052    
2053    
2054    /*************************************************
2055  *           Compile one branch                   *  *           Compile one branch                   *
2056  *************************************************/  *************************************************/
2057    
# Line 1744  bits. Line 2062  bits.
2062  Arguments:  Arguments:
2063    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
2064    brackets       points to number of extracting brackets used    brackets       points to number of extracting brackets used
2065    code           points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2066    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2067    errorptr       points to pointer to error message    errorptr       points to pointer to error message
2068    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
# Line 1764  compile_branch(int *optionsptr, int *bra Line 2082  compile_branch(int *optionsptr, int *bra
2082  int repeat_type, op_type;  int repeat_type, op_type;
2083  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2084  int bravalue = 0;  int bravalue = 0;
 int length;  
2085  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
2086  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2087  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2088  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
2089  int condcount = 0;  int condcount = 0;
2090  int options = *optionsptr;  int options = *optionsptr;
2091    int after_manual_callout = 0;
2092  register int c;  register int c;
2093  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2094  uschar *tempcode;  uschar *tempcode;
# Line 1779  BOOL groupsetfirstbyte = FALSE; Line 2097  BOOL groupsetfirstbyte = FALSE;
2097  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2098  const uschar *tempptr;  const uschar *tempptr;
2099  uschar *previous = NULL;  uschar *previous = NULL;
2100  uschar class[32];  uschar *previous_callout = NULL;
2101    uschar classbits[32];
2102    
2103  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2104  BOOL class_utf8;  BOOL class_utf8;
# Line 1795  BOOL utf8 = FALSE; Line 2114  BOOL utf8 = FALSE;
2114  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
2116    
2117  /* Initialize no first char, no required char. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqbyte just remains unset if we never
2120  find one.  find one.
# Line 1810  firstbyte = reqbyte = zerofirstbyte = ze Line 2129  firstbyte = reqbyte = zerofirstbyte = ze
2129  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130  according to the current setting of the caseless flag. REQ_CASELESS is a bit  according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131  value > 255. It is added into the firstbyte or reqbyte variables to record the  value > 255. It is added into the firstbyte or reqbyte variables to record the
2132  case status of the value. */  case status of the value. This is used only for ASCII characters. */
2133    
2134  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2135    
# Line 1820  for (;; ptr++) Line 2139  for (;; ptr++)
2139    {    {
2140    BOOL negate_class;    BOOL negate_class;
2141    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2142      BOOL is_quantifier;
2143    int class_charcount;    int class_charcount;
2144    int class_lastchar;    int class_lastchar;
2145    int newoptions;    int newoptions;
# Line 1827  for (;; ptr++) Line 2147  for (;; ptr++)
2147    int skipbytes;    int skipbytes;
2148    int subreqbyte;    int subreqbyte;
2149    int subfirstbyte;    int subfirstbyte;
2150      int mclength;
2151      uschar mcbuffer[8];
2152    
2153      /* Next byte in the pattern */
2154    
2155    c = *ptr;    c = *ptr;
2156    if (inescq && c != 0) goto NORMAL_CHAR;  
2157      /* If in \Q...\E, check for the end; if not, we have a literal */
2158    
2159      if (inescq && c != 0)
2160        {
2161        if (c == '\\' && ptr[1] == 'E')
2162          {
2163          inescq = FALSE;
2164          ptr++;
2165          continue;
2166          }
2167        else
2168          {
2169          if (previous_callout != NULL)
2170            {
2171            complete_callout(previous_callout, ptr, cd);
2172            previous_callout = NULL;
2173            }
2174          if ((options & PCRE_AUTO_CALLOUT) != 0)
2175            {
2176            previous_callout = code;
2177            code = auto_callout(code, ptr, cd);
2178            }
2179          goto NORMAL_CHAR;
2180          }
2181        }
2182    
2183      /* Fill in length of a previous callout, except when the next thing is
2184      a quantifier. */
2185    
2186      is_quantifier = c == '*' || c == '+' || c == '?' ||
2187        (c == '{' && is_counted_repeat(ptr+1));
2188    
2189      if (!is_quantifier && previous_callout != NULL &&
2190           after_manual_callout-- <= 0)
2191        {
2192        complete_callout(previous_callout, ptr, cd);
2193        previous_callout = NULL;
2194        }
2195    
2196      /* In extended mode, skip white space and comments */
2197    
2198    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2199      {      {
# Line 1843  for (;; ptr++) Line 2207  for (;; ptr++)
2207        }        }
2208      }      }
2209    
2210      /* No auto callout for quantifiers. */
2211    
2212      if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2213        {
2214        previous_callout = code;
2215        code = auto_callout(code, ptr, cd);
2216        }
2217    
2218    switch(c)    switch(c)
2219      {      {
2220      /* The branch terminates at end of string, |, or ). */      /* The branch terminates at end of string, |, or ). */
# Line 1938  for (;; ptr++) Line 2310  for (;; ptr++)
2310      character (< 256), because in that case the compiled code doesn't use the      character (< 256), because in that case the compiled code doesn't use the
2311      bit map. */      bit map. */
2312    
2313      memset(class, 0, 32 * sizeof(uschar));      memset(classbits, 0, 32 * sizeof(uschar));
2314    
2315      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2316      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. The first pass
# Line 2022  for (;; ptr++) Line 2394  for (;; ptr++)
2394            if (taboffset < 0) break;            if (taboffset < 0) break;
2395            if (local_negate)            if (local_negate)
2396              {              {
2397              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];              if (i == 0)
2398              if (blankclass) class[1] |= 0x3c;                for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2399                else
2400                  for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401                if (blankclass) classbits[1] |= 0x3c;
2402              }              }
2403            else            else
2404              {              {
2405              for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406              if (blankclass) class[1] &= ~0x3c;              if (blankclass) classbits[1] &= ~0x3c;
2407              }              }
2408            }            }
2409    
# Line 2048  for (;; ptr++) Line 2423  for (;; ptr++)
2423        if (c == '\\')        if (c == '\\')
2424          {          {
2425          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
         if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */  
2426    
2427          if (-c == ESC_Q)            /* Handle start of quoted string */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2428            else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2429            else if (-c == ESC_Q)            /* Handle start of quoted string */
2430            {            {
2431            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
2432              {              {
# Line 2060  for (;; ptr++) Line 2436  for (;; ptr++)
2436            continue;            continue;
2437            }            }
2438    
2439          else if (c < 0)          if (c < 0)
2440            {            {
2441            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2442            class_charcount = 10;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2443            switch (-c)            switch (-c)
2444              {              {
2445              case ESC_d:              case ESC_d:
2446              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2447              continue;              continue;
2448    
2449              case ESC_D:              case ESC_D:
2450              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2451              continue;              continue;
2452    
2453              case ESC_w:              case ESC_w:
2454              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2455              continue;              continue;
2456    
2457              case ESC_W:              case ESC_W:
2458              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2459              continue;              continue;
2460    
2461              case ESC_s:              case ESC_s:
2462              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463              class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2464              continue;              continue;
2465    
2466              case ESC_S:              case ESC_S:
2467              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468              class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2469                continue;
2470    
2471    #ifdef SUPPORT_UCP
2472                case ESC_p:
2473                case ESC_P:
2474                  {
2475                  BOOL negated;
2476                  int property = get_ucp(&ptr, &negated, errorptr);
2477                  if (property < 0) goto FAILED;
2478                  class_utf8 = TRUE;
2479                  *class_utf8data++ = ((-c == ESC_p) != negated)?
2480                    XCL_PROP : XCL_NOTPROP;
2481                  *class_utf8data++ = property;
2482                  class_charcount -= 2;   /* Not a < 256 character */
2483                  }
2484              continue;              continue;
2485    #endif
2486    
2487              /* Unrecognized escapes are faulted if PCRE is running in its              /* Unrecognized escapes are faulted if PCRE is running in its
2488              strict mode. By default, for compatibility with Perl, they are              strict mode. By default, for compatibility with Perl, they are
# Line 2102  for (;; ptr++) Line 2494  for (;; ptr++)
2494                *errorptr = ERR7;                *errorptr = ERR7;
2495                goto FAILED;                goto FAILED;
2496                }                }
2497              c = *ptr;    /* The final character */              c = *ptr;              /* The final character */
2498                class_charcount -= 2;  /* Undo the default count from above */
2499              }              }
2500            }            }
2501    
# Line 2127  for (;; ptr++) Line 2520  for (;; ptr++)
2520            }            }
2521          else          else
2522  #endif  #endif
2523          d = *ptr;          d = *ptr;  /* Not UTF-8 mode */
2524    
2525          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
2526          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
# Line 2138  for (;; ptr++) Line 2531  for (;; ptr++)
2531            const uschar *oldptr = ptr;            const uschar *oldptr = ptr;
2532            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2533    
2534            /* \b is backslash; any other special means the '-' was literal */            /* \b is backslash; \X is literal X; any other special means the '-'
2535              was literal */
2536    
2537            if (d < 0)            if (d < 0)
2538              {              {
2539              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b';
2540                else if (d == -ESC_X) d = 'X'; else
2541                {                {
2542                ptr = oldptr - 2;                ptr = oldptr - 2;
2543                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
# Line 2150  for (;; ptr++) Line 2545  for (;; ptr++)
2545              }              }
2546            }            }
2547    
2548          /* Check that the two values are in the correct order */          /* The check that the two values are in the correct order happens in
2549            the pre-pass. Optimize one-character ranges */
2550    
2551          if (d < c)          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
           {  
           *errorptr = ERR8;  
           goto FAILED;  
           }  
2552    
2553          /* If d is greater than 255, we can't just use the bit map, so set up          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554          for the UTF-8 supporting class type. If we are not caseless, we can          matching, we have to use an XCLASS with extra data items. Caseless
2555          just set up a single range. If we are caseless, the characters < 256          matching for characters > 127 is available only if UCP support is
2556          are handled with a bitmap, in order to get the case-insensitive          available. */
         handling. */  
2557    
2558  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2559          if (d > 255)          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2560            {            {
2561            class_utf8 = TRUE;            class_utf8 = TRUE;
2562            *class_utf8data++ = XCL_RANGE;  
2563            if ((options & PCRE_CASELESS) == 0)            /* With UCP support, we can find the other case equivalents of
2564              the relevant characters. There may be several ranges. Optimize how
2565              they fit with the basic range. */
2566    
2567    #ifdef SUPPORT_UCP
2568              if ((options & PCRE_CASELESS) != 0)
2569              {              {
2570              class_utf8data += ord2utf8(c, class_utf8data);              int occ, ocd;
2571              class_utf8data += ord2utf8(d, class_utf8data);              int cc = c;
2572              continue;  /* Go get the next char in the class */              int origd = d;
2573                while (get_othercase_range(&cc, origd, &occ, &ocd))
2574                  {
2575                  if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2576    
2577                  if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2578                    {                                  /* if there is overlap,   */
2579                    c = occ;                           /* noting that if occ < c */
2580                    continue;                          /* we can't have ocd > d  */
2581                    }                                  /* because a subrange is  */
2582                  if (ocd > d && occ <= d + 1)         /* always shorter than    */
2583                    {                                  /* the basic range.       */
2584                    d = ocd;
2585                    continue;
2586                    }
2587    
2588                  if (occ == ocd)
2589                    {
2590                    *class_utf8data++ = XCL_SINGLE;
2591                    }
2592                  else
2593                    {
2594                    *class_utf8data++ = XCL_RANGE;
2595                    class_utf8data += ord2utf8(occ, class_utf8data);
2596                    }
2597                  class_utf8data += ord2utf8(ocd, class_utf8data);
2598                  }
2599              }              }
2600            class_utf8data += ord2utf8(256, class_utf8data);  #endif  /* SUPPORT_UCP */
2601    
2602              /* Now record the original range, possibly modified for UCP caseless
2603              overlapping ranges. */
2604    
2605              *class_utf8data++ = XCL_RANGE;
2606              class_utf8data += ord2utf8(c, class_utf8data);
2607            class_utf8data += ord2utf8(d, class_utf8data);            class_utf8data += ord2utf8(d, class_utf8data);
2608            d = 255;  
2609            /* Fall through */            /* With UCP support, we are done. Without UCP support, there is no
2610              caseless matching for UTF-8 characters > 127; we can use the bit map
2611              for the smaller ones. */
2612    
2613    #ifdef SUPPORT_UCP
2614              continue;    /* With next character in the class */
2615    #else
2616              if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2617    
2618              /* Adjust upper limit and fall through to set up the map */
2619    
2620              d = 127;
2621    
2622    #endif  /* SUPPORT_UCP */
2623            }            }
2624  #endif  #endif  /* SUPPORT_UTF8 */
2625          /* We use the bit map if the range is entirely < 255, or if part of it  
2626          is < 255 and matching is caseless. */          /* We use the bit map for all cases when not in UTF-8 mode; else
2627            ranges that lie entirely within 0-127 when there is UCP support; else
2628            for partial ranges without UCP support. */
2629    
2630          for (; c <= d; c++)          for (; c <= d; c++)
2631            {            {
2632            class[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2633            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2634              {              {
2635              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2636              class[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2637              }              }
2638            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
2639            class_lastchar = c;            class_lastchar = c;
# Line 2200  for (;; ptr++) Line 2643  for (;; ptr++)
2643          }          }
2644    
2645        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
2646        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character or for an
2647          apparent range that isn't. */
2648    
2649        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
2650    
2651        /* Handle a multibyte character */        /* Handle a character that cannot go in the bit map */
2652    
2653  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2654        if (utf8 && c > 255)        if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2655          {          {
2656          class_utf8 = TRUE;          class_utf8 = TRUE;
2657          *class_utf8data++ = XCL_SINGLE;          *class_utf8data++ = XCL_SINGLE;
2658          class_utf8data += ord2utf8(c, class_utf8data);          class_utf8data += ord2utf8(c, class_utf8data);
2659    
2660    #ifdef SUPPORT_UCP
2661            if ((options & PCRE_CASELESS) != 0)
2662              {
2663              int chartype;
2664              int othercase;
2665              if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2666                {
2667                *class_utf8data++ = XCL_SINGLE;
2668                class_utf8data += ord2utf8(othercase, class_utf8data);
2669                }
2670              }
2671    #endif  /* SUPPORT_UCP */
2672    
2673          }          }
2674        else        else
2675  #endif  #endif  /* SUPPORT_UTF8 */
2676    
2677        /* Handle a single-byte character */        /* Handle a single-byte character */
2678          {          {
2679          class [c/8] |= (1 << (c&7));          classbits[c/8] |= (1 << (c&7));
2680          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2681            {            {
2682            c = cd->fcc[c];   /* flip case */            c = cd->fcc[c];   /* flip case */
2683            class[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2684            }            }
2685          class_charcount++;          class_charcount++;
2686          class_lastchar = c;          class_lastchar = c;
# Line 2233  for (;; ptr++) Line 2692  for (;; ptr++)
2692    
2693      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != ']' || inescq);
2694    
2695      /* If class_charcount is 1, we saw precisely one character with a value <      /* If class_charcount is 1, we saw precisely one character whose value is
2696      256. In UTF-8 mode, we can optimize if there were no characters >= 256 and      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697      the one character is < 128. In non-UTF-8 mode we can always optimize.      can optimize the negative case only if there were no characters >= 128
2698        because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699        single-bytes only. This is an historical hangover. Maybe one day we can
2700        tidy these opcodes to handle multi-byte characters.
2701    
2702      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
2703      1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704      that OP_NOT does not support multibyte characters. In the positive case, it      that OP_NOT does not support multibyte characters. In the positive case, it
2705      can cause firstbyte to be set. Otherwise, there can be no first char if      can cause firstbyte to be set. Otherwise, there can be no first char if
2706      this item is first, whatever repeat count may follow. In the case of      this item is first, whatever repeat count may follow. In the case of
# Line 2247  for (;; ptr++) Line 2709  for (;; ptr++)
2709  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2710      if (class_charcount == 1 &&      if (class_charcount == 1 &&
2711            (!utf8 ||            (!utf8 ||
2712            (!class_utf8 && class_lastchar < 128)))            (!class_utf8 && (!negate_class || class_lastchar < 128))))
2713    
2714  #else  #else
2715      if (class_charcount == 1)      if (class_charcount == 1)
2716  #endif  #endif
2717        {        {
2718        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
2719    
2720          /* The OP_NOT opcode works on one-byte characters only. */
2721    
2722        if (negate_class)        if (negate_class)
2723          {          {
2724          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725          zerofirstbyte = firstbyte;          zerofirstbyte = firstbyte;
2726          *code++ = OP_NOT;          *code++ = OP_NOT;
2727            *code++ = class_lastchar;
2728            break;
2729          }          }
2730    
2731          /* For a single, positive character, get the value into mcbuffer, and
2732          then we can handle this with the normal one-character code. */
2733    
2734    #ifdef SUPPORT_UTF8
2735          if (utf8 && class_lastchar > 127)
2736            mclength = ord2utf8(class_lastchar, mcbuffer);
2737        else        else
2738    #endif
2739          {          {
2740          if (firstbyte == REQ_UNSET)          mcbuffer[0] = class_lastchar;
2741            {          mclength = 1;
           zerofirstbyte = REQ_NONE;  
           firstbyte = class_lastchar | req_caseopt;  
           }  
         else  
           {  
           zerofirstbyte = firstbyte;  
           reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;  
           }  
         *code++ = OP_CHARS;  
         *code++ = 1;  
2742          }          }
2743        *code++ = class_lastchar;        goto ONE_CHAR;
2744        break;  /* End of class handling */        }       /* End of 1-char optimization */
2745        }       /* End of 1-byte optimization */  
2746        /* The general case - not the one-char optimization. If this is the first
2747      /* Otherwise, if this is the first thing in the branch, there can be no      thing in the branch, there can be no first char setting, whatever the
2748      first char setting, whatever the repeat count. Any reqbyte setting must      repeat count. Any reqbyte setting must remain unchanged after any kind of
2749      remain unchanged after any kind of repeat. */      repeat. */
2750    
2751      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
# Line 2304  for (;; ptr++) Line 2770  for (;; ptr++)
2770        if (class_charcount > 0)        if (class_charcount > 0)
2771          {          {
2772          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2773          memcpy(code, class, 32);          memcpy(code, classbits, 32);
2774          code = class_utf8data;          code = class_utf8data;
2775          }          }
2776    
# Line 2332  for (;; ptr++) Line 2798  for (;; ptr++)
2798      if (negate_class)      if (negate_class)
2799        {        {
2800        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2801        for (c = 0; c < 32; c++) code[c] = ~class[c];        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2802        }        }
2803      else      else
2804        {        {
2805        *code++ = OP_CLASS;        *code++ = OP_CLASS;
2806        memcpy(code, class, 32);        memcpy(code, classbits, 32);
2807        }        }
2808      code += 32;      code += 32;
2809      break;      break;
2810    
2811      /* Various kinds of repeat */      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812        has been tested above. */
2813    
2814      case '{':      case '{':
2815      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;      if (!is_quantifier) goto NORMAL_CHAR;
2816      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
2818      goto REPEAT;      goto REPEAT;
# Line 2422  for (;; ptr++) Line 2889  for (;; ptr++)
2889        code += 1 + LINK_SIZE;        code += 1 + LINK_SIZE;
2890        }        }
2891    
2892      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a character match, abolish the item and generate a
2893      as the subject of the repeat. If there was only one character, we can      repeat item instead. If a char item has a minumum of more than one, ensure
2894      abolish the previous item altogether. If a one-char item has a minumum of      that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895      more than one, ensure that it is set in reqbyte - it might not be if a      the first thing in a branch because the x will have gone into firstbyte
2896      sequence such as x{3} is the first thing in a branch because the x will      instead.  */
     have gone into firstbyte instead.  */  
2897    
2898      if (*previous == OP_CHARS)      if (*previous == OP_CHAR || *previous == OP_CHARNC)
2899        {        {
2900        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF-8 characters that take up more than one byte. It's
2901        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
# Line 2443  for (;; ptr++) Line 2909  for (;; ptr++)
2909          while((*lastchar & 0xc0) == 0x80) lastchar--;          while((*lastchar & 0xc0) == 0x80) lastchar--;
2910          c = code - lastchar;            /* Length of UTF-8 character */          c = code - lastchar;            /* Length of UTF-8 character */
2911          memcpy(utf8_char, lastchar, c); /* Save the char */          memcpy(utf8_char, lastchar, c); /* Save the char */
         if (lastchar == previous + 2)   /* There was only one character */  
           {  
           code = previous;              /* Abolish the previous item */  
           }  
         else  
           {  
           previous[1] -= c;             /* Adjust length of previous */  
           code = lastchar;              /* Lost char off the end */  
           tempcode = code;              /* Adjust position to be moved for '+' */  
           }  
2912          c |= 0x80;                      /* Flag c as a length */          c |= 0x80;                      /* Flag c as a length */
2913          }          }
2914        else        else
# Line 2462  for (;; ptr++) Line 2918  for (;; ptr++)
2918        with UTF-8 disabled, or for a UTF-8 character < 128. */        with UTF-8 disabled, or for a UTF-8 character < 128. */
2919    
2920          {          {
2921          c = *(--code);          c = code[-1];
2922          if (code == previous + 2)   /* There was only one character */          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
           {  
           code = previous;              /* Abolish the previous item */  
           if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;  
           }  
         else  
           {  
           previous[1]--;             /* adjust length */  
           tempcode = code;           /* Adjust position to be moved for '+' */  
           }  
2923          }          }
2924    
2925        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
# Line 2487  for (;; ptr++) Line 2934  for (;; ptr++)
2934        {        {
2935        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2936        c = previous[1];        c = previous[1];
       code = previous;  
2937        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
2938        }        }
2939    
2940      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
2941      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
2942      repeats by setting op_type to add a suitable offset into repeat_type. */      repeats by setting op_type to add a suitable offset into repeat_type. Note
2943        the the Unicode property types will be present only when SUPPORT_UCP is
2944        defined, but we don't wrap the little bits of code here because it just
2945        makes it horribly messy. */
2946    
2947      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
2948        {        {
2949          uschar *oldcode;
2950          int prop_type;
2951        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2952        c = *previous;        c = *previous;
       code = previous;  
2953    
2954        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
2955          prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2956            previous[1] : -1;
2957    
2958          oldcode = code;
2959          code = previous;                  /* Usually overwrite previous item */
2960    
2961        /* If the maximum is zero then the minimum must also be zero; Perl allows        /* If the maximum is zero then the minimum must also be zero; Perl allows
2962        this case, so we do too - by simply omitting the item altogether. */        this case, so we do too - by simply omitting the item altogether. */
2963    
2964        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
2965    
2966          /* All real repeats make it impossible to handle partial matching (maybe
2967          one day we will be able to remove this restriction). */
2968    
2969          if (repeat_max != 1) cd->nopartial = TRUE;
2970    
2971        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
2972    
2973        repeat_type += op_type;        repeat_type += op_type;
# Line 2526  for (;; ptr++) Line 2986  for (;; ptr++)
2986            }            }
2987          }          }
2988    
2989        /* The case {1,} is handled as the special case + */        /* A repeat minimum of 1 is optimized into some special cases. If the
2990          maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991          left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992          one less than the maximum. */
2993    
2994        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1)
         *code++ = OP_PLUS + repeat_type;  
   
       /* The case {n,n} is just an EXACT, while the general case {n,m} is  
       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */  
   
       else  
2995          {          {
2996          if (repeat_min != 1)          if (repeat_max == -1)
2997            {            *code++ = OP_PLUS + repeat_type;
2998            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */          else
           PUT2INC(code, 0, repeat_min);  
           }  
   
         /* If the mininum is 1 and the previous item was a character string,  
         we either have to put back the item that got cancelled if the string  
         length was 1, or add the character back onto the end of a longer  
         string. For a character type nothing need be done; it will just get  
         put back naturally. Note that the final character is always going to  
         get added below, so we leave code ready for its insertion. */  
   
         else if (*previous == OP_CHARS)  
2999            {            {
3000            if (code == previous) code += 2; else            code = oldcode;                 /* leave previous item in place */
3001              if (repeat_max == 1) goto END_REPEAT;
3002            /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80            *code++ = OP_UPTO + repeat_type;
3003            bit set as a flag. The length will always be between 2 and 6. */            PUT2INC(code, 0, repeat_max - 1);
   
 #ifdef SUPPORT_UTF8  
           if (utf8 && c >= 128) previous[1] += c & 7; else  
 #endif  
           previous[1]++;  
3004            }            }
3005            }
3006    
3007          /*  For a single negated character we also have to put back the        /* The case {n,n} is just an EXACT, while the general case {n,m} is
3008          item that got cancelled. At present this applies only to single byte        handled as an EXACT followed by an UPTO. */
         characters in any mode. */  
3009    
3010          else if (*previous == OP_NOT) code++;        else
3011            {
3012            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3013            PUT2INC(code, 0, repeat_min);
3014    
3015          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016          we have to insert the character for the previous code. In UTF-8 mode,          we have to insert the character for the previous code. For a repeated
3017          long characters have their length in c, with the 0x80 bit as a flag. */          Unicode property match, there is an extra byte that defines the
3018            required property. In UTF-8 mode, long characters have their length in
3019            c, with the 0x80 bit as a flag. */
3020    
3021          if (repeat_max < 0)          if (repeat_max < 0)
3022            {            {
# Line 2582  for (;; ptr++) Line 3028  for (;; ptr++)
3028              }              }
3029            else            else
3030  #endif  #endif
3031            *code++ = c;              {
3032                *code++ = c;
3033                if (prop_type >= 0) *code++ = prop_type;
3034                }
3035            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3036            }            }
3037    
# Line 2600  for (;; ptr++) Line 3049  for (;; ptr++)
3049            else            else
3050  #endif  #endif
3051            *code++ = c;            *code++ = c;
3052              if (prop_type >= 0) *code++ = prop_type;
3053            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3054            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
3055            PUT2INC(code, 0, repeat_max);            PUT2INC(code, 0, repeat_max);
# Line 2616  for (;; ptr++) Line 3066  for (;; ptr++)
3066          }          }
3067        else        else
3068  #endif  #endif
   
3069        *code++ = c;        *code++ = c;
3070    
3071          /* For a repeated Unicode property match, there is an extra byte that
3072          defines the required property. */
3073    
3074    #ifdef SUPPORT_UCP
3075          if (prop_type >= 0) *code++ = prop_type;
3076    #endif
3077        }        }
3078    
3079      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
# Line 2635  for (;; ptr++) Line 3091  for (;; ptr++)
3091          code = previous;          code = previous;
3092          goto END_REPEAT;          goto END_REPEAT;
3093          }          }
3094    
3095          /* All real repeats make it impossible to handle partial matching (maybe
3096          one day we will be able to remove this restriction). */
3097    
3098          if (repeat_max != 1) cd->nopartial = TRUE;
3099    
3100        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3101          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
3102        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 2937  for (;; ptr++) Line 3399  for (;; ptr++)
3399          ptr++;          ptr++;
3400          break;          break;
3401    
3402          case 'C':                 /* Callout - may be followed by digits */          case 'C':                 /* Callout - may be followed by digits; */
3403          *code++ = OP_CALLOUT;          previous_callout = code;  /* Save for later completion */
3404            {          after_manual_callout = 1; /* Skip one item before completing */
3405            *code++ = OP_CALLOUT;     /* Already checked that the terminating */
3406              {                       /* closing parenthesis is present. */
3407            int n = 0;            int n = 0;
3408            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
# Line 2949  for (;; ptr++) Line 3413  for (;; ptr++)
3413              goto FAILED;              goto FAILED;
3414              }              }
3415            *code++ = n;            *code++ = n;
3416              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
3417              PUT(code, LINK_SIZE, 0);                    /* Default length */
3418              code += 2 * LINK_SIZE;
3419            }            }
3420          previous = NULL;          previous = NULL;
3421          continue;          continue;
# Line 3339  for (;; ptr++) Line 3806  for (;; ptr++)
3806          *code++ = OP_REF;          *code++ = OP_REF;
3807          PUT2INC(code, 0, number);          PUT2INC(code, 0, number);
3808          }          }
       else  
         {  
         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;  
         *code++ = -c;  
         }  
       continue;  
       }  
   
     /* Data character: reset and fall through */  
3809    
3810      ptr = tempptr;        /* So are Unicode property matches, if supported. We know that get_ucp
3811      c = '\\';        won't fail because it was tested in the pre-pass. */
   
     /* Handle a run of data characters until a metacharacter is encountered.  
     The first character is guaranteed not to be whitespace or # when the  
     extended flag is set. */  
   
     NORMAL_CHAR:  
     default:  
     previous = code;  
     *code = OP_CHARS;  
     code += 2;  
     length = 0;  
   
     do  
       {  
       /* If in \Q...\E, check for the end; if not, we always have a literal */  
3812    
3813        if (inescq)  #ifdef SUPPORT_UCP
3814          else if (-c == ESC_P || -c == ESC_p)
3815          {          {
3816          if (c == '\\' && ptr[1] == 'E')          BOOL negated;
3817            {          int value = get_ucp(&ptr, &negated, errorptr);
3818            inescq = FALSE;          previous = code;
3819            ptr++;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3820            }          *code++ = value;
         else  
           {  
           *code++ = c;  
           length++;  
           }  
         continue;  
3821          }          }
3822    #endif
3823    
3824        /* Skip white space and comments for /x patterns */        /* For the rest, we can obtain the OP value by negating the escape
3825          value */
3826    
3827        if ((options & PCRE_EXTENDED) != 0)        else
3828          {          {
3829          if ((cd->ctypes[c] & ctype_space) != 0) continue;          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3830          if (c == '#')          *code++ = -c;
           {  
           /* The space before the ; is to avoid a warning on a silly compiler  
           on the Macintosh. */  
           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;  
           if (c == 0) break;  
           continue;  
           }  
3831          }          }
3832          continue;
3833          }
3834    
3835        /* Backslash may introduce a data char or a metacharacter. Escaped items      /* We have a data character whose value is in c. In UTF-8 mode it may have
3836        are checked for validity in the pre-compiling pass. Stop the string      a value > 127. We set its representation in the length/buffer, and then
3837        before a metaitem. */      handle it as a data character. */
   
       if (c == '\\')  
         {  
         tempptr = ptr;  
         c = check_escape(&ptr, errorptr, *brackets, options, FALSE);  
         if (c < 0) { ptr = tempptr; break; }  
   
         /* If a character is > 127 in UTF-8 mode, we have to turn it into  
         two or more bytes in the UTF-8 encoding. */  
3838    
3839  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3840          if (utf8 && c > 127)      if (utf8 && c > 127)
3841            {        mclength = ord2utf8(c, mcbuffer);
3842            uschar buffer[8];      else
           int len = ord2utf8(c, buffer);  
           for (c = 0; c < len; c++) *code++ = buffer[c];  
           length += len;  
           continue;  
           }  
3843  #endif  #endif
         }  
   
       /* Ordinary character or single-char escape */  
3844    
3845        *code++ = c;       {
3846        length++;       mcbuffer[0] = c;
3847        }       mclength = 1;
3848         }
3849    
3850      /* This "while" is the end of the "do" above. */      goto ONE_CHAR;
3851    
3852      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      /* Handle a literal character. It is guaranteed not to be whitespace or #
3853        when the extended flag is set. If we are in UTF-8 mode, it may be a
3854        multi-byte literal character. */
3855    
3856      /* Update the first and last requirements. These are always bytes, even in      default:
3857      UTF-8 mode. However, there is a special case to be considered when there      NORMAL_CHAR:
3858      are only one or two characters. Because this gets messy in UTF-8 mode, the      mclength = 1;
3859      code is kept separate. When we get here "length" contains the number of      mcbuffer[0] = c;
     bytes. */  
3860    
3861  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3862      if (utf8 && length > 1)      if (utf8 && (c & 0xc0) == 0xc0)
3863        {        {
3864        uschar *t = previous + 3;                      /* After this code, t */        while ((ptr[1] & 0xc0) == 0x80)
3865        while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */          mcbuffer[mclength++] = *(++ptr);
3866          }
3867        /* Handle the case when there is only one multibyte character. It must  #endif
       have at least two bytes because of the "length > 1" test above. */  
   
       if (t == code)  
         {  
         /* If no previous first byte, set it from this character, but revert to  
         none on a zero repeat. */  
   
         if (firstbyte == REQ_UNSET)  
           {  
           zerofirstbyte = REQ_NONE;  
           firstbyte = previous[2];  
           }  
3868    
3869          /* Otherwise, leave the first byte value alone, and don't change it on      /* At this point we have the character's bytes in mcbuffer, and the length
3870          a zero repeat */      in mclength. When not in UTF-8 mode, the length is always 1. */
3871    
3872          else zerofirstbyte = firstbyte;      ONE_CHAR:
3873        previous = code;
3874        *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875        for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3876    
3877          /* In both cases, a zero repeat resets the previous required byte */      /* Set the first and required bytes appropriately. If no previous first
3878        byte, set it from this character, but revert to none on a zero repeat.
3879        Otherwise, leave the firstbyte value alone, and don't change it on a zero
3880        repeat. */
3881    
3882          zeroreqbyte = reqbyte;      if (firstbyte == REQ_UNSET)
3883          }        {
3884          zerofirstbyte = REQ_NONE;
3885          zeroreqbyte = reqbyte;
3886    
3887        /* Handle the case when there is more than one character. These may be        /* If the character is more than one byte long, we can set firstbyte
3888        single-byte or multibyte characters */        only if it is not to be matched caselessly. */
3889    
3890        else        if (mclength == 1 || req_caseopt == 0)
3891          {          {
3892          t = code - 1;                       /* After this code, t is at the */          firstbyte = mcbuffer[0] | req_caseopt;
3893          while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */          if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
   
         /* If no previous first byte, set it from the first character, and  
         retain it on a zero repeat (of the last character). The required byte  
         is reset on a zero repeat, either to the byte before the last  
         character, unless this is the first byte of the string. In that case,  
         it reverts to its previous value. */  
   
         if (firstbyte == REQ_UNSET)  
           {  
           zerofirstbyte = firstbyte = previous[2] | req_caseopt;  
           zeroreqbyte = (t - 1 == previous + 2)?  
             reqbyte : t[-1] | req_caseopt | cd->req_varyopt;  
           }  
   
         /* If there was a previous first byte, leave it alone, and don't change  
         it on a zero repeat. The required byte is reset on a zero repeat to the  
         byte before the last character. */  
   
         else  
           {  
           zerofirstbyte = firstbyte;  
           zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;  
           }  
3894          }          }
3895          else firstbyte = reqbyte = REQ_NONE;
       /* In all cases (we know length > 1), the new required byte is the last  
       byte of the string. */  
   
       reqbyte = code[-1] | req_caseopt | cd->req_varyopt;  
3896        }        }
3897    
3898      else   /* End of UTF-8 coding */      /* firstbyte was previously set; we can set reqbyte only the length is
3899  #endif      1 or the matching is caseful. */
   
     /* This is the code for non-UTF-8 operation, either without UTF-8 support,  
     or when UTF-8 is not enabled. */  
3900    
3901        else
3902        {        {
3903        /* firstbyte was not previously set; take it from this string */        zerofirstbyte = firstbyte;
3904          zeroreqbyte = reqbyte;
3905        if (firstbyte == REQ_UNSET)        if (mclength == 1 || req_caseopt == 0)
         {  
         if (length == 1)  
           {  
           zerofirstbyte = REQ_NONE;  
           firstbyte = previous[2] | req_caseopt;  
           zeroreqbyte = reqbyte;  
           }  
         else  
           {  
           zerofirstbyte = firstbyte = previous[2] | req_caseopt;  
           zeroreqbyte = (length > 2)?  
             (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;  
           reqbyte = code[-1] | req_caseopt | cd->req_varyopt;  
           }  
         }  
   
       /* firstbyte was previously set */  
   
       else  
         {  
         zerofirstbyte = firstbyte;  
         zeroreqbyte = (length == 1)? reqbyte :  
           code[-2] | req_caseopt | cd->req_varyopt;  
3906          reqbyte = code[-1] | req_caseopt | cd->req_varyopt;          reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
         }  
3907        }        }
3908    
3909      /* Set the length in the data vector, and advance to the next state. */      break;            /* End of literal character handling */
   
     previous[1] = length;  
     if (length < MAXLIT) ptr--;  
     break;  
3910      }      }
3911    }                   /* end of big loop */    }                   /* end of big loop */
3912    
# Line 3809  is_anchored(register const uschar *code, Line 4168  is_anchored(register const uschar *code,
4168  {  {
4169  do {  do {
4170     const uschar *scode =     const uschar *scode =
4171       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172     register int op = *scode;     register int op = *scode;
4173    
4174     /* Capturing brackets */     /* Capturing brackets */
# Line 3878  is_startline(const uschar *code, unsigne Line 4237  is_startline(const uschar *code, unsigne
4237    unsigned int backref_map)    unsigned int backref_map)
4238  {  {
4239  do {  do {
4240     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4241         FALSE);
4242     register int op = *scode;     register int op = *scode;
4243    
4244     /* Capturing brackets */     /* Capturing brackets */
# Line 3897  do { Line 4257  do {
4257     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4259    
4260     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
4261     may be referenced. */     may be referenced. */
4262    
4263     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
# Line 3908  do { Line 4268  do {
4268     /* Check for explicit circumflex */     /* Check for explicit circumflex */
4269    
4270     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
4271    
4272       /* Move on to the next alternative */
4273    
4274     code += GET(code, 1);     code += GET(code, 1);
4275     }     }
4276  while (*code == OP_ALT);  /* Loop for each alternative */  while (*code == OP_ALT);  /* Loop for each alternative */
# Line 3943  register int c = -1; Line 4306  register int c = -1;
4306  do {  do {
4307     int d;     int d;
4308     const uschar *scode =     const uschar *scode =
4309       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310     register int op = *scode;     register int op = *scode;
4311    
4312     if (op >= OP_BRA) op = OP_BRA;     if (op >= OP_BRA) op = OP_BRA;
# Line 3963  do { Line 4326  do {
4326       break;       break;
4327    
4328       case OP_EXACT:       /* Fall through */       case OP_EXACT:       /* Fall through */
4329       scode++;       scode += 2;
   
      case OP_CHARS:       /* Fall through */  
      scode++;  
4330    
4331         case OP_CHAR:
4332         case OP_CHARNC:
4333       case OP_PLUS:       case OP_PLUS:
4334       case OP_MINPLUS:       case OP_MINPLUS:
4335       if (!inassert) return -1;       if (!inassert) return -1;
# Line 4107  int branch_newextra; Line 4469  int branch_newextra;
4469  int item_count = -1;  int item_count = -1;
4470  int name_count = 0;  int name_count = 0;
4471  int max_name_size = 0;  int max_name_size = 0;
4472    int lastitemlength = 0;
4473  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
 int lastcharlength = 0;  
4474  BOOL utf8;  BOOL utf8;
4475  BOOL class_utf8;  BOOL class_utf8;
4476  #endif  #endif
# Line 4199  while ((c = *(++ptr)) != 0) Line 4561  while ((c = *(++ptr)) != 0)
4561    
4562    /* If we are inside a \Q...\E sequence, all chars are literal */    /* If we are inside a \Q...\E sequence, all chars are literal */
4563    
4564    if (inescq) goto NORMAL_CHAR;    if (inescq)
4565        {
4566        if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4567        goto NORMAL_CHAR;
4568        }
4569    
4570    /* Otherwise, first check for ignored whitespace and comments */    /* Otherwise, first check for ignored whitespace and comments */
4571    
# Line 4218  while ((c = *(++ptr)) != 0) Line 4584  while ((c = *(++ptr)) != 0)
4584    
4585    item_count++;    /* Is zero for the first non-comment item */    item_count++;    /* Is zero for the first non-comment item */
4586    
4587      /* Allow space for auto callout before every item except quantifiers. */
4588    
4589      if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4590           c != '*' && c != '+' && c != '?' &&
4591           (c != '{' || !is_counted_repeat(ptr + 1)))
4592        length += 2 + 2*LINK_SIZE;
4593    
4594    switch(c)    switch(c)
4595      {      {
4596      /* A backslashed item may be an escaped "normal" character or a      /* A backslashed item may be an escaped data character or it may be a
4597      character type. For a "normal" character, put the pointers and      character type. */
     character back so that tests for whitespace etc. in the input  
     are done correctly. */  
4598    
4599      case '\\':      case '\\':
4600        c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4601        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602    
4603        lastitemlength = 1;     /* Default length of last item for repeats */
4604    
4605        if (c >= 0)             /* Data character */
4606        {        {
4607        const uschar *save_ptr = ptr;        length += 2;          /* For a one-byte character */
4608        c = check_escape(&ptr, errorptr, bracount, options, FALSE);  
4609        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;  #ifdef SUPPORT_UTF8
4610        if (c >= 0)        if (utf8 && c > 127)
4611          {          {
4612          ptr = save_ptr;          int i;
4613          c = '\\';          for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4614          goto NORMAL_CHAR;            if (c <= utf8_table1[i]) break;
4615            length += i;
4616            lastitemlength += i;
4617          }          }
4618    #endif
4619    
4620          continue;
4621        }        }
4622    
4623      /* If \Q, enter "literal" mode */      /* If \Q, enter "literal" mode */
# Line 4246  while ((c = *(++ptr)) != 0) Line 4628  while ((c = *(++ptr)) != 0)
4628        continue;        continue;
4629        }        }
4630    
4631      /* Other escapes need one byte, and are of length one for repeats */      /* \X is supported only if Unicode property support is compiled */
4632    
4633      length++;  #ifndef SUPPORT_UCP
4634  #ifdef SUPPORT_UTF8      if (-c == ESC_X)
4635      lastcharlength = 1;        {
4636          *errorptr = ERR45;
4637          goto PCRE_ERROR_RETURN;
4638          }
4639    #endif
4640    
4641        /* \P and \p are for Unicode properties, but only when the support has
4642        been compiled. Each item needs 2 bytes. */
4643    
4644        else if (-c == ESC_P || -c == ESC_p)
4645          {
4646    #ifdef SUPPORT_UCP
4647          BOOL negated;
4648          length += 2;
4649          lastitemlength = 2;
4650          if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4651          continue;
4652    #else
4653          *errorptr = ERR45;
4654          goto PCRE_ERROR_RETURN;
4655  #endif  #endif
4656          }
4657    
4658        /* Other escapes need one byte */
4659    
4660        length++;
4661    
4662      /* A back reference needs an additional 2 bytes, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
4663      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
# Line 4281  while ((c = *(++ptr)) != 0) Line 4687  while ((c = *(++ptr)) != 0)
4687      case '.':      case '.':
4688      case '$':      case '$':
4689      length++;      length++;
4690  #ifdef SUPPORT_UTF8      lastitemlength = 1;
     lastcharlength = 1;  
 #endif  
4691      continue;      continue;
4692    
4693      case '*':            /* These repeats won't be after brackets; */      case '*':            /* These repeats won't be after brackets; */
# Line 4310  while ((c = *(++ptr)) != 0) Line 4714  while ((c = *(++ptr)) != 0)
4714    
4715      else      else
4716        {        {
4717  #ifdef SUPPORT_UTF8        if (min != 1)
       /* In UTF-8 mode, we should find the length in lastcharlength */  
       if (utf8)  
4718          {          {
4719          if (min != 1)          length -= lastitemlength;   /* Uncount the original char or metachar */
4720            {          if (min > 0) length += 3 + lastitemlength;
           length -= lastcharlength;   /* Uncount the original char or metachar */  
           if (min > 0) length += 3 + lastcharlength;  
           }  
         length += lastcharlength + ((max > 0)? 3 : 1);  
         }  
       else  
 #endif  
   
       /* Not UTF-8 mode: all characters are one byte */  
         {  
         if (min != 1)  
           {  
           length--;   /* Uncount the original char or metachar */  
           if (min > 0) length += 4;  
           }  
   
         length += (max > 0)? 4 : 2;  
4721          }          }
4722          length += lastitemlength + ((max > 0)? 3 : 1);
4723        }        }
4724    
4725      if (ptr[1] == '?') ptr++;      /* Needs no extra length */      if (ptr[1] == '?') ptr++;      /* Needs no extra length */
# Line 4364  while ((c = *(++ptr)) != 0) Line 4750  while ((c = *(++ptr)) != 0)
4750      where we can. (In UTF-8 mode we can do this only for chars < 128.) */      where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4751    
4752      case '[':      case '[':
4753      class_optcount = 0;      if (*(++ptr) == '^')
4754          {
4755          class_optcount = 10;  /* Greater than one */
4756          ptr++;
4757          }
4758        else class_optcount = 0;
4759    
4760  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4761      class_utf8 = FALSE;      class_utf8 = FALSE;
4762  #endif  #endif
4763    
     if (*(++ptr) == '^') ptr++;  
   
4764      /* Written as a "do" so that an initial ']' is taken as data */      /* Written as a "do" so that an initial ']' is taken as data */
4765    
4766      if (*ptr != 0) do      if (*ptr != 0) do
# Line 4380  while ((c = *(++ptr)) != 0) Line 4769  while ((c = *(++ptr)) != 0)
4769    
4770        if (inescq)        if (inescq)
4771          {          {
4772          if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;          if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4773          inescq = FALSE;          inescq = FALSE;
4774          ptr += 1;          ptr += 1;
4775          continue;          continue;
# Line 4390  while ((c = *(++ptr)) != 0) Line 4779  while ((c = *(++ptr)) != 0)
4779    
4780        if (*ptr == '\\')        if (*ptr == '\\')
4781          {          {
4782  #ifdef SUPPORT_UTF8          c = check_escape(&ptr, errorptr, bracount, options, TRUE);
         int prevchar = ptr[-1];  
 #endif  
         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);  
4783          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4784    
4785          /* \b is backspace inside a class */          /* \b is backspace inside a class; \X is literal */
4786    
4787          if (-ch == ESC_b) ch = '\b';          if (-c == ESC_b) c = '\b';
4788            else if (-c == ESC_X) c = 'X';
4789    
4790          /* \Q enters quoting mode */          /* \Q enters quoting mode */
4791    
4792          if (-ch == ESC_Q)          else if (-c == ESC_Q)
4793            {            {
4794            inescq = TRUE;            inescq = TRUE;
4795            continue;            continue;
# Line 4410  while ((c = *(++ptr)) != 0) Line 4797  while ((c = *(++ptr)) != 0)
4797    
4798          /* Handle escapes that turn into characters */          /* Handle escapes that turn into characters */
4799    
4800          if (ch >= 0)          if (c >= 0) goto NON_SPECIAL_CHARACTER;
4801    
4802            /* Escapes that are meta-things. The normal ones just affect the
4803            bit map, but Unicode properties require an XCLASS extended item. */
4804    
4805            else
4806            {            {
4807              class_optcount = 10;         /* \d, \s etc; make sure > 1 */
4808  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4809            if (utf8)            if (-c == ESC_p || -c == ESC_P)
4810              {              {
4811              if (ch > 127) class_optcount = 10;  /* Ensure > 1 */              if (!class_utf8)
             if (ch > 255)  
4812                {                {
4813                uschar buffer[6];                class_utf8 = TRUE;
4814                if (!class_utf8)                length += LINK_SIZE + 2;
                 {  
                 class_utf8 = TRUE;  
                 length += LINK_SIZE + 1 + 1;  
                 }  
               length += 1 + ord2utf8(ch, buffer);  
   
               /* If this wide character is preceded by '-', add an extra 2 to  
               the length in case the previous character was < 128, because in  
               this case the whole range will be put into the list. */  
   
               if (prevchar == '-') length += 2;  
4815                }                }
4816                length += 2;
4817              }              }
4818  #endif  #endif
           class_optcount++;            /* for possible optimization */  
4819            }            }
         else class_optcount = 10;      /* \d, \s etc; make sure > 1 */  
4820          }          }
4821    
4822        /* Check the syntax for POSIX stuff. The bits we actually handle are        /* Check the syntax for POSIX stuff. The bits we actually handle are
# Line 4448  while ((c = *(++ptr)) != 0) Line 4828  while ((c = *(++ptr)) != 0)
4828          class_optcount = 10;    /* Make sure > 1 */          class_optcount = 10;    /* Make sure > 1 */
4829          }          }
4830    
4831        /* Anything else just increments the possible optimization count. If        /* Anything else increments the possible optimization count. We have to
4832        there are wide characters, we are going to have to use an XCLASS. */        detect ranges here so that we can compute the number of extra ranges for
4833          caseless wide characters when UCP support is available. If there are wide
4834          characters, we are going to have to use an XCLASS, even for single
4835          characters. */
4836    
4837        else        else
4838          {          {
4839          NON_SPECIAL_CHARACTER:          int d;
4840          class_optcount++;  
4841            GET_ONE_CHARACTER:
4842    
4843  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4844          if (utf8)          if (utf8)
4845            {            {
           int ch;  
4846            int extra = 0;            int extra = 0;
4847            GETCHARLEN(ch, ptr, extra);            GETCHARLEN(c, ptr, extra);
4848            if (ch > 127) class_optcount = 10;   /* No optimization possible */            ptr += extra;
4849            if (ch > 255)            }
4850            else c = *ptr;
4851    #else
4852            c = *ptr;
4853    #endif
4854    
4855            /* Come here from handling \ above when it escapes to a char value */
4856    
4857            NON_SPECIAL_CHARACTER:
4858            class_optcount++;
4859    
4860            d = -1;
4861            if (ptr[1] == '-')
4862              {
4863              uschar const *hyptr = ptr++;
4864              if (ptr[1] == '\\')
4865              {              {
4866              if (!class_utf8)              ptr++;
4867                d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4868                if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4869                if (-d == ESC_b) d = '\b';        /* backspace */
4870                else if (-d == ESC_X) d = 'X';    /* literal X in a class */
4871                }
4872              else if (ptr[1] != 0 && ptr[1] != ']')
4873                {
4874                ptr++;
4875    #ifdef SUPPORT_UTF8
4876                if (utf8)
4877                  {
4878                  int extra = 0;
4879                  GETCHARLEN(d, ptr, extra);
4880                  ptr += extra;
4881                  }
4882                else
4883    #endif
4884                d = *ptr;
4885                }
4886              if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
4887              }
4888    
4889            /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4890            127 for caseless matching, we will need to use an XCLASS. */
4891    
4892            if (d >= 0)
4893              {
4894              class_optcount = 10;     /* Ensure > 1 */
4895              if (d < c)
4896                {
4897                *errorptr = ERR8;
4898                goto PCRE_ERROR_RETURN;
4899                }
4900    
4901    #ifdef SUPPORT_UTF8
4902              if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4903                {
4904                uschar buffer[6];
4905                if (!class_utf8)         /* Allow for XCLASS overhead */
4906                {                {
4907                class_utf8 = TRUE;                class_utf8 = TRUE;
4908                length += LINK_SIZE + 1 + 1;                length += LINK_SIZE + 2;
4909                }                }
             length += 2 + extra;  
4910    
4911              /* If this wide character is preceded by '-', add an extra 2 to  #ifdef SUPPORT_UCP
4912              the length in case the previous character was < 128, because in              /* If we have UCP support, find out how many extra ranges are
4913              this case the whole range will be put into the list. */              needed to map the other case of characters within this range. We
4914                have to mimic the range optimization here, because extending the
4915                range upwards might push d over a boundary that makes is use
4916                another byte in the UTF-8 representation. */
4917    
4918                if ((options & PCRE_CASELESS) != 0)
4919                  {
4920                  int occ, ocd;
4921                  int cc = c;
4922                  int origd = d;
4923                  while (get_othercase_range(&cc, origd, &occ, &ocd))
4924                    {
4925                    if (occ >= c && ocd <= d) continue;   /* Skip embedded */
4926    
4927                    if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
4928                      {                            /* if there is overlap,   */
4929                      c = occ;                     /* noting that if occ < c */
4930                      continue;                    /* we can't have ocd > d  */
4931                      }                            /* because a subrange is  */
4932                    if (ocd > d && occ <= d + 1)   /* always shorter than    */
4933                      {                            /* the basic range.       */
4934                      d = ocd;
4935                      continue;
4936                      }
4937    
4938                    /* An extra item is needed */
4939    
4940              if (ptr[-1] == '-') length += 2;                  length += 1 + ord2utf8(occ, buffer) +
4941                      ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4942                    }
4943                  }
4944    #endif  /* SUPPORT_UCP */
4945    
4946              /* Advance to the end of this character */              /* The length of the (possibly extended) range */
4947    
4948              ptr += extra;              length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4949              }              }
4950    #endif  /* SUPPORT_UTF8 */
4951    
4952              }
4953    
4954            /* We have a single character. There is nothing to be done unless we
4955            are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4956            allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4957            support. */
4958    
4959            else
4960              {
4961    #ifdef SUPPORT_UTF8
4962              if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4963                {
4964                uschar buffer[6];
4965                class_optcount = 10;     /* Ensure > 1 */
4966                if (!class_utf8)         /* Allow for XCLASS overhead */
4967                  {
4968                  class_utf8 = TRUE;
4969                  length += LINK_SIZE + 2;
4970                  }
4971    #ifdef SUPPORT_UCP
4972                length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4973                  (1 + ord2utf8(c, buffer));
4974    #else   /* SUPPORT_UCP */
4975                length += 1 + ord2utf8(c, buffer);
4976    #endif  /* SUPPORT_UCP */
4977                }
4978    #endif  /* SUPPORT_UTF8 */
4979            }            }
 #endif  
4980          }          }
4981        }        }
4982      while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */      while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
# Line 4608  while ((c = *(++ptr)) != 0) Line 5101  while ((c = *(++ptr)) != 0)
5101            *errorptr = ERR39;            *errorptr = ERR39;
5102            goto PCRE_ERROR_RETURN;            goto PCRE_ERROR_RETURN;
5103            }            }
5104          length += 2;          length += 2 + 2*LINK_SIZE;
5105          continue;          continue;
5106    
5107          /* Named subpatterns are an extension copied from Python */          /* Named subpatterns are an extension copied from Python */
# Line 4908  while ((c = *(++ptr)) != 0) Line 5401  while ((c = *(++ptr)) != 0)
5401        }        }
5402      continue;      continue;
5403    
5404      /* Non-special character. For a run of such characters the length required      /* Non-special character. It won't be space or # in extended mode, so it is
5405      is the number of characters + 2, except that the maximum run length is      always a genuine character. If we are in a \Q...\E sequence, check for the
5406      MAXLIT. We won't get a skipped space or a non-data escape or the start of a      end; if not, we have a literal. */
     # comment as the first character, so the length can't be zero. */  
5407    
     NORMAL_CHAR:  
5408      default:      default:
5409      length += 2;      NORMAL_CHAR:
     runlength = 0;  
     do  
       {  
 #ifdef SUPPORT_UTF8  
       lastcharlength = 1;     /* Need length of last char for UTF-8 repeats */  
 #endif  
   
       /* If in a \Q...\E sequence, check for end; otherwise it's a literal */  
       if (inescq)  
         {  
         if (c == '\\' && ptr[1] == 'E')  
           {  
           inescq = FALSE;  
           ptr++;  
           }  
         else runlength++;  
         continue;  
         }  
   
       /* Skip whitespace and comments for /x */  
   
       if ((options & PCRE_EXTENDED) != 0)  
         {  
         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;  
         if (c == '#')  
           {  
           /* The space before the ; is to avoid a warning on a silly compiler  
           on the Macintosh. */  
           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;  
           continue;  
           }  
         }  
   
       /* Backslash may introduce a data char or a metacharacter; stop the  
       string before the latter. */  
   
       if (c == '\\')  
         {  
         const uschar *saveptr = ptr;  
         c = check_escape(&ptr, errorptr, bracount, options, FALSE);  
         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;  
         if (c < 0) { ptr = saveptr; break; }  
   
         /* In UTF-8 mode, add on the number of additional bytes needed to  
         encode this character, and save the total length in case this is a  
         final char that is repeated. */  
   
 #ifdef SUPPORT_UTF8  
         if (utf8 && c > 127)  
           {  
           int i;  
           for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)  
             if (c <= utf8_table1[i]) break;  
           runlength += i;  
           lastcharlength += i;  
           }  
 #endif  
         }  
   
       /* Ordinary character or single-char escape */  
5410    
5411        runlength++;      if (inescq && c == '\\' && ptr[1] == 'E')
5412          {
5413          inescq = FALSE;
5414          ptr++;
5415          continue;
5416        }        }
5417    
5418      /* This "while" is the end of the "do" above. */      length += 2;          /* For a one-byte character */
5419        lastitemlength = 1;   /* Default length of last item for repeats */
     while (runlength < MAXLIT &&  
       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);  
   
     /* If we hit a meta-character, back off to point to it */  
5420    
5421      if (runlength < MAXLIT) ptr--;      /* In UTF-8 mode, check for additional bytes. */
   
     /* If the last char in the string is a UTF-8 multibyte character, we must  
     set lastcharlength correctly. If it was specified as an escape, this will  
     already have been done above. However, we also have to support in-line  
     UTF-8 characters, so check backwards from where we are. */  
5422    
5423  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5424      if (utf8)      if (utf8 && (c & 0xc0) == 0xc0)
5425        {        {
5426        const uschar *lastptr = ptr - 1;        while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
5427        if ((*lastptr & 0x80) != 0)          {                                     /* because the end is marked */
5428          {          lastitemlength++;                     /* by a zero byte. */
5429          while((*lastptr & 0xc0) == 0x80) lastptr--;          length++;
5430          lastcharlength = ptr - lastptr;          ptr++;
5431          }          }
5432        }        }
5433  #endif  #endif
5434    
     length += runlength;  
5435      continue;      continue;
5436      }      }
5437    }    }
5438    
5439  length += 2 + LINK_SIZE;    /* For final KET and END */  length += 2 + LINK_SIZE;    /* For final KET and END */
5440    
5441    if ((options & PCRE_AUTO_CALLOUT) != 0)
5442      length += 2 + 2*LINK_SIZE;  /* For final callout */
5443    
5444  if (length > MAX_PATTERN_SIZE)  if (length > MAX_PATTERN_SIZE)
5445    {    {
5446    *errorptr = ERR20;    *errorptr = ERR20;
# Line 5031  if (re == NULL) Line 5459  if (re == NULL)
5459    return NULL;    return NULL;
5460    }    }
5461    
5462  /* Put in the magic number, and save the size, options, and table pointer */  /* Put in the magic number, and save the sizes, options, and character table
5463    pointer. NULL is used for the default character tables. The nullpad field is at
5464    the end; it's there to help in the case when a regex compiled on a system with
5465    4-byte pointers is run on another with 8-byte pointers. */
5466    
5467  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
5468  re->size = size;  re->size = size;
5469  re->options = options;  re->options = options;
5470  re->tables = tables;  re->dummy1 = re->dummy2 = 0;
5471    re->name_table_offset = sizeof(real_pcre);
5472  re->name_entry_size = max_name_size + 3;  re->name_entry_size = max_name_size + 3;
5473  re->name_count = name_count;  re->name_count = name_count;
5474    re->tables = (tables == pcre_default_tables)? NULL : tables;
5475    re->nullpad = NULL;
5476    
5477  /* The starting points of the name/number translation table and of the code are  /* The starting points of the name/number translation table and of the code are
5478  passed around in the compile data block. */  passed around in the compile data block. */
5479    
5480  compile_block.names_found = 0;  compile_block.names_found = 0;
5481  compile_block.name_entry_size = max_name_size + 3;  compile_block.name_entry_size = max_name_size + 3;
5482  compile_block.name_table = (uschar *)re + sizeof(real_pcre);  compile_block.name_table = (uschar *)re + re->name_table_offset;
5483  codestart = compile_block.name_table + re->name_entry_size * re->name_count;  codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5484  compile_block.start_code = codestart;  compile_block.start_code = codestart;
5485    compile_block.start_pattern = (const uschar *)pattern;
5486  compile_block.req_varyopt = 0;  compile_block.req_varyopt = 0;
5487    compile_block.nopartial = FALSE;
5488    
5489  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
5490  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 5063  bracount = 0; Line 5499  bracount = 0;
5499  re->top_bracket = bracount;  re->top_bracket = bracount;
5500  re->top_backref = compile_block.top_backref;  re->top_backref = compile_block.top_backref;
5501    
5502    if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5503    
5504  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
5505    
5506  if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;  if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
# Line 5124  if ((options & PCRE_ANCHORED) == 0) Line 5562  if ((options & PCRE_ANCHORED) == 0)
5562    
5563  /* For an anchored pattern, we use the "required byte" only if it follows a  /* For an anchored pattern, we use the "required byte" only if it follows a
5564  variable length item in the regex. Remove the caseless flag for non-caseable  variable length item in the regex. Remove the caseless flag for non-caseable
5565  chars. */  bytes. */
5566    
5567  if (reqbyte >= 0 &&  if (reqbyte >= 0 &&
5568       ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))       ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
# Line 5144  printf("Length = %d top_bracket = %d top Line 5582  printf("Length = %d top_bracket = %d top
5582    
5583  if (re->options != 0)  if (re->options != 0)
5584    {    {
5585    printf("%s%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s%s\n",
5586        ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5587      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5588      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5589      ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",      ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
# Line 5276  if (c < 256) Line 5715  if (c < 256)
5715      return !negated;   /* char found */      return !negated;   /* char found */
5716    }    }
5717    
5718  /* Now match against the list of large chars or ranges that end with a large  /* First skip the bit map if present. Then match against the list of Unicode
5719  char. First skip the bit map if present. */  properties or large chars or ranges that end with a large char. We won't ever
5720    encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5721    
5722  if ((*data++ & XCL_MAP) != 0) data += 32;  if ((*data++ & XCL_MAP) != 0) data += 32;
5723    
5724  while ((t = *data++) != XCL_END)  while ((t = *data++) != XCL_END)
5725    {    {
5726    int x, y;    int x, y;
   GETCHARINC(x, data);  
5727    if (t == XCL_SINGLE)    if (t == XCL_SINGLE)
5728      {      {
5729        GETCHARINC(x, data);
5730      if (c == x) return !negated;      if (c == x) return !negated;
5731      }      }
5732    else    else if (t == XCL_RANGE)
5733      {      {
5734        GETCHARINC(x, data);
5735      GETCHARINC(y, data);      GETCHARINC(y, data);
5736      if (c >= x && c <= y) return !negated;      if (c >= x && c <= y) return !negated;
5737      }      }
5738    
5739    #ifdef SUPPORT_UCP
5740      else  /* XCL_PROP & XCL_NOTPROP */
5741        {
5742        int chartype, othercase;
5743        int rqdtype = *data++;
5744        int category = ucp_findchar(c, &chartype, &othercase);
5745        if (rqdtype >= 128)
5746          {
5747          if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5748          }
5749        else
5750          {
5751          if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5752          }
5753        }
5754    #endif  /* SUPPORT_UCP */
5755    }    }
5756    
5757  return negated;   /* char was not found */  return negated;   /* char did not match */
5758  }  }
5759  #endif  #endif
5760    
# Line 5336  always used to. Line 5794  always used to.
5794    
5795  /* These versions of the macros manage a private stack on the heap. Note  /* These versions of the macros manage a private stack on the heap. Note
5796  that the rd argument of RMATCH isn't actually used. It's the md argument of  that the rd argument of RMATCH isn't actually used. It's the md argument of
5797  match(), which never actually changes. */  match(), which never changes. */
5798    
5799  #define REGISTER  #define REGISTER
5800    
# Line 5398  typedef struct heapframe { Line 5856  typedef struct heapframe {
5856    const uschar *Xcallpat;    const uschar *Xcallpat;
5857    const uschar *Xcharptr;    const uschar *Xcharptr;
5858    const uschar *Xdata;    const uschar *Xdata;
   const uschar *Xlastptr;  
5859    const uschar *Xnext;    const uschar *Xnext;
5860    const uschar *Xpp;    const uschar *Xpp;
5861    const uschar *Xprev;    const uschar *Xprev;
# Line 5413  typedef struct heapframe { Line 5870  typedef struct heapframe {
5870    
5871    unsigned long int Xoriginal_ims;    unsigned long int Xoriginal_ims;
5872    
5873    #ifdef SUPPORT_UCP
5874      int Xprop_type;
5875      int Xprop_fail_result;
5876      int Xprop_category;
5877      int Xprop_chartype;
5878      int Xprop_othercase;
5879      int Xprop_test_against;
5880      int *Xprop_test_variable;
5881    #endif
5882    
5883    int Xctype;    int Xctype;
5884    int Xfc;    int Xfc;
5885    int Xfi;    int Xfi;
# Line 5523  HEAP_RECURSE: Line 5990  HEAP_RECURSE:
5990    
5991  /* Ditto for the local variables */  /* Ditto for the local variables */
5992    
5993  #define callpat            frame->Xcallpat  #ifdef SUPPORT_UTF8
5994  #define charptr            frame->Xcharptr  #define charptr            frame->Xcharptr
5995    #endif
5996    #define callpat            frame->Xcallpat
5997  #define data               frame->Xdata  #define data               frame->Xdata
 #define lastptr            frame->Xlastptr  
5998  #define next               frame->Xnext  #define next               frame->Xnext
5999  #define pp                 frame->Xpp  #define pp                 frame->Xpp
6000  #define prev               frame->Xprev  #define prev               frame->Xprev
# Line 5541  HEAP_RECURSE: Line 6009  HEAP_RECURSE:
6009    
6010  #define original_ims       frame->Xoriginal_ims  #define original_ims       frame->Xoriginal_ims
6011    
6012    #ifdef SUPPORT_UCP
6013    #define prop_type          frame->Xprop_type
6014    #define prop_fail_result   frame->Xprop_fail_result
6015    #define prop_category      frame->Xprop_category
6016    #define prop_chartype      frame->Xprop_chartype
6017    #define prop_othercase     frame->Xprop_othercase
6018    #define prop_test_against  frame->Xprop_test_against
6019    #define prop_test_variable frame->Xprop_test_variable
6020    #endif
6021    
6022  #define ctype              frame->Xctype  #define ctype              frame->Xctype
6023  #define fc                 frame->Xfc  #define fc                 frame->Xfc
6024  #define fi                 frame->Xfi  #define fi                 frame->Xfi
# Line 5566  i, and fc and c, can be the same variabl Line 6044  i, and fc and c, can be the same variabl
6044  #define fi i  #define fi i
6045  #define fc c  #define fc c
6046    
6047  const uschar *callpat;             /* Many of these variables are used ony */  
6048    #ifdef SUPPORT_UTF8                /* Many of these variables are used ony */
6049  const uschar *charptr;             /* small blocks of the code. My normal  */  const uschar *charptr;             /* small blocks of the code. My normal  */
6050  const uschar *data;                /* style of coding would have declared  */  #endif                             /* style of coding would have declared  */
6051  const uschar *lastptr;             /* them within each of those blocks.    */  const uschar *callpat;             /* them within each of those blocks.    */
6052  const uschar *next;                /* However, in order to accommodate the */  const uschar *data;                /* However, in order to accommodate the */
6053  const uschar *pp;                  /* version of this code that uses an    */  const uschar *next;                /* version of this code that uses an    */
6054  const uschar *prev;                /* external "stack" implemented on the  */  const uschar *pp;                  /* external "stack" implemented on the  */
6055  const uschar *saved_eptr;          /* heap, it is easier to declare them   */  const uschar *prev;                /* heap, it is easier to declare them   */
6056                                     /* all here, so the declarations can    */  const uschar *saved_eptr;          /* all here, so the declarations can    */
6057  recursion_info new_recursive;      /* be cut out in a block. The only      */                                     /* be cut out in a block. The only      */
6058                                     /* declarations within blocks below are */  recursion_info new_recursive;      /* declarations within blocks below are */
6059  BOOL cur_is_word;                  /* for variables that do not have to    */                                     /* for variables that do not have to    */
6060  BOOL condition;                    /* be preserved over a recursive call   */  BOOL cur_is_word;                  /* be preserved over a recursive call   */
6061  BOOL minimize;                     /* to RMATCH().                         */  BOOL condition;                    /* to RMATCH().                         */
6062    BOOL minimize;
6063  BOOL prev_is_word;  BOOL prev_is_word;
6064    
6065  unsigned long int original_ims;  unsigned long int original_ims;
6066    
6067    #ifdef SUPPORT_UCP
6068    int prop_type;
6069    int prop_fail_result;
6070    int prop_category;
6071    int prop_chartype;
6072    int prop_othercase;
6073    int prop_test_against;
6074    int *prop_test_variable;
6075    #endif
6076    
6077  int ctype;  int ctype;
6078  int length;  int length;
6079  int max;  int max;
# Line 5598  int stacksave[REC_STACK_SAVE_MAX]; Line 6088  int stacksave[REC_STACK_SAVE_MAX];
6088  eptrblock newptrb;  eptrblock newptrb;
6089  #endif  #endif
6090    
6091    /* These statements are here to stop the compiler complaining about unitialized
6092    variables. */
6093    
6094    #ifdef SUPPORT_UCP
6095    prop_fail_result = 0;
6096    prop_test_against = 0;
6097    prop_test_variable = NULL;
6098    #endif
6099    
6100  /* OK, now we can get on with the real code of the function. Recursion is  /* OK, now we can get on with the real code of the function. Recursion is
6101  specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,  specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
# Line 5629  for (;;) Line 6127  for (;;)
6127    op = *ecode;    op = *ecode;
6128    minimize = FALSE;    minimize = FALSE;
6129    
6130      /* For partial matching, remember if we ever hit the end of the subject after
6131      matching at least one subject character. */
6132    
6133      if (md->partial &&
6134          eptr >= md->end_subject &&
6135          eptr > md->start_match)
6136        md->hitend = TRUE;
6137    
6138    /* Opening capturing bracket. If there is space in the offset vector, save    /* Opening capturing bracket. If there is space in the offset vector, save
6139    the current subject position in the working slot at the top of the vector. We    the current subject position in the working slot at the top of the vector. We
6140    mustn't change the current values of the data slot, because they may be set    mustn't change the current values of the data slot, because they may be set
# Line 5883  for (;;) Line 6389  for (;;)
6389      if (pcre_callout != NULL)      if (pcre_callout != NULL)
6390        {        {
6391        pcre_callout_block cb;        pcre_callout_block cb;
6392        cb.version          = 0;   /* Version 0 of the callout block */        cb.version          = 1;   /* Version 1 of the callout block */
6393        cb.callout_number   = ecode[1];        cb.callout_number   = ecode[1];
6394        cb.offset_vector    = md->offset_vector;        cb.offset_vector    = md->offset_vector;
6395        cb.subject          = (const char *)md->start_subject;        cb.subject          = (const char *)md->start_subject;
6396        cb.subject_length   = md->end_subject - md->start_subject;        cb.subject_length   = md->end_subject - md->start_subject;
6397        cb.start_match      = md->start_match - md->start_subject;        cb.start_match      = md->start_match - md->start_subject;
6398        cb.current_position = eptr - md->start_subject;        cb.current_position = eptr - md->start_subject;
6399          cb.pattern_position = GET(ecode, 2);
6400          cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6401        cb.capture_top      = offset_top/2;        cb.capture_top      = offset_top/2;
6402        cb.capture_last     = md->capture_last;        cb.capture_last     = md->capture_last;
6403        cb.callout_data     = md->callout_data;        cb.callout_data     = md->callout_data;
6404        if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);        if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6405        if (rrc < 0) RRETURN(rrc);        if (rrc < 0) RRETURN(rrc);
6406        }        }
6407      ecode += 2;      ecode += 2 + 2*LINK_SIZE;
6408      break;      break;
6409    
6410      /* Recursion either matches the current regex, or some subexpression. The      /* Recursion either matches the current regex, or some subexpression. The
# Line 6297  for (;;) Line 6805  for (;;)
6805          {          {
6806          if (eptr == md->start_subject) prev_is_word = FALSE; else          if (eptr == md->start_subject) prev_is_word = FALSE; else
6807            {            {
6808            lastptr = eptr - 1;            const uschar *lastptr = eptr - 1;
6809            while((*lastptr & 0xc0) == 0x80) lastptr--;            while((*lastptr & 0xc0) == 0x80) lastptr--;
6810            GETCHAR(c, lastptr);            GETCHAR(c, lastptr);
6811            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
# Line 6427  for (;;) Line 6935  for (;;)
6935      ecode++;      ecode++;
6936      break;      break;
6937    
6938    #ifdef SUPPORT_UCP
6939        /* Check the next character by Unicode property. We will get here only
6940        if the support is in the binary; otherwise a compile-time error occurs. */
6941    
6942        case OP_PROP:
6943        case OP_NOTPROP:
6944        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6945        GETCHARINCTEST(c, eptr);
6946          {
6947          int chartype, rqdtype;
6948          int othercase;
6949          int category = ucp_findchar(c, &chartype, &othercase);
6950    
6951          rqdtype = *(++ecode);
6952          ecode++;
6953    
6954          if (rqdtype >= 128)
6955            {
6956            if ((rqdtype - 128 != category) == (op == OP_PROP))
6957              RRETURN(MATCH_NOMATCH);
6958            }
6959          else
6960            {
6961            if ((rqdtype != chartype) == (op == OP_PROP))
6962              RRETURN(MATCH_NOMATCH);
6963            }
6964          }
6965        break;
6966    
6967        /* Match an extended Unicode sequence. We will get here only if the support
6968        is in the binary; otherwise a compile-time error occurs. */
6969    
6970        case OP_EXTUNI:
6971        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6972        GETCHARINCTEST(c, eptr);
6973          {
6974          int chartype;
6975          int othercase;
6976          int category = ucp_findchar(c, &chartype, &othercase);
6977          if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6978          while (eptr < md->end_subject)
6979            {
6980            int len = 1;
6981            if (!md->utf8) c = *eptr; else
6982              {
6983              GETCHARLEN(c, eptr, len);
6984              }
6985            category = ucp_findchar(c, &chartype, &othercase);
6986            if (category != ucp_M) break;
6987            eptr += len;
6988            }
6989          }
6990        ecode++;
6991        break;
6992    #endif
6993    
6994    
6995      /* Match a back reference, possibly repeatedly. Look past the end of the      /* Match a back reference, possibly repeatedly. Look past the end of the
6996      item to see if there is repeat information following. The code is similar      item to see if there is repeat information following. The code is similar
6997      to that for character classes, but repeated for efficiency. Then obey      to that for character classes, but repeated for efficiency. Then obey
# Line 6438  for (;;) Line 7003  for (;;)
7003      case OP_REF:      case OP_REF:
7004        {        {
7005        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
7006        ecode += 3;                                     /* Advance past item */        ecode += 3;                                 /* Advance past item */
7007    
7008        /* If the reference is unset, set the length to be longer than the amount        /* If the reference is unset, set the length to be longer than the amount
7009        of subject left; this ensures that every attempt at a match fails. We        of subject left; this ensures that every attempt at a match fails. We
# Line 6540  for (;;) Line 7105  for (;;)
7105    
7106    
7107      /* Match a bit-mapped character class, possibly repeatedly. This op code is      /* Match a bit-mapped character class, possibly repeatedly. This op code is
7108      used when all the characters in the class have values in the range 0-255.      used when all the characters in the class have values in the range 0-255,
7109      The only difference between OP_CLASS and OP_NCLASS occurs when a data      and either the matching is caseful, or the characters are in the range
7110      character outside the range is encountered.      0-127 when UTF-8 processing is enabled. The only difference between
7111        OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7112        encountered.
7113    
7114      First, look past the end of the item to see if there is repeat information      First, look past the end of the item to see if there is repeat information
7115      following. Then obey similar code to character type repeats - written out      following. Then obey similar code to character type repeats - written out
# Line 6814  for (;;) Line 7381  for (;;)
7381        }        }
7382  #endif    /* End of XCLASS */  #endif    /* End of XCLASS */
7383    
7384      /* Match a run of characters */      /* Match a single character, casefully */
7385    
7386        case OP_CHAR:
7387    #ifdef SUPPORT_UTF8
7388        if (md->utf8)
7389          {
7390          length = 1;
7391          ecode++;
7392          GETCHARLEN(fc, ecode, length);
7393          if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7394          while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7395          }
7396        else
7397    #endif
7398    
7399      case OP_CHARS:      /* Non-UTF-8 mode */
7400        {        {
7401        register int slen = ecode[1];        if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7402          if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7403        ecode += 2;        ecode += 2;
7404          }
7405        break;
7406    
7407  #ifdef DEBUG    /* Sigh. Some compilers never learn. */      /* Match a single character, caselessly */
7408        if (eptr >= md->end_subject)  
7409          printf("matching subject <null> against pattern ");      case OP_CHARNC:
7410        else  #ifdef SUPPORT_UTF8
7411          {      if (md->utf8)
7412          printf("matching subject ");        {
7413          pchars(eptr, slen, TRUE, md);        length = 1;
7414          printf(" against pattern ");        ecode++;
7415          }        GETCHARLEN(fc, ecode, length);
7416        pchars(ecode, slen, FALSE, md);  
7417        printf("\n");        if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
 #endif  
7418    
7419        if (slen > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);        /* If the pattern character's value is < 128, we have only one byte, and
7420        if ((ims & PCRE_CASELESS) != 0)        can use the fast lookup table. */
7421    
7422          if (fc < 128)
7423          {          {
7424          while (slen-- > 0)          if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
           if (md->lcc[*ecode++] != md->lcc[*eptr++])  
             RRETURN(MATCH_NOMATCH);  
7425          }          }
7426    
7427          /* Otherwise we must pick up the subject character */
7428    
7429        else        else
7430          {          {
7431          while (slen-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);          int dc;
7432            GETCHARINC(dc, eptr);
7433            ecode += length;
7434    
7435            /* If we have Unicode property support, we can use it to test the other
7436            case of the character, if there is one. The result of ucp_findchar() is
7437            < 0 if the char isn't found, and othercase is returned as zero if there
7438            isn't one. */
7439    
7440            if (fc != dc)
7441              {
7442    #ifdef SUPPORT_UCP
7443              int chartype;
7444              int othercase;
7445              if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7446    #endif
7447                RRETURN(MATCH_NOMATCH);
7448              }
7449          }          }
7450        }        }
7451        else
7452    #endif   /* SUPPORT_UTF8 */
7453    
7454        /* Non-UTF-8 mode */
7455          {
7456          if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7457          if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7458          ecode += 2;
7459          }
7460      break;      break;
7461    
7462      /* Match a single character repeatedly; different opcodes share code. */      /* Match a single character repeatedly; different opcodes share code. */
# Line 6889  for (;;) Line 7500  for (;;)
7500        if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);        if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7501        ecode += length;        ecode += length;
7502    
7503        /* Handle multibyte character matching specially here. There is no        /* Handle multibyte character matching specially here. There is
7504        support for any kind of casing for multibyte characters. */        support for caseless matching if UCP support is present. */
7505    
7506        if (length > 1)        if (length > 1)
7507          {          {
7508            int oclength = 0;
7509            uschar occhars[8];
7510    
7511    #ifdef SUPPORT_UCP
7512            int othercase;
7513            int chartype;
7514            if ((ims & PCRE_CASELESS) != 0 &&
7515                 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7516                 othercase > 0)
7517              oclength = ord2utf8(othercase, occhars);
7518    #endif  /* SUPPORT_UCP */
7519    
7520          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
7521            {            {
7522            if (memcmp(eptr, charptr, length) != 0) RRETURN(MATCH_NOMATCH);            if (memcmp(eptr, charptr, length) == 0) eptr += length;
7523            eptr += length;            /* Need braces because of following else */
7524              else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7525              else
7526                {
7527                if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7528                eptr += oclength;
7529                }
7530            }            }
7531    
7532          if (min == max) continue;          if (min == max) continue;
# Line 6908  for (;;) Line 7537  for (;;)
7537              {              {
7538              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7539              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7540              if (fi >= max ||              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7541                  eptr >= md->end_subject ||              if (memcmp(eptr, charptr, length) == 0) eptr += length;
7542                  memcmp(eptr, charptr, length) != 0)              /* Need braces because of following else */
7543                RRETURN(MATCH_NOMATCH);              else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7544              eptr += length;              else
7545                  {
7546                  if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7547                  eptr += oclength;
7548                  }
7549              }              }
7550            /* Control never gets here */            /* Control never gets here */
7551            }            }
# Line 6921  for (;;) Line 7554  for (;;)
7554            pp = eptr;            pp = eptr;
7555            for (i = min; i < max; i++)            for (i = min; i < max; i++)
7556              {              {
7557              if (eptr > md->end_subject - length ||              if (eptr > md->end_subject - length) break;
7558                  memcmp(eptr, charptr, length) != 0)              if (memcmp(eptr, charptr, length) == 0) eptr += length;
7559                break;              else if (oclength == 0) break;
7560              eptr += length;              else
7561                  {
7562                  if (memcmp(eptr, occhars, oclength) != 0) break;
7563                  eptr += oclength;
7564                  }
7565              }              }
7566            while (eptr >= pp)            while (eptr >= pp)
7567             {             {
# Line 6942  for (;;) Line 7579  for (;;)
7579        value of fc will always be < 128. */        value of fc will always be < 128. */
7580        }        }
7581      else      else
7582  #endif  #endif  /* SUPPORT_UTF8 */
7583    
7584      /* When not in UTF-8 mode, load a single-byte character. */      /* When not in UTF-8 mode, load a single-byte character. */
7585        {        {
# Line 7088  for (;;) Line 7725  for (;;)
7725      max = rep_max[c];                 /* zero for max => infinity */      max = rep_max[c];                 /* zero for max => infinity */
7726      if (max == 0) max = INT_MAX;      if (max == 0) max = INT_MAX;
7727    
7728      /* Common code for all repeated single-character (less than 255) matches.      /* Common code for all repeated single-byte matches. We can give up quickly
7729      We can give up quickly if there are fewer than the minimum number of      if there are fewer than the minimum number of bytes left in the
7730      characters left in the subject. */      subject. */
7731    
7732      REPEATNOTCHAR:      REPEATNOTCHAR:
7733      if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);      if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
# Line 7358  for (;;) Line 7995  for (;;)
7995      REPEATTYPE:      REPEATTYPE:
7996      ctype = *ecode++;      /* Code for the character type */      ctype = *ecode++;      /* Code for the character type */
7997    
7998    #ifdef SUPPORT_UCP
7999        if (ctype == OP_PROP || ctype == OP_NOTPROP)
8000          {
8001          prop_fail_result = ctype == OP_NOTPROP;
8002          prop_type = *ecode++;
8003          if (prop_type >= 128)
8004            {
8005            prop_test_against = prop_type - 128;
8006            prop_test_variable = &prop_category;
8007            }
8008          else
8009            {
8010            prop_test_against = prop_type;
8011            prop_test_variable = &prop_chartype;
8012            }
8013          }
8014        else prop_type = -1;
8015    #endif
8016    
8017      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
8018      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
8019      (i.e. keep it out of the loop). Also we can test that there are at least      (i.e. keep it out of the loop). Also we can test that there are at least
8020      the minimum number of bytes before we start. This isn't as effective in      the minimum number of bytes before we start. This isn't as effective in
8021      UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that      UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8022      is tidier. */      is tidier. Also separate the UCP code, which can be the same for both UTF-8
8023        and single-bytes. */
8024    
8025      if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);      if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8026      if (min > 0)      if (min > 0)
8027        {        {
8028    #ifdef SUPPORT_UCP
8029          if (prop_type > 0)
8030            {
8031            for (i = 1; i <= min; i++)
8032              {
8033              GETCHARINC(c, eptr);
8034              prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8035              if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8036                RRETURN(MATCH_NOMATCH);
8037              }
8038            }
8039    
8040          /* Match extended Unicode sequences. We will get here only if the
8041          support is in the binary; otherwise a compile-time error occurs. */
8042    
8043          else if (ctype == OP_EXTUNI)
8044            {
8045            for (i = 1; i <= min; i++)
8046              {
8047              GETCHARINCTEST(c, eptr);
8048              prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8049              if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8050              while (eptr < md->end_subject)
8051                {
8052                int len = 1;
8053                if (!md->utf8) c = *eptr; else
8054                  {
8055                  GETCHARLEN(c, eptr, len);
8056                  }
8057                prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8058                if (prop_category != ucp_M) break;
8059                eptr += len;
8060                }
8061              }
8062            }
8063    
8064          else
8065    #endif     /* SUPPORT_UCP */
8066    
8067    /* Handle all other cases when the coding is UTF-8 */
8068    
8069  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
8070        if (md->utf8) switch(ctype)        if (md->utf8) switch(ctype)
8071          {          {
# Line 7390  for (;;) Line 8088  for (;;)
8088            {            {
8089            if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8090            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
8091            if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)            if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8092              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
8093            }            }
8094          break;          break;
# Line 7444  for (;;) Line 8142  for (;;)
8142            /* No need to skip more bytes - we know it's a 1-byte character */            /* No need to skip more bytes - we know it's a 1-byte character */
8143            }            }
8144          break;          break;
8145          }  
8146            default:
8147            RRETURN(PCRE_ERROR_INTERNAL);
8148            }  /* End switch(ctype) */
8149    
8150        else        else
8151  #endif  #endif     /* SUPPORT_UTF8 */
8152    
8153        /* Code for the non-UTF-8 case for minimum matching */        /* Code for the non-UTF-8 case for minimum matching of operators other
8154          than OP_PROP and OP_NOTPROP. */
8155    
8156        switch(ctype)        switch(ctype)
8157          {          {
# Line 7496  for (;;) Line 8199  for (;;)
8199            if ((md->ctypes[*eptr++] & ctype_word) == 0)            if ((md->ctypes[*eptr++] & ctype_word) == 0)
8200              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
8201          break;          break;
8202    
8203            default:
8204            RRETURN(PCRE_ERROR_INTERNAL);
8205          }          }
8206        }        }
8207    
# Line 7504  for (;;) Line 8210  for (;;)
8210      if (min == max) continue;      if (min == max) continue;
8211    
8212      /* If minimizing, we have to test the rest of the pattern before each      /* If minimizing, we have to test the rest of the pattern before each
8213      subsequent match. Again, separate the UTF-8 case for speed. */      subsequent match. Again, separate the UTF-8 case for speed, and also
8214        separate the UCP cases. */
8215    
8216      if (minimize)      if (minimize)
8217        {        {
8218    #ifdef SUPPORT_UCP
8219          if (prop_type > 0)
8220            {
8221            for (fi = min;; fi++)
8222              {
8223              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8224              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8225              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8226              GETCHARINC(c, eptr);
8227              prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8228              if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8229                RRETURN(MATCH_NOMATCH);
8230              }
8231            }
8232    
8233          /* Match extended Unicode sequences. We will get here only if the
8234          support is in the binary; otherwise a compile-time error occurs. */
8235    
8236          else if (ctype == OP_EXTUNI)
8237            {
8238            for (fi = min;; fi++)
8239              {
8240              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8241              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8242              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8243              GETCHARINCTEST(c, eptr);
8244              prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8245              if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8246              while (eptr < md->end_subject)
8247                {
8248                int len = 1;
8249                if (!md->utf8) c = *eptr; else
8250                  {
8251                  GETCHARLEN(c, eptr, len);
8252                  }
8253                prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8254                if (prop_category != ucp_M) break;
8255                eptr += len;
8256                }
8257              }
8258            }
8259    
8260          else
8261    #endif     /* SUPPORT_UCP */
8262    
8263  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
8264        /* UTF-8 mode */        /* UTF-8 mode */
8265        if (md->utf8)        if (md->utf8)
# Line 7557  for (;;) Line 8309  for (;;)
8309              if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)              if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8310                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
8311              break;              break;
8312    
8313                default:
8314                RRETURN(PCRE_ERROR_INTERNAL);
8315              }              }
8316            }            }
8317          }          }
# Line 7602  for (;;) Line 8357  for (;;)
8357              case OP_WORDCHAR:              case OP_WORDCHAR:
8358              if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);              if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8359              break;              break;
8360    
8361                default:
8362                RRETURN(PCRE_ERROR_INTERNAL);
8363              }              }
8364            }            }
8365          }          }
# Line 7610  for (;;) Line 8368  for (;;)
8368    
8369      /* If maximizing it is worth using inline code for speed, doing the type      /* If maximizing it is worth using inline code for speed, doing the type
8370      test once at the start (i.e. keep it out of the loop). Again, keep the      test once at the start (i.e. keep it out of the loop). Again, keep the
8371      UTF-8 stuff separate. */      UTF-8 and UCP stuff separate. */
8372    
8373      else      else
8374        {        {
8375        pp = eptr;        pp = eptr;  /* Remember where we started */
8376    
8377    #ifdef SUPPORT_UCP
8378          if (prop_type > 0)
8379            {
8380            for (i = min; i < max; i++)
8381              {
8382              int len = 1;
8383              if (eptr >= md->end_subject) break;
8384              GETCHARLEN(c, eptr, len);
8385              prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8386              if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8387                break;
8388              eptr+= len;
8389              }
8390    
8391            /* eptr is now past the end of the maximum run */
8392    
8393            for(;;)
8394              {
8395              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8396              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8397              if (eptr-- == pp) break;        /* Stop if tried at original pos */
8398              BACKCHAR(eptr);
8399              }
8400            }
8401    
8402          /* Match extended Unicode sequences. We will get here only if the
8403          support is in the binary; otherwise a compile-time error occurs. */
8404    
8405          else if (ctype == OP_EXTUNI)
8406            {
8407            for (i = min; i < max; i++)
8408              {
8409              if (eptr >= md->end_subject) break;
8410              GETCHARINCTEST(c, eptr);
8411              prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8412              if (prop_category == ucp_M) break;
8413              while (eptr < md->end_subject)
8414                {
8415                int len = 1;
8416                if (!md->utf8) c = *eptr; else
8417                  {
8418                  GETCHARLEN(c, eptr, len);
8419                  }
8420                prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8421                if (prop_category != ucp_M) break;
8422                eptr += len;
8423                }
8424              }
8425    
8426            /* eptr is now past the end of the maximum run */
8427    
8428            for(;;)
8429              {
8430              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8431              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8432              if (eptr-- == pp) break;        /* Stop if tried at original pos */
8433              for (;;)                        /* Move back over one extended */
8434                {
8435                int len = 1;
8436                BACKCHAR(eptr);
8437                if (!md->utf8) c = *eptr; else
8438                  {
8439                  GETCHARLEN(c, eptr, len);
8440                  }
8441                prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8442                if (prop_category != ucp_M) break;
8443                eptr--;
8444                }
8445              }
8446            }
8447    
8448          else
8449    #endif   /* SUPPORT_UCP */
8450    
8451  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
8452        /* UTF-8 mode */        /* UTF-8 mode */
# Line 7745  for (;;) Line 8577  for (;;)
8577              eptr+= len;              eptr+= len;
8578              }              }
8579            break;            break;
8580    
8581              default:
8582              RRETURN(PCRE_ERROR_INTERNAL);
8583            }            }
8584    
8585          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
# Line 7835  for (;;) Line 8670  for (;;)
8670              eptr++;              eptr++;
8671              }              }
8672            break;            break;
8673    
8674              default:
8675              RRETURN(PCRE_ERROR_INTERNAL);
8676            }            }
8677    
8678          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
# Line 7889  Undefine all the macros that were define Line 8727  Undefine all the macros that were define
8727  #undef callpat  #undef callpat
8728  #undef charptr  #undef charptr
8729  #undef data  #undef data
 #undef lastptr  
8730  #undef next  #undef next
8731  #undef pp  #undef pp
8732  #undef prev  #undef prev
# Line 7940  portions of the string if it matches. Tw Line 8777  portions of the string if it matches. Tw
8777  each substring: the offsets to the start and end of the substring.  each substring: the offsets to the start and end of the substring.
8778    
8779  Arguments:  Arguments:
8780    external_re     points to the compiled expression    argument_re     points to the compiled expression
8781    extra_data      points to extra data or is NULL    extra_data      points to extra data or is NULL
8782    subject         points to the subject string    subject         points to the subject string
8783    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
# Line 7956  Returns:          > 0 => success; value Line 8793  Returns:          > 0 => success; value
8793  */  */
8794    
8795  EXPORT int  EXPORT int
8796  pcre_exec(const pcre *external_re, const pcre_extra *extra_data,  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8797    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
8798    int offsetcount)    int offsetcount)
8799  {  {
# Line 7971  BOOL startline; Line 8808  BOOL startline;
8808  BOOL first_byte_caseless = FALSE;  BOOL first_byte_caseless = FALSE;
8809  BOOL req_byte_caseless = FALSE;  BOOL req_byte_caseless = FALSE;
8810  match_data match_block;  match_data match_block;
8811    const uschar *tables;
8812  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
8813  const uschar *start_match = (const uschar *)subject + start_offset;  const uschar *start_match = (const uschar *)subject + start_offset;
8814  const uschar *end_subject;  const uschar *end_subject;
8815  const uschar *req_byte_ptr = start_match - 1;  const uschar *req_byte_ptr = start_match - 1;
8816    
8817    pcre_study_data internal_study;
8818  const pcre_study_data *study;  const pcre_study_data *study;
8819  const real_pcre *re = (const real_pcre *)external_re;  
8820    real_pcre internal_re;
8821    const real_pcre *external_re = (const real_pcre *)argument_re;
8822    const real_pcre *re = external_re;
8823    
8824  /* Plausibility checks */  /* Plausibility checks */
8825    
8826  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;  if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8827  if (re == NULL || subject == NULL ||  if (re == NULL || subject == NULL ||
8828     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8829    if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8830    
8831  /* Fish out the optional data from the extra_data structure, first setting  /* Fish out the optional data from the extra_data structure, first setting
8832  the default values. */  the default values. */
# Line 7991  study = NULL; Line 8835  study = NULL;
8835  match_block.match_limit = MATCH_LIMIT;  match_block.match_limit = MATCH_LIMIT;
8836  match_block.callout_data = NULL;  match_block.callout_data = NULL;
8837    
8838    /* The table pointer is always in native byte order. */
8839    
8840    tables = external_re->tables;
8841    
8842  if (extra_data != NULL)  if (extra_data != NULL)
8843    {    {
8844    register unsigned int flags = extra_data->flags;    register unsigned int flags = extra_data->flags;
# Line 8000  if (extra_data != NULL) Line 8848  if (extra_data != NULL)
8848      match_block.match_limit = extra_data->match_limit;      match_block.match_limit = extra_data->match_limit;
8849    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8850      match_block.callout_data = extra_data->callout_data;      match_block.callout_data = extra_data->callout_data;
8851      if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8852    }    }
8853    
8854  /* Now we have re supposedly pointing to the regex */  /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8855    is a feature that makes it possible to save compiled regex and re-use them
8856    in other programs later. */
8857    
8858    if (tables == NULL) tables = pcre_default_tables;
8859    
8860    /* Check that the first field in the block is the magic number. If it is not,
8861    test for a regex that was compiled on a host of opposite endianness. If this is
8862    the case, flipped values are put in internal_re and internal_study if there was
8863    study data too. */
8864    
8865    if (re->magic_number != MAGIC_NUMBER)
8866      {
8867      re = try_flipped(re, &internal_re, study, &internal_study);
8868      if (re == NULL) return PCRE_ERROR_BADMAGIC;
8869      if (study != NULL) study = &internal_study;
8870      }
8871    
8872  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  /* Set up other data */
8873    
8874  anchored = ((re->options | options) & PCRE_ANCHORED) != 0;  anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8875  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->options & PCRE_STARTLINE) != 0;
8876    
8877  match_block.start_code =  /* The code starts after the real_pcre block and the capture name table. */
8878    (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;  
8879    match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8880      re->name_count * re->name_entry_size;
8881    
8882  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
8883  match_block.start_offset = start_offset;  match_block.start_offset = start_offset;
8884  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
# Line 8022  match_block.utf8 = (re->options & PCRE_U Line 8890  match_block.utf8 = (re->options & PCRE_U
8890  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
8891  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
8892  match_block.notempty = (options & PCRE_NOTEMPTY) != 0;  match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8893    match_block.partial = (options & PCRE_PARTIAL) != 0;
8894    match_block.hitend = FALSE;
8895    
8896  match_block.recursive = NULL;                   /* No recursion at top level */  match_block.recursive = NULL;                   /* No recursion at top level */
8897    
8898  match_block.lcc = re->tables + lcc_offset;  match_block.lcc = tables + lcc_offset;
8899  match_block.ctypes = re->tables + ctypes_offset;  match_block.ctypes = tables + ctypes_offset;
8900    
8901    /* Partial matching is supported only for a restricted set of regexes at the
8902    moment. */
8903    
8904    if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8905      return PCRE_ERROR_BADPARTIAL;
8906    
8907  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8908  back the character offset. */  back the character offset. */
# Line 8055  restoring at the exit of a group is easy Line 8931  restoring at the exit of a group is easy
8931  ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);  ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8932    
8933  /* If the expression has got more back references than the offsets supplied can  /* If the expression has got more back references than the offsets supplied can
8934  hold, we get a temporary bit of working store to use during the matching.  hold, we get a temporary chunk of working store to use during the matching.
8935  Otherwise, we can use the vector supplied, rounding down its size to a multiple  Otherwise, we can use the vector supplied, rounding down its size to a multiple
8936  of 3. */  of 3. */
8937    
# Line 8121  if ((re->options & PCRE_REQCHSET) != 0) Line 8997  if ((re->options & PCRE_REQCHSET) != 0)
8997    {    {
8998    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
8999    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9000    req_byte2 = (re->tables + fcc_offset)[req_byte];  /* case flipped */    req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
9001    }    }
9002    
9003  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  /* Loop for handling unanchored repeated matching attempts; for anchored regexs
# Line 8129  the loop runs just once. */ Line 9005  the loop runs just once. */
9005    
9006  do  do
9007    {    {
   register int *iptr = match_block.offset_vector;  
   register int *iend = iptr + resetcount;  
   
9008    /* Reset the maximum number of extractions we might see. */    /* Reset the maximum number of extractions we might see. */
9009    
9010    while (iptr < iend) *iptr++ = -1;    if (match_block.offset_vector != NULL)
9011        {
9012        register int *iptr = match_block.offset_vector;
9013        register int *iend = iptr + resetcount;
9014        while (iptr < iend) *iptr++ = -1;
9015        }
9016    
9017    /* Advance to a unique first char if possible */    /* Advance to a unique first char if possible */
9018    
# Line 8166  do Line 9044  do
9044      {      {
9045      while (start_match < end_subject)      while (start_match < end_subject)
9046        {        {
9047        register int c = *start_match;        register unsigned int c = *start_match;
9048        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9049        }        }
9050      }      }
# Line 8188  do Line 9066  do
9066    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end can
9067    take a long time, and give bad performance on quite ordinary patterns. This    take a long time, and give bad performance on quite ordinary patterns. This
9068    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9069    don't do this when the string is sufficiently long. */    don't do this when the string is sufficiently long.
9070    
9071    if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)    ALSO: this processing is disabled when partial matching is requested.
9072      */
9073    
9074      if (req_byte >= 0 &&
9075          end_subject - start_match < REQ_BYTE_MAX &&
9076          !match_block.partial)
9077      {      {
9078      register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);      register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9079    
# Line 8245  do Line 9128  do
9128      start_match++;      start_match++;
9129  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
9130      if (match_block.utf8)      if (match_block.utf8)
9131        while((*start_match & 0xc0) == 0x80) start_match++;        while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9132            start_match++;
9133  #endif  #endif
9134      continue;      continue;
9135      }      }
# Line 8296  if (using_temporary_offsets) Line 9180  if (using_temporary_offsets)
9180    (pcre_free)(match_block.offset_vector);    (pcre_free)(match_block.offset_vector);
9181    }    }
9182    
9183  DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));  if (match_block.partial && match_block.hitend)
9184      {
9185  return PCRE_ERROR_NOMATCH;    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9186      return PCRE_ERROR_PARTIAL;
9187      }
9188    else
9189      {
9190      DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9191      return PCRE_ERROR_NOMATCH;
9192      }
9193  }  }
9194    
9195  /* End of pcre.c */  /* End of pcre.c */

Legend:
Removed from v.74  
changed lines
  Added in v.75

  ViewVC Help
Powered by ViewVC 1.1.5