/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 391 by ph10, Tue Mar 17 21:16:01 2009 UTC revision 507 by ph10, Wed Mar 10 16:08:01 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 91  is 4 there is plenty of room. */ Line 92  is 4 there is plenty of room. */
92    
93  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (4096)
94    
95    /* The overrun tests check for a slightly smaller size so that they detect the
96    overrun before it actually does run off the end of the data block. */
97    
98    #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 100  is invalid. */ Line 106  is invalid. */
106  #ifndef EBCDIC  #ifndef EBCDIC
107    
108  /* This is the "normal" table for ASCII systems or for EBCDIC systems running  /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109  in UTF-8 mode. */  in UTF-8 mode. */
110    
111  static const short int escapes[] = {  static const short int escapes[] = {
112       0,                       0,       0,                       0,
113         0,                       0,
114         0,                       0,
115       0,                       0,       0,                       0,
      0,                       0,  
116       0,                       0,       0,                       0,
      0,                       0,  
117       CHAR_COLON,              CHAR_SEMICOLON,       CHAR_COLON,              CHAR_SEMICOLON,
118       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
119       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
120       CHAR_COMMERCIAL_AT,      -ESC_A,       CHAR_COMMERCIAL_AT,      -ESC_A,
121       -ESC_B,                  -ESC_C,       -ESC_B,                  -ESC_C,
122       -ESC_D,                  -ESC_E,       -ESC_D,                  -ESC_E,
123       0,                       -ESC_G,       0,                       -ESC_G,
124       -ESC_H,                  0,       -ESC_H,                  0,
125       0,                       -ESC_K,       0,                       -ESC_K,
126       0,                       0,       0,                       0,
127       0,                       0,       0,                       0,
128       -ESC_P,                  -ESC_Q,       -ESC_P,                  -ESC_Q,
129       -ESC_R,                  -ESC_S,       -ESC_R,                  -ESC_S,
130       0,                       0,       0,                       0,
131       -ESC_V,                  -ESC_W,       -ESC_V,                  -ESC_W,
132       -ESC_X,                  0,       -ESC_X,                  0,
133       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
134       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
135       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
136       CHAR_GRAVE_ACCENT,       7,       CHAR_GRAVE_ACCENT,       7,
137       -ESC_b,                  0,       -ESC_b,                  0,
138       -ESC_d,                  ESC_e,       -ESC_d,                  ESC_e,
139       ESC_f,                   0,       ESC_f,                   0,
140       -ESC_h,                  0,       -ESC_h,                  0,
141       0,                       -ESC_k,       0,                       -ESC_k,
142       0,                       0,       0,                       0,
143       ESC_n,                   0,       ESC_n,                   0,
144       -ESC_p,                  0,       -ESC_p,                  0,
145       ESC_r,                   -ESC_s,       ESC_r,                   -ESC_s,
146       ESC_tee,                 0,       ESC_tee,                 0,
147       -ESC_v,                  -ESC_w,       -ESC_v,                  -ESC_w,
148       0,                       0,       0,                       0,
149       -ESC_z       -ESC_z
150  };  };
151    
152  #else  #else
153    
154  /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */  /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
# Line 177  static const short int escapes[] = { Line 183  static const short int escapes[] = {
183    
184  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185  searched linearly. Put all the names into a single string, in order to reduce  searched linearly. Put all the names into a single string, in order to reduce
186  the number of relocations when a shared library is dynamically linked. The  the number of relocations when a shared library is dynamically linked. The
187  string is built from string macros so that it works in UTF-8 mode on EBCDIC  string is built from string macros so that it works in UTF-8 mode on EBCDIC
188  platforms. */  platforms. */
189    
190  typedef struct verbitem {  typedef struct verbitem {
# Line 215  length entry. The first three must be al Line 221  length entry. The first three must be al
221  for handling case independence. */  for handling case independence. */
222    
223  static const char posix_names[] =  static const char posix_names[] =
224    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
225    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
226    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
227    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
228    
# Line 262  the number of relocations needed when a Line 268  the number of relocations needed when a
268  it is now one long string. We cannot use a table of offsets, because the  it is now one long string. We cannot use a table of offsets, because the
269  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
270  simply count through to the one we want - this isn't a performance issue  simply count through to the one we want - this isn't a performance issue
271  because these strings are used only when there is a compilation error. */  because these strings are used only when there is a compilation error.
272    
273    Each substring ends with \0 to insert a null character. This includes the final
274    substring, so that the whole string ends with \0\0, which can be detected when
275    counting through. */
276    
277  static const char error_texts[] =  static const char error_texts[] =
278    "no error\0"    "no error\0"
# Line 341  static const char error_texts[] = Line 351  static const char error_texts[] =
351    "number is too big\0"    "number is too big\0"
352    "subpattern name expected\0"    "subpattern name expected\0"
353    "digit expected after (?+\0"    "digit expected after (?+\0"
354    "] is an invalid data character in JavaScript compatibility mode";    "] is an invalid data character in JavaScript compatibility mode\0"
355      /* 65 */
356      "different names for subpatterns of the same number are not allowed\0";
357    
358  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
359  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 360  For convenience, we use the same bit def Line 371  For convenience, we use the same bit def
371    
372  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
373    
374  #ifndef EBCDIC  #ifndef EBCDIC
375    
376  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
377  UTF-8 mode. */  UTF-8 mode. */
378    
379  static const unsigned char digitab[] =  static const unsigned char digitab[] =
# Line 400  static const unsigned char digitab[] = Line 411  static const unsigned char digitab[] =
411    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
413    
414  #else  #else
415    
416  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
417    
# Line 500  static const char * Line 511  static const char *
511  find_error_text(int n)  find_error_text(int n)
512  {  {
513  const char *s = error_texts;  const char *s = error_texts;
514  for (; n > 0; n--) while (*s++ != 0) {};  for (; n > 0; n--)
515      {
516      while (*s++ != 0) {};
517      if (*s == 0) return "Error text not found (please report)";
518      }
519  return s;  return s;
520  }  }
521    
# Line 1009  return p; Line 1024  return p;
1024    
1025    
1026  /*************************************************  /*************************************************
1027  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1028  *************************************************/  *************************************************/
1029    
1030  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1031    top-level call starts at the beginning of the pattern. All other calls must
1032    start at a parenthesis. It scans along a pattern's text looking for capturing
1033  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1034  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1035  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1036  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1037  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1038    capturing group numbers - the (?| feature.
1039    
1040  Arguments:  Arguments:
1041    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1042    cd           compile background data    cd           compile background data
1043    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1044    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1045    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1046      count        pointer to the current capturing subpattern number (updated)
1047    
1048  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1049  */  */
1050    
1051  static int  static int
1052  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1053    BOOL xmode)    BOOL xmode, int *count)
1054  {  {
1055  const uschar *thisname;  uschar *ptr = *ptrptr;
1056  int count = cd->bracount;  int start_count = *count;
1057    int hwm_count = start_count;
1058    BOOL dup_parens = FALSE;
1059    
1060  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1061    dealing with. The very first call may not start with a parenthesis. */
1062    
1063    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1064    {    {
1065    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1066          ptr[2] == CHAR_VERTICAL_LINE)
1067        {
1068        ptr += 3;
1069        dup_parens = TRUE;
1070        }
1071    
1072      /* Handle a normal, unnamed capturing parenthesis */
1073    
1074      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1075        {
1076        *count += 1;
1077        if (name == NULL && *count == lorn) return *count;
1078        ptr++;
1079        }
1080    
1081      /* Handle a condition. If it is an assertion, just carry on so that it
1082      is processed as normal. If not, skip to the closing parenthesis of the
1083      condition (there can't be any nested parens. */
1084    
1085      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1086        {
1087        ptr += 2;
1088        if (ptr[1] != CHAR_QUESTION_MARK)
1089          {
1090          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1091          if (*ptr != 0) ptr++;
1092          }
1093        }
1094    
1095      /* We have either (? or (* and not a condition */
1096    
1097      else
1098        {
1099        ptr += 2;
1100        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1101    
1102        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1103    
1104        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1105            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1106          {
1107          int term;
1108          const uschar *thisname;
1109          *count += 1;
1110          if (name == NULL && *count == lorn) return *count;
1111          term = *ptr++;
1112          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1113          thisname = ptr;
1114          while (*ptr != term) ptr++;
1115          if (name != NULL && lorn == ptr - thisname &&
1116              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1117            return *count;
1118          term++;
1119          }
1120        }
1121      }
1122    
1123    /* Past any initial parenthesis handling, scan for parentheses or vertical
1124    bars. */
1125    
1126    for (; *ptr != 0; ptr++)
1127      {
1128    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1129    
1130    if (*ptr == CHAR_BACKSLASH)    if (*ptr == CHAR_BACKSLASH)
1131      {      {
1132      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1133      if (*ptr == CHAR_Q) for (;;)      if (*ptr == CHAR_Q) for (;;)
1134        {        {
1135        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1136        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1137        if (*(++ptr) == CHAR_E) break;        if (*(++ptr) == CHAR_E) break;
1138        }        }
1139      continue;      continue;
# Line 1057  for (; *ptr != 0; ptr++) Line 1142  for (; *ptr != 0; ptr++)
1142    /* Skip over character classes; this logic must be similar to the way they    /* Skip over character classes; this logic must be similar to the way they
1143    are handled for real. If the first character is '^', skip it. Also, if the    are handled for real. If the first character is '^', skip it. Also, if the
1144    first few characters (either before or after ^) are \Q\E or \E we skip them    first few characters (either before or after ^) are \Q\E or \E we skip them
1145    too. This makes for compatibility with Perl. Note the use of STR macros to    too. This makes for compatibility with Perl. Note the use of STR macros to
1146    encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */    encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1147    
1148    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
# Line 1065  for (; *ptr != 0; ptr++) Line 1150  for (; *ptr != 0; ptr++)
1150      BOOL negate_class = FALSE;      BOOL negate_class = FALSE;
1151      for (;;)      for (;;)
1152        {        {
1153        int c = *(++ptr);        if (ptr[1] == CHAR_BACKSLASH)
       if (c == CHAR_BACKSLASH)  
1154          {          {
1155          if (ptr[1] == CHAR_E)          if (ptr[2] == CHAR_E)
1156            ptr++;            ptr+= 2;
1157          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+2,
1158                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1159            ptr += 3;            ptr += 4;
1160          else          else
1161            break;            break;
1162          }          }
1163        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1164            {
1165          negate_class = TRUE;          negate_class = TRUE;
1166            ptr++;
1167            }
1168        else break;        else break;
1169        }        }
1170    
1171      /* If the next character is ']', it is a data character that must be      /* If the next character is ']', it is a data character that must be
1172      skipped, except in JavaScript compatibility mode. */      skipped, except in JavaScript compatibility mode. */
1173    
1174      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1175          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1176        ptr++;        ptr++;
1177    
# Line 1093  for (; *ptr != 0; ptr++) Line 1180  for (; *ptr != 0; ptr++)
1180        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1181        if (*ptr == CHAR_BACKSLASH)        if (*ptr == CHAR_BACKSLASH)
1182          {          {
1183          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1184          if (*ptr == CHAR_Q) for (;;)          if (*ptr == CHAR_Q) for (;;)
1185            {            {
1186            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1187            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1188            if (*(++ptr) == CHAR_E) break;            if (*(++ptr) == CHAR_E) break;
1189            }            }
1190          continue;          continue;
# Line 1111  for (; *ptr != 0; ptr++) Line 1198  for (; *ptr != 0; ptr++)
1198    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1199      {      {
1200      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1201      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1202      continue;      continue;
1203      }      }
1204    
1205    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1206    
1207    if (*ptr != CHAR_LEFT_PARENTHESIS) continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)  
1208      {      {
1209      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1210      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1211      continue;      if (*ptr == 0) goto FAIL_EXIT;
1212      }      }
1213    
1214    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1215    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */      {
1216        if (dup_parens && *count < hwm_count) *count = hwm_count;
1217        *ptrptr = ptr;
1218        return -1;
1219        }
1220    
1221      else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1222        {
1223        if (*count > hwm_count) hwm_count = *count;
1224        *count = start_count;
1225        }
1226      }
1227    
1228    FAIL_EXIT:
1229    *ptrptr = ptr;
1230    return -1;
1231    }
1232    
   /* We have to disambiguate (?<! and (?<= from (?<name> */  
1233    
   if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||  
       ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)  
     continue;  
1234    
   count++;  
1235    
1236    if (name == NULL && count == lorn) return count;  /*************************************************
1237    term = *ptr++;  *       Find forward referenced subpattern       *
1238    if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  *************************************************/
1239    thisname = ptr;  
1240    while (*ptr != term) ptr++;  /* This function scans along a pattern's text looking for capturing
1241    if (name != NULL && lorn == ptr - thisname &&  subpatterns, and counting them. If it finds a named pattern that matches the
1242        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  name it is given, it returns its number. Alternatively, if the name is NULL, it
1243      return count;  returns when it reaches a given numbered subpattern. This is used for forward
1244    references to subpatterns. We used to be able to start this scan from the
1245    current compiling point, using the current count value from cd->bracount, and
1246    do it all in a single loop, but the addition of the possibility of duplicate
1247    subpattern numbers means that we have to scan from the very start, in order to
1248    take account of such duplicates, and to use a recursive function to keep track
1249    of the different types of group.
1250    
1251    Arguments:
1252      cd           compile background data
1253      name         name to seek, or NULL if seeking a numbered subpattern
1254      lorn         name length, or subpattern number if name is NULL
1255      xmode        TRUE if we are in /x mode
1256    
1257    Returns:       the number of the found subpattern, or -1 if not found
1258    */
1259    
1260    static int
1261    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1262    {
1263    uschar *ptr = (uschar *)cd->start_pattern;
1264    int count = 0;
1265    int rc;
1266    
1267    /* If the pattern does not start with an opening parenthesis, the first call
1268    to find_parens_sub() will scan right to the end (if necessary). However, if it
1269    does start with a parenthesis, find_parens_sub() will return when it hits the
1270    matching closing parens. That is why we have to have a loop. */
1271    
1272    for (;;)
1273      {
1274      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1275      if (rc > 0 || *ptr++ == 0) break;
1276    }    }
1277    
1278  return -1;  return rc;
1279  }  }
1280    
1281    
1282    
1283    
1284  /*************************************************  /*************************************************
1285  *      Find first significant op code            *  *      Find first significant op code            *
1286  *************************************************/  *************************************************/
# Line 1200  for (;;) Line 1330  for (;;)
1330    
1331      case OP_CALLOUT:      case OP_CALLOUT:
1332      case OP_CREF:      case OP_CREF:
1333        case OP_NCREF:
1334      case OP_RREF:      case OP_RREF:
1335        case OP_NRREF:
1336      case OP_DEF:      case OP_DEF:
1337      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1338      break;      break;
# Line 1216  for (;;) Line 1348  for (;;)
1348    
1349    
1350  /*************************************************  /*************************************************
1351  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1352  *************************************************/  *************************************************/
1353    
1354  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1355  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1356  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1357    temporarily terminated with OP_END when this function is called.
1358    
1359    This function is called when a backward assertion is encountered, so that if it
1360    fails, the error message can point to the correct place in the pattern.
1361    However, we cannot do this when the assertion contains subroutine calls,
1362    because they can be forward references. We solve this by remembering this case
1363    and doing the check at the end; a flag specifies which mode we are running in.
1364    
1365  Arguments:  Arguments:
1366    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1367    options  the compiling options    options  the compiling options
1368      atend    TRUE if called when the pattern is complete
1369      cd       the "compile data" structure
1370    
1371  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1372                 or -1 if there is no fixed length,
1373               or -2 if \C was encountered               or -2 if \C was encountered
1374                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1375  */  */
1376    
1377  static int  static int
1378  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1379  {  {
1380  int length = -1;  int length = -1;
1381    
# Line 1245  branch, check the length against that of Line 1388  branch, check the length against that of
1388  for (;;)  for (;;)
1389    {    {
1390    int d;    int d;
1391      uschar *ce, *cs;
1392    register int op = *cc;    register int op = *cc;
1393    switch (op)    switch (op)
1394      {      {
# Line 1252  for (;;) Line 1396  for (;;)
1396      case OP_BRA:      case OP_BRA:
1397      case OP_ONCE:      case OP_ONCE:
1398      case OP_COND:      case OP_COND:
1399      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1400      if (d < 0) return d;      if (d < 0) return d;
1401      branchlength += d;      branchlength += d;
1402      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1275  for (;;) Line 1419  for (;;)
1419      branchlength = 0;      branchlength = 0;
1420      break;      break;
1421    
1422        /* A true recursion implies not fixed length, but a subroutine call may
1423        be OK. If the subroutine is a forward reference, we can't deal with
1424        it until the end of the pattern, so return -3. */
1425    
1426        case OP_RECURSE:
1427        if (!atend) return -3;
1428        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1429        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1430        if (cc > cs && cc < ce) return -1;                /* Recursion */
1431        d = find_fixedlength(cs + 2, options, atend, cd);
1432        if (d < 0) return d;
1433        branchlength += d;
1434        cc += 1 + LINK_SIZE;
1435        break;
1436    
1437      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1438    
1439      case OP_ASSERT:      case OP_ASSERT:
# Line 1288  for (;;) Line 1447  for (;;)
1447    
1448      case OP_REVERSE:      case OP_REVERSE:
1449      case OP_CREF:      case OP_CREF:
1450        case OP_NCREF:
1451      case OP_RREF:      case OP_RREF:
1452        case OP_NRREF:
1453      case OP_DEF:      case OP_DEF:
1454      case OP_OPT:      case OP_OPT:
1455      case OP_CALLOUT:      case OP_CALLOUT:
1456      case OP_SOD:      case OP_SOD:
1457      case OP_SOM:      case OP_SOM:
1458        case OP_SET_SOM:
1459      case OP_EOD:      case OP_EOD:
1460      case OP_EODN:      case OP_EODN:
1461      case OP_CIRC:      case OP_CIRC:
# Line 1311  for (;;) Line 1473  for (;;)
1473      branchlength++;      branchlength++;
1474      cc += 2;      cc += 2;
1475  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1476      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1477        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1478  #endif  #endif
1479      break;      break;
1480    
# Line 1325  for (;;) Line 1485  for (;;)
1485      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1486      cc += 4;      cc += 4;
1487  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1488      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1489        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1490  #endif  #endif
1491      break;      break;
1492    
# Line 1407  for (;;) Line 1565  for (;;)
1565    
1566    
1567  /*************************************************  /*************************************************
1568  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1569  *************************************************/  *************************************************/
1570    
1571  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1572  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1573    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1574    so that it can be called from pcre_study() when finding the minimum matching
1575    length.
1576    
1577  Arguments:  Arguments:
1578    code        points to start of expression    code        points to start of expression
1579    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1580    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1581    
1582  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1583  */  */
1584    
1585  static const uschar *  const uschar *
1586  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1587  {  {
1588  for (;;)  for (;;)
1589    {    {
# Line 1435  for (;;) Line 1596  for (;;)
1596    
1597    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1598    
1599      /* Handle recursion */
1600    
1601      else if (c == OP_REVERSE)
1602        {
1603        if (number < 0) return (uschar *)code;
1604        code += _pcre_OP_lengths[c];
1605        }
1606    
1607    /* Handle capturing bracket */    /* Handle capturing bracket */
1608    
1609    else if (c == OP_CBRA)    else if (c == OP_CBRA)
# Line 1621  Arguments: Line 1790  Arguments:
1790    code        points to start of search    code        points to start of search
1791    endcode     points to where to stop    endcode     points to where to stop
1792    utf8        TRUE if in UTF8 mode    utf8        TRUE if in UTF8 mode
1793      cd          contains pointers to tables etc.
1794    
1795  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
1796  */  */
1797    
1798  static BOOL  static BOOL
1799  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1800      compile_data *cd)
1801  {  {
1802  register int c;  register int c;
1803  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
# Line 1657  for (code = first_significant_code(code Line 1828  for (code = first_significant_code(code
1828      continue;      continue;
1829      }      }
1830    
1831      /* For a recursion/subroutine call, if its end has been reached, which
1832      implies a subroutine call, we can scan it. */
1833    
1834      if (c == OP_RECURSE)
1835        {
1836        BOOL empty_branch = FALSE;
1837        const uschar *scode = cd->start_code + GET(code, 1);
1838        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
1839        do
1840          {
1841          if (could_be_empty_branch(scode, endcode, utf8, cd))
1842            {
1843            empty_branch = TRUE;
1844            break;
1845            }
1846          scode += GET(scode, 1);
1847          }
1848        while (*scode == OP_ALT);
1849        if (!empty_branch) return FALSE;  /* All branches are non-empty */
1850        continue;
1851        }
1852    
1853    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
1854    
1855    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
# Line 1664  for (code = first_significant_code(code Line 1857  for (code = first_significant_code(code
1857      BOOL empty_branch;      BOOL empty_branch;
1858      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1859    
1860      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
1861        empty branch, so just skip over the conditional, because it could be empty.
1862        Otherwise, scan the individual branches of the group. */
1863    
1864      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
1865        code += GET(code, 1);        code += GET(code, 1);
1866        else
1867          {
1868          empty_branch = FALSE;
1869          do
1870            {
1871            if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1872              empty_branch = TRUE;
1873            code += GET(code, 1);
1874            }
1875          while (*code == OP_ALT);
1876          if (!empty_branch) return FALSE;   /* All branches are non-empty */
1877        }        }
1878      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
1879      c = *code;      c = *code;
1880      continue;      continue;
1881      }      }
# Line 1792  for (code = first_significant_code(code Line 1993  for (code = first_significant_code(code
1993      case OP_QUERY:      case OP_QUERY:
1994      case OP_MINQUERY:      case OP_MINQUERY:
1995      case OP_POSQUERY:      case OP_POSQUERY:
1996        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1997        break;
1998    
1999      case OP_UPTO:      case OP_UPTO:
2000      case OP_MINUPTO:      case OP_MINUPTO:
2001      case OP_POSUPTO:      case OP_POSUPTO:
2002      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2003      break;      break;
2004  #endif  #endif
2005    
2006        /* None of the remaining opcodes are required to match a character. */
2007    
2008        default:
2009        break;
2010      }      }
2011    }    }
2012    
# Line 1820  Arguments: Line 2029  Arguments:
2029    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2030    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2031    utf8        TRUE if in UTF-8 mode    utf8        TRUE if in UTF-8 mode
2032      cd          pointers to tables etc
2033    
2034  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2035  */  */
2036    
2037  static BOOL  static BOOL
2038  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2039    BOOL utf8)    BOOL utf8, compile_data *cd)
2040  {  {
2041  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2042    {    {
2043    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2044        return FALSE;
2045    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2046    }    }
2047  return TRUE;  return TRUE;
# Line 2173  if ((options & PCRE_EXTENDED) != 0) Line 2384  if ((options & PCRE_EXTENDED) != 0)
2384    
2385  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2386    
2387  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2388    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2389      return FALSE;      return FALSE;
2390    
# Line 2492  BOOL utf8 = FALSE; Line 2703  BOOL utf8 = FALSE;
2703  uschar *utf8_char = NULL;  uschar *utf8_char = NULL;
2704  #endif  #endif
2705    
2706  #ifdef DEBUG  #ifdef PCRE_DEBUG
2707  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2708  #endif  #endif
2709    
# Line 2551  for (;; ptr++) Line 2762  for (;; ptr++)
2762    
2763    if (lengthptr != NULL)    if (lengthptr != NULL)
2764      {      {
2765  #ifdef DEBUG  #ifdef PCRE_DEBUG
2766      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2767  #endif  #endif
2768      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
2769        {        {
2770        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
2771        goto FAILED;        goto FAILED;
# Line 2603  for (;; ptr++) Line 2814  for (;; ptr++)
2814    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
2815    reference list. */    reference list. */
2816    
2817    else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)    else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2818      {      {
2819      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
2820      goto FAILED;      goto FAILED;
# Line 2639  for (;; ptr++) Line 2850  for (;; ptr++)
2850    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2851    a quantifier. */    a quantifier. */
2852    
2853    is_quantifier =    is_quantifier =
2854      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2855      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2856    
# Line 2759  for (;; ptr++) Line 2970  for (;; ptr++)
2970      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2971      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2972    
2973      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2974           ptr[1] == CHAR_EQUALS_SIGN) &&           ptr[1] == CHAR_EQUALS_SIGN) &&
2975          check_posix_syntax(ptr, &tempptr))          check_posix_syntax(ptr, &tempptr))
2976        {        {
# Line 2777  for (;; ptr++) Line 2988  for (;; ptr++)
2988        c = *(++ptr);        c = *(++ptr);
2989        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
2990          {          {
2991          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
2992            ptr++;            ptr++;
2993          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+1,
2994                            STR_Q STR_BACKSLASH STR_E, 3) == 0)                            STR_Q STR_BACKSLASH STR_E, 3) == 0)
2995            ptr += 3;            ptr += 3;
2996          else          else
2997            break;            break;
2998          }          }
2999        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
# Line 2795  for (;; ptr++) Line 3006  for (;; ptr++)
3006      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3007      [^] must match any character, so generate OP_ALLANY. */      [^] must match any character, so generate OP_ALLANY. */
3008    
3009      if (c == CHAR_RIGHT_SQUARE_BRACKET &&      if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3010          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3011        {        {
3012        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
# Line 2877  for (;; ptr++) Line 3088  for (;; ptr++)
3088        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3089    
3090        if (c == CHAR_LEFT_SQUARE_BRACKET &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3091            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3092             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3093          {          {
3094          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
# Line 3227  for (;; ptr++) Line 3438  for (;; ptr++)
3438          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3439            {            {
3440            ptr += 2;            ptr += 2;
3441            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3442              { ptr += 2; continue; }              { ptr += 2; continue; }
3443            inescq = TRUE;            inescq = TRUE;
3444            break;            break;
# Line 3749  we set the flag only if there is a liter Line 3960  we set the flag only if there is a liter
3960    
3961        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3962    
3963          /*--------------------------------------------------------------------*/
3964          /* This code is obsolete from release 8.00; the restriction was finally
3965          removed: */
3966    
3967        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3968        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3969    
3970        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3971          /*--------------------------------------------------------------------*/
3972    
3973        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3974    
# Line 3899  we set the flag only if there is a liter Line 4115  we set the flag only if there is a liter
4115          goto END_REPEAT;          goto END_REPEAT;
4116          }          }
4117    
4118          /*--------------------------------------------------------------------*/
4119          /* This code is obsolete from release 8.00; the restriction was finally
4120          removed: */
4121    
4122        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4123        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4124    
4125        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4126          /*--------------------------------------------------------------------*/
4127    
4128        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4129          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 4037  we set the flag only if there is a liter Line 4258  we set the flag only if there is a liter
4258            {            {
4259            /* In the pre-compile phase, we don't actually do the replication. We            /* In the pre-compile phase, we don't actually do the replication. We
4260            just adjust the length as if we had. Do some paranoid checks for            just adjust the length as if we had. Do some paranoid checks for
4261            potential integer overflow. */            potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4262              integer type when available, otherwise double. */
4263    
4264            if (lengthptr != NULL)            if (lengthptr != NULL)
4265              {              {
4266              int delta = (repeat_min - 1)*length_prevgroup;              int delta = (repeat_min - 1)*length_prevgroup;
4267              if ((double)(repeat_min - 1)*(double)length_prevgroup >              if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4268                                                              (double)INT_MAX ||                    (INT64_OR_DOUBLE)length_prevgroup >
4269                        (INT64_OR_DOUBLE)INT_MAX ||
4270                  OFLOW_MAX - *lengthptr < delta)                  OFLOW_MAX - *lengthptr < delta)
4271                {                {
4272                *errorcodeptr = ERR20;                *errorcodeptr = ERR20;
# Line 4089  we set the flag only if there is a liter Line 4312  we set the flag only if there is a liter
4312          just adjust the length as if we had. For each repetition we must add 1          just adjust the length as if we had. For each repetition we must add 1
4313          to the length for BRAZERO and for all but the last repetition we must          to the length for BRAZERO and for all but the last repetition we must
4314          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4315          paranoid checks to avoid integer overflow. */          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4316            a 64-bit integer type when available, otherwise double. */
4317    
4318          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
4319            {            {
4320            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4321                        2 - 2*LINK_SIZE;   /* Last one doesn't nest */                        2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4322            if ((double)repeat_max *            if ((INT64_OR_DOUBLE)repeat_max *
4323                  (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)                  (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4324                    > (double)INT_MAX ||                    > (INT64_OR_DOUBLE)INT_MAX ||
4325                OFLOW_MAX - *lengthptr < delta)                OFLOW_MAX - *lengthptr < delta)
4326              {              {
4327              *errorcodeptr = ERR20;              *errorcodeptr = ERR20;
# Line 4174  we set the flag only if there is a liter Line 4398  we set the flag only if there is a liter
4398            uschar *scode = bracode;            uschar *scode = bracode;
4399            do            do
4400              {              {
4401              if (could_be_empty_branch(scode, ketcode, utf8))              if (could_be_empty_branch(scode, ketcode, utf8, cd))
4402                {                {
4403                *bracode += OP_SBRA - OP_BRA;                *bracode += OP_SBRA - OP_BRA;
4404                break;                break;
# Line 4217  we set the flag only if there is a liter Line 4441  we set the flag only if there is a liter
4441      if (possessive_quantifier)      if (possessive_quantifier)
4442        {        {
4443        int len;        int len;
4444        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||  
4445            *tempcode == OP_NOTEXACT)        if (*tempcode == OP_TYPEEXACT)
4446          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4447            ((*tempcode == OP_TYPEEXACT &&            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4448               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);  
4449          else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4450            {
4451            tempcode += _pcre_OP_lengths[*tempcode];
4452    #ifdef SUPPORT_UTF8
4453            if (utf8 && tempcode[-1] >= 0xc0)
4454              tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4455    #endif
4456            }
4457    
4458        len = code - tempcode;        len = code - tempcode;
4459        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4460          {          {
# Line 4240  we set the flag only if there is a liter Line 4473  we set the flag only if there is a liter
4473          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4474          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4475    
4476            /* Because we are moving code along, we must ensure that any
4477            pending recursive references are updated. */
4478    
4479          default:          default:
4480            *code = OP_END;
4481            adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4482          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4483          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
4484          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
# Line 4299  we set the flag only if there is a liter Line 4537  we set the flag only if there is a liter
4537          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4538              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4539            {            {
4540            *code = verbs[i].op;            /* Check for open captures before ACCEPT */
4541            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;  
4542              if (verbs[i].op == OP_ACCEPT)
4543                {
4544                open_capitem *oc;
4545                cd->had_accept = TRUE;
4546                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4547                  {
4548                  *code++ = OP_CLOSE;
4549                  PUT2INC(code, 0, oc->number);
4550                  }
4551                }
4552              *code++ = verbs[i].op;
4553            break;            break;
4554            }            }
4555          vn += verbs[i].len + 1;          vn += verbs[i].len + 1;
# Line 4427  we set the flag only if there is a liter Line 4676  we set the flag only if there is a liter
4676            }            }
4677          namelen = ptr - name;          namelen = ptr - name;
4678    
4679          if ((terminator > 0 && *ptr++ != terminator) ||          if ((terminator > 0 && *ptr++ != terminator) ||
4680              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
4681            {            {
4682            ptr--;      /* Error offset */            ptr--;      /* Error offset */
# Line 4462  we set the flag only if there is a liter Line 4711  we set the flag only if there is a liter
4711            }            }
4712    
4713          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
4714          name. */          name. If we find a name, add one to the opcode to change OP_CREF or
4715            OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4716            except they record that the reference was originally to a name. The
4717            information is used to check duplicate names. */
4718    
4719          slot = cd->name_table;          slot = cd->name_table;
4720          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 4477  we set the flag only if there is a liter Line 4729  we set the flag only if there is a liter
4729            {            {
4730            recno = GET2(slot, 0);            recno = GET2(slot, 0);
4731            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4732              code[1+LINK_SIZE]++;
4733            }            }
4734    
4735          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4736    
4737          else if ((i = find_parens(ptr, cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4738                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4739            {            {
4740            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
4741              code[1+LINK_SIZE]++;
4742            }            }
4743    
4744          /* If terminator == 0 it means that the name followed directly after          /* If terminator == 0 it means that the name followed directly after
# Line 4626  we set the flag only if there is a liter Line 4880  we set the flag only if there is a liter
4880    
4881          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4882          case CHAR_P:              /* Python-style named subpattern handling */          case CHAR_P:              /* Python-style named subpattern handling */
4883          if (*(++ptr) == CHAR_EQUALS_SIGN ||          if (*(++ptr) == CHAR_EQUALS_SIGN ||
4884              *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */              *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4885            {            {
4886            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
# Line 4645  we set the flag only if there is a liter Line 4899  we set the flag only if there is a liter
4899          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
4900          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
4901            {            {
4902            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4903              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4904            name = ++ptr;            name = ++ptr;
4905    
# Line 4677  we set the flag only if there is a liter Line 4931  we set the flag only if there is a liter
4931                }                }
4932              }              }
4933    
4934            /* In the real compile, create the entry in the table */            /* In the real compile, create the entry in the table, maintaining
4935              alphabetical order. Duplicate names for different numbers are
4936              permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4937              number are always OK. (An existing number can be re-used if (?|
4938              appears in the pattern.) In either event, a duplicate name results in
4939              a duplicate entry in the table, even if the number is the same. This
4940              is because the number of names, and hence the table size, is computed
4941              in the pre-compile, and it affects various numbers and pointers which
4942              would all have to be modified, and the compiled code moved down, if
4943              duplicates with the same number were omitted from the table. This
4944              doesn't seem worth the hassle. However, *different* names for the
4945              same number are not permitted. */
4946    
4947            else            else
4948              {              {
4949                BOOL dupname = FALSE;
4950              slot = cd->name_table;              slot = cd->name_table;
4951    
4952              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
4953                {                {
4954                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+2, namelen);
# Line 4689  we set the flag only if there is a liter Line 4956  we set the flag only if there is a liter
4956                  {                  {
4957                  if (slot[2+namelen] == 0)                  if (slot[2+namelen] == 0)
4958                    {                    {
4959                    if ((options & PCRE_DUPNAMES) == 0)                    if (GET2(slot, 0) != cd->bracount + 1 &&
4960                          (options & PCRE_DUPNAMES) == 0)
4961                      {                      {
4962                      *errorcodeptr = ERR43;                      *errorcodeptr = ERR43;
4963                      goto FAILED;                      goto FAILED;
4964                      }                      }
4965                      else dupname = TRUE;
4966                    }                    }
4967                  else crc = -1;      /* Current name is substring */                  else crc = -1;      /* Current name is a substring */
4968                  }                  }
4969    
4970                  /* Make space in the table and break the loop for an earlier
4971                  name. For a duplicate or later name, carry on. We do this for
4972                  duplicates so that in the simple case (when ?(| is not used) they
4973                  are in order of their numbers. */
4974    
4975                if (crc < 0)                if (crc < 0)
4976                  {                  {
4977                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
4978                    (cd->names_found - i) * cd->name_entry_size);                    (cd->names_found - i) * cd->name_entry_size);
4979                  break;                  break;
4980                  }                  }
4981    
4982                  /* Continue the loop for a later or duplicate name */
4983    
4984                slot += cd->name_entry_size;                slot += cd->name_entry_size;
4985                }                }
4986    
4987                /* For non-duplicate names, check for a duplicate number before
4988                adding the new name. */
4989    
4990                if (!dupname)
4991                  {
4992                  uschar *cslot = cd->name_table;
4993                  for (i = 0; i < cd->names_found; i++)
4994                    {
4995                    if (cslot != slot)
4996                      {
4997                      if (GET2(cslot, 0) == cd->bracount + 1)
4998                        {
4999                        *errorcodeptr = ERR65;
5000                        goto FAILED;
5001                        }
5002                      }
5003                    else i--;
5004                    cslot += cd->name_entry_size;
5005                    }
5006                  }
5007    
5008              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
5009              memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
5010              slot[2+namelen] = 0;              slot[2+namelen] = 0;
5011              }              }
5012            }            }
5013    
5014          /* In both cases, count the number of names we've encountered. */          /* In both pre-compile and compile, count the number of names we've
5015            encountered. */
5016    
         ptr++;                    /* Move past > or ' */  
5017          cd->names_found++;          cd->names_found++;
5018            ptr++;                    /* Move past > or ' */
5019          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
5020    
5021    
# Line 4780  we set the flag only if there is a liter Line 5080  we set the flag only if there is a liter
5080              recno = GET2(slot, 0);              recno = GET2(slot, 0);
5081              }              }
5082            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5083                      find_parens(ptr, cd, name, namelen,                      find_parens(cd, name, namelen,
5084                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
5085              {              {
5086              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4884  we set the flag only if there is a liter Line 5184  we set the flag only if there is a liter
5184            if (lengthptr == NULL)            if (lengthptr == NULL)
5185              {              {
5186              *code = OP_END;              *code = OP_END;
5187              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);              if (recno != 0)
5188                  called = _pcre_find_bracket(cd->start_code, utf8, recno);
5189    
5190              /* Forward reference */              /* Forward reference */
5191    
5192              if (called == NULL)              if (called == NULL)
5193                {                {
5194                if (find_parens(ptr, cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5195                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5196                  {                  {
5197                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
5198                  goto FAILED;                  goto FAILED;
5199                  }                  }
5200    
5201                  /* Fudge the value of "called" so that when it is inserted as an
5202                  offset below, what it actually inserted is the reference number
5203                  of the group. */
5204    
5205                called = cd->start_code + recno;                called = cd->start_code + recno;
5206                PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);                PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5207                }                }
# Line 4905  we set the flag only if there is a liter Line 5211  we set the flag only if there is a liter
5211              recursion that could loop for ever, and diagnose that case. */              recursion that could loop for ever, and diagnose that case. */
5212    
5213              else if (GET(called, 1) == 0 &&              else if (GET(called, 1) == 0 &&
5214                       could_be_empty(called, code, bcptr, utf8))                       could_be_empty(called, code, bcptr, utf8, cd))
5215                {                {
5216                *errorcodeptr = ERR40;                *errorcodeptr = ERR40;
5217                goto FAILED;                goto FAILED;
# Line 5000  we set the flag only if there is a liter Line 5306  we set the flag only if there is a liter
5306              {              {
5307              cd->external_options = newoptions;              cd->external_options = newoptions;
5308              }              }
5309           else            else
5310              {              {
5311              if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))              if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5312                {                {
# Line 5240  we set the flag only if there is a liter Line 5546  we set the flag only if there is a liter
5546        {        {
5547        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
5548          {          {
5549          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5550            ptr += 2;               /* avoid empty string */            ptr += 2;               /* avoid empty string */
5551              else inescq = TRUE;              else inescq = TRUE;
5552          continue;          continue;
# Line 5270  we set the flag only if there is a liter Line 5576  we set the flag only if there is a liter
5576          {          {
5577          const uschar *p;          const uschar *p;
5578          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5579          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5580            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5581    
5582          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
# Line 5321  we set the flag only if there is a liter Line 5627  we set the flag only if there is a liter
5627        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5628        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
5629    
5630        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5631            ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))            ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5632          {          {
5633          is_recurse = FALSE;          is_recurse = FALSE;
5634          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5635            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5636            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5637          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
5638          }          }
# Line 5337  we set the flag only if there is a liter Line 5643  we set the flag only if there is a liter
5643    
5644        if (-c >= ESC_REF)        if (-c >= ESC_REF)
5645          {          {
5646            open_capitem *oc;
5647          recno = -c - ESC_REF;          recno = -c - ESC_REF;
5648    
5649          HANDLE_REFERENCE:    /* Come here from named backref handling */          HANDLE_REFERENCE:    /* Come here from named backref handling */
# Line 5346  we set the flag only if there is a liter Line 5653  we set the flag only if there is a liter
5653          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
5654          cd->backref_map |= (recno < 32)? (1 << recno) : 1;          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5655          if (recno > cd->top_backref) cd->top_backref = recno;          if (recno > cd->top_backref) cd->top_backref = recno;
5656    
5657            /* Check to see if this back reference is recursive, that it, it
5658            is inside the group that it references. A flag is set so that the
5659            group can be made atomic. */
5660    
5661            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5662              {
5663              if (oc->number == recno)
5664                {
5665                oc->flag = TRUE;
5666                break;
5667                }
5668              }
5669          }          }
5670    
5671        /* So are Unicode property matches, if supported. */        /* So are Unicode property matches, if supported. */
# Line 5528  uschar *code = *codeptr; Line 5848  uschar *code = *codeptr;
5848  uschar *last_branch = code;  uschar *last_branch = code;
5849  uschar *start_bracket = code;  uschar *start_bracket = code;
5850  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
5851    open_capitem capitem;
5852    int capnumber = 0;
5853  int firstbyte, reqbyte;  int firstbyte, reqbyte;
5854  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
5855  int length;  int length;
5856  int orig_bracount;  int orig_bracount;
5857  int max_bracount;  int max_bracount;
5858    int old_external_options = cd->external_options;
5859  branch_chain bc;  branch_chain bc;
5860    
5861  bc.outer = bcptr;  bc.outer = bcptr;
5862  bc.current = code;  bc.current_branch = code;
5863    
5864  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
5865    
# Line 5554  the code that abstracts option settings Line 5877  the code that abstracts option settings
5877  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5878  pre-compile phase to find out whether anything has yet been compiled or not. */  pre-compile phase to find out whether anything has yet been compiled or not. */
5879    
5880    /* If this is a capturing subpattern, add to the chain of open capturing items
5881    so that we can detect them if (*ACCEPT) is encountered. This is also used to
5882    detect groups that contain recursive back references to themselves. */
5883    
5884    if (*code == OP_CBRA)
5885      {
5886      capnumber = GET2(code, 1 + LINK_SIZE);
5887      capitem.number = capnumber;
5888      capitem.next = cd->open_caps;
5889      capitem.flag = FALSE;
5890      cd->open_caps = &capitem;
5891      }
5892    
5893  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
5894    
5895  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 5598  for (;;) Line 5934  for (;;)
5934      return FALSE;      return FALSE;
5935      }      }
5936    
5937      /* If the external options have changed during this branch, it means that we
5938      are at the top level, and a leading option setting has been encountered. We
5939      need to re-set the original option values to take account of this so that,
5940      during the pre-compile phase, we know to allow for a re-set at the start of
5941      subsequent branches. */
5942    
5943      if (old_external_options != cd->external_options)
5944        oldims = cd->external_options & PCRE_IMS;
5945    
5946    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
5947    has fewer than the rest. */    has fewer than the rest. */
5948    
# Line 5648  for (;;) Line 5993  for (;;)
5993    
5994      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5995      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5996      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5997        because there may be forward references that we can't check here. Set a
5998        flag to cause another lookbehind check at the end. Why not do it all at the
5999        end? Because common, erroneous checks are picked up here and the offset of
6000        the problem can be shown. */
6001    
6002      if (lookbehind)      if (lookbehind)
6003        {        {
6004        int fixed_length;        int fixed_length;
6005        *code = OP_END;        *code = OP_END;
6006        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6007        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
6008        if (fixed_length < 0)        if (fixed_length == -3)
6009            {
6010            cd->check_lookbehind = TRUE;
6011            }
6012          else if (fixed_length < 0)
6013          {          {
6014          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6015          *ptrptr = ptr;          *ptrptr = ptr;
6016          return FALSE;          return FALSE;
6017          }          }
6018        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
6019        }        }
6020      }      }
6021    
# Line 5696  for (;;) Line 6049  for (;;)
6049      PUT(code, 1, code - start_bracket);      PUT(code, 1, code - start_bracket);
6050      code += 1 + LINK_SIZE;      code += 1 + LINK_SIZE;
6051    
6052      /* Resetting option if needed */      /* If it was a capturing subpattern, check to see if it contained any
6053        recursive back references. If so, we must wrap it in atomic brackets.
6054        In any event, remove the block from the chain. */
6055    
6056        if (capnumber > 0)
6057          {
6058          if (cd->open_caps->flag)
6059            {
6060            memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6061              code - start_bracket);
6062            *start_bracket = OP_ONCE;
6063            code += 1 + LINK_SIZE;
6064            PUT(start_bracket, 1, code - start_bracket);
6065            *code = OP_KET;
6066            PUT(code, 1, code - start_bracket);
6067            code += 1 + LINK_SIZE;
6068            length += 2 + 2*LINK_SIZE;
6069            }
6070          cd->open_caps = cd->open_caps->next;
6071          }
6072    
6073        /* Reset options if needed. */
6074    
6075      if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)      if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6076        {        {
# Line 5745  for (;;) Line 6119  for (;;)
6119      {      {
6120      *code = OP_ALT;      *code = OP_ALT;
6121      PUT(code, 1, code - last_branch);      PUT(code, 1, code - last_branch);
6122      bc.current = last_branch = code;      bc.current_branch = last_branch = code;
6123      code += 1 + LINK_SIZE;      code += 1 + LINK_SIZE;
6124      }      }
6125    
# Line 5879  do { Line 6253  do {
6253     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6254       NULL, 0, FALSE);       NULL, 0, FALSE);
6255     register int op = *scode;     register int op = *scode;
6256    
6257     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
6258     conditional assertion *and* what follows the condition must satisfy the test     conditional assertion *and* what follows the condition must satisfy the test
6259     for start of line. Other kinds of condition fail. Note that there may be an     for start of line. Other kinds of condition fail. Note that there may be an
# Line 5887  do { Line 6261  do {
6261    
6262     if (op == OP_COND)     if (op == OP_COND)
6263       {       {
6264       scode += 1 + LINK_SIZE;       scode += 1 + LINK_SIZE;
6265       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6266       switch (*scode)       switch (*scode)
6267         {         {
6268         case OP_CREF:         case OP_CREF:
6269           case OP_NCREF:
6270         case OP_RREF:         case OP_RREF:
6271           case OP_NRREF:
6272         case OP_DEF:         case OP_DEF:
6273         return FALSE;         return FALSE;
6274    
6275         default:     /* Assertion */         default:     /* Assertion */
6276         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6277         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
6278         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
6279         break;         break;
6280         }         }
6281       scode = first_significant_code(scode, NULL, 0, FALSE);       scode = first_significant_code(scode, NULL, 0, FALSE);
6282       op = *scode;       op = *scode;
6283       }       }
6284    
6285     /* Non-capturing brackets */     /* Non-capturing brackets */
6286    
# Line 5925  do { Line 6301  do {
6301     /* Other brackets */     /* Other brackets */
6302    
6303     else if (op == OP_ASSERT || op == OP_ONCE)     else if (op == OP_ASSERT || op == OP_ONCE)
6304       {       {
6305       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6306       }       }
6307    
6308     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
# Line 6061  int length = 1;  /* For final END opcode Line 6437  int length = 1;  /* For final END opcode
6437  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6438  int errorcode = 0;  int errorcode = 0;
6439  int skipatstart = 0;  int skipatstart = 0;
6440  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6441  size_t size;  size_t size;
6442  uschar *code;  uschar *code;
6443  const uschar *codestart;  const uschar *codestart;
# Line 6106  if (erroroffset == NULL) Line 6480  if (erroroffset == NULL)
6480    
6481  *erroroffset = 0;  *erroroffset = 0;
6482    
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifdef SUPPORT_UTF8  
 utf8 = (options & PCRE_UTF8) != 0;  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  
      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)  
   {  
   errorcode = ERR44;  
   goto PCRE_EARLY_ERROR_RETURN2;  
   }  
 #else  
 if ((options & PCRE_UTF8) != 0)  
   {  
   errorcode = ERR32;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
 #endif  
   
 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)  
   {  
   errorcode = ERR17;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
   
6483  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
6484    
6485  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
# Line 6138  cd->fcc = tables + fcc_offset; Line 6488  cd->fcc = tables + fcc_offset;
6488  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6489  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6490    
6491    /* Check that all undefined public option bits are zero */
6492    
6493    if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6494      {
6495      errorcode = ERR17;
6496      goto PCRE_EARLY_ERROR_RETURN;
6497      }
6498    
6499  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6500  the offset for later. */  the offset for later. */
6501    
6502  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6503         ptr[skipatstart+1] == CHAR_ASTERISK)         ptr[skipatstart+1] == CHAR_ASTERISK)
6504    {    {
6505    int newnl = 0;    int newnl = 0;
6506    int newbsr = 0;    int newbsr = 0;
6507    
6508      if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6509        { skipatstart += 7; options |= PCRE_UTF8; continue; }
6510    
6511    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6512      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6513    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
# Line 6170  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6531  while (ptr[skipatstart] == CHAR_LEFT_PAR
6531    else break;    else break;
6532    }    }
6533    
6534    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6535    
6536    #ifdef SUPPORT_UTF8
6537    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6538         (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6539      {
6540      errorcode = ERR44;
6541      goto PCRE_EARLY_ERROR_RETURN2;
6542      }
6543    #else
6544    if (utf8)
6545      {
6546      errorcode = ERR32;
6547      goto PCRE_EARLY_ERROR_RETURN;
6548      }
6549    #endif
6550    
6551  /* Check validity of \R options. */  /* Check validity of \R options. */
6552    
6553  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
# Line 6252  cd->end_pattern = (const uschar *)(patte Line 6630  cd->end_pattern = (const uschar *)(patte
6630  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6631  cd->external_options = options;  cd->external_options = options;
6632  cd->external_flags = 0;  cd->external_flags = 0;
6633    cd->open_caps = NULL;
6634    
6635  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6636  don't need to look at the result of the function here. The initial options have  don't need to look at the result of the function here. The initial options have
# Line 6326  cd->start_code = codestart; Line 6705  cd->start_code = codestart;
6705  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6706  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6707  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6708    cd->check_lookbehind = FALSE;
6709    cd->open_caps = NULL;
6710    
6711  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
6712  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 6351  if debugging, leave the test till after Line 6732  if debugging, leave the test till after
6732    
6733  *code++ = OP_END;  *code++ = OP_END;
6734    
6735  #ifndef DEBUG  #ifndef PCRE_DEBUG
6736  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
6737  #endif  #endif
6738    
# Line 6364  while (errorcode == 0 && cd->hwm > cwork Line 6745  while (errorcode == 0 && cd->hwm > cwork
6745    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6746    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6747    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6748    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = _pcre_find_bracket(codestart, utf8, recno);
6749    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6750      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6751    }    }
# Line 6374  subpattern. */ Line 6755  subpattern. */
6755    
6756  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6757    
6758    /* If there were any lookbehind assertions that contained OP_RECURSE
6759    (recursions or subroutine calls), a flag is set for them to be checked here,
6760    because they may contain forward references. Actual recursions can't be fixed
6761    length, but subroutine calls can. It is done like this so that those without
6762    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6763    exceptional ones forgo this. We scan the pattern to check that they are fixed
6764    length, and set their lengths. */
6765    
6766    if (cd->check_lookbehind)
6767      {
6768      uschar *cc = (uschar *)codestart;
6769    
6770      /* Loop, searching for OP_REVERSE items, and process those that do not have
6771      their length set. (Actually, it will also re-process any that have a length
6772      of zero, but that is a pathological case, and it does no harm.) When we find
6773      one, we temporarily terminate the branch it is in while we scan it. */
6774    
6775      for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6776           cc != NULL;
6777           cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6778        {
6779        if (GET(cc, 1) == 0)
6780          {
6781          int fixed_length;
6782          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6783          int end_op = *be;
6784          *be = OP_END;
6785          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6786          *be = end_op;
6787          DPRINTF(("fixed length = %d\n", fixed_length));
6788          if (fixed_length < 0)
6789            {
6790            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6791            break;
6792            }
6793          PUT(cc, 1, fixed_length);
6794          }
6795        cc += 1 + LINK_SIZE;
6796        }
6797      }
6798    
6799  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6800    
6801  if (errorcode != 0)  if (errorcode != 0)
# Line 6434  if (reqbyte >= 0 && Line 6856  if (reqbyte >= 0 &&
6856  /* Print out the compiled data if debugging is enabled. This is never the  /* Print out the compiled data if debugging is enabled. This is never the
6857  case when building a production library. */  case when building a production library. */
6858    
6859  #ifdef DEBUG  #ifdef PCRE_DEBUG
   
6860  printf("Length = %d top_bracket = %d top_backref = %d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
6861    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
6862    
# Line 6472  if (code - codestart > length) Line 6893  if (code - codestart > length)
6893    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6894    return NULL;    return NULL;
6895    }    }
6896  #endif   /* DEBUG */  #endif   /* PCRE_DEBUG */
6897    
6898  return (pcre *)re;  return (pcre *)re;
6899  }  }

Legend:
Removed from v.391  
changed lines
  Added in v.507

  ViewVC Help
Powered by ViewVC 1.1.5