/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 391 by ph10, Tue Mar 17 21:16:01 2009 UTC revision 454 by ph10, Tue Sep 22 09:42:11 2009 UTC
# Line 100  is invalid. */ Line 100  is invalid. */
100  #ifndef EBCDIC  #ifndef EBCDIC
101    
102  /* This is the "normal" table for ASCII systems or for EBCDIC systems running  /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103  in UTF-8 mode. */  in UTF-8 mode. */
104    
105  static const short int escapes[] = {  static const short int escapes[] = {
106       0,                       0,       0,                       0,
107         0,                       0,
108         0,                       0,
109       0,                       0,       0,                       0,
      0,                       0,  
110       0,                       0,       0,                       0,
      0,                       0,  
111       CHAR_COLON,              CHAR_SEMICOLON,       CHAR_COLON,              CHAR_SEMICOLON,
112       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
113       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
114       CHAR_COMMERCIAL_AT,      -ESC_A,       CHAR_COMMERCIAL_AT,      -ESC_A,
115       -ESC_B,                  -ESC_C,       -ESC_B,                  -ESC_C,
116       -ESC_D,                  -ESC_E,       -ESC_D,                  -ESC_E,
117       0,                       -ESC_G,       0,                       -ESC_G,
118       -ESC_H,                  0,       -ESC_H,                  0,
119       0,                       -ESC_K,       0,                       -ESC_K,
120       0,                       0,       0,                       0,
121       0,                       0,       0,                       0,
122       -ESC_P,                  -ESC_Q,       -ESC_P,                  -ESC_Q,
123       -ESC_R,                  -ESC_S,       -ESC_R,                  -ESC_S,
124       0,                       0,       0,                       0,
125       -ESC_V,                  -ESC_W,       -ESC_V,                  -ESC_W,
126       -ESC_X,                  0,       -ESC_X,                  0,
127       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
128       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
129       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
130       CHAR_GRAVE_ACCENT,       7,       CHAR_GRAVE_ACCENT,       7,
131       -ESC_b,                  0,       -ESC_b,                  0,
132       -ESC_d,                  ESC_e,       -ESC_d,                  ESC_e,
133       ESC_f,                   0,       ESC_f,                   0,
134       -ESC_h,                  0,       -ESC_h,                  0,
135       0,                       -ESC_k,       0,                       -ESC_k,
136       0,                       0,       0,                       0,
137       ESC_n,                   0,       ESC_n,                   0,
138       -ESC_p,                  0,       -ESC_p,                  0,
139       ESC_r,                   -ESC_s,       ESC_r,                   -ESC_s,
140       ESC_tee,                 0,       ESC_tee,                 0,
141       -ESC_v,                  -ESC_w,       -ESC_v,                  -ESC_w,
142       0,                       0,       0,                       0,
143       -ESC_z       -ESC_z
144  };  };
145    
146  #else  #else
147    
148  /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */  /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
# Line 177  static const short int escapes[] = { Line 177  static const short int escapes[] = {
177    
178  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179  searched linearly. Put all the names into a single string, in order to reduce  searched linearly. Put all the names into a single string, in order to reduce
180  the number of relocations when a shared library is dynamically linked. The  the number of relocations when a shared library is dynamically linked. The
181  string is built from string macros so that it works in UTF-8 mode on EBCDIC  string is built from string macros so that it works in UTF-8 mode on EBCDIC
182  platforms. */  platforms. */
183    
184  typedef struct verbitem {  typedef struct verbitem {
# Line 215  length entry. The first three must be al Line 215  length entry. The first three must be al
215  for handling case independence. */  for handling case independence. */
216    
217  static const char posix_names[] =  static const char posix_names[] =
218    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
222    
# Line 360  For convenience, we use the same bit def Line 360  For convenience, we use the same bit def
360    
361  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363  #ifndef EBCDIC  #ifndef EBCDIC
364    
365  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366  UTF-8 mode. */  UTF-8 mode. */
367    
368  static const unsigned char digitab[] =  static const unsigned char digitab[] =
# Line 400  static const unsigned char digitab[] = Line 400  static const unsigned char digitab[] =
400    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403  #else  #else
404    
405  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
# Line 1009  return p; Line 1009  return p;
1009    
1010    
1011  /*************************************************  /*************************************************
1012  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1013  *************************************************/  *************************************************/
1014    
1015  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1016    top-level call starts at the beginning of the pattern. All other calls must
1017    start at a parenthesis. It scans along a pattern's text looking for capturing
1018  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1019  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1020  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1021  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1022  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1023    capturing group numbers - the (?| feature.
1024    
1025  Arguments:  Arguments:
1026    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1027    cd           compile background data    cd           compile background data
1028    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1029    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1030    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1031      count        pointer to the current capturing subpattern number (updated)
1032    
1033  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1034  */  */
1035    
1036  static int  static int
1037  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038    BOOL xmode)    BOOL xmode, int *count)
1039  {  {
1040  const uschar *thisname;  uschar *ptr = *ptrptr;
1041  int count = cd->bracount;  int start_count = *count;
1042    int hwm_count = start_count;
1043    BOOL dup_parens = FALSE;
1044    
1045  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1046    dealing with. The very first call may not start with a parenthesis. */
1047    
1048    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049    {    {
1050    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1051          ptr[2] == CHAR_VERTICAL_LINE)
1052        {
1053        ptr += 3;
1054        dup_parens = TRUE;
1055        }
1056    
1057      /* Handle a normal, unnamed capturing parenthesis */
1058    
1059      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060        {
1061        *count += 1;
1062        if (name == NULL && *count == lorn) return *count;
1063        ptr++;
1064        }
1065    
1066      /* Handle a condition. If it is an assertion, just carry on so that it
1067      is processed as normal. If not, skip to the closing parenthesis of the
1068      condition (there can't be any nested parens. */
1069    
1070      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071        {
1072        ptr += 2;
1073        if (ptr[1] != CHAR_QUESTION_MARK)
1074          {
1075          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076          if (*ptr != 0) ptr++;
1077          }
1078        }
1079    
1080      /* We have either (? or (* and not a condition */
1081    
1082      else
1083        {
1084        ptr += 2;
1085        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1086    
1087        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088    
1089        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091          {
1092          int term;
1093          const uschar *thisname;
1094          *count += 1;
1095          if (name == NULL && *count == lorn) return *count;
1096          term = *ptr++;
1097          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098          thisname = ptr;
1099          while (*ptr != term) ptr++;
1100          if (name != NULL && lorn == ptr - thisname &&
1101              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102            return *count;
1103          term++;
1104          }
1105        }
1106      }
1107    
1108    /* Past any initial parenthesis handling, scan for parentheses or vertical
1109    bars. */
1110    
1111    for (; *ptr != 0; ptr++)
1112      {
1113    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1114    
1115    if (*ptr == CHAR_BACKSLASH)    if (*ptr == CHAR_BACKSLASH)
1116      {      {
1117      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1118      if (*ptr == CHAR_Q) for (;;)      if (*ptr == CHAR_Q) for (;;)
1119        {        {
1120        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1121        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1122        if (*(++ptr) == CHAR_E) break;        if (*(++ptr) == CHAR_E) break;
1123        }        }
1124      continue;      continue;
# Line 1057  for (; *ptr != 0; ptr++) Line 1127  for (; *ptr != 0; ptr++)
1127    /* Skip over character classes; this logic must be similar to the way they    /* Skip over character classes; this logic must be similar to the way they
1128    are handled for real. If the first character is '^', skip it. Also, if the    are handled for real. If the first character is '^', skip it. Also, if the
1129    first few characters (either before or after ^) are \Q\E or \E we skip them    first few characters (either before or after ^) are \Q\E or \E we skip them
1130    too. This makes for compatibility with Perl. Note the use of STR macros to    too. This makes for compatibility with Perl. Note the use of STR macros to
1131    encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */    encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1132    
1133    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
# Line 1065  for (; *ptr != 0; ptr++) Line 1135  for (; *ptr != 0; ptr++)
1135      BOOL negate_class = FALSE;      BOOL negate_class = FALSE;
1136      for (;;)      for (;;)
1137        {        {
1138        int c = *(++ptr);        if (ptr[1] == CHAR_BACKSLASH)
       if (c == CHAR_BACKSLASH)  
1139          {          {
1140          if (ptr[1] == CHAR_E)          if (ptr[2] == CHAR_E)
1141            ptr++;            ptr+= 2;
1142          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+2,
1143                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144            ptr += 3;            ptr += 4;
1145          else          else
1146            break;            break;
1147          }          }
1148        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1149            {
1150          negate_class = TRUE;          negate_class = TRUE;
1151            ptr++;
1152            }
1153        else break;        else break;
1154        }        }
1155    
1156      /* If the next character is ']', it is a data character that must be      /* If the next character is ']', it is a data character that must be
1157      skipped, except in JavaScript compatibility mode. */      skipped, except in JavaScript compatibility mode. */
1158    
1159      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1160          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1161        ptr++;        ptr++;
1162    
# Line 1093  for (; *ptr != 0; ptr++) Line 1165  for (; *ptr != 0; ptr++)
1165        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1166        if (*ptr == CHAR_BACKSLASH)        if (*ptr == CHAR_BACKSLASH)
1167          {          {
1168          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1169          if (*ptr == CHAR_Q) for (;;)          if (*ptr == CHAR_Q) for (;;)
1170            {            {
1171            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1172            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1173            if (*(++ptr) == CHAR_E) break;            if (*(++ptr) == CHAR_E) break;
1174            }            }
1175          continue;          continue;
# Line 1111  for (; *ptr != 0; ptr++) Line 1183  for (; *ptr != 0; ptr++)
1183    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1184      {      {
1185      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1186      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1187      continue;      continue;
1188      }      }
1189    
1190    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1191    
1192    if (*ptr != CHAR_LEFT_PARENTHESIS) continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)  
1193      {      {
1194      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1195      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1196      continue;      if (*ptr == 0) goto FAIL_EXIT;
1197      }      }
1198    
1199    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1200    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */      {
1201        if (dup_parens && *count < hwm_count) *count = hwm_count;
1202        *ptrptr = ptr;
1203        return -1;
1204        }
1205    
1206    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1207        {
1208        if (*count > hwm_count) hwm_count = *count;
1209        *count = start_count;
1210        }
1211      }
1212    
1213    if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||  FAIL_EXIT:
1214        ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)  *ptrptr = ptr;
1215      continue;  return -1;
1216    }
1217    
1218    
1219    
1220    
1221    /*************************************************
1222    *       Find forward referenced subpattern       *
1223    *************************************************/
1224    
1225    count++;  /* This function scans along a pattern's text looking for capturing
1226    subpatterns, and counting them. If it finds a named pattern that matches the
1227    name it is given, it returns its number. Alternatively, if the name is NULL, it
1228    returns when it reaches a given numbered subpattern. This is used for forward
1229    references to subpatterns. We used to be able to start this scan from the
1230    current compiling point, using the current count value from cd->bracount, and
1231    do it all in a single loop, but the addition of the possibility of duplicate
1232    subpattern numbers means that we have to scan from the very start, in order to
1233    take account of such duplicates, and to use a recursive function to keep track
1234    of the different types of group.
1235    
1236    if (name == NULL && count == lorn) return count;  Arguments:
1237    term = *ptr++;    cd           compile background data
1238    if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;    name         name to seek, or NULL if seeking a numbered subpattern
1239    thisname = ptr;    lorn         name length, or subpattern number if name is NULL
1240    while (*ptr != term) ptr++;    xmode        TRUE if we are in /x mode
1241    if (name != NULL && lorn == ptr - thisname &&  
1242        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  Returns:       the number of the found subpattern, or -1 if not found
1243      return count;  */
1244    
1245    static int
1246    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1247    {
1248    uschar *ptr = (uschar *)cd->start_pattern;
1249    int count = 0;
1250    int rc;
1251    
1252    /* If the pattern does not start with an opening parenthesis, the first call
1253    to find_parens_sub() will scan right to the end (if necessary). However, if it
1254    does start with a parenthesis, find_parens_sub() will return when it hits the
1255    matching closing parens. That is why we have to have a loop. */
1256    
1257    for (;;)
1258      {
1259      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1260      if (rc > 0 || *ptr++ == 0) break;
1261    }    }
1262    
1263  return -1;  return rc;
1264  }  }
1265    
1266    
1267    
1268    
1269  /*************************************************  /*************************************************
1270  *      Find first significant op code            *  *      Find first significant op code            *
1271  *************************************************/  *************************************************/
# Line 1216  for (;;) Line 1331  for (;;)
1331    
1332    
1333  /*************************************************  /*************************************************
1334  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1335  *************************************************/  *************************************************/
1336    
1337  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1338  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1339  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1340    temporarily terminated with OP_END when this function is called.
1341    
1342    This function is called when a backward assertion is encountered, so that if it
1343    fails, the error message can point to the correct place in the pattern.
1344    However, we cannot do this when the assertion contains subroutine calls,
1345    because they can be forward references. We solve this by remembering this case
1346    and doing the check at the end; a flag specifies which mode we are running in.
1347    
1348  Arguments:  Arguments:
1349    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1350    options  the compiling options    options  the compiling options
1351      atend    TRUE if called when the pattern is complete
1352      cd       the "compile data" structure
1353    
1354  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1355                 or -1 if there is no fixed length,
1356               or -2 if \C was encountered               or -2 if \C was encountered
1357                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1358  */  */
1359    
1360  static int  static int
1361  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1362  {  {
1363  int length = -1;  int length = -1;
1364    
# Line 1245  branch, check the length against that of Line 1371  branch, check the length against that of
1371  for (;;)  for (;;)
1372    {    {
1373    int d;    int d;
1374      uschar *ce, *cs;
1375    register int op = *cc;    register int op = *cc;
1376    switch (op)    switch (op)
1377      {      {
# Line 1252  for (;;) Line 1379  for (;;)
1379      case OP_BRA:      case OP_BRA:
1380      case OP_ONCE:      case OP_ONCE:
1381      case OP_COND:      case OP_COND:
1382      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1383      if (d < 0) return d;      if (d < 0) return d;
1384      branchlength += d;      branchlength += d;
1385      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1274  for (;;) Line 1401  for (;;)
1401      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1402      branchlength = 0;      branchlength = 0;
1403      break;      break;
1404    
1405        /* A true recursion implies not fixed length, but a subroutine call may
1406        be OK. If the subroutine is a forward reference, we can't deal with
1407        it until the end of the pattern, so return -3. */
1408    
1409        case OP_RECURSE:
1410        if (!atend) return -3;
1411        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1412        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1413        if (cc > cs && cc < ce) return -1;                /* Recursion */
1414        d = find_fixedlength(cs + 2, options, atend, cd);
1415        if (d < 0) return d;
1416        branchlength += d;
1417        cc += 1 + LINK_SIZE;
1418        break;
1419    
1420      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1421    
# Line 1311  for (;;) Line 1453  for (;;)
1453      branchlength++;      branchlength++;
1454      cc += 2;      cc += 2;
1455  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1456      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1457        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1458  #endif  #endif
1459      break;      break;
1460    
# Line 1325  for (;;) Line 1465  for (;;)
1465      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1466      cc += 4;      cc += 4;
1467  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1468      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1469        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1470  #endif  #endif
1471      break;      break;
1472    
# Line 1407  for (;;) Line 1545  for (;;)
1545    
1546    
1547  /*************************************************  /*************************************************
1548  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1549  *************************************************/  *************************************************/
1550    
1551  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1552  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1553    instance of OP_REVERSE for a lookbehind.
1554    
1555  Arguments:  Arguments:
1556    code        points to start of expression    code        points to start of expression
1557    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1558    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1559    
1560  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1561  */  */
# Line 1434  for (;;) Line 1573  for (;;)
1573    the table is zero; the actual length is stored in the compiled code. */    the table is zero; the actual length is stored in the compiled code. */
1574    
1575    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1576    
1577      /* Handle recursion */
1578    
1579      else if (c == OP_REVERSE)
1580        {
1581        if (number < 0) return (uschar *)code;
1582        code += _pcre_OP_lengths[c];
1583        }
1584    
1585    /* Handle capturing bracket */    /* Handle capturing bracket */
1586    
# Line 1664  for (code = first_significant_code(code Line 1811  for (code = first_significant_code(code
1811      BOOL empty_branch;      BOOL empty_branch;
1812      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1813    
1814      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
1815        empty branch, so just skip over the conditional, because it could be empty.
1816        Otherwise, scan the individual branches of the group. */
1817    
1818      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
1819        code += GET(code, 1);        code += GET(code, 1);
1820        else
1821          {
1822          empty_branch = FALSE;
1823          do
1824            {
1825            if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1826              empty_branch = TRUE;
1827            code += GET(code, 1);
1828            }
1829          while (*code == OP_ALT);
1830          if (!empty_branch) return FALSE;   /* All branches are non-empty */
1831        }        }
1832      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
1833      c = *code;      c = *code;
1834      continue;      continue;
1835      }      }
# Line 1792  for (code = first_significant_code(code Line 1947  for (code = first_significant_code(code
1947      case OP_QUERY:      case OP_QUERY:
1948      case OP_MINQUERY:      case OP_MINQUERY:
1949      case OP_POSQUERY:      case OP_POSQUERY:
1950        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1951        break;
1952    
1953      case OP_UPTO:      case OP_UPTO:
1954      case OP_MINUPTO:      case OP_MINUPTO:
1955      case OP_POSUPTO:      case OP_POSUPTO:
1956      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1957      break;      break;
1958  #endif  #endif
1959      }      }
# Line 2173  if ((options & PCRE_EXTENDED) != 0) Line 2331  if ((options & PCRE_EXTENDED) != 0)
2331    
2332  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2333    
2334  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2335    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2336      return FALSE;      return FALSE;
2337    
# Line 2639  for (;; ptr++) Line 2797  for (;; ptr++)
2797    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2798    a quantifier. */    a quantifier. */
2799    
2800    is_quantifier =    is_quantifier =
2801      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2802      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2803    
# Line 2759  for (;; ptr++) Line 2917  for (;; ptr++)
2917      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2918      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2919    
2920      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2921           ptr[1] == CHAR_EQUALS_SIGN) &&           ptr[1] == CHAR_EQUALS_SIGN) &&
2922          check_posix_syntax(ptr, &tempptr))          check_posix_syntax(ptr, &tempptr))
2923        {        {
# Line 2777  for (;; ptr++) Line 2935  for (;; ptr++)
2935        c = *(++ptr);        c = *(++ptr);
2936        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
2937          {          {
2938          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
2939            ptr++;            ptr++;
2940          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+1,
2941                            STR_Q STR_BACKSLASH STR_E, 3) == 0)                            STR_Q STR_BACKSLASH STR_E, 3) == 0)
2942            ptr += 3;            ptr += 3;
2943          else          else
2944            break;            break;
2945          }          }
2946        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
# Line 2795  for (;; ptr++) Line 2953  for (;; ptr++)
2953      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2954      [^] must match any character, so generate OP_ALLANY. */      [^] must match any character, so generate OP_ALLANY. */
2955    
2956      if (c == CHAR_RIGHT_SQUARE_BRACKET &&      if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2957          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2958        {        {
2959        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
# Line 2877  for (;; ptr++) Line 3035  for (;; ptr++)
3035        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3036    
3037        if (c == CHAR_LEFT_SQUARE_BRACKET &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3038            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3039             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3040          {          {
3041          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
# Line 3227  for (;; ptr++) Line 3385  for (;; ptr++)
3385          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3386            {            {
3387            ptr += 2;            ptr += 2;
3388            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3389              { ptr += 2; continue; }              { ptr += 2; continue; }
3390            inescq = TRUE;            inescq = TRUE;
3391            break;            break;
# Line 3749  we set the flag only if there is a liter Line 3907  we set the flag only if there is a liter
3907    
3908        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3909    
3910          /*--------------------------------------------------------------------*/
3911          /* This code is obsolete from release 8.00; the restriction was finally
3912          removed: */
3913    
3914        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3915        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3916    
3917        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3918          /*--------------------------------------------------------------------*/
3919    
3920        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3921    
# Line 3899  we set the flag only if there is a liter Line 4062  we set the flag only if there is a liter
4062          goto END_REPEAT;          goto END_REPEAT;
4063          }          }
4064    
4065          /*--------------------------------------------------------------------*/
4066          /* This code is obsolete from release 8.00; the restriction was finally
4067          removed: */
4068    
4069        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4070        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4071    
4072        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4073          /*--------------------------------------------------------------------*/
4074    
4075        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4076          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 4217  we set the flag only if there is a liter Line 4385  we set the flag only if there is a liter
4385      if (possessive_quantifier)      if (possessive_quantifier)
4386        {        {
4387        int len;        int len;
4388        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||  
4389            *tempcode == OP_NOTEXACT)        if (*tempcode == OP_TYPEEXACT)
4390          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4391            ((*tempcode == OP_TYPEEXACT &&            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4392               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);  
4393          else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4394            {
4395            tempcode += _pcre_OP_lengths[*tempcode];
4396    #ifdef SUPPORT_UTF8
4397            if (utf8 && tempcode[-1] >= 0xc0)
4398              tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4399    #endif
4400            }
4401    
4402        len = code - tempcode;        len = code - tempcode;
4403        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4404          {          {
# Line 4299  we set the flag only if there is a liter Line 4476  we set the flag only if there is a liter
4476          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4477              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4478            {            {
4479            *code = verbs[i].op;            /* Check for open captures before ACCEPT */
4480            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;  
4481              if (verbs[i].op == OP_ACCEPT)
4482                {
4483                open_capitem *oc;
4484                cd->had_accept = TRUE;
4485                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4486                  {
4487                  *code++ = OP_CLOSE;
4488                  PUT2INC(code, 0, oc->number);
4489                  }
4490                }
4491              *code++ = verbs[i].op;
4492            break;            break;
4493            }            }
4494          vn += verbs[i].len + 1;          vn += verbs[i].len + 1;
# Line 4427  we set the flag only if there is a liter Line 4615  we set the flag only if there is a liter
4615            }            }
4616          namelen = ptr - name;          namelen = ptr - name;
4617    
4618          if ((terminator > 0 && *ptr++ != terminator) ||          if ((terminator > 0 && *ptr++ != terminator) ||
4619              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
4620            {            {
4621            ptr--;      /* Error offset */            ptr--;      /* Error offset */
# Line 4481  we set the flag only if there is a liter Line 4669  we set the flag only if there is a liter
4669    
4670          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4671    
4672          else if ((i = find_parens(ptr, cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4673                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4674            {            {
4675            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
# Line 4626  we set the flag only if there is a liter Line 4814  we set the flag only if there is a liter
4814    
4815          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4816          case CHAR_P:              /* Python-style named subpattern handling */          case CHAR_P:              /* Python-style named subpattern handling */
4817          if (*(++ptr) == CHAR_EQUALS_SIGN ||          if (*(++ptr) == CHAR_EQUALS_SIGN ||
4818              *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */              *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4819            {            {
4820            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
# Line 4645  we set the flag only if there is a liter Line 4833  we set the flag only if there is a liter
4833          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
4834          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
4835            {            {
4836            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4837              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4838            name = ++ptr;            name = ++ptr;
4839    
# Line 4780  we set the flag only if there is a liter Line 4968  we set the flag only if there is a liter
4968              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4969              }              }
4970            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4971                      find_parens(ptr, cd, name, namelen,                      find_parens(cd, name, namelen,
4972                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
4973              {              {
4974              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4890  we set the flag only if there is a liter Line 5078  we set the flag only if there is a liter
5078    
5079              if (called == NULL)              if (called == NULL)
5080                {                {
5081                if (find_parens(ptr, cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5082                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5083                  {                  {
5084                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
# Line 5240  we set the flag only if there is a liter Line 5428  we set the flag only if there is a liter
5428        {        {
5429        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
5430          {          {
5431          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5432            ptr += 2;               /* avoid empty string */            ptr += 2;               /* avoid empty string */
5433              else inescq = TRUE;              else inescq = TRUE;
5434          continue;          continue;
# Line 5270  we set the flag only if there is a liter Line 5458  we set the flag only if there is a liter
5458          {          {
5459          const uschar *p;          const uschar *p;
5460          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5461          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5462            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5463    
5464          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
# Line 5321  we set the flag only if there is a liter Line 5509  we set the flag only if there is a liter
5509        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5510        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
5511    
5512        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5513            ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))            ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5514          {          {
5515          is_recurse = FALSE;          is_recurse = FALSE;
5516          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5517            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5518            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5519          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
5520          }          }
# Line 5528  uschar *code = *codeptr; Line 5716  uschar *code = *codeptr;
5716  uschar *last_branch = code;  uschar *last_branch = code;
5717  uschar *start_bracket = code;  uschar *start_bracket = code;
5718  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
5719    open_capitem capitem;
5720    int capnumber = 0;
5721  int firstbyte, reqbyte;  int firstbyte, reqbyte;
5722  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
5723  int length;  int length;
# Line 5554  the code that abstracts option settings Line 5744  the code that abstracts option settings
5744  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5745  pre-compile phase to find out whether anything has yet been compiled or not. */  pre-compile phase to find out whether anything has yet been compiled or not. */
5746    
5747    /* If this is a capturing subpattern, add to the chain of open capturing items
5748    so that we can detect them if (*ACCEPT) is encountered. */
5749    
5750    if (*code == OP_CBRA)
5751      {
5752      capnumber = GET2(code, 1 + LINK_SIZE);
5753      capitem.number = capnumber;
5754      capitem.next = cd->open_caps;
5755      cd->open_caps = &capitem;
5756      }
5757    
5758  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
5759    
5760  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 5648  for (;;) Line 5849  for (;;)
5849    
5850      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5851      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5852      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5853        because there may be forward references that we can't check here. Set a
5854        flag to cause another lookbehind check at the end. Why not do it all at the
5855        end? Because common, erroneous checks are picked up here and the offset of
5856        the problem can be shown. */
5857    
5858      if (lookbehind)      if (lookbehind)
5859        {        {
5860        int fixed_length;        int fixed_length;
5861        *code = OP_END;        *code = OP_END;
5862        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5863        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
5864        if (fixed_length < 0)        if (fixed_length == -3)
5865            {
5866            cd->check_lookbehind = TRUE;
5867            }
5868          else if (fixed_length < 0)
5869          {          {
5870          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5871          *ptrptr = ptr;          *ptrptr = ptr;
5872          return FALSE;          return FALSE;
5873          }          }
5874        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
5875        }        }
5876      }      }
5877    
# Line 5689  for (;;) Line 5898  for (;;)
5898          }          }
5899        while (branch_length > 0);        while (branch_length > 0);
5900        }        }
5901    
5902        /* If it was a capturing subpattern, remove it from the chain. */
5903    
5904        if (capnumber > 0) cd->open_caps = cd->open_caps->next;
5905    
5906      /* Fill in the ket */      /* Fill in the ket */
5907    
# Line 5879  do { Line 6092  do {
6092     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6093       NULL, 0, FALSE);       NULL, 0, FALSE);
6094     register int op = *scode;     register int op = *scode;
6095    
6096     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
6097     conditional assertion *and* what follows the condition must satisfy the test     conditional assertion *and* what follows the condition must satisfy the test
6098     for start of line. Other kinds of condition fail. Note that there may be an     for start of line. Other kinds of condition fail. Note that there may be an
# Line 5887  do { Line 6100  do {
6100    
6101     if (op == OP_COND)     if (op == OP_COND)
6102       {       {
6103       scode += 1 + LINK_SIZE;       scode += 1 + LINK_SIZE;
6104       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6105       switch (*scode)       switch (*scode)
6106         {         {
6107         case OP_CREF:         case OP_CREF:
6108         case OP_RREF:         case OP_RREF:
6109         case OP_DEF:         case OP_DEF:
6110         return FALSE;         return FALSE;
6111    
6112         default:     /* Assertion */         default:     /* Assertion */
6113         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6114         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
6115         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
6116         break;         break;
6117         }         }
6118       scode = first_significant_code(scode, NULL, 0, FALSE);       scode = first_significant_code(scode, NULL, 0, FALSE);
6119       op = *scode;       op = *scode;
6120       }       }
6121    
6122     /* Non-capturing brackets */     /* Non-capturing brackets */
6123    
# Line 5925  do { Line 6138  do {
6138     /* Other brackets */     /* Other brackets */
6139    
6140     else if (op == OP_ASSERT || op == OP_ONCE)     else if (op == OP_ASSERT || op == OP_ONCE)
6141       {       {
6142       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6143       }       }
6144    
6145     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
# Line 6061  int length = 1;  /* For final END opcode Line 6274  int length = 1;  /* For final END opcode
6274  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6275  int errorcode = 0;  int errorcode = 0;
6276  int skipatstart = 0;  int skipatstart = 0;
6277  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6278  size_t size;  size_t size;
6279  uschar *code;  uschar *code;
6280  const uschar *codestart;  const uschar *codestart;
# Line 6106  if (erroroffset == NULL) Line 6317  if (erroroffset == NULL)
6317    
6318  *erroroffset = 0;  *erroroffset = 0;
6319    
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifdef SUPPORT_UTF8  
 utf8 = (options & PCRE_UTF8) != 0;  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  
      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)  
   {  
   errorcode = ERR44;  
   goto PCRE_EARLY_ERROR_RETURN2;  
   }  
 #else  
 if ((options & PCRE_UTF8) != 0)  
   {  
   errorcode = ERR32;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
 #endif  
   
 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)  
   {  
   errorcode = ERR17;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
   
6320  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
6321    
6322  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
# Line 6138  cd->fcc = tables + fcc_offset; Line 6325  cd->fcc = tables + fcc_offset;
6325  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6326  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6327    
6328    /* Check that all undefined public option bits are zero */
6329    
6330    if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6331      {
6332      errorcode = ERR17;
6333      goto PCRE_EARLY_ERROR_RETURN;
6334      }
6335    
6336  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6337  the offset for later. */  the offset for later. */
6338    
6339  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6340         ptr[skipatstart+1] == CHAR_ASTERISK)         ptr[skipatstart+1] == CHAR_ASTERISK)
6341    {    {
6342    int newnl = 0;    int newnl = 0;
6343    int newbsr = 0;    int newbsr = 0;
6344    
6345      if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6346        { skipatstart += 7; options |= PCRE_UTF8; continue; }
6347    
6348    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6349      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6350    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
# Line 6170  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6368  while (ptr[skipatstart] == CHAR_LEFT_PAR
6368    else break;    else break;
6369    }    }
6370    
6371    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6372    
6373    #ifdef SUPPORT_UTF8
6374    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6375         (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6376      {
6377      errorcode = ERR44;
6378      goto PCRE_EARLY_ERROR_RETURN2;
6379      }
6380    #else
6381    if (utf8)
6382      {
6383      errorcode = ERR32;
6384      goto PCRE_EARLY_ERROR_RETURN;
6385      }
6386    #endif
6387    
6388  /* Check validity of \R options. */  /* Check validity of \R options. */
6389    
6390  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
# Line 6252  cd->end_pattern = (const uschar *)(patte Line 6467  cd->end_pattern = (const uschar *)(patte
6467  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6468  cd->external_options = options;  cd->external_options = options;
6469  cd->external_flags = 0;  cd->external_flags = 0;
6470    cd->open_caps = NULL;
6471    
6472  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6473  don't need to look at the result of the function here. The initial options have  don't need to look at the result of the function here. The initial options have
# Line 6326  cd->start_code = codestart; Line 6542  cd->start_code = codestart;
6542  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6543  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6544  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6545    cd->check_lookbehind = FALSE;
6546    cd->open_caps = NULL;
6547    
6548  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
6549  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 6364  while (errorcode == 0 && cd->hwm > cwork Line 6582  while (errorcode == 0 && cd->hwm > cwork
6582    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6583    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6584    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6585    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = find_bracket(codestart, utf8, recno);
6586    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6587      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6588    }    }
# Line 6374  subpattern. */ Line 6592  subpattern. */
6592    
6593  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6594    
6595    /* If there were any lookbehind assertions that contained OP_RECURSE
6596    (recursions or subroutine calls), a flag is set for them to be checked here,
6597    because they may contain forward references. Actual recursions can't be fixed
6598    length, but subroutine calls can. It is done like this so that those without
6599    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6600    exceptional ones forgo this. We scan the pattern to check that they are fixed
6601    length, and set their lengths. */
6602    
6603    if (cd->check_lookbehind)
6604      {
6605      uschar *cc = (uschar *)codestart;
6606    
6607      /* Loop, searching for OP_REVERSE items, and process those that do not have
6608      their length set. (Actually, it will also re-process any that have a length
6609      of zero, but that is a pathological case, and it does no harm.) When we find
6610      one, we temporarily terminate the branch it is in while we scan it. */
6611    
6612      for (cc = (uschar *)find_bracket(codestart, utf8, -1);
6613           cc != NULL;
6614           cc = (uschar *)find_bracket(cc, utf8, -1))
6615        {
6616        if (GET(cc, 1) == 0)
6617          {
6618          int fixed_length;
6619          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6620          int end_op = *be;
6621          *be = OP_END;
6622          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6623          *be = end_op;
6624          DPRINTF(("fixed length = %d\n", fixed_length));
6625          if (fixed_length < 0)
6626            {
6627            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6628            break;
6629            }
6630          PUT(cc, 1, fixed_length);
6631          }
6632        cc += 1 + LINK_SIZE;
6633        }
6634      }
6635    
6636  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6637    
6638  if (errorcode != 0)  if (errorcode != 0)

Legend:
Removed from v.391  
changed lines
  Added in v.454

  ViewVC Help
Powered by ViewVC 1.1.5