/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 395 by ph10, Fri Mar 20 11:22:42 2009 UTC revision 457 by ph10, Sat Oct 3 16:24:08 2009 UTC
# Line 341  static const char error_texts[] = Line 341  static const char error_texts[] =
341    "number is too big\0"    "number is too big\0"
342    "subpattern name expected\0"    "subpattern name expected\0"
343    "digit expected after (?+\0"    "digit expected after (?+\0"
344    "] is an invalid data character in JavaScript compatibility mode";    "] is an invalid data character in JavaScript compatibility mode\0"
345      /* 65 */
346      "different names for subpatterns of the same number are not allowed";
347    
348    
349  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1009  return p; Line 1011  return p;
1011    
1012    
1013  /*************************************************  /*************************************************
1014  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1015  *************************************************/  *************************************************/
1016    
1017  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1018    top-level call starts at the beginning of the pattern. All other calls must
1019    start at a parenthesis. It scans along a pattern's text looking for capturing
1020  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1021  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1022  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1023  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1024  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1025    capturing group numbers - the (?| feature.
1026    
1027  Arguments:  Arguments:
1028    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1029    cd           compile background data    cd           compile background data
1030    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1031    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1032    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1033      count        pointer to the current capturing subpattern number (updated)
1034    
1035  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1036  */  */
1037    
1038  static int  static int
1039  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1040    BOOL xmode)    BOOL xmode, int *count)
1041  {  {
1042  const uschar *thisname;  uschar *ptr = *ptrptr;
1043  int count = cd->bracount;  int start_count = *count;
1044    int hwm_count = start_count;
1045    BOOL dup_parens = FALSE;
1046    
1047  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1048    dealing with. The very first call may not start with a parenthesis. */
1049    
1050    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1051    {    {
1052    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1053          ptr[2] == CHAR_VERTICAL_LINE)
1054        {
1055        ptr += 3;
1056        dup_parens = TRUE;
1057        }
1058    
1059      /* Handle a normal, unnamed capturing parenthesis */
1060    
1061      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1062        {
1063        *count += 1;
1064        if (name == NULL && *count == lorn) return *count;
1065        ptr++;
1066        }
1067    
1068      /* Handle a condition. If it is an assertion, just carry on so that it
1069      is processed as normal. If not, skip to the closing parenthesis of the
1070      condition (there can't be any nested parens. */
1071    
1072      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1073        {
1074        ptr += 2;
1075        if (ptr[1] != CHAR_QUESTION_MARK)
1076          {
1077          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1078          if (*ptr != 0) ptr++;
1079          }
1080        }
1081    
1082      /* We have either (? or (* and not a condition */
1083    
1084      else
1085        {
1086        ptr += 2;
1087        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1088    
1089        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1090    
1091        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1092            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1093          {
1094          int term;
1095          const uschar *thisname;
1096          *count += 1;
1097          if (name == NULL && *count == lorn) return *count;
1098          term = *ptr++;
1099          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1100          thisname = ptr;
1101          while (*ptr != term) ptr++;
1102          if (name != NULL && lorn == ptr - thisname &&
1103              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1104            return *count;
1105          term++;
1106          }
1107        }
1108      }
1109    
1110    /* Past any initial parenthesis handling, scan for parentheses or vertical
1111    bars. */
1112    
1113    for (; *ptr != 0; ptr++)
1114      {
1115    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1116    
1117    if (*ptr == CHAR_BACKSLASH)    if (*ptr == CHAR_BACKSLASH)
1118      {      {
1119      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1120      if (*ptr == CHAR_Q) for (;;)      if (*ptr == CHAR_Q) for (;;)
1121        {        {
1122        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1123        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1124        if (*(++ptr) == CHAR_E) break;        if (*(++ptr) == CHAR_E) break;
1125        }        }
1126      continue;      continue;
# Line 1065  for (; *ptr != 0; ptr++) Line 1137  for (; *ptr != 0; ptr++)
1137      BOOL negate_class = FALSE;      BOOL negate_class = FALSE;
1138      for (;;)      for (;;)
1139        {        {
1140        int c = *(++ptr);        if (ptr[1] == CHAR_BACKSLASH)
       if (c == CHAR_BACKSLASH)  
1141          {          {
1142          if (ptr[1] == CHAR_E)          if (ptr[2] == CHAR_E)
1143            ptr++;            ptr+= 2;
1144          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+2,
1145                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1146            ptr += 3;            ptr += 4;
1147          else          else
1148            break;            break;
1149          }          }
1150        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1151            {
1152          negate_class = TRUE;          negate_class = TRUE;
1153            ptr++;
1154            }
1155        else break;        else break;
1156        }        }
1157    
# Line 1093  for (; *ptr != 0; ptr++) Line 1167  for (; *ptr != 0; ptr++)
1167        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1168        if (*ptr == CHAR_BACKSLASH)        if (*ptr == CHAR_BACKSLASH)
1169          {          {
1170          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1171          if (*ptr == CHAR_Q) for (;;)          if (*ptr == CHAR_Q) for (;;)
1172            {            {
1173            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1174            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1175            if (*(++ptr) == CHAR_E) break;            if (*(++ptr) == CHAR_E) break;
1176            }            }
1177          continue;          continue;
# Line 1111  for (; *ptr != 0; ptr++) Line 1185  for (; *ptr != 0; ptr++)
1185    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1186      {      {
1187      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1188      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1189      continue;      continue;
1190      }      }
1191    
1192    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1193    
1194    if (*ptr != CHAR_LEFT_PARENTHESIS) continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)  
1195      {      {
1196      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1197      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1198      continue;      if (*ptr == 0) goto FAIL_EXIT;
1199      }      }
1200    
1201    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1202    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */      {
1203        if (dup_parens && *count < hwm_count) *count = hwm_count;
1204        *ptrptr = ptr;
1205        return -1;
1206        }
1207    
1208    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1209        {
1210        if (*count > hwm_count) hwm_count = *count;
1211        *count = start_count;
1212        }
1213      }
1214    
1215    if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||  FAIL_EXIT:
1216        ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)  *ptrptr = ptr;
1217      continue;  return -1;
1218    }
1219    
1220    
1221    
1222    
1223    /*************************************************
1224    *       Find forward referenced subpattern       *
1225    *************************************************/
1226    
1227    count++;  /* This function scans along a pattern's text looking for capturing
1228    subpatterns, and counting them. If it finds a named pattern that matches the
1229    name it is given, it returns its number. Alternatively, if the name is NULL, it
1230    returns when it reaches a given numbered subpattern. This is used for forward
1231    references to subpatterns. We used to be able to start this scan from the
1232    current compiling point, using the current count value from cd->bracount, and
1233    do it all in a single loop, but the addition of the possibility of duplicate
1234    subpattern numbers means that we have to scan from the very start, in order to
1235    take account of such duplicates, and to use a recursive function to keep track
1236    of the different types of group.
1237    
1238    if (name == NULL && count == lorn) return count;  Arguments:
1239    term = *ptr++;    cd           compile background data
1240    if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;    name         name to seek, or NULL if seeking a numbered subpattern
1241    thisname = ptr;    lorn         name length, or subpattern number if name is NULL
1242    while (*ptr != term) ptr++;    xmode        TRUE if we are in /x mode
1243    if (name != NULL && lorn == ptr - thisname &&  
1244        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  Returns:       the number of the found subpattern, or -1 if not found
1245      return count;  */
1246    
1247    static int
1248    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1249    {
1250    uschar *ptr = (uschar *)cd->start_pattern;
1251    int count = 0;
1252    int rc;
1253    
1254    /* If the pattern does not start with an opening parenthesis, the first call
1255    to find_parens_sub() will scan right to the end (if necessary). However, if it
1256    does start with a parenthesis, find_parens_sub() will return when it hits the
1257    matching closing parens. That is why we have to have a loop. */
1258    
1259    for (;;)
1260      {
1261      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1262      if (rc > 0 || *ptr++ == 0) break;
1263    }    }
1264    
1265  return -1;  return rc;
1266  }  }
1267    
1268    
1269    
1270    
1271  /*************************************************  /*************************************************
1272  *      Find first significant op code            *  *      Find first significant op code            *
1273  *************************************************/  *************************************************/
# Line 1216  for (;;) Line 1333  for (;;)
1333    
1334    
1335  /*************************************************  /*************************************************
1336  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1337  *************************************************/  *************************************************/
1338    
1339  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1340  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1341  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1342    temporarily terminated with OP_END when this function is called.
1343    
1344    This function is called when a backward assertion is encountered, so that if it
1345    fails, the error message can point to the correct place in the pattern.
1346    However, we cannot do this when the assertion contains subroutine calls,
1347    because they can be forward references. We solve this by remembering this case
1348    and doing the check at the end; a flag specifies which mode we are running in.
1349    
1350  Arguments:  Arguments:
1351    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1352    options  the compiling options    options  the compiling options
1353      atend    TRUE if called when the pattern is complete
1354      cd       the "compile data" structure
1355    
1356  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1357                 or -1 if there is no fixed length,
1358               or -2 if \C was encountered               or -2 if \C was encountered
1359                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1360  */  */
1361    
1362  static int  static int
1363  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1364  {  {
1365  int length = -1;  int length = -1;
1366    
# Line 1245  branch, check the length against that of Line 1373  branch, check the length against that of
1373  for (;;)  for (;;)
1374    {    {
1375    int d;    int d;
1376      uschar *ce, *cs;
1377    register int op = *cc;    register int op = *cc;
1378    switch (op)    switch (op)
1379      {      {
# Line 1252  for (;;) Line 1381  for (;;)
1381      case OP_BRA:      case OP_BRA:
1382      case OP_ONCE:      case OP_ONCE:
1383      case OP_COND:      case OP_COND:
1384      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1385      if (d < 0) return d;      if (d < 0) return d;
1386      branchlength += d;      branchlength += d;
1387      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1274  for (;;) Line 1403  for (;;)
1403      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1404      branchlength = 0;      branchlength = 0;
1405      break;      break;
1406    
1407        /* A true recursion implies not fixed length, but a subroutine call may
1408        be OK. If the subroutine is a forward reference, we can't deal with
1409        it until the end of the pattern, so return -3. */
1410    
1411        case OP_RECURSE:
1412        if (!atend) return -3;
1413        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1414        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1415        if (cc > cs && cc < ce) return -1;                /* Recursion */
1416        d = find_fixedlength(cs + 2, options, atend, cd);
1417        if (d < 0) return d;
1418        branchlength += d;
1419        cc += 1 + LINK_SIZE;
1420        break;
1421    
1422      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1423    
# Line 1311  for (;;) Line 1455  for (;;)
1455      branchlength++;      branchlength++;
1456      cc += 2;      cc += 2;
1457  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1458      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1459        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1460  #endif  #endif
1461      break;      break;
1462    
# Line 1325  for (;;) Line 1467  for (;;)
1467      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1468      cc += 4;      cc += 4;
1469  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1470      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1471        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1472  #endif  #endif
1473      break;      break;
1474    
# Line 1407  for (;;) Line 1547  for (;;)
1547    
1548    
1549  /*************************************************  /*************************************************
1550  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1551  *************************************************/  *************************************************/
1552    
1553  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1554  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1555    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1556    so that it can be called from pcre_study() when finding the minimum matching
1557    length.
1558    
1559  Arguments:  Arguments:
1560    code        points to start of expression    code        points to start of expression
1561    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1562    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1563    
1564  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1565  */  */
1566    
1567  static const uschar *  const uschar *
1568  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1569  {  {
1570  for (;;)  for (;;)
1571    {    {
# Line 1434  for (;;) Line 1577  for (;;)
1577    the table is zero; the actual length is stored in the compiled code. */    the table is zero; the actual length is stored in the compiled code. */
1578    
1579    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1580    
1581      /* Handle recursion */
1582    
1583      else if (c == OP_REVERSE)
1584        {
1585        if (number < 0) return (uschar *)code;
1586        code += _pcre_OP_lengths[c];
1587        }
1588    
1589    /* Handle capturing bracket */    /* Handle capturing bracket */
1590    
# Line 1663  for (code = first_significant_code(code Line 1814  for (code = first_significant_code(code
1814      {      {
1815      BOOL empty_branch;      BOOL empty_branch;
1816      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1817    
1818      /* If a conditional group has only one branch, there is a second, implied,      /* If a conditional group has only one branch, there is a second, implied,
1819      empty branch, so just skip over the conditional, because it could be empty.      empty branch, so just skip over the conditional, because it could be empty.
1820      Otherwise, scan the individual branches of the group. */      Otherwise, scan the individual branches of the group. */
1821    
1822      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1823        code += GET(code, 1);        code += GET(code, 1);
1824      else      else
1825        {        {
1826        empty_branch = FALSE;        empty_branch = FALSE;
1827        do        do
1828          {          {
# Line 1682  for (code = first_significant_code(code Line 1833  for (code = first_significant_code(code
1833        while (*code == OP_ALT);        while (*code == OP_ALT);
1834        if (!empty_branch) return FALSE;   /* All branches are non-empty */        if (!empty_branch) return FALSE;   /* All branches are non-empty */
1835        }        }
1836    
1837      c = *code;      c = *code;
1838      continue;      continue;
1839      }      }
# Line 1800  for (code = first_significant_code(code Line 1951  for (code = first_significant_code(code
1951      case OP_QUERY:      case OP_QUERY:
1952      case OP_MINQUERY:      case OP_MINQUERY:
1953      case OP_POSQUERY:      case OP_POSQUERY:
1954        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1955        break;
1956    
1957      case OP_UPTO:      case OP_UPTO:
1958      case OP_MINUPTO:      case OP_MINUPTO:
1959      case OP_POSUPTO:      case OP_POSUPTO:
1960      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1961      break;      break;
1962  #endif  #endif
1963      }      }
# Line 3757  we set the flag only if there is a liter Line 3911  we set the flag only if there is a liter
3911    
3912        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3913    
3914          /*--------------------------------------------------------------------*/
3915          /* This code is obsolete from release 8.00; the restriction was finally
3916          removed: */
3917    
3918        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3919        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3920    
3921        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3922          /*--------------------------------------------------------------------*/
3923    
3924        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3925    
# Line 3907  we set the flag only if there is a liter Line 4066  we set the flag only if there is a liter
4066          goto END_REPEAT;          goto END_REPEAT;
4067          }          }
4068    
4069          /*--------------------------------------------------------------------*/
4070          /* This code is obsolete from release 8.00; the restriction was finally
4071          removed: */
4072    
4073        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4074        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4075    
4076        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4077          /*--------------------------------------------------------------------*/
4078    
4079        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4080          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 4225  we set the flag only if there is a liter Line 4389  we set the flag only if there is a liter
4389      if (possessive_quantifier)      if (possessive_quantifier)
4390        {        {
4391        int len;        int len;
4392        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||  
4393            *tempcode == OP_NOTEXACT)        if (*tempcode == OP_TYPEEXACT)
4394          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4395            ((*tempcode == OP_TYPEEXACT &&            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4396               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);  
4397          else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4398            {
4399            tempcode += _pcre_OP_lengths[*tempcode];
4400    #ifdef SUPPORT_UTF8
4401            if (utf8 && tempcode[-1] >= 0xc0)
4402              tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4403    #endif
4404            }
4405    
4406        len = code - tempcode;        len = code - tempcode;
4407        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4408          {          {
# Line 4307  we set the flag only if there is a liter Line 4480  we set the flag only if there is a liter
4480          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4481              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4482            {            {
4483            *code = verbs[i].op;            /* Check for open captures before ACCEPT */
4484            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;  
4485              if (verbs[i].op == OP_ACCEPT)
4486                {
4487                open_capitem *oc;
4488                cd->had_accept = TRUE;
4489                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4490                  {
4491                  *code++ = OP_CLOSE;
4492                  PUT2INC(code, 0, oc->number);
4493                  }
4494                }
4495              *code++ = verbs[i].op;
4496            break;            break;
4497            }            }
4498          vn += verbs[i].len + 1;          vn += verbs[i].len + 1;
# Line 4489  we set the flag only if there is a liter Line 4673  we set the flag only if there is a liter
4673    
4674          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4675    
4676          else if ((i = find_parens(ptr, cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4677                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4678            {            {
4679            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
# Line 4685  we set the flag only if there is a liter Line 4869  we set the flag only if there is a liter
4869                }                }
4870              }              }
4871    
4872            /* In the real compile, create the entry in the table */            /* In the real compile, create the entry in the table, maintaining
4873              alphabetical order. Duplicate names for different numbers are
4874              permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4875              number are always OK. (An existing number can be re-used if (?|
4876              appears in the pattern.) In either event, a duplicate name results in
4877              a duplicate entry in the table, even if the number is the same. This
4878              is because the number of names, and hence the table size, is computed
4879              in the pre-compile, and it affects various numbers and pointers which
4880              would all have to be modified, and the compiled code moved down, if
4881              duplicates with the same number were omitted from the table. This
4882              doesn't seem worth the hassle. However, *different* names for the
4883              same number are not permitted. */
4884    
4885            else            else
4886              {              {
4887                BOOL dupname = FALSE;
4888              slot = cd->name_table;              slot = cd->name_table;
4889    
4890              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
4891                {                {
4892                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+2, namelen);
# Line 4697  we set the flag only if there is a liter Line 4894  we set the flag only if there is a liter
4894                  {                  {
4895                  if (slot[2+namelen] == 0)                  if (slot[2+namelen] == 0)
4896                    {                    {
4897                    if ((options & PCRE_DUPNAMES) == 0)                    if (GET2(slot, 0) != cd->bracount + 1 &&
4898                          (options & PCRE_DUPNAMES) == 0)
4899                      {                      {
4900                      *errorcodeptr = ERR43;                      *errorcodeptr = ERR43;
4901                      goto FAILED;                      goto FAILED;
4902                      }                      }
4903                      else dupname = TRUE;
4904                    }                    }
4905                  else crc = -1;      /* Current name is substring */                  else crc = -1;      /* Current name is a substring */
4906                  }                  }
4907    
4908                  /* Make space in the table and break the loop for an earlier
4909                  name. For a duplicate or later name, carry on. We do this for
4910                  duplicates so that in the simple case (when ?(| is not used) they
4911                  are in order of their numbers. */
4912    
4913                if (crc < 0)                if (crc < 0)
4914                  {                  {
4915                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
4916                    (cd->names_found - i) * cd->name_entry_size);                    (cd->names_found - i) * cd->name_entry_size);
4917                  break;                  break;
4918                  }                  }
4919    
4920                  /* Continue the loop for a later or duplicate name */
4921    
4922                slot += cd->name_entry_size;                slot += cd->name_entry_size;
4923                }                }
4924    
4925                /* For non-duplicate names, check for a duplicate number before
4926                adding the new name. */
4927    
4928                if (!dupname)
4929                  {
4930                  uschar *cslot = cd->name_table;
4931                  for (i = 0; i < cd->names_found; i++)
4932                    {
4933                    if (cslot != slot)
4934                      {
4935                      if (GET2(cslot, 0) == cd->bracount + 1)
4936                        {
4937                        *errorcodeptr = ERR65;
4938                        goto FAILED;
4939                        }
4940                      }
4941                    else i--;
4942                    cslot += cd->name_entry_size;
4943                    }
4944                  }
4945    
4946              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
4947              memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
# Line 4720  we set the flag only if there is a liter Line 4949  we set the flag only if there is a liter
4949              }              }
4950            }            }
4951    
4952          /* In both cases, count the number of names we've encountered. */          /* In both pre-compile and compile, count the number of names we've
4953            encountered. */
4954    
         ptr++;                    /* Move past > or ' */  
4955          cd->names_found++;          cd->names_found++;
4956            ptr++;                    /* Move past > or ' */
4957          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
4958    
4959    
# Line 4788  we set the flag only if there is a liter Line 5018  we set the flag only if there is a liter
5018              recno = GET2(slot, 0);              recno = GET2(slot, 0);
5019              }              }
5020            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5021                      find_parens(ptr, cd, name, namelen,                      find_parens(cd, name, namelen,
5022                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
5023              {              {
5024              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4892  we set the flag only if there is a liter Line 5122  we set the flag only if there is a liter
5122            if (lengthptr == NULL)            if (lengthptr == NULL)
5123              {              {
5124              *code = OP_END;              *code = OP_END;
5125              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);              if (recno != 0)
5126                  called = _pcre_find_bracket(cd->start_code, utf8, recno);
5127    
5128              /* Forward reference */              /* Forward reference */
5129    
5130              if (called == NULL)              if (called == NULL)
5131                {                {
5132                if (find_parens(ptr, cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5133                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5134                  {                  {
5135                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
# Line 5536  uschar *code = *codeptr; Line 5767  uschar *code = *codeptr;
5767  uschar *last_branch = code;  uschar *last_branch = code;
5768  uschar *start_bracket = code;  uschar *start_bracket = code;
5769  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
5770    open_capitem capitem;
5771    int capnumber = 0;
5772  int firstbyte, reqbyte;  int firstbyte, reqbyte;
5773  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
5774  int length;  int length;
# Line 5562  the code that abstracts option settings Line 5795  the code that abstracts option settings
5795  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5796  pre-compile phase to find out whether anything has yet been compiled or not. */  pre-compile phase to find out whether anything has yet been compiled or not. */
5797    
5798    /* If this is a capturing subpattern, add to the chain of open capturing items
5799    so that we can detect them if (*ACCEPT) is encountered. */
5800    
5801    if (*code == OP_CBRA)
5802      {
5803      capnumber = GET2(code, 1 + LINK_SIZE);
5804      capitem.number = capnumber;
5805      capitem.next = cd->open_caps;
5806      cd->open_caps = &capitem;
5807      }
5808    
5809  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
5810    
5811  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 5656  for (;;) Line 5900  for (;;)
5900    
5901      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5902      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5903      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5904        because there may be forward references that we can't check here. Set a
5905        flag to cause another lookbehind check at the end. Why not do it all at the
5906        end? Because common, erroneous checks are picked up here and the offset of
5907        the problem can be shown. */
5908    
5909      if (lookbehind)      if (lookbehind)
5910        {        {
5911        int fixed_length;        int fixed_length;
5912        *code = OP_END;        *code = OP_END;
5913        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5914        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
5915        if (fixed_length < 0)        if (fixed_length == -3)
5916            {
5917            cd->check_lookbehind = TRUE;
5918            }
5919          else if (fixed_length < 0)
5920          {          {
5921          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5922          *ptrptr = ptr;          *ptrptr = ptr;
5923          return FALSE;          return FALSE;
5924          }          }
5925        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
5926        }        }
5927      }      }
5928    
# Line 5697  for (;;) Line 5949  for (;;)
5949          }          }
5950        while (branch_length > 0);        while (branch_length > 0);
5951        }        }
5952    
5953        /* If it was a capturing subpattern, remove it from the chain. */
5954    
5955        if (capnumber > 0) cd->open_caps = cd->open_caps->next;
5956    
5957      /* Fill in the ket */      /* Fill in the ket */
5958    
# Line 6069  int length = 1;  /* For final END opcode Line 6325  int length = 1;  /* For final END opcode
6325  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6326  int errorcode = 0;  int errorcode = 0;
6327  int skipatstart = 0;  int skipatstart = 0;
6328  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6329  size_t size;  size_t size;
6330  uschar *code;  uschar *code;
6331  const uschar *codestart;  const uschar *codestart;
# Line 6114  if (erroroffset == NULL) Line 6368  if (erroroffset == NULL)
6368    
6369  *erroroffset = 0;  *erroroffset = 0;
6370    
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifdef SUPPORT_UTF8  
 utf8 = (options & PCRE_UTF8) != 0;  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  
      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)  
   {  
   errorcode = ERR44;  
   goto PCRE_EARLY_ERROR_RETURN2;  
   }  
 #else  
 if ((options & PCRE_UTF8) != 0)  
   {  
   errorcode = ERR32;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
 #endif  
   
 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)  
   {  
   errorcode = ERR17;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
   
6371  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
6372    
6373  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
# Line 6146  cd->fcc = tables + fcc_offset; Line 6376  cd->fcc = tables + fcc_offset;
6376  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6377  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6378    
6379    /* Check that all undefined public option bits are zero */
6380    
6381    if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6382      {
6383      errorcode = ERR17;
6384      goto PCRE_EARLY_ERROR_RETURN;
6385      }
6386    
6387  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6388  the offset for later. */  the offset for later. */
6389    
# Line 6155  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6393  while (ptr[skipatstart] == CHAR_LEFT_PAR
6393    int newnl = 0;    int newnl = 0;
6394    int newbsr = 0;    int newbsr = 0;
6395    
6396      if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6397        { skipatstart += 7; options |= PCRE_UTF8; continue; }
6398    
6399    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6400      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6401    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
# Line 6178  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6419  while (ptr[skipatstart] == CHAR_LEFT_PAR
6419    else break;    else break;
6420    }    }
6421    
6422    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6423    
6424    #ifdef SUPPORT_UTF8
6425    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6426         (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6427      {
6428      errorcode = ERR44;
6429      goto PCRE_EARLY_ERROR_RETURN2;
6430      }
6431    #else
6432    if (utf8)
6433      {
6434      errorcode = ERR32;
6435      goto PCRE_EARLY_ERROR_RETURN;
6436      }
6437    #endif
6438    
6439  /* Check validity of \R options. */  /* Check validity of \R options. */
6440    
6441  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
# Line 6260  cd->end_pattern = (const uschar *)(patte Line 6518  cd->end_pattern = (const uschar *)(patte
6518  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6519  cd->external_options = options;  cd->external_options = options;
6520  cd->external_flags = 0;  cd->external_flags = 0;
6521    cd->open_caps = NULL;
6522    
6523  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6524  don't need to look at the result of the function here. The initial options have  don't need to look at the result of the function here. The initial options have
# Line 6334  cd->start_code = codestart; Line 6593  cd->start_code = codestart;
6593  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6594  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6595  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6596    cd->check_lookbehind = FALSE;
6597    cd->open_caps = NULL;
6598    
6599  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
6600  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 6372  while (errorcode == 0 && cd->hwm > cwork Line 6633  while (errorcode == 0 && cd->hwm > cwork
6633    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6634    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6635    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6636    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = _pcre_find_bracket(codestart, utf8, recno);
6637    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6638      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6639    }    }
# Line 6382  subpattern. */ Line 6643  subpattern. */
6643    
6644  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6645    
6646    /* If there were any lookbehind assertions that contained OP_RECURSE
6647    (recursions or subroutine calls), a flag is set for them to be checked here,
6648    because they may contain forward references. Actual recursions can't be fixed
6649    length, but subroutine calls can. It is done like this so that those without
6650    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6651    exceptional ones forgo this. We scan the pattern to check that they are fixed
6652    length, and set their lengths. */
6653    
6654    if (cd->check_lookbehind)
6655      {
6656      uschar *cc = (uschar *)codestart;
6657    
6658      /* Loop, searching for OP_REVERSE items, and process those that do not have
6659      their length set. (Actually, it will also re-process any that have a length
6660      of zero, but that is a pathological case, and it does no harm.) When we find
6661      one, we temporarily terminate the branch it is in while we scan it. */
6662    
6663      for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6664           cc != NULL;
6665           cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6666        {
6667        if (GET(cc, 1) == 0)
6668          {
6669          int fixed_length;
6670          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6671          int end_op = *be;
6672          *be = OP_END;
6673          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6674          *be = end_op;
6675          DPRINTF(("fixed length = %d\n", fixed_length));
6676          if (fixed_length < 0)
6677            {
6678            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6679            break;
6680            }
6681          PUT(cc, 1, fixed_length);
6682          }
6683        cc += 1 + LINK_SIZE;
6684        }
6685      }
6686    
6687  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6688    
6689  if (errorcode != 0)  if (errorcode != 0)

Legend:
Removed from v.395  
changed lines
  Added in v.457

  ViewVC Help
Powered by ViewVC 1.1.5