/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 406 by ph10, Mon Mar 23 12:05:43 2009 UTC revision 455 by ph10, Sat Sep 26 19:12:32 2009 UTC
# Line 1009  return p; Line 1009  return p;
1009    
1010    
1011  /*************************************************  /*************************************************
1012  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1013  *************************************************/  *************************************************/
1014    
1015  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1016    top-level call starts at the beginning of the pattern. All other calls must
1017    start at a parenthesis. It scans along a pattern's text looking for capturing
1018  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1019  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1020  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1021  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1022  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1023    capturing group numbers - the (?| feature.
1024    
1025  Arguments:  Arguments:
1026    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1027    cd           compile background data    cd           compile background data
1028    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1029    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1030    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1031      count        pointer to the current capturing subpattern number (updated)
1032    
1033  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1034  */  */
1035    
1036  static int  static int
1037  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038    BOOL xmode)    BOOL xmode, int *count)
1039  {  {
1040  const uschar *thisname;  uschar *ptr = *ptrptr;
1041  int count = cd->bracount;  int start_count = *count;
1042    int hwm_count = start_count;
1043    BOOL dup_parens = FALSE;
1044    
1045  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1046    dealing with. The very first call may not start with a parenthesis. */
1047    
1048    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049    {    {
1050    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1051          ptr[2] == CHAR_VERTICAL_LINE)
1052        {
1053        ptr += 3;
1054        dup_parens = TRUE;
1055        }
1056    
1057      /* Handle a normal, unnamed capturing parenthesis */
1058    
1059      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060        {
1061        *count += 1;
1062        if (name == NULL && *count == lorn) return *count;
1063        ptr++;
1064        }
1065    
1066      /* Handle a condition. If it is an assertion, just carry on so that it
1067      is processed as normal. If not, skip to the closing parenthesis of the
1068      condition (there can't be any nested parens. */
1069    
1070      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071        {
1072        ptr += 2;
1073        if (ptr[1] != CHAR_QUESTION_MARK)
1074          {
1075          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076          if (*ptr != 0) ptr++;
1077          }
1078        }
1079    
1080      /* We have either (? or (* and not a condition */
1081    
1082      else
1083        {
1084        ptr += 2;
1085        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1086    
1087        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088    
1089        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091          {
1092          int term;
1093          const uschar *thisname;
1094          *count += 1;
1095          if (name == NULL && *count == lorn) return *count;
1096          term = *ptr++;
1097          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098          thisname = ptr;
1099          while (*ptr != term) ptr++;
1100          if (name != NULL && lorn == ptr - thisname &&
1101              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102            return *count;
1103          term++;
1104          }
1105        }
1106      }
1107    
1108    /* Past any initial parenthesis handling, scan for parentheses or vertical
1109    bars. */
1110    
1111    for (; *ptr != 0; ptr++)
1112      {
1113    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1114    
1115    if (*ptr == CHAR_BACKSLASH)    if (*ptr == CHAR_BACKSLASH)
1116      {      {
1117      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1118      if (*ptr == CHAR_Q) for (;;)      if (*ptr == CHAR_Q) for (;;)
1119        {        {
1120        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1121        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1122        if (*(++ptr) == CHAR_E) break;        if (*(++ptr) == CHAR_E) break;
1123        }        }
1124      continue;      continue;
# Line 1065  for (; *ptr != 0; ptr++) Line 1135  for (; *ptr != 0; ptr++)
1135      BOOL negate_class = FALSE;      BOOL negate_class = FALSE;
1136      for (;;)      for (;;)
1137        {        {
1138        int c = *(++ptr);        if (ptr[1] == CHAR_BACKSLASH)
       if (c == CHAR_BACKSLASH)  
1139          {          {
1140          if (ptr[1] == CHAR_E)          if (ptr[2] == CHAR_E)
1141            ptr++;            ptr+= 2;
1142          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+2,
1143                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144            ptr += 3;            ptr += 4;
1145          else          else
1146            break;            break;
1147          }          }
1148        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1149            {
1150          negate_class = TRUE;          negate_class = TRUE;
1151            ptr++;
1152            }
1153        else break;        else break;
1154        }        }
1155    
# Line 1093  for (; *ptr != 0; ptr++) Line 1165  for (; *ptr != 0; ptr++)
1165        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1166        if (*ptr == CHAR_BACKSLASH)        if (*ptr == CHAR_BACKSLASH)
1167          {          {
1168          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1169          if (*ptr == CHAR_Q) for (;;)          if (*ptr == CHAR_Q) for (;;)
1170            {            {
1171            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1172            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1173            if (*(++ptr) == CHAR_E) break;            if (*(++ptr) == CHAR_E) break;
1174            }            }
1175          continue;          continue;
# Line 1111  for (; *ptr != 0; ptr++) Line 1183  for (; *ptr != 0; ptr++)
1183    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1184      {      {
1185      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1186      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1187      continue;      continue;
1188      }      }
1189    
1190    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1191    
1192    if (*ptr != CHAR_LEFT_PARENTHESIS) continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)  
1193      {      {
1194      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1195      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1196      continue;      if (*ptr == 0) goto FAIL_EXIT;
1197      }      }
1198    
1199    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1200    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */      {
1201        if (dup_parens && *count < hwm_count) *count = hwm_count;
1202        *ptrptr = ptr;
1203        return -1;
1204        }
1205    
1206    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1207        {
1208        if (*count > hwm_count) hwm_count = *count;
1209        *count = start_count;
1210        }
1211      }
1212    
1213    if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||  FAIL_EXIT:
1214        ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)  *ptrptr = ptr;
1215      continue;  return -1;
1216    }
1217    
   count++;  
1218    
1219    if (name == NULL && count == lorn) return count;  
1220    term = *ptr++;  
1221    if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  /*************************************************
1222    thisname = ptr;  *       Find forward referenced subpattern       *
1223    while (*ptr != term) ptr++;  *************************************************/
1224    if (name != NULL && lorn == ptr - thisname &&  
1225        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  /* This function scans along a pattern's text looking for capturing
1226      return count;  subpatterns, and counting them. If it finds a named pattern that matches the
1227    name it is given, it returns its number. Alternatively, if the name is NULL, it
1228    returns when it reaches a given numbered subpattern. This is used for forward
1229    references to subpatterns. We used to be able to start this scan from the
1230    current compiling point, using the current count value from cd->bracount, and
1231    do it all in a single loop, but the addition of the possibility of duplicate
1232    subpattern numbers means that we have to scan from the very start, in order to
1233    take account of such duplicates, and to use a recursive function to keep track
1234    of the different types of group.
1235    
1236    Arguments:
1237      cd           compile background data
1238      name         name to seek, or NULL if seeking a numbered subpattern
1239      lorn         name length, or subpattern number if name is NULL
1240      xmode        TRUE if we are in /x mode
1241    
1242    Returns:       the number of the found subpattern, or -1 if not found
1243    */
1244    
1245    static int
1246    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1247    {
1248    uschar *ptr = (uschar *)cd->start_pattern;
1249    int count = 0;
1250    int rc;
1251    
1252    /* If the pattern does not start with an opening parenthesis, the first call
1253    to find_parens_sub() will scan right to the end (if necessary). However, if it
1254    does start with a parenthesis, find_parens_sub() will return when it hits the
1255    matching closing parens. That is why we have to have a loop. */
1256    
1257    for (;;)
1258      {
1259      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1260      if (rc > 0 || *ptr++ == 0) break;
1261    }    }
1262    
1263  return -1;  return rc;
1264  }  }
1265    
1266    
1267    
1268    
1269  /*************************************************  /*************************************************
1270  *      Find first significant op code            *  *      Find first significant op code            *
1271  *************************************************/  *************************************************/
# Line 1216  for (;;) Line 1331  for (;;)
1331    
1332    
1333  /*************************************************  /*************************************************
1334  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1335  *************************************************/  *************************************************/
1336    
1337  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1338  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1339  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1340    temporarily terminated with OP_END when this function is called.
1341    
1342    This function is called when a backward assertion is encountered, so that if it
1343    fails, the error message can point to the correct place in the pattern.
1344    However, we cannot do this when the assertion contains subroutine calls,
1345    because they can be forward references. We solve this by remembering this case
1346    and doing the check at the end; a flag specifies which mode we are running in.
1347    
1348  Arguments:  Arguments:
1349    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1350    options  the compiling options    options  the compiling options
1351      atend    TRUE if called when the pattern is complete
1352      cd       the "compile data" structure
1353    
1354  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1355                 or -1 if there is no fixed length,
1356               or -2 if \C was encountered               or -2 if \C was encountered
1357                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1358  */  */
1359    
1360  static int  static int
1361  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1362  {  {
1363  int length = -1;  int length = -1;
1364    
# Line 1245  branch, check the length against that of Line 1371  branch, check the length against that of
1371  for (;;)  for (;;)
1372    {    {
1373    int d;    int d;
1374      uschar *ce, *cs;
1375    register int op = *cc;    register int op = *cc;
1376    switch (op)    switch (op)
1377      {      {
# Line 1252  for (;;) Line 1379  for (;;)
1379      case OP_BRA:      case OP_BRA:
1380      case OP_ONCE:      case OP_ONCE:
1381      case OP_COND:      case OP_COND:
1382      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1383      if (d < 0) return d;      if (d < 0) return d;
1384      branchlength += d;      branchlength += d;
1385      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1274  for (;;) Line 1401  for (;;)
1401      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1402      branchlength = 0;      branchlength = 0;
1403      break;      break;
1404    
1405        /* A true recursion implies not fixed length, but a subroutine call may
1406        be OK. If the subroutine is a forward reference, we can't deal with
1407        it until the end of the pattern, so return -3. */
1408    
1409        case OP_RECURSE:
1410        if (!atend) return -3;
1411        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1412        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1413        if (cc > cs && cc < ce) return -1;                /* Recursion */
1414        d = find_fixedlength(cs + 2, options, atend, cd);
1415        if (d < 0) return d;
1416        branchlength += d;
1417        cc += 1 + LINK_SIZE;
1418        break;
1419    
1420      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1421    
# Line 1311  for (;;) Line 1453  for (;;)
1453      branchlength++;      branchlength++;
1454      cc += 2;      cc += 2;
1455  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1456      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1457        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1458  #endif  #endif
1459      break;      break;
1460    
# Line 1325  for (;;) Line 1465  for (;;)
1465      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1466      cc += 4;      cc += 4;
1467  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1468      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1469        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1470  #endif  #endif
1471      break;      break;
1472    
# Line 1407  for (;;) Line 1545  for (;;)
1545    
1546    
1547  /*************************************************  /*************************************************
1548  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1549  *************************************************/  *************************************************/
1550    
1551  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1552  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1553    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1554    so that it can be called from pcre_study() when finding the minimum matching
1555    length.
1556    
1557  Arguments:  Arguments:
1558    code        points to start of expression    code        points to start of expression
1559    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1560    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1561    
1562  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1563  */  */
1564    
1565  static const uschar *  const uschar *
1566  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1567  {  {
1568  for (;;)  for (;;)
1569    {    {
# Line 1434  for (;;) Line 1575  for (;;)
1575    the table is zero; the actual length is stored in the compiled code. */    the table is zero; the actual length is stored in the compiled code. */
1576    
1577    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1578    
1579      /* Handle recursion */
1580    
1581      else if (c == OP_REVERSE)
1582        {
1583        if (number < 0) return (uschar *)code;
1584        code += _pcre_OP_lengths[c];
1585        }
1586    
1587    /* Handle capturing bracket */    /* Handle capturing bracket */
1588    
# Line 1800  for (code = first_significant_code(code Line 1949  for (code = first_significant_code(code
1949      case OP_QUERY:      case OP_QUERY:
1950      case OP_MINQUERY:      case OP_MINQUERY:
1951      case OP_POSQUERY:      case OP_POSQUERY:
1952        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1953        break;
1954    
1955      case OP_UPTO:      case OP_UPTO:
1956      case OP_MINUPTO:      case OP_MINUPTO:
1957      case OP_POSUPTO:      case OP_POSUPTO:
1958      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1959      break;      break;
1960  #endif  #endif
1961      }      }
# Line 3757  we set the flag only if there is a liter Line 3909  we set the flag only if there is a liter
3909    
3910        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3911    
3912          /*--------------------------------------------------------------------*/
3913          /* This code is obsolete from release 8.00; the restriction was finally
3914          removed: */
3915    
3916        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3917        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3918    
3919        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3920          /*--------------------------------------------------------------------*/
3921    
3922        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3923    
# Line 3907  we set the flag only if there is a liter Line 4064  we set the flag only if there is a liter
4064          goto END_REPEAT;          goto END_REPEAT;
4065          }          }
4066    
4067          /*--------------------------------------------------------------------*/
4068          /* This code is obsolete from release 8.00; the restriction was finally
4069          removed: */
4070    
4071        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4072        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4073    
4074        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4075          /*--------------------------------------------------------------------*/
4076    
4077        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4078          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 4225  we set the flag only if there is a liter Line 4387  we set the flag only if there is a liter
4387      if (possessive_quantifier)      if (possessive_quantifier)
4388        {        {
4389        int len;        int len;
4390        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||  
4391            *tempcode == OP_NOTEXACT)        if (*tempcode == OP_TYPEEXACT)
4392          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4393            ((*tempcode == OP_TYPEEXACT &&            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4394               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);  
4395          else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4396            {
4397            tempcode += _pcre_OP_lengths[*tempcode];
4398    #ifdef SUPPORT_UTF8
4399            if (utf8 && tempcode[-1] >= 0xc0)
4400              tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4401    #endif
4402            }
4403    
4404        len = code - tempcode;        len = code - tempcode;
4405        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4406          {          {
# Line 4307  we set the flag only if there is a liter Line 4478  we set the flag only if there is a liter
4478          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4479              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4480            {            {
4481            *code = verbs[i].op;            /* Check for open captures before ACCEPT */
4482            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;  
4483              if (verbs[i].op == OP_ACCEPT)
4484                {
4485                open_capitem *oc;
4486                cd->had_accept = TRUE;
4487                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4488                  {
4489                  *code++ = OP_CLOSE;
4490                  PUT2INC(code, 0, oc->number);
4491                  }
4492                }
4493              *code++ = verbs[i].op;
4494            break;            break;
4495            }            }
4496          vn += verbs[i].len + 1;          vn += verbs[i].len + 1;
# Line 4489  we set the flag only if there is a liter Line 4671  we set the flag only if there is a liter
4671    
4672          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4673    
4674          else if ((i = find_parens(ptr, cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4675                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4676            {            {
4677            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
# Line 4788  we set the flag only if there is a liter Line 4970  we set the flag only if there is a liter
4970              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4971              }              }
4972            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4973                      find_parens(ptr, cd, name, namelen,                      find_parens(cd, name, namelen,
4974                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
4975              {              {
4976              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4892  we set the flag only if there is a liter Line 5074  we set the flag only if there is a liter
5074            if (lengthptr == NULL)            if (lengthptr == NULL)
5075              {              {
5076              *code = OP_END;              *code = OP_END;
5077              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);              if (recno != 0)
5078                  called = _pcre_find_bracket(cd->start_code, utf8, recno);
5079    
5080              /* Forward reference */              /* Forward reference */
5081    
5082              if (called == NULL)              if (called == NULL)
5083                {                {
5084                if (find_parens(ptr, cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5085                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5086                  {                  {
5087                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
# Line 5536  uschar *code = *codeptr; Line 5719  uschar *code = *codeptr;
5719  uschar *last_branch = code;  uschar *last_branch = code;
5720  uschar *start_bracket = code;  uschar *start_bracket = code;
5721  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
5722    open_capitem capitem;
5723    int capnumber = 0;
5724  int firstbyte, reqbyte;  int firstbyte, reqbyte;
5725  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
5726  int length;  int length;
# Line 5562  the code that abstracts option settings Line 5747  the code that abstracts option settings
5747  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5748  pre-compile phase to find out whether anything has yet been compiled or not. */  pre-compile phase to find out whether anything has yet been compiled or not. */
5749    
5750    /* If this is a capturing subpattern, add to the chain of open capturing items
5751    so that we can detect them if (*ACCEPT) is encountered. */
5752    
5753    if (*code == OP_CBRA)
5754      {
5755      capnumber = GET2(code, 1 + LINK_SIZE);
5756      capitem.number = capnumber;
5757      capitem.next = cd->open_caps;
5758      cd->open_caps = &capitem;
5759      }
5760    
5761  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
5762    
5763  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 5656  for (;;) Line 5852  for (;;)
5852    
5853      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5854      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5855      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5856        because there may be forward references that we can't check here. Set a
5857        flag to cause another lookbehind check at the end. Why not do it all at the
5858        end? Because common, erroneous checks are picked up here and the offset of
5859        the problem can be shown. */
5860    
5861      if (lookbehind)      if (lookbehind)
5862        {        {
5863        int fixed_length;        int fixed_length;
5864        *code = OP_END;        *code = OP_END;
5865        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5866        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
5867        if (fixed_length < 0)        if (fixed_length == -3)
5868            {
5869            cd->check_lookbehind = TRUE;
5870            }
5871          else if (fixed_length < 0)
5872          {          {
5873          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5874          *ptrptr = ptr;          *ptrptr = ptr;
5875          return FALSE;          return FALSE;
5876          }          }
5877        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
5878        }        }
5879      }      }
5880    
# Line 5697  for (;;) Line 5901  for (;;)
5901          }          }
5902        while (branch_length > 0);        while (branch_length > 0);
5903        }        }
5904    
5905        /* If it was a capturing subpattern, remove it from the chain. */
5906    
5907        if (capnumber > 0) cd->open_caps = cd->open_caps->next;
5908    
5909      /* Fill in the ket */      /* Fill in the ket */
5910    
# Line 6069  int length = 1;  /* For final END opcode Line 6277  int length = 1;  /* For final END opcode
6277  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6278  int errorcode = 0;  int errorcode = 0;
6279  int skipatstart = 0;  int skipatstart = 0;
6280  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6281  size_t size;  size_t size;
6282  uschar *code;  uschar *code;
6283  const uschar *codestart;  const uschar *codestart;
# Line 6114  if (erroroffset == NULL) Line 6320  if (erroroffset == NULL)
6320    
6321  *erroroffset = 0;  *erroroffset = 0;
6322    
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifdef SUPPORT_UTF8  
 utf8 = (options & PCRE_UTF8) != 0;  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  
      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)  
   {  
   errorcode = ERR44;  
   goto PCRE_EARLY_ERROR_RETURN2;  
   }  
 #else  
 if ((options & PCRE_UTF8) != 0)  
   {  
   errorcode = ERR32;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
 #endif  
   
 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)  
   {  
   errorcode = ERR17;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
   
6323  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
6324    
6325  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
# Line 6146  cd->fcc = tables + fcc_offset; Line 6328  cd->fcc = tables + fcc_offset;
6328  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6329  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6330    
6331    /* Check that all undefined public option bits are zero */
6332    
6333    if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6334      {
6335      errorcode = ERR17;
6336      goto PCRE_EARLY_ERROR_RETURN;
6337      }
6338    
6339  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6340  the offset for later. */  the offset for later. */
6341    
# Line 6155  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6345  while (ptr[skipatstart] == CHAR_LEFT_PAR
6345    int newnl = 0;    int newnl = 0;
6346    int newbsr = 0;    int newbsr = 0;
6347    
6348      if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6349        { skipatstart += 7; options |= PCRE_UTF8; continue; }
6350    
6351    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6352      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6353    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
# Line 6178  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6371  while (ptr[skipatstart] == CHAR_LEFT_PAR
6371    else break;    else break;
6372    }    }
6373    
6374    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6375    
6376    #ifdef SUPPORT_UTF8
6377    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6378         (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6379      {
6380      errorcode = ERR44;
6381      goto PCRE_EARLY_ERROR_RETURN2;
6382      }
6383    #else
6384    if (utf8)
6385      {
6386      errorcode = ERR32;
6387      goto PCRE_EARLY_ERROR_RETURN;
6388      }
6389    #endif
6390    
6391  /* Check validity of \R options. */  /* Check validity of \R options. */
6392    
6393  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
# Line 6260  cd->end_pattern = (const uschar *)(patte Line 6470  cd->end_pattern = (const uschar *)(patte
6470  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6471  cd->external_options = options;  cd->external_options = options;
6472  cd->external_flags = 0;  cd->external_flags = 0;
6473    cd->open_caps = NULL;
6474    
6475  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6476  don't need to look at the result of the function here. The initial options have  don't need to look at the result of the function here. The initial options have
# Line 6334  cd->start_code = codestart; Line 6545  cd->start_code = codestart;
6545  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6546  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6547  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6548    cd->check_lookbehind = FALSE;
6549    cd->open_caps = NULL;
6550    
6551  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
6552  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 6372  while (errorcode == 0 && cd->hwm > cwork Line 6585  while (errorcode == 0 && cd->hwm > cwork
6585    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6586    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6587    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6588    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = _pcre_find_bracket(codestart, utf8, recno);
6589    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6590      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6591    }    }
# Line 6382  subpattern. */ Line 6595  subpattern. */
6595    
6596  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6597    
6598    /* If there were any lookbehind assertions that contained OP_RECURSE
6599    (recursions or subroutine calls), a flag is set for them to be checked here,
6600    because they may contain forward references. Actual recursions can't be fixed
6601    length, but subroutine calls can. It is done like this so that those without
6602    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6603    exceptional ones forgo this. We scan the pattern to check that they are fixed
6604    length, and set their lengths. */
6605    
6606    if (cd->check_lookbehind)
6607      {
6608      uschar *cc = (uschar *)codestart;
6609    
6610      /* Loop, searching for OP_REVERSE items, and process those that do not have
6611      their length set. (Actually, it will also re-process any that have a length
6612      of zero, but that is a pathological case, and it does no harm.) When we find
6613      one, we temporarily terminate the branch it is in while we scan it. */
6614    
6615      for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6616           cc != NULL;
6617           cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6618        {
6619        if (GET(cc, 1) == 0)
6620          {
6621          int fixed_length;
6622          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6623          int end_op = *be;
6624          *be = OP_END;
6625          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6626          *be = end_op;
6627          DPRINTF(("fixed length = %d\n", fixed_length));
6628          if (fixed_length < 0)
6629            {
6630            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6631            break;
6632            }
6633          PUT(cc, 1, fixed_length);
6634          }
6635        cc += 1 + LINK_SIZE;
6636        }
6637      }
6638    
6639  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6640    
6641  if (errorcode != 0)  if (errorcode != 0)

Legend:
Removed from v.406  
changed lines
  Added in v.455

  ViewVC Help
Powered by ViewVC 1.1.5