/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 447 by ph10, Tue Sep 15 18:17:54 2009 UTC revision 496 by ph10, Tue Mar 2 19:11:17 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 341  static const char error_texts[] = Line 342  static const char error_texts[] =
342    "number is too big\0"    "number is too big\0"
343    "subpattern name expected\0"    "subpattern name expected\0"
344    "digit expected after (?+\0"    "digit expected after (?+\0"
345    "] is an invalid data character in JavaScript compatibility mode";    "] is an invalid data character in JavaScript compatibility mode\0"
346      /* 65 */
347      "different names for subpatterns of the same number are not allowed";
348    
349    
350  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1100  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1103  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1103        if (name != NULL && lorn == ptr - thisname &&        if (name != NULL && lorn == ptr - thisname &&
1104            strncmp((const char *)name, (const char *)thisname, lorn) == 0)            strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1105          return *count;          return *count;
1106        term++;        term++;
1107        }        }
1108      }      }
1109    }    }
# Line 1146  for (; *ptr != 0; ptr++) Line 1149  for (; *ptr != 0; ptr++)
1149            break;            break;
1150          }          }
1151        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1152          {          {
1153          negate_class = TRUE;          negate_class = TRUE;
1154          ptr++;          ptr++;
1155          }          }
1156        else break;        else break;
1157        }        }
1158    
# Line 1315  for (;;) Line 1318  for (;;)
1318    
1319      case OP_CALLOUT:      case OP_CALLOUT:
1320      case OP_CREF:      case OP_CREF:
1321        case OP_NCREF:
1322      case OP_RREF:      case OP_RREF:
1323        case OP_NRREF:
1324      case OP_DEF:      case OP_DEF:
1325      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1326      break;      break;
# Line 1331  for (;;) Line 1336  for (;;)
1336    
1337    
1338  /*************************************************  /*************************************************
1339  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1340  *************************************************/  *************************************************/
1341    
1342  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1343  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1344  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1345    temporarily terminated with OP_END when this function is called.
1346    
1347    This function is called when a backward assertion is encountered, so that if it
1348    fails, the error message can point to the correct place in the pattern.
1349    However, we cannot do this when the assertion contains subroutine calls,
1350    because they can be forward references. We solve this by remembering this case
1351    and doing the check at the end; a flag specifies which mode we are running in.
1352    
1353  Arguments:  Arguments:
1354    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1355    options  the compiling options    options  the compiling options
1356      atend    TRUE if called when the pattern is complete
1357      cd       the "compile data" structure
1358    
1359  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1360                 or -1 if there is no fixed length,
1361               or -2 if \C was encountered               or -2 if \C was encountered
1362                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1363  */  */
1364    
1365  static int  static int
1366  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1367  {  {
1368  int length = -1;  int length = -1;
1369    
# Line 1360  branch, check the length against that of Line 1376  branch, check the length against that of
1376  for (;;)  for (;;)
1377    {    {
1378    int d;    int d;
1379      uschar *ce, *cs;
1380    register int op = *cc;    register int op = *cc;
1381    switch (op)    switch (op)
1382      {      {
# Line 1367  for (;;) Line 1384  for (;;)
1384      case OP_BRA:      case OP_BRA:
1385      case OP_ONCE:      case OP_ONCE:
1386      case OP_COND:      case OP_COND:
1387      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1388      if (d < 0) return d;      if (d < 0) return d;
1389      branchlength += d;      branchlength += d;
1390      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1390  for (;;) Line 1407  for (;;)
1407      branchlength = 0;      branchlength = 0;
1408      break;      break;
1409    
1410        /* A true recursion implies not fixed length, but a subroutine call may
1411        be OK. If the subroutine is a forward reference, we can't deal with
1412        it until the end of the pattern, so return -3. */
1413    
1414        case OP_RECURSE:
1415        if (!atend) return -3;
1416        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1417        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1418        if (cc > cs && cc < ce) return -1;                /* Recursion */
1419        d = find_fixedlength(cs + 2, options, atend, cd);
1420        if (d < 0) return d;
1421        branchlength += d;
1422        cc += 1 + LINK_SIZE;
1423        break;
1424    
1425      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1426    
1427      case OP_ASSERT:      case OP_ASSERT:
# Line 1403  for (;;) Line 1435  for (;;)
1435    
1436      case OP_REVERSE:      case OP_REVERSE:
1437      case OP_CREF:      case OP_CREF:
1438        case OP_NCREF:
1439      case OP_RREF:      case OP_RREF:
1440        case OP_NRREF:
1441      case OP_DEF:      case OP_DEF:
1442      case OP_OPT:      case OP_OPT:
1443      case OP_CALLOUT:      case OP_CALLOUT:
# Line 1426  for (;;) Line 1460  for (;;)
1460      branchlength++;      branchlength++;
1461      cc += 2;      cc += 2;
1462  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1463      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1464        cc += _pcre_utf8_table4[cc[-1] & 0x3f];        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1465  #endif  #endif
1466      break;      break;
# Line 1438  for (;;) Line 1472  for (;;)
1472      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1473      cc += 4;      cc += 4;
1474  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1475      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1476        cc += _pcre_utf8_table4[cc[-1] & 0x3f];        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1477  #endif  #endif
1478      break;      break;
# Line 1518  for (;;) Line 1552  for (;;)
1552    
1553    
1554  /*************************************************  /*************************************************
1555  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1556  *************************************************/  *************************************************/
1557    
1558  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1559  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1560    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1561    so that it can be called from pcre_study() when finding the minimum matching
1562    length.
1563    
1564  Arguments:  Arguments:
1565    code        points to start of expression    code        points to start of expression
1566    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1567    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1568    
1569  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1570  */  */
1571    
1572  static const uschar *  const uschar *
1573  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1574  {  {
1575  for (;;)  for (;;)
1576    {    {
# Line 1546  for (;;) Line 1583  for (;;)
1583    
1584    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1585    
1586      /* Handle recursion */
1587    
1588      else if (c == OP_REVERSE)
1589        {
1590        if (number < 0) return (uschar *)code;
1591        code += _pcre_OP_lengths[c];
1592        }
1593    
1594    /* Handle capturing bracket */    /* Handle capturing bracket */
1595    
1596    else if (c == OP_CBRA)    else if (c == OP_CBRA)
# Line 1913  for (code = first_significant_code(code Line 1958  for (code = first_significant_code(code
1958      case OP_POSQUERY:      case OP_POSQUERY:
1959      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];      if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1960      break;      break;
1961    
1962      case OP_UPTO:      case OP_UPTO:
1963      case OP_MINUPTO:      case OP_MINUPTO:
1964      case OP_POSUPTO:      case OP_POSUPTO:
# Line 1950  static BOOL Line 1995  static BOOL
1995  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1996    BOOL utf8)    BOOL utf8)
1997  {  {
1998  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
1999    {    {
2000    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
2001        return FALSE;
2002    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2003    }    }
2004  return TRUE;  return TRUE;
# Line 2614  BOOL utf8 = FALSE; Line 2660  BOOL utf8 = FALSE;
2660  uschar *utf8_char = NULL;  uschar *utf8_char = NULL;
2661  #endif  #endif
2662    
2663  #ifdef DEBUG  #ifdef PCRE_DEBUG
2664  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2665  #endif  #endif
2666    
# Line 2673  for (;; ptr++) Line 2719  for (;; ptr++)
2719    
2720    if (lengthptr != NULL)    if (lengthptr != NULL)
2721      {      {
2722  #ifdef DEBUG  #ifdef PCRE_DEBUG
2723      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2724  #endif  #endif
2725      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
# Line 3871  we set the flag only if there is a liter Line 3917  we set the flag only if there is a liter
3917    
3918        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3919    
3920        /*--------------------------------------------------------------------*/        /*--------------------------------------------------------------------*/
3921        /* This code is obsolete from release 8.00; the restriction was finally        /* This code is obsolete from release 8.00; the restriction was finally
3922        removed: */        removed: */
3923    
3924        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3925        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3926    
3927        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3928        /*--------------------------------------------------------------------*/        /*--------------------------------------------------------------------*/
3929    
3930        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3931    
# Line 4026  we set the flag only if there is a liter Line 4072  we set the flag only if there is a liter
4072          goto END_REPEAT;          goto END_REPEAT;
4073          }          }
4074    
4075        /*--------------------------------------------------------------------*/        /*--------------------------------------------------------------------*/
4076        /* This code is obsolete from release 8.00; the restriction was finally        /* This code is obsolete from release 8.00; the restriction was finally
4077        removed: */        removed: */
4078    
# Line 4034  we set the flag only if there is a liter Line 4080  we set the flag only if there is a liter
4080        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4081    
4082        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4083        /*--------------------------------------------------------------------*/        /*--------------------------------------------------------------------*/
4084    
4085        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4086          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 4169  we set the flag only if there is a liter Line 4215  we set the flag only if there is a liter
4215            {            {
4216            /* In the pre-compile phase, we don't actually do the replication. We            /* In the pre-compile phase, we don't actually do the replication. We
4217            just adjust the length as if we had. Do some paranoid checks for            just adjust the length as if we had. Do some paranoid checks for
4218            potential integer overflow. */            potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4219              integer type when available, otherwise double. */
4220    
4221            if (lengthptr != NULL)            if (lengthptr != NULL)
4222              {              {
4223              int delta = (repeat_min - 1)*length_prevgroup;              int delta = (repeat_min - 1)*length_prevgroup;
4224              if ((double)(repeat_min - 1)*(double)length_prevgroup >              if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4225                                                              (double)INT_MAX ||                    (INT64_OR_DOUBLE)length_prevgroup >
4226                        (INT64_OR_DOUBLE)INT_MAX ||
4227                  OFLOW_MAX - *lengthptr < delta)                  OFLOW_MAX - *lengthptr < delta)
4228                {                {
4229                *errorcodeptr = ERR20;                *errorcodeptr = ERR20;
# Line 4221  we set the flag only if there is a liter Line 4269  we set the flag only if there is a liter
4269          just adjust the length as if we had. For each repetition we must add 1          just adjust the length as if we had. For each repetition we must add 1
4270          to the length for BRAZERO and for all but the last repetition we must          to the length for BRAZERO and for all but the last repetition we must
4271          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4272          paranoid checks to avoid integer overflow. */          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4273            a 64-bit integer type when available, otherwise double. */
4274    
4275          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
4276            {            {
4277            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4278                        2 - 2*LINK_SIZE;   /* Last one doesn't nest */                        2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4279            if ((double)repeat_max *            if ((INT64_OR_DOUBLE)repeat_max *
4280                  (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)                  (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4281                    > (double)INT_MAX ||                    > (INT64_OR_DOUBLE)INT_MAX ||
4282                OFLOW_MAX - *lengthptr < delta)                OFLOW_MAX - *lengthptr < delta)
4283              {              {
4284              *errorcodeptr = ERR20;              *errorcodeptr = ERR20;
# Line 4349  we set the flag only if there is a liter Line 4398  we set the flag only if there is a liter
4398      if (possessive_quantifier)      if (possessive_quantifier)
4399        {        {
4400        int len;        int len;
4401    
4402        if (*tempcode == OP_TYPEEXACT)        if (*tempcode == OP_TYPEEXACT)
4403          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4404            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4405    
4406        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4407          {          {
4408          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += _pcre_OP_lengths[*tempcode];
# Line 4361  we set the flag only if there is a liter Line 4410  we set the flag only if there is a liter
4410          if (utf8 && tempcode[-1] >= 0xc0)          if (utf8 && tempcode[-1] >= 0xc0)
4411            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];            tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4412  #endif  #endif
4413          }          }
4414    
4415        len = code - tempcode;        len = code - tempcode;
4416        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4417          {          {
# Line 4381  we set the flag only if there is a liter Line 4430  we set the flag only if there is a liter
4430          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4431          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4432    
4433            /* Because we are moving code along, we must ensure that any
4434            pending recursive references are updated. */
4435    
4436          default:          default:
4437            *code = OP_END;
4438            adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4439          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4440          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
4441          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
# Line 4441  we set the flag only if there is a liter Line 4495  we set the flag only if there is a liter
4495              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4496            {            {
4497            /* Check for open captures before ACCEPT */            /* Check for open captures before ACCEPT */
4498    
4499            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
4500              {              {
4501              open_capitem *oc;              open_capitem *oc;
4502              cd->had_accept = TRUE;              cd->had_accept = TRUE;
4503              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4504                {                {
4505                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
4506                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
4507                }                }
4508              }              }
4509            *code++ = verbs[i].op;            *code++ = verbs[i].op;
4510            break;            break;
4511            }            }
# Line 4614  we set the flag only if there is a liter Line 4668  we set the flag only if there is a liter
4668            }            }
4669    
4670          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
4671          name. */          name. If we find a name, add one to the opcode to change OP_CREF or
4672            OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4673            except they record that the reference was originally to a name. The
4674            information is used to check duplicate names. */
4675    
4676          slot = cd->name_table;          slot = cd->name_table;
4677          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 4629  we set the flag only if there is a liter Line 4686  we set the flag only if there is a liter
4686            {            {
4687            recno = GET2(slot, 0);            recno = GET2(slot, 0);
4688            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4689              code[1+LINK_SIZE]++;
4690            }            }
4691    
4692          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
# Line 4637  we set the flag only if there is a liter Line 4695  we set the flag only if there is a liter
4695                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4696            {            {
4697            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
4698              code[1+LINK_SIZE]++;
4699            }            }
4700    
4701          /* If terminator == 0 it means that the name followed directly after          /* If terminator == 0 it means that the name followed directly after
# Line 4829  we set the flag only if there is a liter Line 4888  we set the flag only if there is a liter
4888                }                }
4889              }              }
4890    
4891            /* In the real compile, create the entry in the table */            /* In the real compile, create the entry in the table, maintaining
4892              alphabetical order. Duplicate names for different numbers are
4893              permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4894              number are always OK. (An existing number can be re-used if (?|
4895              appears in the pattern.) In either event, a duplicate name results in
4896              a duplicate entry in the table, even if the number is the same. This
4897              is because the number of names, and hence the table size, is computed
4898              in the pre-compile, and it affects various numbers and pointers which
4899              would all have to be modified, and the compiled code moved down, if
4900              duplicates with the same number were omitted from the table. This
4901              doesn't seem worth the hassle. However, *different* names for the
4902              same number are not permitted. */
4903    
4904            else            else
4905              {              {
4906                BOOL dupname = FALSE;
4907              slot = cd->name_table;              slot = cd->name_table;
4908    
4909              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
4910                {                {
4911                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+2, namelen);
# Line 4841  we set the flag only if there is a liter Line 4913  we set the flag only if there is a liter
4913                  {                  {
4914                  if (slot[2+namelen] == 0)                  if (slot[2+namelen] == 0)
4915                    {                    {
4916                    if ((options & PCRE_DUPNAMES) == 0)                    if (GET2(slot, 0) != cd->bracount + 1 &&
4917                          (options & PCRE_DUPNAMES) == 0)
4918                      {                      {
4919                      *errorcodeptr = ERR43;                      *errorcodeptr = ERR43;
4920                      goto FAILED;                      goto FAILED;
4921                      }                      }
4922                      else dupname = TRUE;
4923                    }                    }
4924                  else crc = -1;      /* Current name is substring */                  else crc = -1;      /* Current name is a substring */
4925                  }                  }
4926    
4927                  /* Make space in the table and break the loop for an earlier
4928                  name. For a duplicate or later name, carry on. We do this for
4929                  duplicates so that in the simple case (when ?(| is not used) they
4930                  are in order of their numbers. */
4931    
4932                if (crc < 0)                if (crc < 0)
4933                  {                  {
4934                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
4935                    (cd->names_found - i) * cd->name_entry_size);                    (cd->names_found - i) * cd->name_entry_size);
4936                  break;                  break;
4937                  }                  }
4938    
4939                  /* Continue the loop for a later or duplicate name */
4940    
4941                slot += cd->name_entry_size;                slot += cd->name_entry_size;
4942                }                }
4943    
4944                /* For non-duplicate names, check for a duplicate number before
4945                adding the new name. */
4946    
4947                if (!dupname)
4948                  {
4949                  uschar *cslot = cd->name_table;
4950                  for (i = 0; i < cd->names_found; i++)
4951                    {
4952                    if (cslot != slot)
4953                      {
4954                      if (GET2(cslot, 0) == cd->bracount + 1)
4955                        {
4956                        *errorcodeptr = ERR65;
4957                        goto FAILED;
4958                        }
4959                      }
4960                    else i--;
4961                    cslot += cd->name_entry_size;
4962                    }
4963                  }
4964    
4965              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
4966              memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4967              slot[2+namelen] = 0;              slot[2+namelen] = 0;
4968              }              }
4969            }            }
4970    
4971          /* In both cases, count the number of names we've encountered. */          /* In both pre-compile and compile, count the number of names we've
4972            encountered. */
4973    
         ptr++;                    /* Move past > or ' */  
4974          cd->names_found++;          cd->names_found++;
4975            ptr++;                    /* Move past > or ' */
4976          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
4977    
4978    
# Line 5036  we set the flag only if there is a liter Line 5141  we set the flag only if there is a liter
5141            if (lengthptr == NULL)            if (lengthptr == NULL)
5142              {              {
5143              *code = OP_END;              *code = OP_END;
5144              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);              if (recno != 0)
5145                  called = _pcre_find_bracket(cd->start_code, utf8, recno);
5146    
5147              /* Forward reference */              /* Forward reference */
5148    
# Line 5048  we set the flag only if there is a liter Line 5154  we set the flag only if there is a liter
5154                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
5155                  goto FAILED;                  goto FAILED;
5156                  }                  }
5157    
5158                  /* Fudge the value of "called" so that when it is inserted as an
5159                  offset below, what it actually inserted is the reference number
5160                  of the group. */
5161    
5162                called = cd->start_code + recno;                called = cd->start_code + recno;
5163                PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);                PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5164                }                }
# Line 5152  we set the flag only if there is a liter Line 5263  we set the flag only if there is a liter
5263              {              {
5264              cd->external_options = newoptions;              cd->external_options = newoptions;
5265              }              }
5266           else            else
5267              {              {
5268              if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))              if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5269                {                {
# Line 5489  we set the flag only if there is a liter Line 5600  we set the flag only if there is a liter
5600    
5601        if (-c >= ESC_REF)        if (-c >= ESC_REF)
5602          {          {
5603            open_capitem *oc;
5604          recno = -c - ESC_REF;          recno = -c - ESC_REF;
5605    
5606          HANDLE_REFERENCE:    /* Come here from named backref handling */          HANDLE_REFERENCE:    /* Come here from named backref handling */
# Line 5498  we set the flag only if there is a liter Line 5610  we set the flag only if there is a liter
5610          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
5611          cd->backref_map |= (recno < 32)? (1 << recno) : 1;          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5612          if (recno > cd->top_backref) cd->top_backref = recno;          if (recno > cd->top_backref) cd->top_backref = recno;
5613    
5614            /* Check to see if this back reference is recursive, that it, it
5615            is inside the group that it references. A flag is set so that the
5616            group can be made atomic. */
5617    
5618            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5619              {
5620              if (oc->number == recno)
5621                {
5622                oc->flag = TRUE;
5623                break;
5624                }
5625              }
5626          }          }
5627    
5628        /* So are Unicode property matches, if supported. */        /* So are Unicode property matches, if supported. */
# Line 5687  int branchfirstbyte, branchreqbyte; Line 5812  int branchfirstbyte, branchreqbyte;
5812  int length;  int length;
5813  int orig_bracount;  int orig_bracount;
5814  int max_bracount;  int max_bracount;
5815    int old_external_options = cd->external_options;
5816  branch_chain bc;  branch_chain bc;
5817    
5818  bc.outer = bcptr;  bc.outer = bcptr;
5819  bc.current = code;  bc.current_branch = code;
5820    
5821  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
5822    
# Line 5709  them global. It tests the value of lengt Line 5835  them global. It tests the value of lengt
5835  pre-compile phase to find out whether anything has yet been compiled or not. */  pre-compile phase to find out whether anything has yet been compiled or not. */
5836    
5837  /* If this is a capturing subpattern, add to the chain of open capturing items  /* If this is a capturing subpattern, add to the chain of open capturing items
5838  so that we can detect them if (*ACCEPT) is encountered. */  so that we can detect them if (*ACCEPT) is encountered. This is also used to
5839    detect groups that contain recursive back references to themselves. */
5840    
5841  if (*code == OP_CBRA)  if (*code == OP_CBRA)
5842    {    {
5843    capnumber = GET2(code, 1 + LINK_SIZE);    capnumber = GET2(code, 1 + LINK_SIZE);
5844    capitem.number = capnumber;    capitem.number = capnumber;
5845    capitem.next = cd->open_caps;    capitem.next = cd->open_caps;
5846    cd->open_caps = &capitem;    capitem.flag = FALSE;
5847    }    cd->open_caps = &capitem;
5848      }
5849    
5850  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
5851    
# Line 5763  for (;;) Line 5891  for (;;)
5891      return FALSE;      return FALSE;
5892      }      }
5893    
5894      /* If the external options have changed during this branch, it means that we
5895      are at the top level, and a leading option setting has been encountered. We
5896      need to re-set the original option values to take account of this so that,
5897      during the pre-compile phase, we know to allow for a re-set at the start of
5898      subsequent branches. */
5899    
5900      if (old_external_options != cd->external_options)
5901        oldims = cd->external_options & PCRE_IMS;
5902    
5903    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
5904    has fewer than the rest. */    has fewer than the rest. */
5905    
# Line 5813  for (;;) Line 5950  for (;;)
5950    
5951      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5952      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5953      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5954        because there may be forward references that we can't check here. Set a
5955        flag to cause another lookbehind check at the end. Why not do it all at the
5956        end? Because common, erroneous checks are picked up here and the offset of
5957        the problem can be shown. */
5958    
5959      if (lookbehind)      if (lookbehind)
5960        {        {
5961        int fixed_length;        int fixed_length;
5962        *code = OP_END;        *code = OP_END;
5963        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5964        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
5965        if (fixed_length < 0)        if (fixed_length == -3)
5966            {
5967            cd->check_lookbehind = TRUE;
5968            }
5969          else if (fixed_length < 0)
5970          {          {
5971          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5972          *ptrptr = ptr;          *ptrptr = ptr;
5973          return FALSE;          return FALSE;
5974          }          }
5975        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
5976        }        }
5977      }      }
5978    
# Line 5854  for (;;) Line 5999  for (;;)
5999          }          }
6000        while (branch_length > 0);        while (branch_length > 0);
6001        }        }
   
     /* If it was a capturing subpattern, remove it from the chain. */  
   
     if (capnumber > 0) cd->open_caps = cd->open_caps->next;  
6002    
6003      /* Fill in the ket */      /* Fill in the ket */
6004    
# Line 5865  for (;;) Line 6006  for (;;)
6006      PUT(code, 1, code - start_bracket);      PUT(code, 1, code - start_bracket);
6007      code += 1 + LINK_SIZE;      code += 1 + LINK_SIZE;
6008    
6009      /* Resetting option if needed */      /* If it was a capturing subpattern, check to see if it contained any
6010        recursive back references. If so, we must wrap it in atomic brackets.
6011        In any event, remove the block from the chain. */
6012    
6013        if (capnumber > 0)
6014          {
6015          if (cd->open_caps->flag)
6016            {
6017            memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6018              code - start_bracket);
6019            *start_bracket = OP_ONCE;
6020            code += 1 + LINK_SIZE;
6021            PUT(start_bracket, 1, code - start_bracket);
6022            *code = OP_KET;
6023            PUT(code, 1, code - start_bracket);
6024            code += 1 + LINK_SIZE;
6025            length += 2 + 2*LINK_SIZE;
6026            }
6027          cd->open_caps = cd->open_caps->next;
6028          }
6029    
6030        /* Reset options if needed. */
6031    
6032      if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)      if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6033        {        {
# Line 5914  for (;;) Line 6076  for (;;)
6076      {      {
6077      *code = OP_ALT;      *code = OP_ALT;
6078      PUT(code, 1, code - last_branch);      PUT(code, 1, code - last_branch);
6079      bc.current = last_branch = code;      bc.current_branch = last_branch = code;
6080      code += 1 + LINK_SIZE;      code += 1 + LINK_SIZE;
6081      }      }
6082    
# Line 6061  do { Line 6223  do {
6223       switch (*scode)       switch (*scode)
6224         {         {
6225         case OP_CREF:         case OP_CREF:
6226           case OP_NCREF:
6227         case OP_RREF:         case OP_RREF:
6228           case OP_NRREF:
6229         case OP_DEF:         case OP_DEF:
6230         return FALSE;         return FALSE;
6231    
# Line 6230  int length = 1;  /* For final END opcode Line 6394  int length = 1;  /* For final END opcode
6394  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6395  int errorcode = 0;  int errorcode = 0;
6396  int skipatstart = 0;  int skipatstart = 0;
6397  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6398  size_t size;  size_t size;
6399  uschar *code;  uschar *code;
6400  const uschar *codestart;  const uschar *codestart;
# Line 6329  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6491  while (ptr[skipatstart] == CHAR_LEFT_PAR
6491  /* Can't support UTF8 unless PCRE has been compiled to include the code. */  /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6492    
6493  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
 utf8 = (options & PCRE_UTF8) != 0;  
6494  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6495       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6496    {    {
6497    errorcode = ERR44;    errorcode = ERR44;
6498    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
6499    }    }
6500  #else  #else
6501  if ((options & PCRE_UTF8) != 0)  if (utf8)
6502    {    {
6503    errorcode = ERR32;    errorcode = ERR32;
6504    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
# Line 6501  cd->start_code = codestart; Line 6662  cd->start_code = codestart;
6662  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6663  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6664  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6665    cd->check_lookbehind = FALSE;
6666  cd->open_caps = NULL;  cd->open_caps = NULL;
6667    
6668  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
# Line 6527  if debugging, leave the test till after Line 6689  if debugging, leave the test till after
6689    
6690  *code++ = OP_END;  *code++ = OP_END;
6691    
6692  #ifndef DEBUG  #ifndef PCRE_DEBUG
6693  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
6694  #endif  #endif
6695    
# Line 6540  while (errorcode == 0 && cd->hwm > cwork Line 6702  while (errorcode == 0 && cd->hwm > cwork
6702    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6703    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6704    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6705    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = _pcre_find_bracket(codestart, utf8, recno);
6706    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6707      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6708    }    }
# Line 6550  subpattern. */ Line 6712  subpattern. */
6712    
6713  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6714    
6715    /* If there were any lookbehind assertions that contained OP_RECURSE
6716    (recursions or subroutine calls), a flag is set for them to be checked here,
6717    because they may contain forward references. Actual recursions can't be fixed
6718    length, but subroutine calls can. It is done like this so that those without
6719    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6720    exceptional ones forgo this. We scan the pattern to check that they are fixed
6721    length, and set their lengths. */
6722    
6723    if (cd->check_lookbehind)
6724      {
6725      uschar *cc = (uschar *)codestart;
6726    
6727      /* Loop, searching for OP_REVERSE items, and process those that do not have
6728      their length set. (Actually, it will also re-process any that have a length
6729      of zero, but that is a pathological case, and it does no harm.) When we find
6730      one, we temporarily terminate the branch it is in while we scan it. */
6731    
6732      for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6733           cc != NULL;
6734           cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6735        {
6736        if (GET(cc, 1) == 0)
6737          {
6738          int fixed_length;
6739          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6740          int end_op = *be;
6741          *be = OP_END;
6742          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6743          *be = end_op;
6744          DPRINTF(("fixed length = %d\n", fixed_length));
6745          if (fixed_length < 0)
6746            {
6747            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6748            break;
6749            }
6750          PUT(cc, 1, fixed_length);
6751          }
6752        cc += 1 + LINK_SIZE;
6753        }
6754      }
6755    
6756  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6757    
6758  if (errorcode != 0)  if (errorcode != 0)
# Line 6610  if (reqbyte >= 0 && Line 6813  if (reqbyte >= 0 &&
6813  /* Print out the compiled data if debugging is enabled. This is never the  /* Print out the compiled data if debugging is enabled. This is never the
6814  case when building a production library. */  case when building a production library. */
6815    
6816  #ifdef DEBUG  #ifdef PCRE_DEBUG
   
6817  printf("Length = %d top_bracket = %d top_backref = %d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
6818    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
6819    
# Line 6648  if (code - codestart > length) Line 6850  if (code - codestart > length)
6850    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6851    return NULL;    return NULL;
6852    }    }
6853  #endif   /* DEBUG */  #endif   /* PCRE_DEBUG */
6854    
6855  return (pcre *)re;  return (pcre *)re;
6856  }  }

Legend:
Removed from v.447  
changed lines
  Added in v.496

  ViewVC Help
Powered by ViewVC 1.1.5