/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 602 by ph10, Wed May 25 08:29:03 2011 UTC revision 604 by ph10, Thu Jun 2 19:04:54 2011 UTC
# Line 545  static const unsigned char ebcdic_charta Line 545  static const unsigned char ebcdic_charta
545  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
546    
547  static BOOL  static BOOL
548    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,
549      int *, int *, branch_chain *, compile_data *, int *);      int *, branch_chain *, compile_data *, int *);
550    
551    
552    
# Line 1403  does not. Line 1403  does not.
1403    
1404  Arguments:  Arguments:
1405    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1406    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1407    
1408  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1409  */  */
1410    
1411  static const uschar*  static const uschar*
1412  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const uschar *code, BOOL skipassert)
   BOOL skipassert)  
1413  {  {
1414  for (;;)  for (;;)
1415    {    {
# Line 1468  and doing the check at the end; a flag s Line 1464  and doing the check at the end; a flag s
1464    
1465  Arguments:  Arguments:
1466    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1467    options  the compiling options    utf8     TRUE in UTF-8 mode
1468    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1469    cd       the "compile data" structure    cd       the "compile data" structure
1470    
# Line 1479  Returns:   the fixed length, Line 1475  Returns:   the fixed length,
1475  */  */
1476    
1477  static int  static int
1478  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1479  {  {
1480  int length = -1;  int length = -1;
1481    
# Line 1496  for (;;) Line 1492  for (;;)
1492    register int op = *cc;    register int op = *cc;
1493    switch (op)    switch (op)
1494      {      {
1495        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1496        OP_BRA (normal non-capturing bracket) because the other variants of these
1497        opcodes are all concerned with unlimited repeated groups, which of course
1498        are not of fixed length. They will cause a -1 response from the default
1499        case of this switch. */
1500    
1501      case OP_CBRA:      case OP_CBRA:
1502      case OP_BRA:      case OP_BRA:
1503      case OP_ONCE:      case OP_ONCE:
1504      case OP_COND:      case OP_COND:
1505      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1506      if (d < 0) return d;      if (d < 0) return d;
1507      branchlength += d;      branchlength += d;
1508      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1509  for (;;) Line 1511  for (;;)
1511    
1512      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
1513      call. If it's ALT it is an alternation in a nested call. If it is      call. If it's ALT it is an alternation in a nested call. If it is
1514      END it's the end of the outer call. All can be handled by the same code. */      END it's the end of the outer call. All can be handled by the same code.
1515        Note that we must not include the OP_KETRxxx opcodes here, because they
1516        all imply an unlimited repeat. */
1517    
1518      case OP_ALT:      case OP_ALT:
1519      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1520      case OP_END:      case OP_END:
1521      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1522        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
# Line 1532  for (;;) Line 1534  for (;;)
1534      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1535      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1536      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                /* Recursion */
1537      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + 2, utf8, atend, cd);
1538      if (d < 0) return d;      if (d < 0) return d;
1539      branchlength += d;      branchlength += d;
1540      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1575  for (;;) Line 1577  for (;;)
1577      case OP_CHAR:      case OP_CHAR:
1578      case OP_CHARI:      case OP_CHARI:
1579      case OP_NOT:      case OP_NOT:
1580      case OP_NOTI:      case OP_NOTI:
1581      branchlength++;      branchlength++;
1582      cc += 2;      cc += 2;
1583  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1584      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1585  #endif  #endif
1586      break;      break;
1587    
# Line 1591  for (;;) Line 1592  for (;;)
1592      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1593      cc += 4;      cc += 4;
1594  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1595      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1596  #endif  #endif
1597      break;      break;
1598    
# Line 1712  for (;;) Line 1712  for (;;)
1712    
1713    /* Handle capturing bracket */    /* Handle capturing bracket */
1714    
1715    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1716               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1717      {      {
1718      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1719      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
# Line 1954  could_be_empty_branch(const uschar *code Line 1955  could_be_empty_branch(const uschar *code
1955    compile_data *cd)    compile_data *cd)
1956  {  {
1957  register int c;  register int c;
1958  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
1959       code < endcode;       code < endcode;
1960       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
1961    {    {
1962    const uschar *ccode;    const uschar *ccode;
1963    
# Line 1972  for (code = first_significant_code(code Line 1973  for (code = first_significant_code(code
1973      continue;      continue;
1974      }      }
1975    
   /* Groups with zero repeats can of course be empty; skip them. */  
   
   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)  
     {  
     code += _pcre_OP_lengths[c];  
     do code += GET(code, 1); while (*code == OP_ALT);  
     c = *code;  
     continue;  
     }  
   
1976    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
1977    implies a subroutine call, we can scan it. */    implies a subroutine call, we can scan it. */
1978    
# Line 2004  for (code = first_significant_code(code Line 1995  for (code = first_significant_code(code
1995      continue;      continue;
1996      }      }
1997    
1998      /* Groups with zero repeats can of course be empty; skip them. */
1999    
2000      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2001          c == OP_BRAPOSZERO)
2002        {
2003        code += _pcre_OP_lengths[c];
2004        do code += GET(code, 1); while (*code == OP_ALT);
2005        c = *code;
2006        continue;
2007        }
2008    
2009      /* A nested group that is already marked as "could be empty" can just be
2010      skipped. */
2011    
2012      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2013          c == OP_SCBRA || c == OP_SCBRAPOS)
2014        {
2015        do code += GET(code, 1); while (*code == OP_ALT);
2016        c = *code;
2017        continue;
2018        }
2019    
2020    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2021    
2022    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2023          c == OP_CBRA || c == OP_CBRAPOS ||
2024          c == OP_ONCE || c == OP_COND)
2025      {      {
2026      BOOL empty_branch;      BOOL empty_branch;
2027      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2135  for (code = first_significant_code(code Line 2150  for (code = first_significant_code(code
2150      case OP_KET:      case OP_KET:
2151      case OP_KETRMAX:      case OP_KETRMAX:
2152      case OP_KETRMIN:      case OP_KETRMIN:
2153        case OP_KETRPOS:
2154      case OP_ALT:      case OP_ALT:
2155      return TRUE;      return TRUE;
2156    
# Line 2682  if (next >= 0) switch(op_code) Line 2698  if (next >= 0) switch(op_code)
2698    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2699    
2700    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2701    opcodes are not used for multi-byte characters, because they are coded using    opcodes are not used for multi-byte characters, because they are coded using
2702    an XCLASS instead. */    an XCLASS instead. */
2703    
2704    case OP_NOT:    case OP_NOT:
2705    return (c = *previous) == next;    return (c = *previous) == next;
2706    
2707    case OP_NOTI:    case OP_NOTI:
2708    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
2709  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2710    if (utf8)    if (utf8)
# Line 4201  for (;; ptr++) Line 4217  for (;; ptr++)
4217      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4218        {        {
4219        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4220    
4221        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF-8 characters that take up more than one byte. It's
4222        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4223        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus 0x80 to flag that it's a
# Line 4246  for (;; ptr++) Line 4262  for (;; ptr++)
4262      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
4263      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
4264      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
4265      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4266      are currently used only for single-byte chars. */      are currently used only for single-byte chars. */
4267    
4268      else if (*previous == OP_NOT || *previous == OP_NOTI)      else if (*previous == OP_NOT || *previous == OP_NOTI)
# Line 4483  for (;; ptr++) Line 4499  for (;; ptr++)
4499        }        }
4500    
4501      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4502      cases. */      cases. Note that at this point we can encounter only the "basic" BRA and
4503        KET opcodes, as this is the place where they get converted into the more
4504        special varieties. */
4505    
4506      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4507               *previous == OP_ONCE || *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
4508        {        {
4509        register int i;        register int i;
       int ketoffset = 0;  
4510        int len = (int)(code - previous);        int len = (int)(code - previous);
4511        uschar *bralink = NULL;        uschar *bralink = NULL;
4512          uschar *brazeroptr = NULL;
4513    
4514        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless */
4515    
# Line 4501  for (;; ptr++) Line 4519  for (;; ptr++)
4519          goto FAILED;          goto FAILED;
4520          }          }
4521    
       /* If the maximum repeat count is unlimited, find the end of the bracket  
       by scanning through from the start, and compute the offset back to it  
       from the current code pointer. */  
   
       if (repeat_max == -1)  
         {  
         register uschar *ket = previous;  
         do ket += GET(ket, 1); while (*ket != OP_KET);  
         ketoffset = (int)(code - ket);  
         }  
   
4522        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
4523        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
4524        data, whereas in other cases it appears the minimum number of times. For        data, whereas in other cases it appears the minimum number of times. For
# Line 4553  for (;; ptr++) Line 4560  for (;; ptr++)
4560              *previous++ = OP_SKIPZERO;              *previous++ = OP_SKIPZERO;
4561              goto END_REPEAT;              goto END_REPEAT;
4562              }              }
4563              brazeroptr = previous;    /* Save for possessive optimizing */
4564            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4565            }            }
4566    
# Line 4717  for (;; ptr++) Line 4725  for (;; ptr++)
4725            }            }
4726          }          }
4727    
4728        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. For
4729        can't just offset backwards from the current code point, because we        ONCE brackets, that's all we need to do.
4730        don't know if there's been an options resetting after the ket. The  
4731        correct offset was computed above.        Otherwise, if the quantifier was possessive, we convert the BRA code to
4732          the POS form, and the KET code to KETRPOS. (It turns out to be convenient
4733          at runtime to detect this kind of subpattern at both the start and at the
4734          end.) If the group is preceded by OP_BRAZERO, convert this to
4735          OP_BRAPOSZERO. Then cancel the possessive flag so that the default action
4736          below, of wrapping everything inside atomic brackets, does not happen.
4737    
4738        Then, when we are doing the actual compile phase, check to see whether        Then, when we are doing the actual compile phase, check to see whether
4739        this group is a non-atomic one that could match an empty string. If so,        this group is one that could match an empty string. If so, convert the
4740        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime
4741        that runtime checking can be done. [This check is also applied to        checking can be done. [This check is also applied to ONCE groups at
4742        atomic groups at runtime, but in a different way.] */        runtime, but in a different way.] */
4743    
4744        else        else
4745          {          {
4746          uschar *ketcode = code - ketoffset;          uschar *ketcode = code - 1 - LINK_SIZE;
4747          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4748          *ketcode = OP_KETRMAX + repeat_type;  
4749          if (lengthptr == NULL && *bracode != OP_ONCE)          if (*bracode == OP_ONCE)
4750              *ketcode = OP_KETRMAX + repeat_type;
4751            else
4752            {            {
4753            uschar *scode = bracode;            if (possessive_quantifier)
           do  
4754              {              {
4755              if (could_be_empty_branch(scode, ketcode, utf8, cd))              *bracode += 1;                   /* Switch to xxxPOS opcodes */
4756                *ketcode = OP_KETRPOS;
4757                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
4758                possessive_quantifier = FALSE;
4759                }
4760              else *ketcode = OP_KETRMAX + repeat_type;
4761    
4762              if (lengthptr == NULL)
4763                {
4764                uschar *scode = bracode;
4765                do
4766                {                {
4767                *bracode += OP_SBRA - OP_BRA;                if (could_be_empty_branch(scode, ketcode, utf8, cd))
4768                break;                  {
4769                    *bracode += OP_SBRA - OP_BRA;
4770                    break;
4771                    }
4772                  scode += GET(scode, 1);
4773                }                }
4774              scode += GET(scode, 1);              while (*scode == OP_ALT);
4775              }              }
           while (*scode == OP_ALT);  
4776            }            }
4777          }          }
4778        }        }
# Line 5714  for (;; ptr++) Line 5741  for (;; ptr++)
5741          is necessary to ensure we correctly detect the start of the pattern in          is necessary to ensure we correctly detect the start of the pattern in
5742          both phases.          both phases.
5743    
5744          If we are not at the pattern start, compile code to change the ims          If we are not at the pattern start, reset the greedy defaults and the
5745          options if this setting actually changes any of them, and reset the          case value for firstbyte and reqbyte. */
         greedy defaults and the case value for firstbyte and reqbyte. */  
5746    
5747          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
5748            {            {
# Line 5733  for (;; ptr++) Line 5759  for (;; ptr++)
5759              }              }
5760    
5761            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
5762            in subsequent branches. When not at the start of the pattern, this            in subsequent branches. */
           information is also necessary so that a resetting item can be  
           compiled at the end of a group (if we are in a group). */  
5763    
5764            *optionsptr = options = newoptions;            *optionsptr = options = newoptions;
5765            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
# Line 5773  for (;; ptr++) Line 5797  for (;; ptr++)
5797    
5798      /* Process nested bracketed regex. Assertions may not be repeated, but      /* Process nested bracketed regex. Assertions may not be repeated, but
5799      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5800      non-register variable in order to be able to pass its address because some      non-register variable (tempcode) in order to be able to pass its address
5801      compilers complain otherwise. Pass in a new setting for the ims options if      because some compilers complain otherwise. */
     they have changed. */  
5802    
5803      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
5804      *code = bravalue;      *code = bravalue;
# Line 5785  for (;; ptr++) Line 5808  for (;; ptr++)
5808    
5809      if (!compile_regex(      if (!compile_regex(
5810           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
          options & PCRE_IMS,           /* The previous ims option state */  
5811           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
5812           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
5813           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
# Line 6242  value of lengthptr distinguishes the two Line 6264  value of lengthptr distinguishes the two
6264    
6265  Arguments:  Arguments:
6266    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
   oldims         previous settings of ims option bits  
6267    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
6268    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
6269    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
# Line 6260  Returns:         TRUE on success Line 6281  Returns:         TRUE on success
6281  */  */
6282    
6283  static BOOL  static BOOL
6284  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, uschar **codeptr, const uschar **ptrptr,
6285    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6286    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6287    int *lengthptr)    int *lengthptr)
# Line 6277  int branchfirstbyte, branchreqbyte; Line 6298  int branchfirstbyte, branchreqbyte;
6298  int length;  int length;
6299  int orig_bracount;  int orig_bracount;
6300  int max_bracount;  int max_bracount;
 int old_external_options = cd->external_options;  
6301  branch_chain bc;  branch_chain bc;
6302    
6303  bc.outer = bcptr;  bc.outer = bcptr;
# Line 6301  pre-compile phase to find out whether an Line 6321  pre-compile phase to find out whether an
6321    
6322  /* If this is a capturing subpattern, add to the chain of open capturing items  /* If this is a capturing subpattern, add to the chain of open capturing items
6323  so that we can detect them if (*ACCEPT) is encountered. This is also used to  so that we can detect them if (*ACCEPT) is encountered. This is also used to
6324  detect groups that contain recursive back references to themselves. */  detect groups that contain recursive back references to themselves. Note that
6325    only OP_CBRA need be tested here; changing this opcode to one of its variants,
6326    e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
6327    
6328  if (*code == OP_CBRA)  if (*code == OP_CBRA)
6329    {    {
# Line 6347  for (;;) Line 6369  for (;;)
6369      return FALSE;      return FALSE;
6370      }      }
6371    
   /* If the external options have changed during this branch, it means that we  
   are at the top level, and a leading option setting has been encountered. We  
   need to re-set the original option values to take account of this so that,  
   during the pre-compile phase, we know to allow for a re-set at the start of  
   subsequent branches. */  
   
   if (old_external_options != cd->external_options)  
     oldims = cd->external_options & PCRE_IMS;  
   
6372    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
6373    has fewer than the rest. */    has fewer than the rest. */
6374    
# Line 6416  for (;;) Line 6429  for (;;)
6429        {        {
6430        int fixed_length;        int fixed_length;
6431        *code = OP_END;        *code = OP_END;
6432        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);        fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
6433            FALSE, cd);
6434        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
6435        if (fixed_length == -3)        if (fixed_length == -3)
6436          {          {
# Line 6437  for (;;) Line 6451  for (;;)
6451    of offsets, with the field in the BRA item now becoming an offset to the    of offsets, with the field in the BRA item now becoming an offset to the
6452    first alternative. If there are no alternatives, it points to the end of the    first alternative. If there are no alternatives, it points to the end of the
6453    group. The length in the terminating ket is always the length of the whole    group. The length in the terminating ket is always the length of the whole
6454    bracketed item. If any of the ims options were changed inside the group,    bracketed item. Return leaving the pointer at the terminating char. */
   compile a resetting op-code following, except at the very end of the pattern.  
   Return leaving the pointer at the terminating char. */  
6455    
6456    if (*ptr != CHAR_VERTICAL_LINE)    if (*ptr != CHAR_VERTICAL_LINE)
6457      {      {
# Line 6564  of the more common cases more precisely. Line 6576  of the more common cases more precisely.
6576    
6577  Arguments:  Arguments:
6578    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
   options        points to the options setting  
6579    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
6580                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
6581                    the less precise approach                    the less precise approach
# Line 6574  Returns:     TRUE or FALSE Line 6585  Returns:     TRUE or FALSE
6585  */  */
6586    
6587  static BOOL  static BOOL
6588  is_anchored(register const uschar *code, int *options, unsigned int bracket_map,  is_anchored(register const uschar *code, unsigned int bracket_map,
6589    unsigned int backref_map)    unsigned int backref_map)
6590  {  {
6591  do {  do {
6592     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6593       options, PCRE_MULTILINE, FALSE);       FALSE);
6594     register int op = *scode;     register int op = *scode;
6595    
6596     /* Non-capturing brackets */     /* Non-capturing brackets */
6597    
6598     if (op == OP_BRA)     if (op == OP_BRA  || op == OP_BRAPOS ||
6599           op == OP_SBRA || op == OP_SBRAPOS)
6600       {       {
6601       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6602       }       }
6603    
6604     /* Capturing brackets */     /* Capturing brackets */
6605    
6606     else if (op == OP_CBRA)     else if (op == OP_CBRA  || op == OP_CBRAPOS ||
6607                op == OP_SCBRA || op == OP_SCBRAPOS)
6608       {       {
6609       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
6610       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6611       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, backref_map)) return FALSE;
6612       }       }
6613    
6614     /* Other brackets */     /* Other brackets */
6615    
6616     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6617       {       {
6618       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6619       }       }
6620    
6621     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
# Line 6653  is_startline(const uschar *code, unsigne Line 6666  is_startline(const uschar *code, unsigne
6666  {  {
6667  do {  do {
6668     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6669       NULL, 0, FALSE);       FALSE);
6670     register int op = *scode;     register int op = *scode;
6671    
6672     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
# Line 6680  do { Line 6693  do {
6693         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
6694         break;         break;
6695         }         }
6696       scode = first_significant_code(scode, NULL, 0, FALSE);       scode = first_significant_code(scode, FALSE);
6697       op = *scode;       op = *scode;
6698       }       }
6699    
6700     /* Non-capturing brackets */     /* Non-capturing brackets */
6701    
6702     if (op == OP_BRA)     if (op == OP_BRA  || op == OP_BRAPOS ||
6703           op == OP_SBRA || op == OP_SBRAPOS)
6704       {       {
6705       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6706       }       }
6707    
6708     /* Capturing brackets */     /* Capturing brackets */
6709    
6710     else if (op == OP_CBRA)     else if (op == OP_CBRA  || op == OP_CBRAPOS ||
6711                op == OP_SCBRA || op == OP_SCBRAPOS)
6712       {       {
6713       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
6714       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
# Line 6743  we return that char, otherwise -1. Line 6758  we return that char, otherwise -1.
6758    
6759  Arguments:  Arguments:
6760    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
   options    pointer to the options (used to check casing changes)  
6761    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
6762    
6763  Returns:     -1 or the fixed first char  Returns:     -1 or the fixed first char
6764  */  */
6765    
6766  static int  static int
6767  find_firstassertedchar(const uschar *code, int *options, BOOL inassert)  find_firstassertedchar(const uschar *code, BOOL inassert)
6768  {  {
6769  register int c = -1;  register int c = -1;
6770  do {  do {
6771     int d;     int d;
6772     const uschar *scode =     int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
6773       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);               *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? 2:0;
6774       const uschar *scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
6775     register int op = *scode;     register int op = *scode;
6776    
6777     switch(op)     switch(op)
# Line 6765  do { Line 6780  do {
6780       return -1;       return -1;
6781    
6782       case OP_BRA:       case OP_BRA:
6783         case OP_BRAPOS:
6784       case OP_CBRA:       case OP_CBRA:
6785         case OP_SCBRA:
6786         case OP_CBRAPOS:
6787         case OP_SCBRAPOS:
6788       case OP_ASSERT:       case OP_ASSERT:
6789       case OP_ONCE:       case OP_ONCE:
6790       case OP_COND:       case OP_COND:
6791       if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)       if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
6792         return -1;         return -1;
6793       if (c < 0) c = d; else if (c != d) return -1;       if (c < 0) c = d; else if (c != d) return -1;
6794       break;       break;
6795    
6796       case OP_EXACT:       /* Fall through */       case OP_EXACT:
6797       scode += 2;       scode += 2;
6798         /* Fall through */
6799    
6800       case OP_CHAR:       case OP_CHAR:
      case OP_CHARI:  
6801       case OP_PLUS:       case OP_PLUS:
6802       case OP_MINPLUS:       case OP_MINPLUS:
6803       case OP_POSPLUS:       case OP_POSPLUS:
6804       if (!inassert) return -1;       if (!inassert) return -1;
6805       if (c < 0)       if (c < 0) c = scode[1];
6806         {         else if (c != scode[1]) return -1;
6807         c = scode[1];       break;
6808         if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;  
6809         }       case OP_EXACTI:
6810       else if (c != scode[1]) return -1;       scode += 2;
6811         /* Fall through */
6812    
6813         case OP_CHARI:
6814         case OP_PLUSI:
6815         case OP_MINPLUSI:
6816         case OP_POSPLUSI:
6817         if (!inassert) return -1;
6818         if (c < 0) c = scode[1] | REQ_CASELESS;
6819           else if (c != scode[1]) return -1;
6820       break;       break;
6821       }       }
6822    
# Line 6939  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6967  while (ptr[skipatstart] == CHAR_LEFT_PAR
6967    
6968  utf8 = (options & PCRE_UTF8) != 0;  utf8 = (options & PCRE_UTF8) != 0;
6969    
6970  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF8 unless PCRE has been compiled to include the code. The
6971  return of an error code from _pcre_valid_utf8() is a new feature, introduced in  return of an error code from _pcre_valid_utf8() is a new feature, introduced in
6972  release 8.13. The only use we make of it here is to adjust the offset value to  release 8.13. The only use we make of it here is to adjust the offset value to
6973  the end of the string for a short string error, for compatibility with previous  the end of the string for a short string error, for compatibility with previous
6974  versions. */  versions. */
6975    
6976  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 7063  outside can help speed up starting point Line 7091  outside can help speed up starting point
7091  ptr += skipatstart;  ptr += skipatstart;
7092  code = cworkspace;  code = cworkspace;
7093  *code = OP_BRA;  *code = OP_BRA;
7094  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7095    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,    FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
   &length);  
7096  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7097    
7098  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 7137  of the function here. */ Line 7164  of the function here. */
7164  ptr = (const uschar *)pattern + skipatstart;  ptr = (const uschar *)pattern + skipatstart;
7165  code = (uschar *)codestart;  code = (uschar *)codestart;
7166  *code = OP_BRA;  *code = OP_BRA;
7167  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0,
7168    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &firstbyte, &reqbyte, NULL, cd, NULL);
7169  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7170  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7171  re->flags = cd->external_flags;  re->flags = cd->external_flags;
# Line 7204  if (cd->check_lookbehind) Line 7231  if (cd->check_lookbehind)
7231        uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);        uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
7232        int end_op = *be;        int end_op = *be;
7233        *be = OP_END;        *be = OP_END;
7234        fixed_length = find_fixedlength(cc, re->options, TRUE, cd);        fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
7235            cd);
7236        *be = end_op;        *be = end_op;
7237        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
7238        if (fixed_length < 0)        if (fixed_length < 0)
# Line 7243  start with ^. and also when all branches Line 7271  start with ^. and also when all branches
7271    
7272  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
7273    {    {
7274    int temp_options = re->options;   /* May get changed during these scans */    if (is_anchored(codestart, 0, cd->backref_map))
   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))  
7275      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
7276    else    else
7277      {      {
7278      if (firstbyte < 0)      if (firstbyte < 0)
7279        firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);        firstbyte = find_firstassertedchar(codestart, FALSE);
7280      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
7281        {        {
7282        int ch = firstbyte & 255;        int ch = firstbyte & 255;

Legend:
Removed from v.602  
changed lines
  Added in v.604

  ViewVC Help
Powered by ViewVC 1.1.5