/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1363 by ph10, Tue Oct 1 16:54:40 2013 UTC revision 1369 by ph10, Tue Oct 8 15:06:46 2013 UTC
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET);
879  *************************************************/  *************************************************/
880    
881  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
882  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
883  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
884  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
885  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
886  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
887    
888  Arguments:  Arguments:
889    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
890    chptr          points to the data character    chptr          points to a returned data character
891    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
892    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
893    options        the options bits    options        the options bits
# Line 1092  else Line 1091  else
1091      break;      break;
1092    
1093      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1094      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1095      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1096        recommended to avoid the ambiguities in the old syntax.
1097    
1098      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1099      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1100      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1101      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1102      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1103      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1104      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1105    
1106        Inside a character class, \ followed by a digit is always either a literal
1107        8 or 9 or an octal number. */
1108    
1109      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1110      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1131  else
1131          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1132          break;          break;
1133          }          }
1134        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1135          {          {
1136          escape = -s;          escape = -s;
1137          break;          break;
# Line 1136  else Line 1139  else
1139        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1140        }        }
1141    
1142      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1143      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1144      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1145        changed so as not to insert the binary zero. */
1146      if ((c = *ptr) >= CHAR_8)  
1147        {      if ((c = *ptr) >= CHAR_8) break;
1148        ptr--;  
1149        c = 0;      /* Fall through with a digit less than 8 */
       break;  
       }  
1150    
1151      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1152      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1524  for (;;) Line 1525  for (;;)
1525    
1526      case OP_CALLOUT:      case OP_CALLOUT:
1527      case OP_CREF:      case OP_CREF:
1528      case OP_NCREF:      case OP_DNCREF:
1529      case OP_RREF:      case OP_RREF:
1530      case OP_NRREF:      case OP_DNRREF:
1531      case OP_DEF:      case OP_DEF:
1532      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1533      break;      break;
# Line 1663  for (;;) Line 1664  for (;;)
1664      case OP_COMMIT:      case OP_COMMIT:
1665      case OP_CREF:      case OP_CREF:
1666      case OP_DEF:      case OP_DEF:
1667        case OP_DNCREF:
1668        case OP_DNRREF:
1669      case OP_DOLL:      case OP_DOLL:
1670      case OP_DOLLM:      case OP_DOLLM:
1671      case OP_EOD:      case OP_EOD:
1672      case OP_EODN:      case OP_EODN:
1673      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1674      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1675      case OP_PRUNE:      case OP_PRUNE:
1676      case OP_REVERSE:      case OP_REVERSE:
# Line 2650  switch(ptype) Line 2651  switch(ptype)
2651    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2652            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2653    
2654      /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2655      means that Perl space and POSIX space are now identical. PCRE was changed
2656      at release 8.34. */
2657    
2658    case PT_SPACE:    /* Perl space */    case PT_SPACE:    /* Perl space */
   return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||  
           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)  
           == negated;  
   
2659    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2660    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2661            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
# Line 4627  for (;; ptr++) Line 4628  for (;; ptr++)
4628              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4629              continue;              continue;
4630    
4631              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4632              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4633              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
4634              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4635                we could just adjust the appropriate bit. From PCRE 8.34 we no
4636                longer treat \s and \S specially. */
4637    
4638              case ESC_s:              case ESC_s:
4639              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
4640              continue;              continue;
4641    
4642              case ESC_S:              case ESC_S:
4643              should_flip_negation = TRUE;              should_flip_negation = TRUE;
4644              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
4645              continue;              continue;
4646    
4647              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
# Line 6031  for (;; ptr++) Line 6031  for (;; ptr++)
6031                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6032            break;            break;
6033    
6034          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6035          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6036    
6037          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6038          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 6048  for (;; ptr++) Line 6048  for (;; ptr++)
6048            }            }
6049    
6050          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6051          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6052            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6053            consist entirely of digits, there is scope for ambiguity. */
6054    
6055          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6056            {            {
# Line 6065  for (;; ptr++) Line 6067  for (;; ptr++)
6067            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6068            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6069            }            }
6070    
6071            /* When a name is one of a number of duplicates, a different opcode is
6072            used and it needs more memory. Unfortunately we cannot tell whether a
6073            name is a duplicate in the first pass, so we have to allow for more
6074            memory except when we know it is a relative numerical reference. */
6075    
6076            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6077    
6078          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name (possibly all digits); any thing else
6079            is an error. In the case of all digits, also get it as a number. */
6080    
6081          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6082            {            {
# Line 6075  for (;; ptr++) Line 6085  for (;; ptr++)
6085            goto FAILED;            goto FAILED;
6086            }            }
6087    
         /* Read the name, but also get it as a number if it's all digits */  
   
6088          recno = 0;          recno = 0;
6089          name = ++ptr;          name = ++ptr;
6090          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
# Line 6087  for (;; ptr++) Line 6095  for (;; ptr++)
6095            }            }
6096          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6097    
6098            /* Check the terminator */
6099    
6100          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6101              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6102            {            {
# Line 6122  for (;; ptr++) Line 6132  for (;; ptr++)
6132            }            }
6133    
6134          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6135          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
6136          OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
   
6137          slot = cd->name_table;          slot = cd->name_table;
6138          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
6139            {            {
# Line 6134  for (;; ptr++) Line 6141  for (;; ptr++)
6141            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6142            }            }
6143    
6144          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6145            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6146            appropriate data values. Otherwise, just insert the unique subpattern
6147            number. */
6148    
6149          if (i < cd->names_found)          if (i < cd->names_found)
6150            {            {
6151            recno = GET2(slot, 0);            int offset = i++;
6152            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6153            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6154              for (; i < cd->names_found; i++)
6155                {
6156                slot += cd->name_entry_size;
6157                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6158                count++;
6159                }
6160              if (count > 1)
6161                {
6162                PUT2(code, 2+LINK_SIZE, offset);
6163                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6164                skipbytes += IMM2_SIZE;
6165                code[1+LINK_SIZE]++;
6166                }
6167              else  /* Not a duplicated name */
6168                {
6169                PUT2(code, 2+LINK_SIZE, recno);
6170                }
6171            }            }
6172    
6173          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 7830  do { Line 7857  do {
7857       switch (*scode)       switch (*scode)
7858         {         {
7859         case OP_CREF:         case OP_CREF:
7860         case OP_NCREF:         case OP_DNCREF:
7861         case OP_RREF:         case OP_RREF:
7862         case OP_NRREF:         case OP_DNRREF:
7863         case OP_DEF:         case OP_DEF:
7864         return FALSE;         return FALSE;
7865    

Legend:
Removed from v.1363  
changed lines
  Added in v.1369

  ViewVC Help
Powered by ViewVC 1.1.5