/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 285 by ph10, Wed Dec 12 17:03:50 2007 UTC revision 327 by ph10, Sat Mar 8 19:38:30 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 158  static const char verbnames[] = Line 158  static const char verbnames[] =
158    "SKIP\0"    "SKIP\0"
159    "THEN";    "THEN";
160    
161  static verbitem verbs[] = {  static const verbitem verbs[] = {
162    { 6, OP_ACCEPT },    { 6, OP_ACCEPT },
163    { 6, OP_COMMIT },    { 6, OP_COMMIT },
164    { 1, OP_FAIL },    { 1, OP_FAIL },
# Line 168  static verbitem verbs[] = { Line 168  static verbitem verbs[] = {
168    { 4, OP_THEN  }    { 4, OP_THEN  }
169  };  };
170    
171  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
# Line 1531  for (code = first_significant_code(code Line 1531  for (code = first_significant_code(code
1531    const uschar *ccode;    const uschar *ccode;
1532    
1533    c = *code;    c = *code;
1534    
1535    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
1536    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
1537    
1538    if (c == OP_ASSERT)    if (c == OP_ASSERT)
1539      {      {
1540      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1541      c = *code;      c = *code;
1542      continue;      continue;
1543      }      }
1544    
1545    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1546    
# Line 1737  return TRUE; Line 1737  return TRUE;
1737  *************************************************/  *************************************************/
1738    
1739  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1740  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1741  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1743    
1744    Originally, this function only recognized a sequence of letters between the
1745    terminators, but it seems that Perl recognizes any sequence of characters,
1746    though of course unknown POSIX names are subsequently rejected. Perl gives an
1747    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748    didn't consider this to be a POSIX class. Likewise for [:1234:].
1749    
1750    The problem in trying to be exactly like Perl is in the handling of escapes. We
1751    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753    below handles the special case of \], but does not try to do any other escape
1754    processing. This makes it different from Perl for cases such as [:l\ower:]
1755    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757    I think.
1758    
1759  Argument:  Arguments:
1760    ptr      pointer to the initial [    ptr      pointer to the initial [
1761    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1762    
1763  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1764  */  */
1765    
1766  static BOOL  static BOOL
1767  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768  {  {
1769  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1770  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1771  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1772    {    {
1773    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774    return TRUE;      {
1775        if (*ptr == ']') return FALSE;
1776        if (*ptr == terminator && ptr[1] == ']')
1777          {
1778          *endptr = ptr;
1779          return TRUE;
1780          }
1781        }
1782    }    }
1783  return FALSE;  return FALSE;
1784  }  }
# Line 2094  if (next >= 0) switch(op_code) Line 2113  if (next >= 0) switch(op_code)
2113    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT, "item" must be a single-byte character. */
2114    
2115    case OP_NOT:    case OP_NOT:
   if (next < 0) return FALSE;  /* Not a character */  
2116    if (item == next) return TRUE;    if (item == next) return TRUE;
2117    if ((options & PCRE_CASELESS) == 0) return FALSE;    if ((options & PCRE_CASELESS) == 0) return FALSE;
2118  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 2357  uschar classbits[32]; Line 2375  uschar classbits[32];
2375  BOOL class_utf8;  BOOL class_utf8;
2376  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2377  uschar *class_utf8data;  uschar *class_utf8data;
2378    uschar *class_utf8data_base;
2379  uschar utf8_char[6];  uschar utf8_char[6];
2380  #else  #else
2381  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
# Line 2396  req_caseopt = ((options & PCRE_CASELESS) Line 2415  req_caseopt = ((options & PCRE_CASELESS)
2415  for (;; ptr++)  for (;; ptr++)
2416    {    {
2417    BOOL negate_class;    BOOL negate_class;
2418    BOOL should_flip_negation;    BOOL should_flip_negation;
2419    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2420    BOOL is_quantifier;    BOOL is_quantifier;
2421    BOOL is_recurse;    BOOL is_recurse;
# Line 2620  for (;; ptr++) Line 2639  for (;; ptr++)
2639      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2640    
2641      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2642          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2643        {        {
2644        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2645        goto FAILED;        goto FAILED;
# Line 2645  for (;; ptr++) Line 2664  for (;; ptr++)
2664        else break;        else break;
2665        }        }
2666    
2667      /* If a class contains a negative special such as \S, we need to flip the      /* If a class contains a negative special such as \S, we need to flip the
2668      negation flag at the end, so that support for characters > 255 works      negation flag at the end, so that support for characters > 255 works
2669      correctly (they are all included in the class). */      correctly (they are all included in the class). */
2670    
2671      should_flip_negation = FALSE;      should_flip_negation = FALSE;
# Line 2668  for (;; ptr++) Line 2687  for (;; ptr++)
2687  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2688      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2689      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2690        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2691  #endif  #endif
2692    
2693      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2683  for (;; ptr++) Line 2703  for (;; ptr++)
2703          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2704          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2705          }          }
2706    
2707          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2708          data and reset the pointer. This is so that very large classes that
2709          contain a zillion UTF-8 characters no longer overwrite the work space
2710          (which is on the stack). */
2711    
2712          if (lengthptr != NULL)
2713            {
2714            *lengthptr += class_utf8data - class_utf8data_base;
2715            class_utf8data = class_utf8data_base;
2716            }
2717    
2718  #endif  #endif
2719    
2720        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 2706  for (;; ptr++) Line 2738  for (;; ptr++)
2738    
2739        if (c == '[' &&        if (c == '[' &&
2740            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2741            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2742          {          {
2743          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2744          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 2723  for (;; ptr++) Line 2755  for (;; ptr++)
2755          if (*ptr == '^')          if (*ptr == '^')
2756            {            {
2757            local_negate = TRUE;            local_negate = TRUE;
2758            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
2759            ptr++;            ptr++;
2760            }            }
2761    
# Line 2826  for (;; ptr++) Line 2858  for (;; ptr++)
2858              continue;              continue;
2859    
2860              case ESC_D:              case ESC_D:
2861              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2862              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2863              continue;              continue;
2864    
# Line 2835  for (;; ptr++) Line 2867  for (;; ptr++)
2867              continue;              continue;
2868    
2869              case ESC_W:              case ESC_W:
2870              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2871              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2872              continue;              continue;
2873    
# Line 2845  for (;; ptr++) Line 2877  for (;; ptr++)
2877              continue;              continue;
2878    
2879              case ESC_S:              case ESC_S:
2880              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2881              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2882              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2883              continue;              continue;
# Line 3348  we set the flag only if there is a liter Line 3380  we set the flag only if there is a liter
3380      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3381    
3382      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3383      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
3384      such as \S in the class, because in that case all characters > 255 are in      such as \S in the class, because in that case all characters > 255 are in
3385      the class, so any that were explicitly given as well can be ignored. If      the class, so any that were explicitly given as well can be ignored. If
3386      (when there are explicit characters > 255 that must be listed) there are no      (when there are explicit characters > 255 that must be listed) there are no
3387      characters < 256, we can omit the bitmap in the actual compiled code. */      characters < 256, we can omit the bitmap in the actual compiled code. */
3388    
# Line 3381  we set the flag only if there is a liter Line 3413  we set the flag only if there is a liter
3413        }        }
3414  #endif  #endif
3415    
3416      /* If there are no characters > 255, set the opcode to OP_CLASS or      /* If there are no characters > 255, set the opcode to OP_CLASS or
3417      OP_NCLASS, depending on whether the whole class was negated and whether      OP_NCLASS, depending on whether the whole class was negated and whether
3418      there were negative specials such as \S in the class. Then copy the 32-byte      there were negative specials such as \S in the class. Then copy the 32-byte
3419      map into the code vector, negating it if necessary. */      map into the code vector, negating it if necessary. */
3420    
3421      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3422      if (negate_class)      if (negate_class)
3423        {        {
# Line 4030  we set the flag only if there is a liter Line 4062  we set the flag only if there is a liter
4062        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4063            *tempcode == OP_NOTEXACT)            *tempcode == OP_NOTEXACT)
4064          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4065            ((*tempcode == OP_TYPEEXACT &&            ((*tempcode == OP_TYPEEXACT &&
4066               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4067        len = code - tempcode;        len = code - tempcode;
4068        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4069          {          {
# Line 4258  we set the flag only if there is a liter Line 4290  we set the flag only if there is a liter
4290              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
4291              goto FAILED;              goto FAILED;
4292              }              }
4293            recno = (refsign == '-')?            recno = (refsign == '-')?
4294              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno +cd->bracount;
4295            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
4296              {              {
# Line 4337  we set the flag only if there is a liter Line 4369  we set the flag only if there is a liter
4369            }            }
4370    
4371          /* Check for the "name" actually being a subpattern number. We are          /* Check for the "name" actually being a subpattern number. We are
4372          in the second pass here, so final_bracount is set. */          in the second pass here, so final_bracount is set. */
4373    
4374          else if (recno > 0 && recno <= cd->final_bracount)          else if (recno > 0 && recno <= cd->final_bracount)
4375            {            {
# Line 4551  we set the flag only if there is a liter Line 4583  we set the flag only if there is a liter
4583              {              {
4584              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
4585              goto FAILED;              goto FAILED;
4586              }              }
4587            if (*ptr != terminator)            if (*ptr != terminator)
4588              {              {
4589              *errorcodeptr = ERR42;              *errorcodeptr = ERR42;
# Line 4565  we set the flag only if there is a liter Line 4597  we set the flag only if there is a liter
4597            recno = 0;            recno = 0;
4598            }            }
4599    
4600          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, seek the name in the table. We check the name
4601          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
4602          table. That way, if the name that is longer than any in the table,          table. That way, if the name that is longer than any in the table,
4603          the comparison will fail without reading beyond the table entry. */          the comparison will fail without reading beyond the table entry. */
4604    
# Line 4576  we set the flag only if there is a liter Line 4608  we set the flag only if there is a liter
4608            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4609              {              {
4610              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4611                  slot[2+namelen] == 0)                  slot[2+namelen] == 0)
4612                break;                break;
4613              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4614              }              }
# Line 4614  we set the flag only if there is a liter Line 4646  we set the flag only if there is a liter
4646            {            {
4647            const uschar *called;            const uschar *called;
4648    
4649            if ((refsign = *ptr) == '+')            if ((refsign = *ptr) == '+')
4650              {              {
4651              ptr++;              ptr++;
4652              if ((digitab[*ptr] & ctype_digit) == 0)              if ((digitab[*ptr] & ctype_digit) == 0)
4653                {                {
4654                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
4655                goto FAILED;                goto FAILED;
4656                }                }
4657              }              }
4658            else if (refsign == '-')            else if (refsign == '-')
4659              {              {
4660              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
# Line 5788  to fill in forward references to subpatt Line 5820  to fill in forward references to subpatt
5820    
5821  uschar cworkspace[COMPILE_WORK_SIZE];  uschar cworkspace[COMPILE_WORK_SIZE];
5822    
   
5823  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
5824    
5825  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern;

Legend:
Removed from v.285  
changed lines
  Added in v.327

  ViewVC Help
Powered by ViewVC 1.1.5