/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 629 by ph10, Fri Jul 22 09:18:11 2011 UTC revision 640 by ph10, Mon Jul 25 10:50:28 2011 UTC
# Line 393  static const char error_texts[] = Line 393  static const char error_texts[] =
393    "internal error: previously-checked referenced subpattern not found\0"    "internal error: previously-checked referenced subpattern not found\0"
394    "DEFINE group contains more than one branch\0"    "DEFINE group contains more than one branch\0"
395    /* 55 */    /* 55 */
396    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
397    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
398    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
# Line 578  return s; Line 578  return s;
578    
579    
580  /*************************************************  /*************************************************
581    *            Check for counted repeat            *
582    *************************************************/
583    
584    /* This function is called when a '{' is encountered in a place where it might
585    start a quantifier. It looks ahead to see if it really is a quantifier or not.
586    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
587    where the ddds are digits.
588    
589    Arguments:
590      p         pointer to the first char after '{'
591    
592    Returns:    TRUE or FALSE
593    */
594    
595    static BOOL
596    is_counted_repeat(const uschar *p)
597    {
598    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
599    while ((digitab[*p] & ctype_digit) != 0) p++;
600    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
601    
602    if (*p++ != CHAR_COMMA) return FALSE;
603    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
604    
605    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
606    while ((digitab[*p] & ctype_digit) != 0) p++;
607    
608    return (*p == CHAR_RIGHT_CURLY_BRACKET);
609    }
610    
611    
612    
613    /*************************************************
614  *            Handle escapes                      *  *            Handle escapes                      *
615  *************************************************/  *************************************************/
616    
# Line 648  else Line 681  else
681      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
682      break;      break;
683    
684      /* \g must be followed by one of a number of specific things:      /* In a character class, \g is just a literal "g". Outside a character
685        class, \g must be followed by one of a number of specific things:
686    
687      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
688      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
# Line 665  else Line 699  else
699      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
700    
701      case CHAR_g:      case CHAR_g:
702        if (isclass) break;
703      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
704        {        {
705        c = -ESC_g;        c = -ESC_g;
# Line 886  else Line 921  else
921    }    }
922    
923  /* Perl supports \N{name} for character names, as well as plain \N for "not  /* Perl supports \N{name} for character names, as well as plain \N for "not
924  newline". PCRE does not support \N{name}. */  newline". PCRE does not support \N{name}. However, it does support
925    quantification such as \N{2,3}. */
926    
927  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)  if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
928         !is_counted_repeat(ptr+2))
929    *errorcodeptr = ERR37;    *errorcodeptr = ERR37;
930    
931  /* If PCRE_UCP is set, we change the values for \d etc. */  /* If PCRE_UCP is set, we change the values for \d etc. */
# Line 998  return -1; Line 1035  return -1;
1035    
1036    
1037  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if (*p++ != CHAR_COMMA) return FALSE;  
 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == CHAR_RIGHT_CURLY_BRACKET);  
 }  
   
   
   
 /*************************************************  
1038  *         Read repeat counts                     *  *         Read repeat counts                     *
1039  *************************************************/  *************************************************/
1040    
# Line 2288  where Perl recognizes it as the POSIX cl Line 2292  where Perl recognizes it as the POSIX cl
2292  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2293  I think.  I think.
2294    
2295    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2296    It seems that the appearance of a nested POSIX class supersedes an apparent
2297    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2298    a digit. Also, unescaped square brackets may also appear as part of class
2299    names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.
2300    
2301  Arguments:  Arguments:
2302    ptr      pointer to the initial [    ptr      pointer to the initial [
2303    endptr   where to return the end pointer    endptr   where to return the end pointer
# Line 2302  int terminator;          /* Don't combin Line 2312  int terminator;          /* Don't combin
2312  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2313  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2314    {    {
2315    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2316        ptr++;
2317      else
2318      {      {
     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
2319      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2320        {        {
2321        *endptr = ptr;        *endptr = ptr;
2322        return TRUE;        return TRUE;
2323        }        }
2324        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2325             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2326              ptr[1] == CHAR_EQUALS_SIGN) &&
2327            check_posix_syntax(ptr, endptr))
2328          return FALSE;
2329      }      }
2330    }    }
2331  return FALSE;  return FALSE;
# Line 3039  int greedy_default, greedy_non_default; Line 3055  int greedy_default, greedy_non_default;
3055  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3056  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3057  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3058  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3059  int after_manual_callout = 0;  int after_manual_callout = 0;
3060  int length_prevgroup = 0;  int length_prevgroup = 0;
3061  register int c;  register int c;
# Line 3057  uschar *previous_callout = NULL; Line 3073  uschar *previous_callout = NULL;
3073  uschar *save_hwm = NULL;  uschar *save_hwm = NULL;
3074  uschar classbits[32];  uschar classbits[32];
3075    
3076    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3077    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3078    dynamically as we process the pattern. */
3079    
3080  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3081  BOOL class_utf8;  BOOL class_utf8;
3082  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
# Line 3237  for (;; ptr++) Line 3257  for (;; ptr++)
3257      previous_callout = NULL;      previous_callout = NULL;
3258      }      }
3259    
3260    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3261    
3262    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3263      {      {
# Line 4206  for (;; ptr++) Line 4226  for (;; ptr++)
4226      op_type = 0;                    /* Default single-char op codes */      op_type = 0;                    /* Default single-char op codes */
4227      possessive_quantifier = FALSE;  /* Default not possessive quantifier */      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4228    
4229      /* Save start of previous item, in case we have to move it up to make space      /* Save start of previous item, in case we have to move it up in order to
4230      for an inserted OP_ONCE for the additional '+' extension. */      insert something before it. */
4231    
4232      tempcode = previous;      tempcode = previous;
4233    
# Line 4550  for (;; ptr++) Line 4570  for (;; ptr++)
4570        }        }
4571    
4572      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4573      cases. Note that at this point we can encounter only the "basic" BRA and      cases. Note that at this point we can encounter only the "basic" bracket
4574      KET opcodes, as this is the place where they get converted into the more      opcodes such as BRA and CBRA, as this is the place where they get converted
4575      special varieties. */      into the more special varieties such as BRAPOS and SBRA. A test for >=
4576        OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
4577        ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
4578        repetition of assertions, but now it does, for Perl compatibility. */
4579    
4580      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
              *previous == OP_ONCE || *previous == OP_COND)  
4581        {        {
4582        register int i;        register int i;
4583        int len = (int)(code - previous);        int len = (int)(code - previous);
4584        uschar *bralink = NULL;        uschar *bralink = NULL;
4585        uschar *brazeroptr = NULL;        uschar *brazeroptr = NULL;
4586    
4587        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
4588          we just ignore the repeat. */
4589    
4590        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4591          {          goto END_REPEAT;
4592          *errorcodeptr = ERR55;  
4593          goto FAILED;        /* There is no sense in actually repeating assertions. The only potential
4594          }        use of repetition is in cases when the assertion is optional. Therefore,
4595          if the minimum is greater than zero, just ignore the repeat. If the
4596          maximum is not not zero or one, set it to 1. */
4597    
4598          if (*previous < OP_ONCE)    /* Assertion */
4599            {
4600            if (repeat_min > 0) goto END_REPEAT;
4601            if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
4602            }
4603    
4604        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
4605        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
# Line 4588  for (;; ptr++) Line 4619  for (;; ptr++)
4619          **   goto END_REPEAT;          **   goto END_REPEAT;
4620          **   }          **   }
4621    
4622          However, that fails when a group is referenced as a subroutine from          However, that fails when a group or a subgroup within it is referenced
4623          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it          as a subroutine from elsewhere in the pattern, so now we stick in
4624          so that it is skipped on execution. As we don't have a list of which          OP_SKIPZERO in front of it so that it is skipped on execution. As we
4625          groups are referenced, we cannot do this selectively.          don't have a list of which groups are referenced, we cannot do this
4626            selectively.
4627    
4628          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4629          and do no more at this point. However, we do need to adjust any          and do no more at this point. However, we do need to adjust any
# Line 5572  for (;; ptr++) Line 5604  for (;; ptr++)
5604    
5605            temp = cd->end_pattern;            temp = cd->end_pattern;
5606            cd->end_pattern = ptr;            cd->end_pattern = ptr;
5607            recno = find_parens(cd, name, namelen,            recno = find_parens(cd, name, namelen,
5608              (options & PCRE_EXTENDED) != 0, utf8);              (options & PCRE_EXTENDED) != 0, utf8);
5609            cd->end_pattern = temp;            cd->end_pattern = temp;
5610            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
# Line 5856  for (;; ptr++) Line 5888  for (;; ptr++)
5888        skipbytes = 2;        skipbytes = 2;
5889        }        }
5890    
5891      /* Process nested bracketed regex. Assertions may not be repeated, but      /* Process nested bracketed regex. Assertions used not to be repeatable,
5892      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a      but this was changed for Perl compatibility, so all kinds can now be
5893      non-register variable (tempcode) in order to be able to pass its address      repeated. We copy code into a non-register variable (tempcode) in order to
5894      because some compilers complain otherwise. */      be able to pass its address because some compilers complain otherwise. */
5895    
5896      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = code;                   /* For handling repetition */
5897      *code = bravalue;      *code = bravalue;
5898      tempcode = code;      tempcode = code;
5899      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
# Line 7068  if ((options & PCRE_UCP) != 0) Line 7100  if ((options & PCRE_UCP) != 0)
7100    
7101  /* Check validity of \R options. */  /* Check validity of \R options. */
7102    
7103  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
7104         (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7105    {    {
7106    case 0:    errorcode = ERR56;
7107    case PCRE_BSR_ANYCRLF:    goto PCRE_EARLY_ERROR_RETURN;
   case PCRE_BSR_UNICODE:  
   break;  
   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;  
7108    }    }
7109    
7110  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The

Legend:
Removed from v.629  
changed lines
  Added in v.640

  ViewVC Help
Powered by ViewVC 1.1.5