/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 282 by ph10, Fri Dec 7 19:32:32 2007 UTC revision 336 by ph10, Sat Apr 12 15:59:03 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 158  static const char verbnames[] = Line 158  static const char verbnames[] =
158    "SKIP\0"    "SKIP\0"
159    "THEN";    "THEN";
160    
161  static verbitem verbs[] = {  static const verbitem verbs[] = {
162    { 6, OP_ACCEPT },    { 6, OP_ACCEPT },
163    { 6, OP_COMMIT },    { 6, OP_COMMIT },
164    { 1, OP_FAIL },    { 1, OP_FAIL },
# Line 168  static verbitem verbs[] = { Line 168  static verbitem verbs[] = {
168    { 4, OP_THEN  }    { 4, OP_THEN  }
169  };  };
170    
171  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
# Line 295  static const char error_texts[] = Line 295  static const char error_texts[] =
295    /* 55 */    /* 55 */
296    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"
297    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
298    "\\g is not followed by a braced name or an optionally braced non-zero number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"    "a numbered reference must not be zero\0"
300    "(*VERB) with an argument is not supported\0"    "(*VERB) with an argument is not supported\0"
301    /* 60 */    /* 60 */
302    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
303    "number is too big\0"    "number is too big\0"
304    "subpattern name expected\0"    "subpattern name expected\0"
305    "digit expected after (?+";    "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 531  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by one of a number of specific things:
536      is an absolute backreference. If negative, it is a relative backreference.  
537      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a      (1) A number, either plain or braced. If positive, it is an absolute
538      reference to a named group. This is part of Perl's movement towards a      backreference. If negative, it is a relative backreference. This is a Perl
539      unified syntax for back references. As this is synonymous with \k{name}, we      5.10 feature.
540      fudge it up by pretending it really was \k. */  
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551      case 'g':      case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560      if (ptr[1] == '{')      if (ptr[1] == '{')
561        {        {
562        const uschar *p;        const uschar *p;
# Line 565  else Line 583  else
583      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
584        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - '0';
585    
586      if (c < 0)      if (c < 0)   /* Integer overflow */
587        {        {
588        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
589        break;        break;
590        }        }
591    
592      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != '}')
593        {        {
594        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
595        break;        break;
596        }        }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604      if (negated)      if (negated)
605        {        {
# Line 611  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638        if (c < 0)        if (c < 0)    /* Integer overflow */
639          {          {
640          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
641          break;          break;
# Line 1531  for (code = first_significant_code(code Line 1555  for (code = first_significant_code(code
1555    const uschar *ccode;    const uschar *ccode;
1556    
1557    c = *code;    c = *code;
1558    
1559    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
1560    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
1561    
1562    if (c == OP_ASSERT)    if (c == OP_ASSERT)
1563      {      {
1564      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1565      c = *code;      c = *code;
1566      continue;      continue;
1567      }      }
1568    
1569    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1570    
1571    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1572      {      {
1573      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1574      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
# Line 1737  return TRUE; Line 1761  return TRUE;
1761  *************************************************/  *************************************************/
1762    
1763  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1764  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1765  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1766  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1767    
1768    Originally, this function only recognized a sequence of letters between the
1769    terminators, but it seems that Perl recognizes any sequence of characters,
1770    though of course unknown POSIX names are subsequently rejected. Perl gives an
1771    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1772    didn't consider this to be a POSIX class. Likewise for [:1234:].
1773    
1774    The problem in trying to be exactly like Perl is in the handling of escapes. We
1775    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1776    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1777    below handles the special case of \], but does not try to do any other escape
1778    processing. This makes it different from Perl for cases such as [:l\ower:]
1779    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1780    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1781    I think.
1782    
1783  Argument:  Arguments:
1784    ptr      pointer to the initial [    ptr      pointer to the initial [
1785    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1786    
1787  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1788  */  */
1789    
1790  static BOOL  static BOOL
1791  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1792  {  {
1793  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1794  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1795  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1796    {    {
1797    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1798    return TRUE;      {
1799        if (*ptr == ']') return FALSE;
1800        if (*ptr == terminator && ptr[1] == ']')
1801          {
1802          *endptr = ptr;
1803          return TRUE;
1804          }
1805        }
1806    }    }
1807  return FALSE;  return FALSE;
1808  }  }
# Line 1805  return -1; Line 1848  return -1;
1848  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1849  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1850  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1851  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1852  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1853  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1854  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
1855  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1856    OP_END.
1857    
1858  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
1859  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1840  while ((ptr = (uschar *)find_recurse(ptr Line 1884  while ((ptr = (uschar *)find_recurse(ptr
1884    
1885    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
1886    reference. */    reference. */
1887    
1888    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1889      {      {
1890      offset = GET(hc, 0);      offset = GET(hc, 0);
# Line 2094  if (next >= 0) switch(op_code) Line 2138  if (next >= 0) switch(op_code)
2138    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT, "item" must be a single-byte character. */
2139    
2140    case OP_NOT:    case OP_NOT:
   if (next < 0) return FALSE;  /* Not a character */  
2141    if (item == next) return TRUE;    if (item == next) return TRUE;
2142    if ((options & PCRE_CASELESS) == 0) return FALSE;    if ((options & PCRE_CASELESS) == 0) return FALSE;
2143  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 2357  uschar classbits[32]; Line 2400  uschar classbits[32];
2400  BOOL class_utf8;  BOOL class_utf8;
2401  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2402  uschar *class_utf8data;  uschar *class_utf8data;
2403    uschar *class_utf8data_base;
2404  uschar utf8_char[6];  uschar utf8_char[6];
2405  #else  #else
2406  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
# Line 2396  req_caseopt = ((options & PCRE_CASELESS) Line 2440  req_caseopt = ((options & PCRE_CASELESS)
2440  for (;; ptr++)  for (;; ptr++)
2441    {    {
2442    BOOL negate_class;    BOOL negate_class;
2443    BOOL should_flip_negation;    BOOL should_flip_negation;
2444    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2445    BOOL is_quantifier;    BOOL is_quantifier;
2446    BOOL is_recurse;    BOOL is_recurse;
# Line 2416  for (;; ptr++) Line 2460  for (;; ptr++)
2460    /* Get next byte in the pattern */    /* Get next byte in the pattern */
2461    
2462    c = *ptr;    c = *ptr;
2463    
2464    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
2465    previous cycle of this loop. */    previous cycle of this loop. */
2466    
# Line 2611  for (;; ptr++) Line 2655  for (;; ptr++)
2655      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2656      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2657      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2658      */  
2659        In JavaScript compatibility mode, an isolated ']' causes an error. In
2660        default (Perl) mode, it is treated as a data character. */
2661    
2662        case ']':
2663        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2664          {
2665          *errorcodeptr = ERR64;
2666          goto FAILED;
2667          }
2668        goto NORMAL_CHAR;
2669    
2670      case '[':      case '[':
2671      previous = code;      previous = code;
# Line 2620  for (;; ptr++) Line 2674  for (;; ptr++)
2674      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2675    
2676      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2677          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2678        {        {
2679        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2680        goto FAILED;        goto FAILED;
# Line 2645  for (;; ptr++) Line 2699  for (;; ptr++)
2699        else break;        else break;
2700        }        }
2701    
2702      /* If a class contains a negative special such as \S, we need to flip the      /* If a class contains a negative special such as \S, we need to flip the
2703      negation flag at the end, so that support for characters > 255 works      negation flag at the end, so that support for characters > 255 works
2704      correctly (they are all included in the class). */      correctly (they are all included in the class). */
2705    
2706      should_flip_negation = FALSE;      should_flip_negation = FALSE;
# Line 2668  for (;; ptr++) Line 2722  for (;; ptr++)
2722  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2723      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2724      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2725        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2726  #endif  #endif
2727    
2728      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2683  for (;; ptr++) Line 2738  for (;; ptr++)
2738          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2739          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2740          }          }
2741    
2742          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2743          data and reset the pointer. This is so that very large classes that
2744          contain a zillion UTF-8 characters no longer overwrite the work space
2745          (which is on the stack). */
2746    
2747          if (lengthptr != NULL)
2748            {
2749            *lengthptr += class_utf8data - class_utf8data_base;
2750            class_utf8data = class_utf8data_base;
2751            }
2752    
2753  #endif  #endif
2754    
2755        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 2706  for (;; ptr++) Line 2773  for (;; ptr++)
2773    
2774        if (c == '[' &&        if (c == '[' &&
2775            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2776            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2777          {          {
2778          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2779          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 2723  for (;; ptr++) Line 2790  for (;; ptr++)
2790          if (*ptr == '^')          if (*ptr == '^')
2791            {            {
2792            local_negate = TRUE;            local_negate = TRUE;
2793            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
2794            ptr++;            ptr++;
2795            }            }
2796    
# Line 2826  for (;; ptr++) Line 2893  for (;; ptr++)
2893              continue;              continue;
2894    
2895              case ESC_D:              case ESC_D:
2896              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2897              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2898              continue;              continue;
2899    
# Line 2835  for (;; ptr++) Line 2902  for (;; ptr++)
2902              continue;              continue;
2903    
2904              case ESC_W:              case ESC_W:
2905              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2906              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2907              continue;              continue;
2908    
# Line 2845  for (;; ptr++) Line 2912  for (;; ptr++)
2912              continue;              continue;
2913    
2914              case ESC_S:              case ESC_S:
2915              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2916              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2917              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2918              continue;              continue;
# Line 3348  we set the flag only if there is a liter Line 3415  we set the flag only if there is a liter
3415      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3416    
3417      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3418      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
3419      such as \S in the class, because in that case all characters > 255 are in      such as \S in the class, because in that case all characters > 255 are in
3420      the class, so any that were explicitly given as well can be ignored. If      the class, so any that were explicitly given as well can be ignored. If
3421      (when there are explicit characters > 255 that must be listed) there are no      (when there are explicit characters > 255 that must be listed) there are no
3422      characters < 256, we can omit the bitmap in the actual compiled code. */      characters < 256, we can omit the bitmap in the actual compiled code. */
3423    
# Line 3381  we set the flag only if there is a liter Line 3448  we set the flag only if there is a liter
3448        }        }
3449  #endif  #endif
3450    
3451      /* If there are no characters > 255, set the opcode to OP_CLASS or      /* If there are no characters > 255, set the opcode to OP_CLASS or
3452      OP_NCLASS, depending on whether the whole class was negated and whether      OP_NCLASS, depending on whether the whole class was negated and whether
3453      there were negative specials such as \S in the class. Then copy the 32-byte      there were negative specials such as \S in the class. Then copy the 32-byte
3454      map into the code vector, negating it if necessary. */      map into the code vector, negating it if necessary. */
3455    
3456      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3457      if (negate_class)      if (negate_class)
3458        {        {
# Line 3787  we set the flag only if there is a liter Line 3854  we set the flag only if there is a liter
3854    
3855        if (repeat_min == 0)        if (repeat_min == 0)
3856          {          {
3857          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3858          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
3859    
3860          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
3861          BRAZERO and do no more at this point. However, we do need to adjust          **   {
3862          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
3863          any internal or forward referenced group, because the offset is from          **   goto END_REPEAT;
3864          the start of the whole regex. Temporarily terminate the pattern while          **   }
3865          doing this. */  
3866            However, that fails when a group is referenced as a subroutine from
3867            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3868            so that it is skipped on execution. As we don't have a list of which
3869            groups are referenced, we cannot do this selectively.
3870    
3871            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3872            and do no more at this point. However, we do need to adjust any
3873            OP_RECURSE calls inside the group that refer to the group itself or any
3874            internal or forward referenced group, because the offset is from the
3875            start of the whole regex. Temporarily terminate the pattern while doing
3876            this. */
3877    
3878          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3879            {            {
3880            *code = OP_END;            *code = OP_END;
3881            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3882            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3883            code++;            code++;
3884              if (repeat_max == 0)
3885                {
3886                *previous++ = OP_SKIPZERO;
3887                goto END_REPEAT;
3888                }
3889            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3890            }            }
3891    
# Line 4029  we set the flag only if there is a liter Line 4106  we set the flag only if there is a liter
4106        int len;        int len;
4107        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4108            *tempcode == OP_NOTEXACT)            *tempcode == OP_NOTEXACT)
4109          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += _pcre_OP_lengths[*tempcode] +
4110              ((*tempcode == OP_TYPEEXACT &&
4111                 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4112        len = code - tempcode;        len = code - tempcode;
4113        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4114          {          {
# Line 4081  we set the flag only if there is a liter Line 4160  we set the flag only if there is a liter
4160      bravalue = OP_CBRA;      bravalue = OP_CBRA;
4161      save_hwm = cd->hwm;      save_hwm = cd->hwm;
4162      reset_bracount = FALSE;      reset_bracount = FALSE;
4163    
4164      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
4165    
4166      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
# Line 4256  we set the flag only if there is a liter Line 4335  we set the flag only if there is a liter
4335              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
4336              goto FAILED;              goto FAILED;
4337              }              }
4338            recno = (refsign == '-')?            recno = (refsign == '-')?
4339              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno +cd->bracount;
4340            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
4341              {              {
# Line 4335  we set the flag only if there is a liter Line 4414  we set the flag only if there is a liter
4414            }            }
4415    
4416          /* Check for the "name" actually being a subpattern number. We are          /* Check for the "name" actually being a subpattern number. We are
4417          in the second pass here, so final_bracount is set. */          in the second pass here, so final_bracount is set. */
4418    
4419          else if (recno > 0 && recno <= cd->final_bracount)          else if (recno > 0 && recno <= cd->final_bracount)
4420            {            {
# Line 4533  we set the flag only if there is a liter Line 4612  we set the flag only if there is a liter
4612          references (?P=name) and recursion (?P>name), as well as falling          references (?P=name) and recursion (?P>name), as well as falling
4613          through from the Perl recursion syntax (?&name). We also come here from          through from the Perl recursion syntax (?&name). We also come here from
4614          the Perl \k<name> or \k'name' back reference syntax and the \k{name}          the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4615          .NET syntax. */          .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4616    
4617          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
4618          name = ++ptr;          name = ++ptr;
# Line 4549  we set the flag only if there is a liter Line 4628  we set the flag only if there is a liter
4628              {              {
4629              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
4630              goto FAILED;              goto FAILED;
4631              }              }
4632            if (*ptr != terminator)            if (*ptr != terminator)
4633              {              {
4634              *errorcodeptr = ERR42;              *errorcodeptr = ERR42;
# Line 4563  we set the flag only if there is a liter Line 4642  we set the flag only if there is a liter
4642            recno = 0;            recno = 0;
4643            }            }
4644    
4645          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, seek the name in the table. We check the name
4646          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
4647          table. That way, if the name that is longer than any in the table,          table. That way, if the name that is longer than any in the table,
4648          the comparison will fail without reading beyond the table entry. */          the comparison will fail without reading beyond the table entry. */
4649    
# Line 4574  we set the flag only if there is a liter Line 4653  we set the flag only if there is a liter
4653            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4654              {              {
4655              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4656                  slot[2+namelen] == 0)                  slot[2+namelen] == 0)
4657                break;                break;
4658              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4659              }              }
# Line 4611  we set the flag only if there is a liter Line 4690  we set the flag only if there is a liter
4690          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4691            {            {
4692            const uschar *called;            const uschar *called;
4693              terminator = ')';
4694    
4695              /* Come here from the \g<...> and \g'...' code (Oniguruma
4696              compatibility). However, the syntax has been checked to ensure that
4697              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4698              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4699              ever be taken. */
4700    
4701              HANDLE_NUMERICAL_RECURSION:
4702    
4703            if ((refsign = *ptr) == '+')            if ((refsign = *ptr) == '+')
4704              {              {
4705              ptr++;              ptr++;
4706              if ((digitab[*ptr] & ctype_digit) == 0)              if ((digitab[*ptr] & ctype_digit) == 0)
4707                {                {
4708                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
4709                goto FAILED;                goto FAILED;
4710                }                }
4711              }              }
4712            else if (refsign == '-')            else if (refsign == '-')
4713              {              {
4714              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
# Line 4632  we set the flag only if there is a liter Line 4720  we set the flag only if there is a liter
4720            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4721              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4722    
4723            if (*ptr != ')')            if (*ptr != terminator)
4724              {              {
4725              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4726              goto FAILED;              goto FAILED;
# Line 5028  we set the flag only if there is a liter Line 5116  we set the flag only if there is a liter
5116      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
5117      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
5118      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
5119    
5120      case '\\':      case '\\':
5121      tempptr = ptr;      tempptr = ptr;
5122      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
# Line 5055  we set the flag only if there is a liter Line 5143  we set the flag only if there is a liter
5143    
5144        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
5145        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
5146    
5147          /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5148          is a subroutine call by number (Oniguruma syntax). In fact, the value
5149          -ESC_g is returned only for these cases. So we don't need to check for <
5150          or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5151          -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5152          that is a synonym for a named back reference). */
5153    
5154          if (-c == ESC_g)
5155            {
5156            const uschar *p;
5157            save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5158            terminator = (*(++ptr) == '<')? '>' : '\'';
5159    
5160            /* These two statements stop the compiler for warning about possibly
5161            unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5162            fact, because we actually check for a number below, the paths that
5163            would actually be in error are never taken. */
5164    
5165            skipbytes = 0;
5166            reset_bracount = FALSE;
5167    
5168            /* Test for a name */
5169    
5170            if (ptr[1] != '+' && ptr[1] != '-')
5171              {
5172              BOOL isnumber = TRUE;
5173              for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5174                {
5175                if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5176                if ((cd->ctypes[*p] & ctype_word) == 0) break;
5177                }
5178              if (*p != terminator)
5179                {
5180                *errorcodeptr = ERR57;
5181                break;
5182                }
5183              if (isnumber)
5184                {
5185                ptr++;
5186                goto HANDLE_NUMERICAL_RECURSION;
5187                }
5188              is_recurse = TRUE;
5189              goto NAMED_REF_OR_RECURSE;
5190              }
5191    
5192            /* Test a signed number in angle brackets or quotes. */
5193    
5194            p = ptr + 2;
5195            while ((digitab[*p] & ctype_digit) != 0) p++;
5196            if (*p != terminator)
5197              {
5198              *errorcodeptr = ERR57;
5199              break;
5200              }
5201            ptr++;
5202            goto HANDLE_NUMERICAL_RECURSION;
5203            }
5204    
5205        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5206        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
# Line 5786  to fill in forward references to subpatt Line 5932  to fill in forward references to subpatt
5932    
5933  uschar cworkspace[COMPILE_WORK_SIZE];  uschar cworkspace[COMPILE_WORK_SIZE];
5934    
   
5935  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
5936    
5937  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern;
# Line 6075  while (errorcode == 0 && cd->hwm > cwork Line 6220  while (errorcode == 0 && cd->hwm > cwork
6220    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6221      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6222    }    }
6223    
6224  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
6225  subpattern. */  subpattern. */
6226    

Legend:
Removed from v.282  
changed lines
  Added in v.336

  ViewVC Help
Powered by ViewVC 1.1.5