/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 265 by ph10, Wed Nov 14 11:35:48 2007 UTC revision 336 by ph10, Sat Apr 12 15:59:03 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 158  static const char verbnames[] = Line 158  static const char verbnames[] =
158    "SKIP\0"    "SKIP\0"
159    "THEN";    "THEN";
160    
161  static verbitem verbs[] = {  static const verbitem verbs[] = {
162    { 6, OP_ACCEPT },    { 6, OP_ACCEPT },
163    { 6, OP_COMMIT },    { 6, OP_COMMIT },
164    { 1, OP_FAIL },    { 1, OP_FAIL },
# Line 168  static verbitem verbs[] = { Line 168  static verbitem verbs[] = {
168    { 4, OP_THEN  }    { 4, OP_THEN  }
169  };  };
170    
171  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
# Line 241  static const char error_texts[] = Line 241  static const char error_texts[] =
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat\0"    "internal error: unexpected repeat\0"
244    "unrecognized character after (?\0"    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class\0"    "POSIX named classes are supported only within a class\0"
246    "missing )\0"    "missing )\0"
247    /* 15 */    /* 15 */
# Line 295  static const char error_texts[] = Line 295  static const char error_texts[] =
295    /* 55 */    /* 55 */
296    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"
297    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
298    "\\g is not followed by a braced name or an optionally braced non-zero number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"    "a numbered reference must not be zero\0"
300    "(*VERB) with an argument is not supported\0"    "(*VERB) with an argument is not supported\0"
301    /* 60 */    /* 60 */
302    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
303    "number is too big";    "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 496  ptr--;                            /* Set Line 499  ptr--;                            /* Set
499    
500  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
501    
502  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
504  Otherwise further processing may be required. */  Otherwise further processing may be required. */
505    
506  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
507  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
508  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
509    
510  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
511  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
512  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
513  #endif  #endif
514    
# Line 529  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by one of a number of specific things:
536      is an absolute backreference. If negative, it is a relative backreference.  
537      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a      (1) A number, either plain or braced. If positive, it is an absolute
538      reference to a named group. This is part of Perl's movement towards a      backreference. If negative, it is a relative backreference. This is a Perl
539      unified syntax for back references. As this is synonymous with \k{name}, we      5.10 feature.
540      fudge it up by pretending it really was \k. */  
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551      case 'g':      case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560      if (ptr[1] == '{')      if (ptr[1] == '{')
561        {        {
562        const uschar *p;        const uschar *p;
# Line 563  else Line 583  else
583      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
584        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - '0';
585    
586      if (c < 0)      if (c < 0)   /* Integer overflow */
587        {        {
588        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
589        break;        break;
590        }        }
591    
592      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != '}')
593        {        {
594        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
595        break;        break;
596        }        }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604      if (negated)      if (negated)
605        {        {
# Line 609  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638        if (c < 0)        if (c < 0)    /* Integer overflow */
639          {          {
640          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
641          break;          break;
# Line 722  else Line 748  else
748      break;      break;
749    
750      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
752      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
753      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
754      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
755    
756      default:      default:
757      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 1506  for (;;) Line 1532  for (;;)
1532  can match the empty string or not. It is called from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1533  below and from compile_branch() when checking for an unlimited repeat of a  below and from compile_branch() when checking for an unlimited repeat of a
1534  group that can match nothing. Note that first_significant_code() skips over  group that can match nothing. Note that first_significant_code() skips over
1535  assertions. If we hit an unclosed bracket, we return "empty" - this means we've  backward and negative forward assertions when its final argument is TRUE. If we
1536  struck an inner bracket whose current branch will already have been scanned.  hit an unclosed bracket, we return "empty" - this means we've struck an inner
1537    bracket whose current branch will already have been scanned.
1538    
1539  Arguments:  Arguments:
1540    code        points to start of search    code        points to start of search
# Line 1529  for (code = first_significant_code(code Line 1556  for (code = first_significant_code(code
1556    
1557    c = *code;    c = *code;
1558    
1559      /* Skip over forward assertions; the other assertions are skipped by
1560      first_significant_code() with a TRUE final argument. */
1561    
1562      if (c == OP_ASSERT)
1563        {
1564        do code += GET(code, 1); while (*code == OP_ALT);
1565        c = *code;
1566        continue;
1567        }
1568    
1569    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1570    
1571    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1572      {      {
1573      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1574      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
# Line 1724  return TRUE; Line 1761  return TRUE;
1761  *************************************************/  *************************************************/
1762    
1763  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1764  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1765  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1766  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1767    
1768    Originally, this function only recognized a sequence of letters between the
1769    terminators, but it seems that Perl recognizes any sequence of characters,
1770    though of course unknown POSIX names are subsequently rejected. Perl gives an
1771    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1772    didn't consider this to be a POSIX class. Likewise for [:1234:].
1773    
1774    The problem in trying to be exactly like Perl is in the handling of escapes. We
1775    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1776    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1777    below handles the special case of \], but does not try to do any other escape
1778    processing. This makes it different from Perl for cases such as [:l\ower:]
1779    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1780    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1781    I think.
1782    
1783  Argument:  Arguments:
1784    ptr      pointer to the initial [    ptr      pointer to the initial [
1785    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1786    
1787  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1788  */  */
1789    
1790  static BOOL  static BOOL
1791  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1792  {  {
1793  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1794  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1795  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1796    {    {
1797    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1798    return TRUE;      {
1799        if (*ptr == ']') return FALSE;
1800        if (*ptr == terminator && ptr[1] == ']')
1801          {
1802          *endptr = ptr;
1803          return TRUE;
1804          }
1805        }
1806    }    }
1807  return FALSE;  return FALSE;
1808  }  }
# Line 1792  return -1; Line 1848  return -1;
1848  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1849  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1850  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1851  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1852  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1853  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1854  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
1855  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1856    OP_END.
1857    
1858  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
1859  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1827  while ((ptr = (uschar *)find_recurse(ptr Line 1884  while ((ptr = (uschar *)find_recurse(ptr
1884    
1885    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
1886    reference. */    reference. */
1887    
1888    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1889      {      {
1890      offset = GET(hc, 0);      offset = GET(hc, 0);
# Line 2081  if (next >= 0) switch(op_code) Line 2138  if (next >= 0) switch(op_code)
2138    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT, "item" must be a single-byte character. */
2139    
2140    case OP_NOT:    case OP_NOT:
   if (next < 0) return FALSE;  /* Not a character */  
2141    if (item == next) return TRUE;    if (item == next) return TRUE;
2142    if ((options & PCRE_CASELESS) == 0) return FALSE;    if ((options & PCRE_CASELESS) == 0) return FALSE;
2143  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 2344  uschar classbits[32]; Line 2400  uschar classbits[32];
2400  BOOL class_utf8;  BOOL class_utf8;
2401  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2402  uschar *class_utf8data;  uschar *class_utf8data;
2403    uschar *class_utf8data_base;
2404  uschar utf8_char[6];  uschar utf8_char[6];
2405  #else  #else
2406  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
# Line 2383  req_caseopt = ((options & PCRE_CASELESS) Line 2440  req_caseopt = ((options & PCRE_CASELESS)
2440  for (;; ptr++)  for (;; ptr++)
2441    {    {
2442    BOOL negate_class;    BOOL negate_class;
2443    BOOL should_flip_negation;    BOOL should_flip_negation;
2444    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2445    BOOL is_quantifier;    BOOL is_quantifier;
2446    BOOL is_recurse;    BOOL is_recurse;
# Line 2403  for (;; ptr++) Line 2460  for (;; ptr++)
2460    /* Get next byte in the pattern */    /* Get next byte in the pattern */
2461    
2462    c = *ptr;    c = *ptr;
2463    
2464    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
2465    previous cycle of this loop. */    previous cycle of this loop. */
2466    
# Line 2598  for (;; ptr++) Line 2655  for (;; ptr++)
2655      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2656      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2657      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2658      */  
2659        In JavaScript compatibility mode, an isolated ']' causes an error. In
2660        default (Perl) mode, it is treated as a data character. */
2661    
2662        case ']':
2663        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2664          {
2665          *errorcodeptr = ERR64;
2666          goto FAILED;
2667          }
2668        goto NORMAL_CHAR;
2669    
2670      case '[':      case '[':
2671      previous = code;      previous = code;
# Line 2607  for (;; ptr++) Line 2674  for (;; ptr++)
2674      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2675    
2676      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2677          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2678        {        {
2679        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2680        goto FAILED;        goto FAILED;
# Line 2632  for (;; ptr++) Line 2699  for (;; ptr++)
2699        else break;        else break;
2700        }        }
2701    
2702      /* If a class contains a negative special such as \S, we need to flip the      /* If a class contains a negative special such as \S, we need to flip the
2703      negation flag at the end, so that support for characters > 255 works      negation flag at the end, so that support for characters > 255 works
2704      correctly (they are all included in the class). */      correctly (they are all included in the class). */
2705    
2706      should_flip_negation = FALSE;      should_flip_negation = FALSE;
# Line 2655  for (;; ptr++) Line 2722  for (;; ptr++)
2722  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2723      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2724      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2725        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2726  #endif  #endif
2727    
2728      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2670  for (;; ptr++) Line 2738  for (;; ptr++)
2738          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2739          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2740          }          }
2741    
2742          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2743          data and reset the pointer. This is so that very large classes that
2744          contain a zillion UTF-8 characters no longer overwrite the work space
2745          (which is on the stack). */
2746    
2747          if (lengthptr != NULL)
2748            {
2749            *lengthptr += class_utf8data - class_utf8data_base;
2750            class_utf8data = class_utf8data_base;
2751            }
2752    
2753  #endif  #endif
2754    
2755        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 2693  for (;; ptr++) Line 2773  for (;; ptr++)
2773    
2774        if (c == '[' &&        if (c == '[' &&
2775            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2776            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2777          {          {
2778          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2779          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 2710  for (;; ptr++) Line 2790  for (;; ptr++)
2790          if (*ptr == '^')          if (*ptr == '^')
2791            {            {
2792            local_negate = TRUE;            local_negate = TRUE;
2793            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
2794            ptr++;            ptr++;
2795            }            }
2796    
# Line 2785  for (;; ptr++) Line 2865  for (;; ptr++)
2865          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2866          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
2867    
2868          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2869          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2870          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2871          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 2813  for (;; ptr++) Line 2893  for (;; ptr++)
2893              continue;              continue;
2894    
2895              case ESC_D:              case ESC_D:
2896              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2897              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2898              continue;              continue;
2899    
# Line 2822  for (;; ptr++) Line 2902  for (;; ptr++)
2902              continue;              continue;
2903    
2904              case ESC_W:              case ESC_W:
2905              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2906              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2907              continue;              continue;
2908    
# Line 2832  for (;; ptr++) Line 2912  for (;; ptr++)
2912              continue;              continue;
2913    
2914              case ESC_S:              case ESC_S:
2915              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2916              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2917              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2918              continue;              continue;
2919    
             case ESC_E: /* Perl ignores an orphan \E */  
             continue;  
   
2920              default:    /* Not recognized; fall through */              default:    /* Not recognized; fall through */
2921              break;      /* Need "default" setting to stop compiler warning. */              break;      /* Need "default" setting to stop compiler warning. */
2922              }              }
# Line 3074  for (;; ptr++) Line 3151  for (;; ptr++)
3151            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3152            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
3153    
3154            /* \b is backslash; \X is literal X; \R is literal R; any other            /* \b is backspace; \X is literal X; \R is literal R; any other
3155            special means the '-' was literal */            special means the '-' was literal */
3156    
3157            if (d < 0)            if (d < 0)
# Line 3338  we set the flag only if there is a liter Line 3415  we set the flag only if there is a liter
3415      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3416    
3417      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3418      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
3419      such as \S in the class, because in that case all characters > 255 are in      such as \S in the class, because in that case all characters > 255 are in
3420      the class, so any that were explicitly given as well can be ignored. If      the class, so any that were explicitly given as well can be ignored. If
3421      (when there are explicit characters > 255 that must be listed) there are no      (when there are explicit characters > 255 that must be listed) there are no
3422      characters < 256, we can omit the bitmap in the actual compiled code. */      characters < 256, we can omit the bitmap in the actual compiled code. */
3423    
# Line 3371  we set the flag only if there is a liter Line 3448  we set the flag only if there is a liter
3448        }        }
3449  #endif  #endif
3450    
3451      /* If there are no characters > 255, set the opcode to OP_CLASS or      /* If there are no characters > 255, set the opcode to OP_CLASS or
3452      OP_NCLASS, depending on whether the whole class was negated and whether      OP_NCLASS, depending on whether the whole class was negated and whether
3453      there were negative specials such as \S in the class. Then copy the 32-byte      there were negative specials such as \S in the class. Then copy the 32-byte
3454      map into the code vector, negating it if necessary. */      map into the code vector, negating it if necessary. */
3455    
3456      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3457      if (negate_class)      if (negate_class)
3458        {        {
# Line 3777  we set the flag only if there is a liter Line 3854  we set the flag only if there is a liter
3854    
3855        if (repeat_min == 0)        if (repeat_min == 0)
3856          {          {
3857          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3858          altogether. */          output altogether, like this:
3859    
3860          if (repeat_max == 0)          ** if (repeat_max == 0)
3861            {          **   {
3862            code = previous;          **   code = previous;
3863            goto END_REPEAT;          **   goto END_REPEAT;
3864            }          **   }
3865    
3866          /* If the maximum is 1 or unlimited, we just have to stick in the          However, that fails when a group is referenced as a subroutine from
3867          BRAZERO and do no more at this point. However, we do need to adjust          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3868          any OP_RECURSE calls inside the group that refer to the group itself or          so that it is skipped on execution. As we don't have a list of which
3869          any internal or forward referenced group, because the offset is from          groups are referenced, we cannot do this selectively.
3870          the start of the whole regex. Temporarily terminate the pattern while  
3871          doing this. */          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3872            and do no more at this point. However, we do need to adjust any
3873            OP_RECURSE calls inside the group that refer to the group itself or any
3874            internal or forward referenced group, because the offset is from the
3875            start of the whole regex. Temporarily terminate the pattern while doing
3876            this. */
3877    
3878          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3879            {            {
3880            *code = OP_END;            *code = OP_END;
3881            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3882            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3883            code++;            code++;
3884              if (repeat_max == 0)
3885                {
3886                *previous++ = OP_SKIPZERO;
3887                goto END_REPEAT;
3888                }
3889            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3890            }            }
3891    
# Line 4019  we set the flag only if there is a liter Line 4106  we set the flag only if there is a liter
4106        int len;        int len;
4107        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4108            *tempcode == OP_NOTEXACT)            *tempcode == OP_NOTEXACT)
4109          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += _pcre_OP_lengths[*tempcode] +
4110              ((*tempcode == OP_TYPEEXACT &&
4111                 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4112        len = code - tempcode;        len = code - tempcode;
4113        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4114          {          {
# Line 4071  we set the flag only if there is a liter Line 4160  we set the flag only if there is a liter
4160      bravalue = OP_CBRA;      bravalue = OP_CBRA;
4161      save_hwm = cd->hwm;      save_hwm = cd->hwm;
4162      reset_bracount = FALSE;      reset_bracount = FALSE;
4163    
4164      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
4165    
4166      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
# Line 4246  we set the flag only if there is a liter Line 4335  we set the flag only if there is a liter
4335              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
4336              goto FAILED;              goto FAILED;
4337              }              }
4338            if (refsign == '-')            recno = (refsign == '-')?
4339                cd->bracount - recno + 1 : recno +cd->bracount;
4340              if (recno <= 0 || recno > cd->final_bracount)
4341              {              {
4342              recno = cd->bracount - recno + 1;              *errorcodeptr = ERR15;
4343              if (recno <= 0)              goto FAILED;
               {  
               *errorcodeptr = ERR15;  
               goto FAILED;  
               }  
4344              }              }
           else recno += cd->bracount;  
4345            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4346            break;            break;
4347            }            }
# Line 4327  we set the flag only if there is a liter Line 4413  we set the flag only if there is a liter
4413            skipbytes = 1;            skipbytes = 1;
4414            }            }
4415    
4416          /* Check for the "name" actually being a subpattern number. */          /* Check for the "name" actually being a subpattern number. We are
4417            in the second pass here, so final_bracount is set. */
4418    
4419          else if (recno > 0)          else if (recno > 0 && recno <= cd->final_bracount)
4420            {            {
4421            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4422            }            }
# Line 4523  we set the flag only if there is a liter Line 4610  we set the flag only if there is a liter
4610    
4611          /* We come here from the Python syntax above that handles both          /* We come here from the Python syntax above that handles both
4612          references (?P=name) and recursion (?P>name), as well as falling          references (?P=name) and recursion (?P>name), as well as falling
4613          through from the Perl recursion syntax (?&name). */          through from the Perl recursion syntax (?&name). We also come here from
4614            the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4615            .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4616    
4617          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
4618          name = ++ptr;          name = ++ptr;
# Line 4535  we set the flag only if there is a liter Line 4624  we set the flag only if there is a liter
4624    
4625          if (lengthptr != NULL)          if (lengthptr != NULL)
4626            {            {
4627              if (namelen == 0)
4628                {
4629                *errorcodeptr = ERR62;
4630                goto FAILED;
4631                }
4632            if (*ptr != terminator)            if (*ptr != terminator)
4633              {              {
4634              *errorcodeptr = ERR42;              *errorcodeptr = ERR42;
# Line 4548  we set the flag only if there is a liter Line 4642  we set the flag only if there is a liter
4642            recno = 0;            recno = 0;
4643            }            }
4644    
4645          /* In the real compile, seek the name in the table */          /* In the real compile, seek the name in the table. We check the name
4646            first, and then check that we have reached the end of the name in the
4647            table. That way, if the name that is longer than any in the table,
4648            the comparison will fail without reading beyond the table entry. */
4649    
4650          else          else
4651            {            {
4652            slot = cd->name_table;            slot = cd->name_table;
4653            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4654              {              {
4655              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4656                    slot[2+namelen] == 0)
4657                  break;
4658              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4659              }              }
4660    
# Line 4591  we set the flag only if there is a liter Line 4690  we set the flag only if there is a liter
4690          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4691            {            {
4692            const uschar *called;            const uschar *called;
4693              terminator = ')';
4694    
4695              /* Come here from the \g<...> and \g'...' code (Oniguruma
4696              compatibility). However, the syntax has been checked to ensure that
4697              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4698              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4699              ever be taken. */
4700    
4701              HANDLE_NUMERICAL_RECURSION:
4702    
4703            if ((refsign = *ptr) == '+') ptr++;            if ((refsign = *ptr) == '+')
4704                {
4705                ptr++;
4706                if ((digitab[*ptr] & ctype_digit) == 0)
4707                  {
4708                  *errorcodeptr = ERR63;
4709                  goto FAILED;
4710                  }
4711                }
4712            else if (refsign == '-')            else if (refsign == '-')
4713              {              {
4714              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
# Line 4604  we set the flag only if there is a liter Line 4720  we set the flag only if there is a liter
4720            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4721              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4722    
4723            if (*ptr != ')')            if (*ptr != terminator)
4724              {              {
4725              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4726              goto FAILED;              goto FAILED;
# Line 5000  we set the flag only if there is a liter Line 5116  we set the flag only if there is a liter
5116      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
5117      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
5118      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
5119    
5120      case '\\':      case '\\':
5121      tempptr = ptr;      tempptr = ptr;
5122      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
# Line 5027  we set the flag only if there is a liter Line 5143  we set the flag only if there is a liter
5143    
5144        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
5145        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
5146    
5147          /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5148          is a subroutine call by number (Oniguruma syntax). In fact, the value
5149          -ESC_g is returned only for these cases. So we don't need to check for <
5150          or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5151          -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5152          that is a synonym for a named back reference). */
5153    
5154          if (-c == ESC_g)
5155            {
5156            const uschar *p;
5157            save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5158            terminator = (*(++ptr) == '<')? '>' : '\'';
5159    
5160            /* These two statements stop the compiler for warning about possibly
5161            unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5162            fact, because we actually check for a number below, the paths that
5163            would actually be in error are never taken. */
5164    
5165            skipbytes = 0;
5166            reset_bracount = FALSE;
5167    
5168            /* Test for a name */
5169    
5170            if (ptr[1] != '+' && ptr[1] != '-')
5171              {
5172              BOOL isnumber = TRUE;
5173              for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5174                {
5175                if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5176                if ((cd->ctypes[*p] & ctype_word) == 0) break;
5177                }
5178              if (*p != terminator)
5179                {
5180                *errorcodeptr = ERR57;
5181                break;
5182                }
5183              if (isnumber)
5184                {
5185                ptr++;
5186                goto HANDLE_NUMERICAL_RECURSION;
5187                }
5188              is_recurse = TRUE;
5189              goto NAMED_REF_OR_RECURSE;
5190              }
5191    
5192            /* Test a signed number in angle brackets or quotes. */
5193    
5194            p = ptr + 2;
5195            while ((digitab[*p] & ctype_digit) != 0) p++;
5196            if (*p != terminator)
5197              {
5198              *errorcodeptr = ERR57;
5199              break;
5200              }
5201            ptr++;
5202            goto HANDLE_NUMERICAL_RECURSION;
5203            }
5204    
5205        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5206        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
# Line 5758  to fill in forward references to subpatt Line 5932  to fill in forward references to subpatt
5932    
5933  uschar cworkspace[COMPILE_WORK_SIZE];  uschar cworkspace[COMPILE_WORK_SIZE];
5934    
   
5935  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
5936    
5937  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern;
# Line 5919  to compile parts of the pattern into; th Line 6092  to compile parts of the pattern into; th
6092  no longer needed, so hopefully this workspace will never overflow, though there  no longer needed, so hopefully this workspace will never overflow, though there
6093  is a test for its doing so. */  is a test for its doing so. */
6094    
6095  cd->bracount = 0;  cd->bracount = cd->final_bracount = 0;
6096  cd->names_found = 0;  cd->names_found = 0;
6097  cd->name_entry_size = 0;  cd->name_entry_size = 0;
6098  cd->name_table = NULL;  cd->name_table = NULL;
# Line 5996  field. Reset the bracket count and the n Line 6169  field. Reset the bracket count and the n
6169  field; this time it's used for remembering forward references to subpatterns.  field; this time it's used for remembering forward references to subpatterns.
6170  */  */
6171    
6172    cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6173  cd->bracount = 0;  cd->bracount = 0;
6174  cd->names_found = 0;  cd->names_found = 0;
6175  cd->name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (uschar *)re + re->name_table_offset;
# Line 6046  while (errorcode == 0 && cd->hwm > cwork Line 6220  while (errorcode == 0 && cd->hwm > cwork
6220    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6221      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6222    }    }
6223    
6224  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
6225  subpattern. */  subpattern. */
6226    

Legend:
Removed from v.265  
changed lines
  Added in v.336

  ViewVC Help
Powered by ViewVC 1.1.5