/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 305 by ph10, Sun Jan 20 20:07:32 2008 UTC revision 336 by ph10, Sat Apr 12 15:59:03 2008 UTC
# Line 158  static const char verbnames[] = Line 158  static const char verbnames[] =
158    "SKIP\0"    "SKIP\0"
159    "THEN";    "THEN";
160    
161  static verbitem verbs[] = {  static const verbitem verbs[] = {
162    { 6, OP_ACCEPT },    { 6, OP_ACCEPT },
163    { 6, OP_COMMIT },    { 6, OP_COMMIT },
164    { 1, OP_FAIL },    { 1, OP_FAIL },
# Line 168  static verbitem verbs[] = { Line 168  static verbitem verbs[] = {
168    { 4, OP_THEN  }    { 4, OP_THEN  }
169  };  };
170    
171  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
# Line 295  static const char error_texts[] = Line 295  static const char error_texts[] =
295    /* 55 */    /* 55 */
296    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"
297    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
298    "\\g is not followed by a braced name or an optionally braced non-zero number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"    "a numbered reference must not be zero\0"
300    "(*VERB) with an argument is not supported\0"    "(*VERB) with an argument is not supported\0"
301    /* 60 */    /* 60 */
302    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
303    "number is too big\0"    "number is too big\0"
304    "subpattern name expected\0"    "subpattern name expected\0"
305    "digit expected after (?+";    "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 531  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by one of a number of specific things:
536      is an absolute backreference. If negative, it is a relative backreference.  
537      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a      (1) A number, either plain or braced. If positive, it is an absolute
538      reference to a named group. This is part of Perl's movement towards a      backreference. If negative, it is a relative backreference. This is a Perl
539      unified syntax for back references. As this is synonymous with \k{name}, we      5.10 feature.
540      fudge it up by pretending it really was \k. */  
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551      case 'g':      case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560      if (ptr[1] == '{')      if (ptr[1] == '{')
561        {        {
562        const uschar *p;        const uschar *p;
# Line 565  else Line 583  else
583      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
584        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - '0';
585    
586      if (c < 0)      if (c < 0)   /* Integer overflow */
587        {        {
588        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
589        break;        break;
590        }        }
591    
592      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != '}')
593        {        {
594        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
595        break;        break;
596        }        }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604      if (negated)      if (negated)
605        {        {
# Line 611  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638        if (c < 0)        if (c < 0)    /* Integer overflow */
639          {          {
640          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
641          break;          break;
# Line 1544  for (code = first_significant_code(code Line 1568  for (code = first_significant_code(code
1568    
1569    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1570    
1571    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1572      {      {
1573      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1574      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
# Line 1824  return -1; Line 1848  return -1;
1848  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1849  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1850  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1851  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1852  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1853  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1854  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
1855  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1856    OP_END.
1857    
1858  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
1859  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1859  while ((ptr = (uschar *)find_recurse(ptr Line 1884  while ((ptr = (uschar *)find_recurse(ptr
1884    
1885    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
1886    reference. */    reference. */
1887    
1888    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1889      {      {
1890      offset = GET(hc, 0);      offset = GET(hc, 0);
# Line 2113  if (next >= 0) switch(op_code) Line 2138  if (next >= 0) switch(op_code)
2138    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT, "item" must be a single-byte character. */
2139    
2140    case OP_NOT:    case OP_NOT:
   if (next < 0) return FALSE;  /* Not a character */  
2141    if (item == next) return TRUE;    if (item == next) return TRUE;
2142    if ((options & PCRE_CASELESS) == 0) return FALSE;    if ((options & PCRE_CASELESS) == 0) return FALSE;
2143  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 2436  for (;; ptr++) Line 2460  for (;; ptr++)
2460    /* Get next byte in the pattern */    /* Get next byte in the pattern */
2461    
2462    c = *ptr;    c = *ptr;
2463    
2464    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
2465    previous cycle of this loop. */    previous cycle of this loop. */
2466    
# Line 2631  for (;; ptr++) Line 2655  for (;; ptr++)
2655      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2656      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2657      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2658      */  
2659        In JavaScript compatibility mode, an isolated ']' causes an error. In
2660        default (Perl) mode, it is treated as a data character. */
2661    
2662        case ']':
2663        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2664          {
2665          *errorcodeptr = ERR64;
2666          goto FAILED;
2667          }
2668        goto NORMAL_CHAR;
2669    
2670      case '[':      case '[':
2671      previous = code;      previous = code;
# Line 2688  for (;; ptr++) Line 2722  for (;; ptr++)
2722  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2723      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2724      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2725      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */      class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2726  #endif  #endif
2727    
2728      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2704  for (;; ptr++) Line 2738  for (;; ptr++)
2738          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2739          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2740          }          }
2741    
2742        /* In the pre-compile phase, accumulate the length of any UTF-8 extra        /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2743        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
2744        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion UTF-8 characters no longer overwrite the work space
2745        (which is on the stack). */        (which is on the stack). */
2746    
2747        if (lengthptr != NULL)        if (lengthptr != NULL)
2748          {          {
2749          *lengthptr += class_utf8data - class_utf8data_base;          *lengthptr += class_utf8data - class_utf8data_base;
2750          class_utf8data = class_utf8data_base;          class_utf8data = class_utf8data_base;
2751          }          }
2752    
2753  #endif  #endif
2754    
2755        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 3820  we set the flag only if there is a liter Line 3854  we set the flag only if there is a liter
3854    
3855        if (repeat_min == 0)        if (repeat_min == 0)
3856          {          {
3857          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3858          altogether. */          output altogether, like this:
3859    
3860          if (repeat_max == 0)          ** if (repeat_max == 0)
3861            {          **   {
3862            code = previous;          **   code = previous;
3863            goto END_REPEAT;          **   goto END_REPEAT;
3864            }          **   }
3865    
3866            However, that fails when a group is referenced as a subroutine from
3867            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3868            so that it is skipped on execution. As we don't have a list of which
3869            groups are referenced, we cannot do this selectively.
3870    
3871            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3872            and do no more at this point. However, we do need to adjust any
3873            OP_RECURSE calls inside the group that refer to the group itself or any
3874            internal or forward referenced group, because the offset is from the
3875            start of the whole regex. Temporarily terminate the pattern while doing
3876            this. */
3877    
3878          /* If the maximum is 1 or unlimited, we just have to stick in the          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
         BRAZERO and do no more at this point. However, we do need to adjust  
         any OP_RECURSE calls inside the group that refer to the group itself or  
         any internal or forward referenced group, because the offset is from  
         the start of the whole regex. Temporarily terminate the pattern while  
         doing this. */  
   
         if (repeat_max <= 1)  
3879            {            {
3880            *code = OP_END;            *code = OP_END;
3881            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3882            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3883            code++;            code++;
3884              if (repeat_max == 0)
3885                {
3886                *previous++ = OP_SKIPZERO;
3887                goto END_REPEAT;
3888                }
3889            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3890            }            }
3891    
# Line 4116  we set the flag only if there is a liter Line 4160  we set the flag only if there is a liter
4160      bravalue = OP_CBRA;      bravalue = OP_CBRA;
4161      save_hwm = cd->hwm;      save_hwm = cd->hwm;
4162      reset_bracount = FALSE;      reset_bracount = FALSE;
4163    
4164      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
4165    
4166      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
# Line 4568  we set the flag only if there is a liter Line 4612  we set the flag only if there is a liter
4612          references (?P=name) and recursion (?P>name), as well as falling          references (?P=name) and recursion (?P>name), as well as falling
4613          through from the Perl recursion syntax (?&name). We also come here from          through from the Perl recursion syntax (?&name). We also come here from
4614          the Perl \k<name> or \k'name' back reference syntax and the \k{name}          the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4615          .NET syntax. */          .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4616    
4617          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
4618          name = ++ptr;          name = ++ptr;
# Line 4646  we set the flag only if there is a liter Line 4690  we set the flag only if there is a liter
4690          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4691            {            {
4692            const uschar *called;            const uschar *called;
4693              terminator = ')';
4694    
4695              /* Come here from the \g<...> and \g'...' code (Oniguruma
4696              compatibility). However, the syntax has been checked to ensure that
4697              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4698              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4699              ever be taken. */
4700    
4701              HANDLE_NUMERICAL_RECURSION:
4702    
4703            if ((refsign = *ptr) == '+')            if ((refsign = *ptr) == '+')
4704              {              {
# Line 4667  we set the flag only if there is a liter Line 4720  we set the flag only if there is a liter
4720            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4721              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4722    
4723            if (*ptr != ')')            if (*ptr != terminator)
4724              {              {
4725              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4726              goto FAILED;              goto FAILED;
# Line 5063  we set the flag only if there is a liter Line 5116  we set the flag only if there is a liter
5116      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
5117      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
5118      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
5119    
5120      case '\\':      case '\\':
5121      tempptr = ptr;      tempptr = ptr;
5122      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
# Line 5090  we set the flag only if there is a liter Line 5143  we set the flag only if there is a liter
5143    
5144        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
5145        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
5146    
5147          /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5148          is a subroutine call by number (Oniguruma syntax). In fact, the value
5149          -ESC_g is returned only for these cases. So we don't need to check for <
5150          or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5151          -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5152          that is a synonym for a named back reference). */
5153    
5154          if (-c == ESC_g)
5155            {
5156            const uschar *p;
5157            save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5158            terminator = (*(++ptr) == '<')? '>' : '\'';
5159    
5160            /* These two statements stop the compiler for warning about possibly
5161            unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5162            fact, because we actually check for a number below, the paths that
5163            would actually be in error are never taken. */
5164    
5165            skipbytes = 0;
5166            reset_bracount = FALSE;
5167    
5168            /* Test for a name */
5169    
5170            if (ptr[1] != '+' && ptr[1] != '-')
5171              {
5172              BOOL isnumber = TRUE;
5173              for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5174                {
5175                if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5176                if ((cd->ctypes[*p] & ctype_word) == 0) break;
5177                }
5178              if (*p != terminator)
5179                {
5180                *errorcodeptr = ERR57;
5181                break;
5182                }
5183              if (isnumber)
5184                {
5185                ptr++;
5186                goto HANDLE_NUMERICAL_RECURSION;
5187                }
5188              is_recurse = TRUE;
5189              goto NAMED_REF_OR_RECURSE;
5190              }
5191    
5192            /* Test a signed number in angle brackets or quotes. */
5193    
5194            p = ptr + 2;
5195            while ((digitab[*p] & ctype_digit) != 0) p++;
5196            if (*p != terminator)
5197              {
5198              *errorcodeptr = ERR57;
5199              break;
5200              }
5201            ptr++;
5202            goto HANDLE_NUMERICAL_RECURSION;
5203            }
5204    
5205        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5206        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
# Line 6109  while (errorcode == 0 && cd->hwm > cwork Line 6220  while (errorcode == 0 && cd->hwm > cwork
6220    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6221      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6222    }    }
6223    
6224  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
6225  subpattern. */  subpattern. */
6226    

Legend:
Removed from v.305  
changed lines
  Added in v.336

  ViewVC Help
Powered by ViewVC 1.1.5