/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 149 by ph10, Mon Apr 16 15:28:08 2007 UTC revision 172 by ph10, Tue Jun 5 10:40:13 2007 UTC
# Line 87  static const short int escapes[] = { Line 87  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
# Line 208  static const char *error_texts[] = { Line 208  static const char *error_texts[] = {
208    "malformed number or name after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
# Line 242  static const char *error_texts[] = { Line 242  static const char *error_texts[] = {
242    /* 55 */    /* 55 */
243    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
244    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
245    "\\g is not followed by an (optionally braced) non-zero number"    "\\g is not followed by a braced name or an optionally braced non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 452  else Line 453  else
453    
454      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by a number, either plain or braced. If positive, it
455      is an absolute backreference. If negative, it is a relative backreference.      is an absolute backreference. If negative, it is a relative backreference.
456      This is a Perl 5.10 feature. */      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457        reference to a named group. This is part of Perl's movement towards a
458        unified syntax for back references. As this is synonymous with \k{name}, we
459        fudge it up by pretending it really was \k. */
460    
461      case 'g':      case 'g':
462      if (ptr[1] == '{')      if (ptr[1] == '{')
463        {        {
464          const uschar *p;
465          for (p = ptr+2; *p != 0 && *p != '}'; p++)
466            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467          if (*p != 0 && *p != '}')
468            {
469            c = -ESC_k;
470            break;
471            }
472        braced = TRUE;        braced = TRUE;
473        ptr++;        ptr++;
474        }        }
# Line 1370  for (code = first_significant_code(code Line 1382  for (code = first_significant_code(code
1382    
1383    c = *code;    c = *code;
1384    
1385      /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388        {
1389        code += _pcre_OP_lengths[c];
1390        do code += GET(code, 1); while (*code == OP_ALT);
1391        c = *code;
1392        continue;
1393        }
1394    
1395      /* For other groups, scan the branches. */
1396    
1397    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398      {      {
1399      BOOL empty_branch;      BOOL empty_branch;
# Line 1386  for (code = first_significant_code(code Line 1410  for (code = first_significant_code(code
1410        }        }
1411      while (*code == OP_ALT);      while (*code == OP_ALT);
1412      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1413        c = *code;
     /* Move past the KET and fudge things so that the increment in the "for"  
     above has no effect. */  
   
     c = OP_END;  
     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];  
1414      continue;      continue;
1415      }      }
1416    
# Line 2095  for (;; ptr++) Line 2114  for (;; ptr++)
2114    int class_lastchar;    int class_lastchar;
2115    int newoptions;    int newoptions;
2116    int recno;    int recno;
2117      int refsign;
2118    int skipbytes;    int skipbytes;
2119    int subreqbyte;    int subreqbyte;
2120    int subfirstbyte;    int subfirstbyte;
# Line 3621  for (;; ptr++) Line 3641  for (;; ptr++)
3641    
3642          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
3643          skipbytes = 3;          skipbytes = 3;
3644            refsign = -1;
3645    
3646          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
3647    
# Line 3644  for (;; ptr++) Line 3665  for (;; ptr++)
3665            terminator = '\'';            terminator = '\'';
3666            ptr++;            ptr++;
3667            }            }
3668          else terminator = 0;          else
3669              {
3670              terminator = 0;
3671              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3672              }
3673    
3674          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
3675    
# Line 3680  for (;; ptr++) Line 3705  for (;; ptr++)
3705          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
3706    
3707          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
3708          reference. */          reference. If the string started with "+" or "-" we require the rest to
3709            be digits, in which case recno will be set. */
3710    
3711            if (refsign > 0)
3712              {
3713              if (recno <= 0)
3714                {
3715                *errorcodeptr = ERR58;
3716                goto FAILED;
3717                }
3718              if (refsign == '-')
3719                {
3720                recno = cd->bracount - recno + 1;
3721                if (recno <= 0)
3722                  {
3723                  *errorcodeptr = ERR15;
3724                  goto FAILED;
3725                  }
3726                }
3727              else recno += cd->bracount;
3728              PUT2(code, 2+LINK_SIZE, recno);
3729              break;
3730              }
3731    
3732            /* Otherwise (did not start with "+" or "-"), start by looking for the
3733            name. */
3734    
3735          slot = cd->name_table;          slot = cd->name_table;
3736          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 3999  for (;; ptr++) Line 4049  for (;; ptr++)
4049    
4050    
4051          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4052            case '-': case '+':
4053          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4054          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4055            {            {
4056            const uschar *called;            const uschar *called;
4057    
4058              if ((refsign = *ptr) == '+') ptr++;
4059              else if (refsign == '-')
4060                {
4061                if ((digitab[ptr[1]] & ctype_digit) == 0)
4062                  goto OTHER_CHAR_AFTER_QUERY;
4063                ptr++;
4064                }
4065    
4066            recno = 0;            recno = 0;
4067            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4068              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4069    
4070            if (*ptr != ')')            if (*ptr != ')')
4071              {              {
4072              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4073              goto FAILED;              goto FAILED;
4074              }              }
4075    
4076              if (refsign == '-')
4077                {
4078                if (recno == 0)
4079                  {
4080                  *errorcodeptr = ERR58;
4081                  goto FAILED;
4082                  }
4083                recno = cd->bracount - recno + 1;
4084                if (recno <= 0)
4085                  {
4086                  *errorcodeptr = ERR15;
4087                  goto FAILED;
4088                  }
4089                }
4090              else if (refsign == '+')
4091                {
4092                if (recno == 0)
4093                  {
4094                  *errorcodeptr = ERR58;
4095                  goto FAILED;
4096                  }
4097                recno += cd->bracount;
4098                }
4099    
4100            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4101    
4102            HANDLE_RECURSION:            HANDLE_RECURSION:
# Line 4084  for (;; ptr++) Line 4169  for (;; ptr++)
4169    
4170          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4171          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4172            OTHER_CHAR_AFTER_QUERY:
4173          set = unset = 0;          set = unset = 0;
4174          optset = &set;          optset = &set;
4175    
# Line 4234  for (;; ptr++) Line 4320  for (;; ptr++)
4320      is on the bracket. */      is on the bracket. */
4321    
4322      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4323      two branches in the group, or just one if it's a DEFINE group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4324        in the real compile phase, not in the pre-pass, where the whole group may
4325        not be available. */
4326    
4327      if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4328        {        {
4329        uschar *tc = code;        uschar *tc = code;
4330        int condcount = 0;        int condcount = 0;
# Line 4396  for (;; ptr++) Line 4484  for (;; ptr++)
4484        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4485        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4486    
4487        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4488          We also support \k{name} (.NET syntax) */
4489    
4490        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4491          {          {
4492          is_recurse = FALSE;          is_recurse = FALSE;
4493          terminator = (*(++ptr) == '<')? '>' : '\'';          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4494          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
4495          }          }
4496    
# Line 4567  This function is used during the pre-com Line 4656  This function is used during the pre-com
4656  out the amount of memory needed, as well as during the real compile phase. The  out the amount of memory needed, as well as during the real compile phase. The
4657  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
4658    
4659  Argument:  Arguments:
4660    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4661    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
4662    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
# Line 4720  for (;;) Line 4809  for (;;)
4809        }        }
4810      }      }
4811    
4812    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
4813    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
4814    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
4815    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
4816    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
4817    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
4818    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
4819    at the terminating char. */    Return leaving the pointer at the terminating char. */
4820    
4821    if (*ptr != '|')    if (*ptr != '|')
4822      {      {
4823      int branch_length = code - last_branch;      if (lengthptr == NULL)
     do  
4824        {        {
4825        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
4826        PUT(last_branch, 1, branch_length);        do
4827        branch_length = prev_length;          {
4828        last_branch -= branch_length;          int prev_length = GET(last_branch, 1);
4829            PUT(last_branch, 1, branch_length);
4830            branch_length = prev_length;
4831            last_branch -= branch_length;
4832            }
4833          while (branch_length > 0);
4834        }        }
     while (branch_length > 0);  
4835    
4836      /* Fill in the ket */      /* Fill in the ket */
4837    
# Line 4766  for (;;) Line 4858  for (;;)
4858      return TRUE;      return TRUE;
4859      }      }
4860    
4861    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
4862      pointer back to where it was for the start of the first branch. (That is,
4863      pretend that each branch is the only one.)
4864    
4865      In the real compile phase, insert an ALT node. Its length field points back
4866    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
4867    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
4868    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
4869    
4870    *code = OP_ALT;    if (lengthptr != NULL)
4871    PUT(code, 1, code - last_branch);      {
4872    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
4873    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
4874        }
4875      else
4876        {
4877        *code = OP_ALT;
4878        PUT(code, 1, code - last_branch);
4879        bc.current = last_branch = code;
4880        code += 1 + LINK_SIZE;
4881        }
4882    
4883    ptr++;    ptr++;
   length += 1 + LINK_SIZE;  
4884    }    }
4885  /* Control never reaches here */  /* Control never reaches here */
4886  }  }
# Line 5138  cd->cbits = tables + cbits_offset; Line 5242  cd->cbits = tables + cbits_offset;
5242  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5243    
5244  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
5245  current code allows for fixed one- or two-byte sequences, plus "any" and  current code allows for fixed one- or two-byte sequences, plus "any" and
5246  "anycrlf". */  "anycrlf". */
5247    
5248  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
# Line 5149  switch (options & (PCRE_NEWLINE_CRLF | P Line 5253  switch (options & (PCRE_NEWLINE_CRLF | P
5253    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5254         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5255    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
5256    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5257    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5258    }    }
5259    

Legend:
Removed from v.149  
changed lines
  Added in v.172

  ViewVC Help
Powered by ViewVC 1.1.5