/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 171 by ph10, Mon Jun 4 14:28:58 2007 UTC revision 176 by ph10, Mon Jun 11 13:48:37 2007 UTC
# Line 243  static const char *error_texts[] = { Line 243  static const char *error_texts[] = {
243    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
244    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
245    "\\g is not followed by a braced name or an optionally braced non-zero number",    "\\g is not followed by a braced name or an optionally braced non-zero number",
246    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 374  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
378      int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 453  else Line 453  else
453    
454      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by a number, either plain or braced. If positive, it
455      is an absolute backreference. If negative, it is a relative backreference.      is an absolute backreference. If negative, it is a relative backreference.
456      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457      reference to a named group. This is part of Perl's movement towards a      reference to a named group. This is part of Perl's movement towards a
458      unified syntax for back references. As this is synonymous with \k{name}, we      unified syntax for back references. As this is synonymous with \k{name}, we
459      fudge it up by pretending it really was \k. */      fudge it up by pretending it really was \k. */
460    
461      case 'g':      case 'g':
# Line 464  else Line 464  else
464        const uschar *p;        const uschar *p;
465        for (p = ptr+2; *p != 0 && *p != '}'; p++)        for (p = ptr+2; *p != 0 && *p != '}'; p++)
466          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467        if (*p != 0 && *p != '}')        if (*p != 0 && *p != '}')
468          {          {
469          c = -ESC_k;          c = -ESC_k;
470          break;          break;
471          }          }
472        braced = TRUE;        braced = TRUE;
473        ptr++;        ptr++;
474        }        }
# Line 1381  for (code = first_significant_code(code Line 1381  for (code = first_significant_code(code
1381    const uschar *ccode;    const uschar *ccode;
1382    
1383    c = *code;    c = *code;
1384    
1385    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388      {      {
1389        code += _pcre_OP_lengths[c];
1390      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1391      c = *code;      c = *code;
1392      continue;      continue;
1393      }      }
1394    
1395    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
1396    
1397    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398      {      {
1399      BOOL empty_branch;      BOOL empty_branch;
# Line 1409  for (code = first_significant_code(code Line 1410  for (code = first_significant_code(code
1410        }        }
1411      while (*code == OP_ALT);      while (*code == OP_ALT);
1412      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1413      c = *code;      c = *code;
1414      continue;      continue;
1415      }      }
1416    
# Line 2109  for (;; ptr++) Line 2110  for (;; ptr++)
2110    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2111    BOOL is_quantifier;    BOOL is_quantifier;
2112    BOOL is_recurse;    BOOL is_recurse;
2113      BOOL reset_bracount;
2114    int class_charcount;    int class_charcount;
2115    int class_lastchar;    int class_lastchar;
2116    int newoptions;    int newoptions;
2117    int recno;    int recno;
2118    int refsign;    int refsign;
2119    int skipbytes;    int skipbytes;
2120    int subreqbyte;    int subreqbyte;
2121    int subfirstbyte;    int subfirstbyte;
# Line 2678  for (;; ptr++) Line 2680  for (;; ptr++)
2680              unsigned int origd = d;              unsigned int origd = d;
2681              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2682                {                {
2683                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2684                      ocd <= (unsigned int)d)
2685                    continue;                          /* Skip embedded ranges */
2686    
2687                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2688                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2689                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2690                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2691                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2692                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2693                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2694                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2695                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2696                  d = ocd;                  d = ocd;
2697                  continue;                  continue;
# Line 3583  for (;; ptr++) Line 3589  for (;; ptr++)
3589      skipbytes = 0;      skipbytes = 0;
3590      bravalue = OP_CBRA;      bravalue = OP_CBRA;
3591      save_hwm = cd->hwm;      save_hwm = cd->hwm;
3592        reset_bracount = FALSE;
3593    
3594      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3595        {        {
# Line 3605  for (;; ptr++) Line 3612  for (;; ptr++)
3612    
3613    
3614          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
3615            case '|':                 /* Reset capture count for each branch */
3616            reset_bracount = TRUE;
3617            /* Fall through */
3618    
3619            /* ------------------------------------------------------------ */
3620          case ':':                 /* Non-capturing bracket */          case ':':                 /* Non-capturing bracket */
3621          bravalue = OP_BRA;          bravalue = OP_BRA;
3622          ptr++;          ptr++;
# Line 3640  for (;; ptr++) Line 3652  for (;; ptr++)
3652    
3653          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
3654          skipbytes = 3;          skipbytes = 3;
3655          refsign = -1;          refsign = -1;
3656    
3657          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
3658    
# Line 3664  for (;; ptr++) Line 3676  for (;; ptr++)
3676            terminator = '\'';            terminator = '\'';
3677            ptr++;            ptr++;
3678            }            }
3679          else          else
3680            {            {
3681            terminator = 0;            terminator = 0;
3682            if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);            if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3683            }            }
3684    
3685          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
3686    
# Line 3706  for (;; ptr++) Line 3718  for (;; ptr++)
3718          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
3719          reference. If the string started with "+" or "-" we require the rest to          reference. If the string started with "+" or "-" we require the rest to
3720          be digits, in which case recno will be set. */          be digits, in which case recno will be set. */
3721    
3722          if (refsign > 0)          if (refsign > 0)
3723            {            {
3724            if (recno <= 0)            if (recno <= 0)
3725              {              {
3726              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
3727              goto FAILED;              goto FAILED;
3728              }              }
3729            if (refsign == '-')            if (refsign == '-')
3730              {              {
3731              recno = cd->bracount - recno + 1;              recno = cd->bracount - recno + 1;
3732              if (recno <= 0)              if (recno <= 0)
3733                {                {
3734                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
3735                goto FAILED;                goto FAILED;
3736                }                }
3737              }              }
3738            else recno += cd->bracount;            else recno += cd->bracount;
3739            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
3740            break;            break;
3741            }            }
3742    
3743          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
3744          name. */          name. */
3745    
3746          slot = cd->name_table;          slot = cd->name_table;
3747          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
3748            {            {
# Line 4055  for (;; ptr++) Line 4067  for (;; ptr++)
4067            const uschar *called;            const uschar *called;
4068    
4069            if ((refsign = *ptr) == '+') ptr++;            if ((refsign = *ptr) == '+') ptr++;
4070            else if (refsign == '-')            else if (refsign == '-')
4071              {              {
4072              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
4073                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
4074              ptr++;              ptr++;
4075              }              }
4076    
4077            recno = 0;            recno = 0;
4078            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4079              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
# Line 4071  for (;; ptr++) Line 4083  for (;; ptr++)
4083              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4084              goto FAILED;              goto FAILED;
4085              }              }
4086    
4087            if (refsign == '-')            if (refsign == '-')
4088              {              {
4089              if (recno == 0)              if (recno == 0)
4090                {                {
4091                *errorcodeptr = ERR58;                *errorcodeptr = ERR58;
4092                goto FAILED;                goto FAILED;
4093                }                }
4094              recno = cd->bracount - recno + 1;              recno = cd->bracount - recno + 1;
4095              if (recno <= 0)              if (recno <= 0)
4096                {                {
4097                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
4098                goto FAILED;                goto FAILED;
4099                }                }
4100              }              }
4101            else if (refsign == '+')            else if (refsign == '+')
4102              {              {
# Line 4092  for (;; ptr++) Line 4104  for (;; ptr++)
4104                {                {
4105                *errorcodeptr = ERR58;                *errorcodeptr = ERR58;
4106                goto FAILED;                goto FAILED;
4107                }                }
4108              recno += cd->bracount;              recno += cd->bracount;
4109              }              }
4110    
4111            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4112    
# Line 4168  for (;; ptr++) Line 4180  for (;; ptr++)
4180    
4181          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4182          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4183          OTHER_CHAR_AFTER_QUERY:          OTHER_CHAR_AFTER_QUERY:
4184          set = unset = 0;          set = unset = 0;
4185          optset = &set;          optset = &set;
4186    
# Line 4303  for (;; ptr++) Line 4315  for (;; ptr++)
4315           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4316           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4317            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4318             reset_bracount,               /* True if (?| group */
4319           skipbytes,                    /* Skip over bracket number */           skipbytes,                    /* Skip over bracket number */
4320           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4321           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
# Line 4319  for (;; ptr++) Line 4332  for (;; ptr++)
4332      is on the bracket. */      is on the bracket. */
4333    
4334      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4335      two branches in the group, or just one if it's a DEFINE group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4336        in the real compile phase, not in the pre-pass, where the whole group may
4337        not be available. */
4338    
4339      if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4340        {        {
4341        uschar *tc = code;        uschar *tc = code;
4342        int condcount = 0;        int condcount = 0;
# Line 4653  This function is used during the pre-com Line 4668  This function is used during the pre-com
4668  out the amount of memory needed, as well as during the real compile phase. The  out the amount of memory needed, as well as during the real compile phase. The
4669  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
4670    
4671  Argument:  Arguments:
4672    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4673    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
4674    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4675    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4676    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4677    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4678      reset_bracount TRUE to reset the count for each branch
4679    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4680    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4681    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
# Line 4673  Returns:         TRUE on success Line 4689  Returns:         TRUE on success
4689    
4690  static BOOL  static BOOL
4691  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4692    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4693    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4694      int *lengthptr)
4695  {  {
4696  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4697  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 4684  uschar *reverse_count = NULL; Line 4701  uschar *reverse_count = NULL;
4701  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4702  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4703  int length;  int length;
4704    int orig_bracount;
4705    int max_bracount;
4706  branch_chain bc;  branch_chain bc;
4707    
4708  bc.outer = bcptr;  bc.outer = bcptr;
# Line 4712  code += 1 + LINK_SIZE + skipbytes; Line 4731  code += 1 + LINK_SIZE + skipbytes;
4731    
4732  /* Loop for each alternative branch */  /* Loop for each alternative branch */
4733    
4734    orig_bracount = max_bracount = cd->bracount;
4735  for (;;)  for (;;)
4736    {    {
4737      /* For a (?| group, reset the capturing bracket count so that each branch
4738      uses the same numbers. */
4739    
4740      if (reset_bracount) cd->bracount = orig_bracount;
4741    
4742    /* Handle a change of ims options at the start of the branch */    /* Handle a change of ims options at the start of the branch */
4743    
4744    if ((options & PCRE_IMS) != oldims)    if ((options & PCRE_IMS) != oldims)
# Line 4742  for (;;) Line 4767  for (;;)
4767      *ptrptr = ptr;      *ptrptr = ptr;
4768      return FALSE;      return FALSE;
4769      }      }
4770    
4771      /* Keep the highest bracket count in case (?| was used and some branch
4772      has fewer than the rest. */
4773    
4774      if (cd->bracount > max_bracount) max_bracount = cd->bracount;
4775    
4776    /* In the real compile phase, there is some post-processing to be done. */    /* In the real compile phase, there is some post-processing to be done. */
4777    
# Line 4806  for (;;) Line 4836  for (;;)
4836        }        }
4837      }      }
4838    
4839    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
4840    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
4841    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
4842    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
4843    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
4844    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
4845    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
4846    at the terminating char. */    Return leaving the pointer at the terminating char. */
4847    
4848    if (*ptr != '|')    if (*ptr != '|')
4849      {      {
4850      int branch_length = code - last_branch;      if (lengthptr == NULL)
     do  
4851        {        {
4852        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
4853        PUT(last_branch, 1, branch_length);        do
4854        branch_length = prev_length;          {
4855        last_branch -= branch_length;          int prev_length = GET(last_branch, 1);
4856            PUT(last_branch, 1, branch_length);
4857            branch_length = prev_length;
4858            last_branch -= branch_length;
4859            }
4860          while (branch_length > 0);
4861        }        }
     while (branch_length > 0);  
4862    
4863      /* Fill in the ket */      /* Fill in the ket */
4864    
# Line 4841  for (;;) Line 4874  for (;;)
4874        *code++ = oldims;        *code++ = oldims;
4875        length += 2;        length += 2;
4876        }        }
4877    
4878        /* Retain the highest bracket number, in case resetting was used. */
4879    
4880        cd->bracount = max_bracount;
4881    
4882      /* Set values to pass back */      /* Set values to pass back */
4883    
# Line 4852  for (;;) Line 4889  for (;;)
4889      return TRUE;      return TRUE;
4890      }      }
4891    
4892    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
4893      pointer back to where it was for the start of the first branch. (That is,
4894      pretend that each branch is the only one.)
4895    
4896      In the real compile phase, insert an ALT node. Its length field points back
4897    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
4898    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
4899    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
4900    
4901    *code = OP_ALT;    if (lengthptr != NULL)
4902    PUT(code, 1, code - last_branch);      {
4903    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
4904    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
4905        }
4906      else
4907        {
4908        *code = OP_ALT;
4909        PUT(code, 1, code - last_branch);
4910        bc.current = last_branch = code;
4911        code += 1 + LINK_SIZE;
4912        }
4913    
4914    ptr++;    ptr++;
   length += 1 + LINK_SIZE;  
4915    }    }
4916  /* Control never reaches here */  /* Control never reaches here */
4917  }  }
# Line 5304  outside can help speed up starting point Line 5353  outside can help speed up starting point
5353  code = cworkspace;  code = cworkspace;
5354  *code = OP_BRA;  *code = OP_BRA;
5355  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5356    &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5357      &length);
5358  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5359    
5360  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 5372  ptr = (const uschar *)pattern; Line 5422  ptr = (const uschar *)pattern;
5422  code = (uschar *)codestart;  code = (uschar *)codestart;
5423  *code = OP_BRA;  *code = OP_BRA;
5424  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5425    &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5426  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
5427  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
5428    

Legend:
Removed from v.171  
changed lines
  Added in v.176

  ViewVC Help
Powered by ViewVC 1.1.5