/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 964 by ph10, Fri May 4 13:03:39 2012 UTC revision 994 by ph10, Tue Jul 10 14:29:26 2012 UTC
# Line 490  static const char error_texts[] = Line 490  static const char error_texts[] =
490    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491    "invalid UTF-16 string\0"    "invalid UTF-16 string\0"
492    /* 75 */    /* 75 */
493    "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"    "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494      "character value in \\u.... sequence is too large\0"
495    ;    ;
496    
497  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 831  else Line 832  else
832            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833  #endif  #endif
834            }            }
835    
836    #ifdef COMPILE_PCRE8
837            if (c > (utf ? 0x10ffff : 0xff))
838    #else
839    #ifdef COMPILE_PCRE16
840            if (c > (utf ? 0x10ffff : 0xffff))
841    #endif
842    #endif
843              {
844              *errorcodeptr = ERR76;
845              }
846            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847          }          }
848        }        }
849      else      else
# Line 4518  for (;; ptr++) Line 4531  for (;; ptr++)
4531        LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
4532    
4533        /* Only the value of 1 matters for class_single_char. */        /* Only the value of 1 matters for class_single_char. */
4534    
4535        if (class_single_char < 2) class_single_char++;        if (class_single_char < 2) class_single_char++;
4536    
4537        /* If class_charcount is 1, we saw precisely one character. As long as        /* If class_charcount is 1, we saw precisely one character. As long as
# Line 4813  for (;; ptr++) Line 4826  for (;; ptr++)
4826      if (*previous == OP_CHAR || *previous == OP_CHARI      if (*previous == OP_CHAR || *previous == OP_CHARI
4827          || *previous == OP_NOT || *previous == OP_NOTI)          || *previous == OP_NOT || *previous == OP_NOTI)
4828        {        {
4829        switch (*previous)        switch (*previous)
4830          {          {
4831          default: /* Make compiler happy. */          default: /* Make compiler happy. */
4832          case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;          case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
# Line 5593  for (;; ptr++) Line 5606  for (;; ptr++)
5606        ptr++;        ptr++;
5607        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5608        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5609    
5610        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
5611        a closing parenthesis, to appear in arguments, so we no longer insist on        a closing parenthesis, to appear in arguments, so we no longer insist on
5612        letters, digits, and underscores. */        letters, digits, and underscores. */
# Line 5607  for (;; ptr++) Line 5620  for (;; ptr++)
5620            {            {
5621            *errorcodeptr = ERR75;            *errorcodeptr = ERR75;
5622            goto FAILED;            goto FAILED;
5623            }            }
5624          }          }
5625    
5626        if (*ptr != CHAR_RIGHT_PARENTHESIS)        if (*ptr != CHAR_RIGHT_PARENTHESIS)
# Line 5623  for (;; ptr++) Line 5636  for (;; ptr++)
5636          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5637              STRNCMP_UC_C8(name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5638            {            {
5639              int setverb;
5640    
5641            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5642            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
5643    
# Line 5640  for (;; ptr++) Line 5655  for (;; ptr++)
5655                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5656                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5657                }                }
5658              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              setverb = *code++ =
5659                  (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5660    
5661              /* Do not set firstchar after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5662              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
# Line 5655  for (;; ptr++) Line 5671  for (;; ptr++)
5671                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
5672                goto FAILED;                goto FAILED;
5673                }                }
5674              *code = verbs[i].op;              setverb = *code++ = verbs[i].op;
             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;  
5675              }              }
5676    
5677            else            else
# Line 5666  for (;; ptr++) Line 5681  for (;; ptr++)
5681                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5682                goto FAILED;                goto FAILED;
5683                }                }
5684              *code = verbs[i].op_arg;              setverb = *code++ = verbs[i].op_arg;
             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;  
5685              *code++ = arglen;              *code++ = arglen;
5686              memcpy(code, arg, IN_UCHARS(arglen));              memcpy(code, arg, IN_UCHARS(arglen));
5687              code += arglen;              code += arglen;
5688              *code++ = 0;              *code++ = 0;
5689              }              }
5690    
5691              switch (setverb)
5692                {
5693                case OP_THEN:
5694                case OP_THEN_ARG:
5695                cd->external_flags |= PCRE_HASTHEN;
5696                break;
5697    
5698                case OP_PRUNE:
5699                case OP_PRUNE_ARG:
5700                case OP_SKIP:
5701                case OP_SKIP_ARG:
5702                cd->had_pruneorskip = TRUE;
5703                break;
5704                }
5705    
5706            break;  /* Found verb, exit loop */            break;  /* Found verb, exit loop */
5707            }            }
5708    
# Line 6859  for (;; ptr++) Line 6888  for (;; ptr++)
6888        /* For the rest (including \X when Unicode properties are supported), we        /* For the rest (including \X when Unicode properties are supported), we
6889        can obtain the OP value by negating the escape value in the default        can obtain the OP value by negating the escape value in the default
6890        situation when PCRE_UCP is not set. When it *is* set, we substitute        situation when PCRE_UCP is not set. When it *is* set, we substitute
6891        Unicode property tests. Note that \b and \B do a one-character        Unicode property tests. Note that \b and \B do a one-character
6892        lookbehind. */        lookbehind. */
6893    
6894        else        else
6895          {          {
6896          if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)          if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6897            cd->max_lookbehind = 1;            cd->max_lookbehind = 1;
6898  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
6899          if (-c >= ESC_DU && -c <= ESC_wu)          if (-c >= ESC_DU && -c <= ESC_wu)
6900            {            {
# Line 7173  for (;;) Line 7202  for (;;)
7202          *ptrptr = ptr;          *ptrptr = ptr;
7203          return FALSE;          return FALSE;
7204          }          }
7205        else        else
7206          {          {
7207          if (fixed_length > cd->max_lookbehind)          if (fixed_length > cd->max_lookbehind)
7208            cd->max_lookbehind = fixed_length;            cd->max_lookbehind = fixed_length;
7209          PUT(reverse_count, 0, fixed_length);          PUT(reverse_count, 0, fixed_length);
7210          }          }
7211        }        }
7212      }      }
# Line 7310  and the highest back reference was great Line 7339  and the highest back reference was great
7339  However, by keeping a bitmap of the first 31 back references, we can catch some  However, by keeping a bitmap of the first 31 back references, we can catch some
7340  of the more common cases more precisely.  of the more common cases more precisely.
7341    
7342    ... A second exception is when the .* appears inside an atomic group, because
7343    this prevents the number of characters it matches from being adjusted.
7344    
7345  Arguments:  Arguments:
7346    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7347    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7348                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7349                    the less precise approach                    the less precise approach
7350    backref_map    the back reference bitmap    cd             points to the compile data block
7351      atomcount      atomic group level
7352    
7353  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
7354  */  */
7355    
7356  static BOOL  static BOOL
7357  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
7358    unsigned int backref_map)    compile_data *cd, int atomcount)
7359  {  {
7360  do {  do {
7361     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7334  do { Line 7367  do {
7367     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7368         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7369       {       {
7370       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7371       }       }
7372    
7373     /* Capturing brackets */     /* Capturing brackets */
# Line 7344  do { Line 7377  do {
7377       {       {
7378       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7379       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7380       if (!is_anchored(scode, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
7381         }
7382    
7383       /* Positive forward assertions and conditions */
7384    
7385       else if (op == OP_ASSERT || op == OP_COND)
7386         {
7387         if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7388       }       }
7389    
7390     /* Other brackets */     /* Atomic groups */
7391    
7392     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||     else if (op == OP_ONCE || op == OP_ONCE_NC)
             op == OP_COND)  
7393       {       {
7394       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
7395           return FALSE;
7396       }       }
7397    
7398     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7399     it isn't in brackets that are or may be referenced. */     it isn't in brackets that are or may be referenced or inside an atomic
7400       group. */
7401    
7402     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7403               op == OP_TYPEPOSSTAR))               op == OP_TYPEPOSSTAR))
7404       {       {
7405       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)       if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
7406             atomcount > 0 || cd->had_pruneorskip)
7407         return FALSE;         return FALSE;
7408       }       }
7409    
7410     /* Check for explicit anchoring */     /* Check for explicit anchoring */
7411    
7412     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7413    
7414     code += GET(code, 1);     code += GET(code, 1);
7415     }     }
7416  while (*code == OP_ALT);   /* Loop for each alternative */  while (*code == OP_ALT);   /* Loop for each alternative */
# Line 7385  return TRUE; Line 7428  return TRUE;
7428  matching and for non-DOTALL patterns that start with .* (which must start at  matching and for non-DOTALL patterns that start with .* (which must start at
7429  the beginning or after \n). As in the case of is_anchored() (see above), we  the beginning or after \n). As in the case of is_anchored() (see above), we
7430  have to take account of back references to capturing brackets that contain .*  have to take account of back references to capturing brackets that contain .*
7431  because in that case we can't make the assumption.  because in that case we can't make the assumption. Also, the appearance of .*
7432    inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
7433    count, because once again the assumption no longer holds.
7434    
7435  Arguments:  Arguments:
7436    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7437    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7438                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7439                    the less precise approach                    the less precise approach
7440    backref_map    the back reference bitmap    cd             points to the compile data
7441      atomcount      atomic group level
7442    
7443  Returns:         TRUE or FALSE  Returns:         TRUE or FALSE
7444  */  */
7445    
7446  static BOOL  static BOOL
7447  is_startline(const pcre_uchar *code, unsigned int bracket_map,  is_startline(const pcre_uchar *code, unsigned int bracket_map,
7448    unsigned int backref_map)    compile_data *cd, int atomcount)
7449  {  {
7450  do {  do {
7451     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7425  do { Line 7471  do {
7471         return FALSE;         return FALSE;
7472    
7473         default:     /* Assertion */         default:     /* Assertion */
7474         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7475         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
7476         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
7477         break;         break;
# Line 7439  do { Line 7485  do {
7485     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7486         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7487       {       {
7488       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7489       }       }
7490    
7491     /* Capturing brackets */     /* Capturing brackets */
# Line 7449  do { Line 7495  do {
7495       {       {
7496       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7497       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7498       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
7499       }       }
7500    
7501     /* Other brackets */     /* Positive forward assertions */
7502    
7503       else if (op == OP_ASSERT)
7504         {
7505         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7506         }
7507    
7508       /* Atomic brackets */
7509    
7510     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)     else if (op == OP_ONCE || op == OP_ONCE_NC)
7511       {       {
7512       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
7513       }       }
7514    
7515     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in atomic brackets or
7516     may be referenced. */     brackets that may be referenced, as long as the pattern does not contain
7517       *PRUNE or *SKIP, because these break the feature. Consider, for example,
7518       /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
7519       start of a line. */
7520    
7521     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7522       {       {
7523       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
7524             atomcount > 0 || cd->had_pruneorskip)
7525           return FALSE;
7526       }       }
7527    
7528     /* Check for explicit circumflex */     /* Check for explicit circumflex; anything else gives a FALSE result. Note
7529       in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
7530       because the number of characters matched by .* cannot be adjusted inside
7531       them. */
7532    
7533     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7534    
# Line 7926  cd->start_code = codestart; Line 7987  cd->start_code = codestart;
7987  cd->hwm = (pcre_uchar *)(cd->start_workspace);  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7988  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7989  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7990    cd->had_pruneorskip = FALSE;
7991  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
7992  cd->open_caps = NULL;  cd->open_caps = NULL;
7993    
# Line 8049  if (errorcode != 0) Line 8111  if (errorcode != 0)
8111    }    }
8112    
8113  /* If the anchored option was not passed, set the flag if we can determine that  /* If the anchored option was not passed, set the flag if we can determine that
8114  the pattern is anchored by virtue of ^ characters or \A or anything else (such  the pattern is anchored by virtue of ^ characters or \A or anything else, such
8115  as starting with .* when DOTALL is set).  as starting with non-atomic .* when DOTALL is set and there are no occurrences
8116    of *PRUNE or *SKIP.
8117    
8118  Otherwise, if we know what the first byte has to be, save it, because that  Otherwise, if we know what the first byte has to be, save it, because that
8119  speeds up unanchored matches no end. If not, see if we can set the  speeds up unanchored matches no end. If not, see if we can set the
8120  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8121  start with ^. and also when all branches start with .* for non-DOTALL matches.  start with ^. and also when all branches start with non-atomic .* for
8122  */  non-DOTALL matches when *PRUNE and SKIP are not present. */
8123    
8124  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
8125    {    {
8126    if (is_anchored(codestart, 0, cd->backref_map))    if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
     re->options |= PCRE_ANCHORED;  
8127    else    else
8128      {      {
8129      if (firstchar < 0)      if (firstchar < 0)
# Line 8098  if ((re->options & PCRE_ANCHORED) == 0) Line 8160  if ((re->options & PCRE_ANCHORED) == 0)
8160    
8161        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
8162        }        }
8163      else if (is_startline(codestart, 0, cd->backref_map))  
8164        re->flags |= PCRE_STARTLINE;      else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
8165      }      }
8166    }    }
8167    

Legend:
Removed from v.964  
changed lines
  Added in v.994

  ViewVC Help
Powered by ViewVC 1.1.5