/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 654 by ph10, Tue Aug 2 11:00:40 2011 UTC revision 745 by ph10, Mon Nov 14 11:41:03 2011 UTC
# Line 676  else Line 676  else
676    
677      case CHAR_l:      case CHAR_l:
678      case CHAR_L:      case CHAR_L:
679        *errorcodeptr = ERR37;
680        break;
681    
682      case CHAR_u:      case CHAR_u:
683        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
684          {
685          /* In JavaScript, \u must be followed by four hexadecimal numbers.
686          Otherwise it is a lowercase u letter. */
687          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
688               && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
689            {
690            c = 0;
691            for (i = 0; i < 4; ++i)
692              {
693              register int cc = *(++ptr);
694    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
695              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
696              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
697    #else           /* EBCDIC coding */
698              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
699              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
700    #endif
701              }
702            }
703          }
704        else
705          *errorcodeptr = ERR37;
706        break;
707    
708      case CHAR_U:      case CHAR_U:
709      *errorcodeptr = ERR37;      /* In JavaScript, \U is an uppercase U letter. */
710        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
711      break;      break;
712    
713      /* In a character class, \g is just a literal "g". Outside a character      /* In a character class, \g is just a literal "g". Outside a character
# Line 828  else Line 857  else
857      treated as a data character. */      treated as a data character. */
858    
859      case CHAR_x:      case CHAR_x:
860        if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
861          {
862          /* In JavaScript, \x must be followed by two hexadecimal numbers.
863          Otherwise it is a lowercase x letter. */
864          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
865            {
866            c = 0;
867            for (i = 0; i < 2; ++i)
868              {
869              register int cc = *(++ptr);
870    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
871              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
872              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
873    #else           /* EBCDIC coding */
874              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
875              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
876    #endif
877              }
878            }
879          break;
880          }
881    
882      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
883        {        {
884        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
# Line 1506  for (;;) Line 1557  for (;;)
1557      case OP_CBRA:      case OP_CBRA:
1558      case OP_BRA:      case OP_BRA:
1559      case OP_ONCE:      case OP_ONCE:
1560        case OP_ONCE_NC:
1561      case OP_COND:      case OP_COND:
1562      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1563      if (d < 0) return d;      if (d < 0) return d;
# Line 1761  for (;;) Line 1813  for (;;)
1813        break;        break;
1814    
1815        case OP_THEN_ARG:        case OP_THEN_ARG:
1816        code += code[1+LINK_SIZE];        code += code[1];
1817        break;        break;
1818        }        }
1819    
# Line 1880  for (;;) Line 1932  for (;;)
1932        break;        break;
1933    
1934        case OP_THEN_ARG:        case OP_THEN_ARG:
1935        code += code[1+LINK_SIZE];        code += code[1];
1936        break;        break;
1937        }        }
1938    
# Line 2045  for (code = first_significant_code(code Line 2097  for (code = first_significant_code(code
2097    
2098    if (c == OP_BRA  || c == OP_BRAPOS ||    if (c == OP_BRA  || c == OP_BRAPOS ||
2099        c == OP_CBRA || c == OP_CBRAPOS ||        c == OP_CBRA || c == OP_CBRAPOS ||
2100        c == OP_ONCE || c == OP_COND)        c == OP_ONCE || c == OP_ONCE_NC ||
2101          c == OP_COND)
2102      {      {
2103      BOOL empty_branch;      BOOL empty_branch;
2104      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2217  for (code = first_significant_code(code Line 2270  for (code = first_significant_code(code
2270      break;      break;
2271    
2272      case OP_THEN_ARG:      case OP_THEN_ARG:
2273      code += code[1+LINK_SIZE];      code += code[1];
2274      break;      break;
2275    
2276      /* None of the remaining opcodes are required to match a character. */      /* None of the remaining opcodes are required to match a character. */
# Line 2295  I think. Line 2348  I think.
2348  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2349  It seems that the appearance of a nested POSIX class supersedes an apparent  It seems that the appearance of a nested POSIX class supersedes an apparent
2350  external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or  external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2351  a digit. Also, unescaped square brackets may also appear as part of class  a digit.
2352  names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.  
2353    In Perl, unescaped square brackets may also appear as part of class names. For
2354    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2355    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2356    seem right at all. PCRE does not allow closing square brackets in POSIX class
2357    names.
2358    
2359  Arguments:  Arguments:
2360    ptr      pointer to the initial [    ptr      pointer to the initial [
# Line 2314  for (++ptr; *ptr != 0; ptr++) Line 2372  for (++ptr; *ptr != 0; ptr++)
2372    {    {
2373    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2374      ptr++;      ptr++;
2375      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2376    else    else
2377      {      {
2378      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
# Line 3086  uschar *class_utf8data_base; Line 3145  uschar *class_utf8data_base;
3145  uschar utf8_char[6];  uschar utf8_char[6];
3146  #else  #else
3147  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
 uschar *utf8_char = NULL;  
3148  #endif  #endif
3149    
3150  #ifdef PCRE_DEBUG  #ifdef PCRE_DEBUG
# Line 3137  for (;; ptr++) Line 3195  for (;; ptr++)
3195    int subfirstbyte;    int subfirstbyte;
3196    int terminator;    int terminator;
3197    int mclength;    int mclength;
3198      int tempbracount;
3199    uschar mcbuffer[8];    uschar mcbuffer[8];
3200    
3201    /* Get next byte in the pattern */    /* Get next byte in the pattern */
# Line 4835  for (;; ptr++) Line 4894  for (;; ptr++)
4894          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
4895          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4896    
4897          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
4898          if (*bracode == OP_ONCE)              possessive_quantifier) *bracode = OP_BRA;
4899    
4900            if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
4901            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
4902          else          else
4903            {            {
# Line 5040  for (;; ptr++) Line 5101  for (;; ptr++)
5101                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5102                }                }
5103              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5104    
5105                /* Do not set firstbyte after *ACCEPT */
5106                if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5107              }              }
5108    
5109            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5052  for (;; ptr++) Line 5116  for (;; ptr++)
5116                goto FAILED;                goto FAILED;
5117                }                }
5118              *code = verbs[i].op;              *code = verbs[i].op;
5119              if (*code++ == OP_THEN)              if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
               {  
               PUT(code, 0, code - bcptr->current_branch - 1);  
               code += LINK_SIZE;  
               }  
5120              }              }
5121    
5122            else            else
# Line 5067  for (;; ptr++) Line 5127  for (;; ptr++)
5127                goto FAILED;                goto FAILED;
5128                }                }
5129              *code = verbs[i].op_arg;              *code = verbs[i].op_arg;
5130              if (*code++ == OP_THEN_ARG)              if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
               {  
               PUT(code, 0, code - bcptr->current_branch - 1);  
               code += LINK_SIZE;  
               }  
5131              *code++ = arglen;              *code++ = arglen;
5132              memcpy(code, arg, arglen);              memcpy(code, arg, arglen);
5133              code += arglen;              code += arglen;
# Line 5906  for (;; ptr++) Line 5962  for (;; ptr++)
5962      *code = bravalue;      *code = bravalue;
5963      tempcode = code;      tempcode = code;
5964      tempreqvary = cd->req_varyopt;        /* Save value before bracket */      tempreqvary = cd->req_varyopt;        /* Save value before bracket */
5965        tempbracount = cd->bracount;          /* Save value before bracket */
5966      length_prevgroup = 0;                 /* Initialize for pre-compile phase */      length_prevgroup = 0;                 /* Initialize for pre-compile phase */
5967    
5968      if (!compile_regex(      if (!compile_regex(
# Line 5928  for (;; ptr++) Line 5985  for (;; ptr++)
5985           ))           ))
5986        goto FAILED;        goto FAILED;
5987    
5988        /* If this was an atomic group and there are no capturing groups within it,
5989        generate OP_ONCE_NC instead of OP_ONCE. */
5990    
5991        if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
5992          *code = OP_ONCE_NC;
5993    
5994      if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)      if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
5995        cd->assert_depth -= 1;        cd->assert_depth -= 1;
5996    
5997      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
5998      group, while tempcode has been updated to point past the end of the group      group, while tempcode has been updated to point past the end of the group.
5999      and any option resetting that may follow it. The pattern pointer (ptr)      The pattern pointer (ptr) is on the bracket.
     is on the bracket. */  
6000    
6001      /* If this is a conditional bracket, check that there are no more than      If this is a conditional bracket, check that there are no more than
6002      two branches in the group, or just one if it's a DEFINE group. We do this      two branches in the group, or just one if it's a DEFINE group. We do this
6003      in the real compile phase, not in the pre-pass, where the whole group may      in the real compile phase, not in the pre-pass, where the whole group may
6004      not be available. */      not be available. */
# Line 6335  for (;; ptr++) Line 6397  for (;; ptr++)
6397        else firstbyte = reqbyte = REQ_NONE;        else firstbyte = reqbyte = REQ_NONE;
6398        }        }
6399    
6400      /* firstbyte was previously set; we can set reqbyte only the length is      /* firstbyte was previously set; we can set reqbyte only if the length is
6401      1 or the matching is caseful. */      1 or the matching is caseful. */
6402    
6403      else      else
# Line 6727  do { Line 6789  do {
6789    
6790     /* Other brackets */     /* Other brackets */
6791    
6792     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
6793                op == OP_COND)
6794       {       {
6795       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6796       }       }
# Line 6831  do { Line 6894  do {
6894    
6895     /* Other brackets */     /* Other brackets */
6896    
6897     else if (op == OP_ASSERT || op == OP_ONCE)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
6898       {       {
6899       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6900       }       }
# Line 6901  do { Line 6964  do {
6964       case OP_SCBRAPOS:       case OP_SCBRAPOS:
6965       case OP_ASSERT:       case OP_ASSERT:
6966       case OP_ONCE:       case OP_ONCE:
6967         case OP_ONCE_NC:
6968       case OP_COND:       case OP_COND:
6969       if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)       if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
6970         return -1;         return -1;
# Line 7282  re->top_bracket = cd->bracount; Line 7346  re->top_bracket = cd->bracount;
7346  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7347  re->flags = cd->external_flags;  re->flags = cd->external_flags;
7348    
7349  if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqbyte = REQ_NONE;   /* Must disable after (*ACCEPT) */
7350    
7351  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
7352    

Legend:
Removed from v.654  
changed lines
  Added in v.745

  ViewVC Help
Powered by ViewVC 1.1.5