/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 533 by ph10, Wed Jun 2 19:02:41 2010 UTC revision 556 by ph10, Tue Oct 26 11:06:44 2010 UTC
# Line 261  static const int posix_class_maps[] = { Line 261  static const int posix_class_maps[] = {
261    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
262  };  };
263    
264  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265  substitutes must be in the order of the names, defined above, and there are  substitutes must be in the order of the names, defined above, and there are
266  both positive and negative cases. NULL means no substitute. */  both positive and negative cases. NULL means no substitute. */
267    
268  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 272  static const uschar *substitutes[] = { Line 272  static const uschar *substitutes[] = {
272    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */    (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
273    (uschar *)"\\p{Xsp}",   /* \s */    (uschar *)"\\p{Xsp}",   /* \s */
274    (uschar *)"\\P{Xwd}",   /* \W */    (uschar *)"\\P{Xwd}",   /* \W */
275    (uschar *)"\\p{Xwd}"    /* \w */    (uschar *)"\\p{Xwd}"    /* \w */
276  };  };
277    
278  static const uschar *posix_substitutes[] = {  static const uschar *posix_substitutes[] = {
279    (uschar *)"\\p{L}",     /* alpha */    (uschar *)"\\p{L}",     /* alpha */
280    (uschar *)"\\p{Ll}",    /* lower */    (uschar *)"\\p{Ll}",    /* lower */
281    (uschar *)"\\p{Lu}",    /* upper */    (uschar *)"\\p{Lu}",    /* upper */
282    (uschar *)"\\p{Xan}",   /* alnum */    (uschar *)"\\p{Xan}",   /* alnum */
283    NULL,                   /* ascii */    NULL,                   /* ascii */
284    (uschar *)"\\h",        /* blank */    (uschar *)"\\h",        /* blank */
285    NULL,                   /* cntrl */    NULL,                   /* cntrl */
# Line 289  static const uschar *posix_substitutes[] Line 289  static const uschar *posix_substitutes[]
289    NULL,                   /* punct */    NULL,                   /* punct */
290    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */    (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
291    (uschar *)"\\p{Xwd}",   /* word */    (uschar *)"\\p{Xwd}",   /* word */
292    NULL,                   /* xdigit */    NULL,                   /* xdigit */
293    /* Negated cases */    /* Negated cases */
294    (uschar *)"\\P{L}",     /* ^alpha */    (uschar *)"\\P{L}",     /* ^alpha */
295    (uschar *)"\\P{Ll}",    /* ^lower */    (uschar *)"\\P{Ll}",    /* ^lower */
296    (uschar *)"\\P{Lu}",    /* ^upper */    (uschar *)"\\P{Lu}",    /* ^upper */
297    (uschar *)"\\P{Xan}",   /* ^alnum */    (uschar *)"\\P{Xan}",   /* ^alnum */
298    NULL,                   /* ^ascii */    NULL,                   /* ^ascii */
299    (uschar *)"\\H",        /* ^blank */    (uschar *)"\\H",        /* ^blank */
300    NULL,                   /* ^cntrl */    NULL,                   /* ^cntrl */
# Line 304  static const uschar *posix_substitutes[] Line 304  static const uschar *posix_substitutes[]
304    NULL,                   /* ^punct */    NULL,                   /* ^punct */
305    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */    (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
306    (uschar *)"\\P{Xwd}",   /* ^word */    (uschar *)"\\P{Xwd}",   /* ^word */
307    NULL                    /* ^xdigit */    NULL                    /* ^xdigit */
308  };  };
309  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310  #endif  #endif
311    
312  #define STRING(a)  # a  #define STRING(a)  # a
313  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 407  static const char error_texts[] = Line 407  static const char error_texts[] =
407    /* 65 */    /* 65 */
408    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
409    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
410    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
411    ;    ;
412    
413  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1110  Arguments: Line 1110  Arguments:
1110    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1111    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1112    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1113      utf8         TRUE if we are in UTF-8 mode
1114    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1115    
1116  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
# Line 1117  Returns:       the number of the named s Line 1118  Returns:       the number of the named s
1118    
1119  static int  static int
1120  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1121    BOOL xmode, int *count)    BOOL xmode, BOOL utf8, int *count)
1122  {  {
1123  uschar *ptr = *ptrptr;  uschar *ptr = *ptrptr;
1124  int start_count = *count;  int start_count = *count;
# Line 1129  dealing with. The very first call may no Line 1130  dealing with. The very first call may no
1130    
1131  if (ptr[0] == CHAR_LEFT_PARENTHESIS)  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1132    {    {
1133    if (ptr[1] == CHAR_QUESTION_MARK &&    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1134        ptr[2] == CHAR_VERTICAL_LINE)  
1135      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1136    
1137      /* Handle a normal, unnamed capturing parenthesis. */
1138    
1139      else if (ptr[1] != CHAR_QUESTION_MARK)
1140        {
1141        *count += 1;
1142        if (name == NULL && *count == lorn) return *count;
1143        ptr++;
1144        }
1145    
1146      /* All cases now have (? at the start. Remember when we are in a group
1147      where the parenthesis numbers are duplicated. */
1148    
1149      else if (ptr[2] == CHAR_VERTICAL_LINE)
1150      {      {
1151      ptr += 3;      ptr += 3;
1152      dup_parens = TRUE;      dup_parens = TRUE;
1153      }      }
1154    
1155    /* Handle a normal, unnamed capturing parenthesis */    /* Handle comments; all characters are allowed until a ket is reached. */
1156    
1157    else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)    else if (ptr[2] == CHAR_NUMBER_SIGN)
1158      {      {
1159      *count += 1;      for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1160      if (name == NULL && *count == lorn) return *count;      goto FAIL_EXIT;
     ptr++;  
1161      }      }
1162    
1163    /* Handle a condition. If it is an assertion, just carry on so that it    /* Handle a condition. If it is an assertion, just carry on so that it
1164    is processed as normal. If not, skip to the closing parenthesis of the    is processed as normal. If not, skip to the closing parenthesis of the
1165    condition (there can't be any nested parens. */    condition (there can't be any nested parens). */
1166    
1167    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1168      {      {
# Line 1159  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1174  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1174        }        }
1175      }      }
1176    
1177    /* We have either (? or (* and not a condition */    /* Start with (? but not a condition. */
1178    
1179    else    else
1180      {      {
# Line 1264  for (; *ptr != 0; ptr++) Line 1279  for (; *ptr != 0; ptr++)
1279    
1280    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1281      {      {
1282      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1283        while (*ptr != 0)
1284          {
1285          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1286          ptr++;
1287    #ifdef SUPPORT_UTF8
1288          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1289    #endif
1290          }
1291      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1292      continue;      continue;
1293      }      }
# Line 1273  for (; *ptr != 0; ptr++) Line 1296  for (; *ptr != 0; ptr++)
1296    
1297    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1298      {      {
1299      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1300      if (rc > 0) return rc;      if (rc > 0) return rc;
1301      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1302      }      }
# Line 1281  for (; *ptr != 0; ptr++) Line 1304  for (; *ptr != 0; ptr++)
1304    else if (*ptr == CHAR_RIGHT_PARENTHESIS)    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1305      {      {
1306      if (dup_parens && *count < hwm_count) *count = hwm_count;      if (dup_parens && *count < hwm_count) *count = hwm_count;
1307      *ptrptr = ptr;      goto FAIL_EXIT;
     return -1;  
1308      }      }
1309    
1310    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
# Line 1320  Arguments: Line 1342  Arguments:
1342    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1343    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1344    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1345      utf8         TRUE if we are in UTF-8 mode
1346    
1347  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1348  */  */
1349    
1350  static int  static int
1351  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1352      BOOL utf8)
1353  {  {
1354  uschar *ptr = (uschar *)cd->start_pattern;  uschar *ptr = (uschar *)cd->start_pattern;
1355  int count = 0;  int count = 0;
# Line 1338  matching closing parens. That is why we Line 1362  matching closing parens. That is why we
1362    
1363  for (;;)  for (;;)
1364    {    {
1365    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1366    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1367    }    }
1368    
# Line 1711  for (;;) Line 1735  for (;;)
1735        case OP_MARK:        case OP_MARK:
1736        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
1737        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       case OP_THEN_ARG:  
1738        code += code[1];        code += code[1];
1739        break;        break;
1740    
1741          case OP_THEN_ARG:
1742          code += code[1+LINK_SIZE];
1743          break;
1744        }        }
1745    
1746      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
# Line 1814  for (;;) Line 1841  for (;;)
1841        case OP_MARK:        case OP_MARK:
1842        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
1843        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       case OP_THEN_ARG:  
1844        code += code[1];        code += code[1];
1845        break;        break;
1846    
1847          case OP_THEN_ARG:
1848          code += code[1+LINK_SIZE];
1849          break;
1850        }        }
1851    
1852      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
# Line 2092  for (code = first_significant_code(code Line 2122  for (code = first_significant_code(code
2122      case OP_MARK:      case OP_MARK:
2123      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2124      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     case OP_THEN_ARG:  
2125      code += code[1];      code += code[1];
2126      break;      break;
2127    
2128        case OP_THEN_ARG:
2129        code += code[1+LINK_SIZE];
2130        break;
2131    
2132      /* None of the remaining opcodes are required to match a character. */      /* None of the remaining opcodes are required to match a character. */
2133    
2134      default:      default:
# Line 2407  Arguments: Line 2440  Arguments:
2440    ptype        the property type    ptype        the property type
2441    pdata        the data for the type    pdata        the data for the type
2442    negated      TRUE if it's a negated property (\P or \p{^)    negated      TRUE if it's a negated property (\P or \p{^)
2443    
2444  Returns:       TRUE if auto-possessifying is OK  Returns:       TRUE if auto-possessifying is OK
2445  */  */
2446    
2447  static BOOL  static BOOL
2448  check_char_prop(int c, int ptype, int pdata, BOOL negated)  check_char_prop(int c, int ptype, int pdata, BOOL negated)
# Line 2453  switch(ptype) Line 2486  switch(ptype)
2486            _pcre_ucp_gentype[prop->chartype] == ucp_N ||            _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2487            c == CHAR_UNDERSCORE) == negated;            c == CHAR_UNDERSCORE) == negated;
2488    }    }
2489  return FALSE;  return FALSE;
2490  }  }
2491  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2492    
# Line 2478  Returns:        TRUE if possessifying is Line 2511  Returns:        TRUE if possessifying is
2511  */  */
2512    
2513  static BOOL  static BOOL
2514  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2515    int options, compile_data *cd)    int options, compile_data *cd)
2516  {  {
2517  int c, next;  int c, next;
# Line 2493  if ((options & PCRE_EXTENDED) != 0) Line 2526  if ((options & PCRE_EXTENDED) != 0)
2526      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2527      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2528        {        {
2529        while (*(++ptr) != 0)        ptr++;
2530          while (*ptr != 0)
2531            {
2532          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2533            ptr++;
2534    #ifdef SUPPORT_UTF8
2535            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2536    #endif
2537            }
2538        }        }
2539      else break;      else break;
2540      }      }
# Line 2530  if ((options & PCRE_EXTENDED) != 0) Line 2570  if ((options & PCRE_EXTENDED) != 0)
2570      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2571      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2572        {        {
2573        while (*(++ptr) != 0)        ptr++;
2574          while (*ptr != 0)
2575            {
2576          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2577            ptr++;
2578    #ifdef SUPPORT_UTF8
2579            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2580    #endif
2581            }
2582        }        }
2583      else break;      else break;
2584      }      }
# Line 2549  the next item is a character. */ Line 2596  the next item is a character. */
2596  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
2597    {    {
2598    case OP_CHAR:    case OP_CHAR:
2599  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2600    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2601  #else  #else
2602    c = *previous;    c = *previous;
2603  #endif  #endif
2604    return c != next;    return c != next;
2605    
2606    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARNC (caseless character) we must check the other case. If we have
2607    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
2608    high-valued characters. */    high-valued characters. */
2609    
2610    case OP_CHARNC:    case OP_CHARNC:
2611  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2612    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2613  #else  #else
2614    c = *previous;    c = *previous;
2615  #endif  #endif
2616    if (c == next) return FALSE;    if (c == next) return FALSE;
2617  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2618    if (utf8)    if (utf8)
# Line 2603  if (next >= 0) switch(op_code) Line 2650  if (next >= 0) switch(op_code)
2650    else    else
2651  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2652    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
2653    
2654    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2655    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2656    
2657    case OP_DIGIT:    case OP_DIGIT:
2658    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
# Line 2673  if (next >= 0) switch(op_code) Line 2720  if (next >= 0) switch(op_code)
2720  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2721    case OP_PROP:    case OP_PROP:
2722    return check_char_prop(next, previous[0], previous[1], FALSE);    return check_char_prop(next, previous[0], previous[1], FALSE);
2723    
2724    case OP_NOTPROP:    case OP_NOTPROP:
2725    return check_char_prop(next, previous[0], previous[1], TRUE);    return check_char_prop(next, previous[0], previous[1], TRUE);
2726  #endif  #endif
# Line 2683  if (next >= 0) switch(op_code) Line 2730  if (next >= 0) switch(op_code)
2730    }    }
2731    
2732    
2733  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2734  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2735  generated only when PCRE_UCP is *not* set, that is, when only ASCII  generated only when PCRE_UCP is *not* set, that is, when only ASCII
2736  characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are  characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2737  replaced by OP_PROP codes when PCRE_UCP is set. */  replaced by OP_PROP codes when PCRE_UCP is set. */
2738    
2739  switch(op_code)  switch(op_code)
2740    {    {
2741    case OP_CHAR:    case OP_CHAR:
2742    case OP_CHARNC:    case OP_CHARNC:
2743  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2744    GETCHARTEST(c, previous);    GETCHARTEST(c, previous);
2745  #else  #else
2746    c = *previous;    c = *previous;
2747  #endif  #endif
2748    switch(-next)    switch(-next)
2749      {      {
2750      case ESC_d:      case ESC_d:
# Line 2761  switch(op_code) Line 2808  switch(op_code)
2808        default:        default:
2809        return -next == ESC_v;        return -next == ESC_v;
2810        }        }
2811    
2812      /* When PCRE_UCP is set, these values get generated for \d etc. Find      /* When PCRE_UCP is set, these values get generated for \d etc. Find
2813      their substitutions and process them. The result will always be either      their substitutions and process them. The result will always be either
2814      -ESC_p or -ESC_P. Then fall through to process those values. */      -ESC_p or -ESC_P. Then fall through to process those values. */
2815    
2816  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2817      case ESC_du:      case ESC_du:
2818      case ESC_DU:      case ESC_DU:
# Line 2780  switch(op_code) Line 2827  switch(op_code)
2827        if (temperrorcode != 0) return FALSE;        if (temperrorcode != 0) return FALSE;
2828        ptr++;    /* For compatibility */        ptr++;    /* For compatibility */
2829        }        }
2830      /* Fall through */      /* Fall through */
2831    
2832      case ESC_p:      case ESC_p:
2833      case ESC_P:      case ESC_P:
2834        {        {
2835        int ptype, pdata, errorcodeptr;        int ptype, pdata, errorcodeptr;
2836        BOOL negated;        BOOL negated;
2837    
2838        ptr--;      /* Make ptr point at the p or P */        ptr--;      /* Make ptr point at the p or P */
2839        ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);        ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2840        if (ptype < 0) return FALSE;        if (ptype < 0) return FALSE;
2841        ptr++;      /* Point past the final curly ket */        ptr++;      /* Point past the final curly ket */
2842    
2843        /* If the property item is optional, we have to give up. (When generated        /* If the property item is optional, we have to give up. (When generated
2844        from \d etc by PCRE_UCP, this test will have been applied much earlier,        from \d etc by PCRE_UCP, this test will have been applied much earlier,
2845        to the original \d etc. At this point, ptr will point to a zero byte. */        to the original \d etc. At this point, ptr will point to a zero byte. */
2846    
2847        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2848          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)          strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2849            return FALSE;            return FALSE;
2850    
2851        /* Do the property check. */        /* Do the property check. */
2852    
2853        return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);        return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2854        }        }
2855  #endif  #endif
2856    
2857      default:      default:
2858      return FALSE;      return FALSE;
2859      }      }
2860    
2861    /* In principle, support for Unicode properties should be integrated here as    /* In principle, support for Unicode properties should be integrated here as
2862    well. It means re-organizing the above code so as to get hold of the property    well. It means re-organizing the above code so as to get hold of the property
2863    values before switching on the op-code. However, I wonder how many patterns    values before switching on the op-code. However, I wonder how many patterns
2864    combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,    combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2865    these op-codes are never generated.) */    these op-codes are never generated.) */
2866    
2867    case OP_DIGIT:    case OP_DIGIT:
2868    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
# Line 2831  switch(op_code) Line 2878  switch(op_code)
2878    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2879    
2880    case OP_HSPACE:    case OP_HSPACE:
2881    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2882           next == -ESC_w || next == -ESC_v || next == -ESC_R;           next == -ESC_w || next == -ESC_v || next == -ESC_R;
2883    
2884    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
2885    return next == -ESC_h;    return next == -ESC_h;
2886    
2887    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
2888    case OP_ANYNL:    case OP_ANYNL:
2889    case OP_VSPACE:    case OP_VSPACE:
2890    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2891    
# Line 2846  switch(op_code) Line 2893  switch(op_code)
2893    return next == -ESC_v || next == -ESC_R;    return next == -ESC_v || next == -ESC_R;
2894    
2895    case OP_WORDCHAR:    case OP_WORDCHAR:
2896    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2897           next == -ESC_v || next == -ESC_R;           next == -ESC_v || next == -ESC_R;
2898    
2899    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
# Line 2982  for (;; ptr++) Line 3029  for (;; ptr++)
3029    
3030    c = *ptr;    c = *ptr;
3031    
3032    /* If we are at the end of a nested substitution, revert to the outer level    /* If we are at the end of a nested substitution, revert to the outer level
3033    string. Nesting only happens one level deep. */    string. Nesting only happens one level deep. */
3034    
3035    if (c == 0 && nestptr != NULL)    if (c == 0 && nestptr != NULL)
# Line 3104  for (;; ptr++) Line 3151  for (;; ptr++)
3151      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3152      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3153        {        {
3154        while (*(++ptr) != 0)        ptr++;
3155          while (*ptr != 0)
3156          {          {
3157          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3158            ptr++;
3159    #ifdef SUPPORT_UTF8
3160            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3161    #endif
3162          }          }
3163        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3164    
# Line 3289  for (;; ptr++) Line 3341  for (;; ptr++)
3341          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3342          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3343          }          }
3344    
3345        /* In the pre-compile phase, accumulate the length of any UTF-8 extra        /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3346        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
3347        contain a zillion UTF-8 characters no longer overwrite the work space        contain a zillion UTF-8 characters no longer overwrite the work space
# Line 3358  for (;; ptr++) Line 3410  for (;; ptr++)
3410    
3411          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3412            posix_class = 0;            posix_class = 0;
3413    
3414          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3415          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties. */
3416    
3417  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3418          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
3419            {            {
3420            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3421            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
3422              {              {
3423              nestptr = tempptr + 1;              nestptr = tempptr + 1;
3424              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
3425              continue;              continue;
3426              }              }
3427            }            }
3428  #endif  #endif
3429          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, we build the bit map for the POSIX class in a
3430          chunk of local store because we may be adding and subtracting from it,          chunk of local store because we may be adding and subtracting from it,
3431          and we don't want to subtract bits that may be in the main map already.          and we don't want to subtract bits that may be in the main map already.
# Line 3460  for (;; ptr++) Line 3512  for (;; ptr++)
3512              case ESC_SU:              case ESC_SU:
3513              nestptr = ptr;              nestptr = ptr;
3514              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */              ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3515              class_charcount -= 2;                /* Undo! */              class_charcount -= 2;                /* Undo! */
3516              continue;              continue;
3517  #endif  #endif
3518              case ESC_d:              case ESC_d:
# Line 3481  for (;; ptr++) Line 3533  for (;; ptr++)
3533              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3534              continue;              continue;
3535    
3536                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3537                if it was previously set by something earlier in the character
3538                class. */
3539    
3540              case ESC_s:              case ESC_s:
3541              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3542              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3543                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3544              continue;              continue;
3545    
3546              case ESC_S:              case ESC_S:
# Line 3911  for (;; ptr++) Line 3968  for (;; ptr++)
3968      can cause firstbyte to be set. Otherwise, there can be no first char if      can cause firstbyte to be set. Otherwise, there can be no first char if
3969      this item is first, whatever repeat count may follow. In the case of      this item is first, whatever repeat count may follow. In the case of
3970      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3971    
3972  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3973      if (class_charcount == 1 && !class_utf8 &&      if (class_charcount == 1 && !class_utf8 &&
3974        (!utf8 || !negate_class || class_lastchar < 128))        (!utf8 || !negate_class || class_lastchar < 128))
# Line 3991  for (;; ptr++) Line 4048  for (;; ptr++)
4048        }        }
4049  #endif  #endif
4050    
4051      /* If there are no characters > 255, or they are all to be included or      /* If there are no characters > 255, or they are all to be included or
4052      excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the      excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4053      whole class was negated and whether there were negative specials such as \S      whole class was negated and whether there were negative specials such as \S
4054      (non-UCP) in the class. Then copy the 32-byte map into the code vector,      (non-UCP) in the class. Then copy the 32-byte map into the code vector,
# Line 4795  for (;; ptr++) Line 4852  for (;; ptr++)
4852                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
4853                goto FAILED;                goto FAILED;
4854                }                }
4855              *code++ = verbs[i].op;              *code = verbs[i].op;
4856                if (*code++ == OP_THEN)
4857                  {
4858                  PUT(code, 0, code - bcptr->current_branch - 1);
4859                  code += LINK_SIZE;
4860                  }
4861              }              }
4862    
4863            else            else
# Line 4805  for (;; ptr++) Line 4867  for (;; ptr++)
4867                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
4868                goto FAILED;                goto FAILED;
4869                }                }
4870              *code++ = verbs[i].op_arg;              *code = verbs[i].op_arg;
4871                if (*code++ == OP_THEN_ARG)
4872                  {
4873                  PUT(code, 0, code - bcptr->current_branch - 1);
4874                  code += LINK_SIZE;
4875                  }
4876              *code++ = arglen;              *code++ = arglen;
4877              memcpy(code, arg, arglen);              memcpy(code, arg, arglen);
4878              code += arglen;              code += arglen;
# Line 4999  for (;; ptr++) Line 5066  for (;; ptr++)
5066          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5067    
5068          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5069                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5070            {            {
5071            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5072            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5345  for (;; ptr++) Line 5412  for (;; ptr++)
5412              }              }
5413            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5414                      find_parens(cd, name, namelen,                      find_parens(cd, name, namelen,
5415                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5416              {              {
5417              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
5418              goto FAILED;              goto FAILED;
# Line 5456  for (;; ptr++) Line 5523  for (;; ptr++)
5523              if (called == NULL)              if (called == NULL)
5524                {                {
5525                if (find_parens(cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5526                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0, utf8) < 0)
5527                  {                  {
5528                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
5529                  goto FAILED;                  goto FAILED;
# Line 5795  for (;; ptr++) Line 5862  for (;; ptr++)
5862    
5863      /* ===================================================================*/      /* ===================================================================*/
5864      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5865      are arranged to be the negation of the corresponding OP_values in the      are arranged to be the negation of the corresponding OP_values in the
5866      default case when PCRE_UCP is not set. For the back references, the values      default case when PCRE_UCP is not set. For the back references, the values
5867      are ESC_REF plus the reference number. Only back references and those types      are ESC_REF plus the reference number. Only back references and those types
5868      that consume a character may be repeated. We can test for values between      that consume a character may be repeated. We can test for values between
# Line 5973  for (;; ptr++) Line 6040  for (;; ptr++)
6040            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6041            }            }
6042          else          else
6043  #endif  #endif
6044            {            {
6045            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6046            *code++ = -c;            *code++ = -c;
6047            }            }
6048          }          }
6049        continue;        continue;
6050        }        }
# Line 6809  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6876  while (ptr[skipatstart] == CHAR_LEFT_PAR
6876      options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;      options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6877    else break;    else break;
6878    }    }
6879    
6880  utf8 = (options & PCRE_UTF8) != 0;  utf8 = (options & PCRE_UTF8) != 0;
6881    
6882  /* Can't support UTF8 unless PCRE has been compiled to include the code. */  /* Can't support UTF8 unless PCRE has been compiled to include the code. */
# Line 6835  if (utf8) Line 6902  if (utf8)
6902  if ((options & PCRE_UCP) != 0)  if ((options & PCRE_UCP) != 0)
6903    {    {
6904    errorcode = ERR67;    errorcode = ERR67;
6905    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
6906    }    }
6907  #endif  #endif
6908    
6909  /* Check validity of \R options. */  /* Check validity of \R options. */

Legend:
Removed from v.533  
changed lines
  Added in v.556

  ViewVC Help
Powered by ViewVC 1.1.5