/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 978 by ph10, Sun Jun 17 16:55:07 2012 UTC revision 1045 by ph10, Sun Sep 23 16:50:00 2012 UTC
# Line 68  COMPILE_PCREx macro will already be appr Line 68  COMPILE_PCREx macro will already be appr
68    
69  /* Macro for setting individual bits in class bitmaps. */  /* Macro for setting individual bits in class bitmaps. */
70    
71  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72    
73  /* Maximum length value to check against when making sure that the integer that  /* Maximum length value to check against when making sure that the integer that
74  holds the compiled pattern length does not overflow. We make it a bit less than  holds the compiled pattern length does not overflow. We make it a bit less than
# Line 77  to check them every time. */ Line 77  to check them every time. */
77    
78  #define OFLOW_MAX (INT_MAX - 20)  #define OFLOW_MAX (INT_MAX - 20)
79    
80    /* Definitions to allow mutual recursion */
81    
82    static int
83      add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84        const pcre_uint32 *, unsigned int);
85    
86    static BOOL
87      compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL,
88        int, int, int *, int *, branch_chain *, compile_data *, int *);
89    
90    
91    
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 631  static const pcre_uint8 ebcdic_chartab[] Line 642  static const pcre_uint8 ebcdic_chartab[]
642  #endif  #endif
643    
644    
 /* Definition to allow mutual recursion */  
   
 static BOOL  
   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,  
     int *, int *, branch_chain *, compile_data *, int *);  
   
645    
646    
647  /*************************************************  /*************************************************
# Line 789  else if ((i = escapes[c - CHAR_0]) != 0) Line 794  else if ((i = escapes[c - CHAR_0]) != 0)
794    
795  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
796  /* Not alphanumeric */  /* Not alphanumeric */
797  else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}  else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
798  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
799  #endif  #endif
800    
# Line 832  else Line 837  else
837            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
838  #endif  #endif
839            }            }
840    
841  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
842          if (c > (utf ? 0x10ffff : 0xff))          if (c > (utf ? 0x10ffff : 0xff))
843  #else  #else
# Line 841  else Line 846  else
846  #endif  #endif
847  #endif  #endif
848            {            {
849            *errorcodeptr = ERR76;            *errorcodeptr = ERR76;
850            }            }
851          else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;          else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
852          }          }
# Line 2871  PUT(previous_callout, 2 + LINK_SIZE, len Line 2876  PUT(previous_callout, 2 + LINK_SIZE, len
2876  *************************************************/  *************************************************/
2877    
2878  /* This function is passed the start and end of a class range, in UTF-8 mode  /* This function is passed the start and end of a class range, in UTF-8 mode
2879  with UCP support. It searches up the characters, looking for internal ranges of  with UCP support. It searches up the characters, looking for ranges of
2880  characters in the "other" case. Each call returns the next one, updating the  characters in the "other" case. Each call returns the next one, updating the
2881  start address.  start address. A character with multiple other cases is returned on its own
2882    with a special return value.
2883    
2884  Arguments:  Arguments:
2885    cptr        points to starting character value; updated    cptr        points to starting character value; updated
# Line 2881  Arguments: Line 2887  Arguments:
2887    ocptr       where to put start of othercase range    ocptr       where to put start of othercase range
2888    odptr       where to put end of othercase range    odptr       where to put end of othercase range
2889    
2890  Yield:        TRUE when range returned; FALSE when no more  Yield:        -1 when no more
2891                   0 when a range is returned
2892                  >0 the CASESET offset for char with multiple other cases
2893                    in this case, ocptr contains the original
2894  */  */
2895    
2896  static BOOL  static int
2897  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2898    unsigned int *odptr)    unsigned int *odptr)
2899  {  {
2900  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2901    int co;
2902    
2903    /* Find the first character that has an other case. If it has multiple other
2904    cases, return its case offset value. */
2905    
2906  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2907    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }    {
2908      if ((co = UCD_CASESET(c)) != 0)
2909        {
2910        *ocptr = c++;   /* Character that has the set */
2911        *cptr = c;      /* Rest of input range */
2912        return co;
2913        }
2914      if ((othercase = UCD_OTHERCASE(c)) != c) break;
2915      }
2916    
2917  if (c > d) return FALSE;  if (c > d) return -1;  /* Reached end of range */
2918    
2919  *ocptr = othercase;  *ocptr = othercase;
2920  next = othercase + 1;  next = othercase + 1;
# Line 2904  for (++c; c <= d; c++) Line 2925  for (++c; c <= d; c++)
2925    next++;    next++;
2926    }    }
2927    
2928  *odptr = next - 1;  *odptr = next - 1;     /* End of othercase range */
2929  *cptr = c;  *cptr = c;             /* Rest of input range */
2930    return 0;
 return TRUE;  
2931  }  }
2932    
2933    
# Line 3168  if (next >= 0) switch(op_code) Line 3188  if (next >= 0) switch(op_code)
3188    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3189    switch(next)    switch(next)
3190      {      {
3191      case 0x09:      HSPACE_CASES:
     case 0x20:  
     case 0xa0:  
     case 0x1680:  
     case 0x180e:  
     case 0x2000:  
     case 0x2001:  
     case 0x2002:  
     case 0x2003:  
     case 0x2004:  
     case 0x2005:  
     case 0x2006:  
     case 0x2007:  
     case 0x2008:  
     case 0x2009:  
     case 0x200A:  
     case 0x202f:  
     case 0x205f:  
     case 0x3000:  
3192      return op_code == OP_NOT_HSPACE;      return op_code == OP_NOT_HSPACE;
3193    
3194      default:      default:
3195      return op_code != OP_NOT_HSPACE;      return op_code != OP_NOT_HSPACE;
3196      }      }
# Line 3197  if (next >= 0) switch(op_code) Line 3200  if (next >= 0) switch(op_code)
3200    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3201    switch(next)    switch(next)
3202      {      {
3203      case 0x0a:      VSPACE_CASES:
     case 0x0b:  
     case 0x0c:  
     case 0x0d:  
     case 0x85:  
     case 0x2028:  
     case 0x2029:  
3204      return op_code == OP_NOT_VSPACE;      return op_code == OP_NOT_VSPACE;
3205    
3206      default:      default:
3207      return op_code != OP_NOT_VSPACE;      return op_code != OP_NOT_VSPACE;
3208      }      }
# Line 3261  switch(op_code) Line 3259  switch(op_code)
3259      case ESC_H:      case ESC_H:
3260      switch(c)      switch(c)
3261        {        {
3262        case 0x09:        HSPACE_CASES:
       case 0x20:  
       case 0xa0:  
       case 0x1680:  
       case 0x180e:  
       case 0x2000:  
       case 0x2001:  
       case 0x2002:  
       case 0x2003:  
       case 0x2004:  
       case 0x2005:  
       case 0x2006:  
       case 0x2007:  
       case 0x2008:  
       case 0x2009:  
       case 0x200A:  
       case 0x202f:  
       case 0x205f:  
       case 0x3000:  
3263        return -next != ESC_h;        return -next != ESC_h;
3264    
3265        default:        default:
3266        return -next == ESC_h;        return -next == ESC_h;
3267        }        }
# Line 3289  switch(op_code) Line 3270  switch(op_code)
3270      case ESC_V:      case ESC_V:
3271      switch(c)      switch(c)
3272        {        {
3273        case 0x0a:        VSPACE_CASES:
       case 0x0b:  
       case 0x0c:  
       case 0x0d:  
       case 0x85:  
       case 0x2028:  
       case 0x2029:  
3274        return -next != ESC_v;        return -next != ESC_v;
3275    
3276        default:        default:
3277        return -next == ESC_v;        return -next == ESC_v;
3278        }        }
# Line 3401  switch(op_code) Line 3377  switch(op_code)
3377    
3378    
3379  /*************************************************  /*************************************************
3380    *        Add a character or range to a class     *
3381    *************************************************/
3382    
3383    /* This function packages up the logic of adding a character or range of
3384    characters to a class. The character values in the arguments will be within the
3385    valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3386    mutually recursive with the function immediately below.
3387    
3388    Arguments:
3389      classbits     the bit map for characters < 256
3390      uchardptr     points to the pointer for extra data
3391      options       the options word
3392      cd            contains pointers to tables etc.
3393      start         start of range character
3394      end           end of range character
3395    
3396    Returns:        the number of < 256 characters added
3397                    the pointer to extra data is updated
3398    */
3399    
3400    static int
3401    add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3402      compile_data *cd, unsigned int start, unsigned int end)
3403    {
3404    unsigned int c;
3405    int n8 = 0;
3406    
3407    /* If caseless matching is required, scan the range and process alternate
3408    cases. In Unicode, there are 8-bit characters that have alternate cases that
3409    are greater than 255 and vice-versa. Sometimes we can just extend the original
3410    range. */
3411    
3412    if ((options & PCRE_CASELESS) != 0)
3413      {
3414    #ifdef SUPPORT_UCP
3415      if ((options & PCRE_UTF8) != 0)
3416        {
3417        int rc;
3418        unsigned int oc, od;
3419    
3420        options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
3421        c = start;
3422    
3423        while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3424          {
3425          /* Handle a single character that has more than one other case. */
3426    
3427          if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3428            PRIV(ucd_caseless_sets) + rc, oc);
3429    
3430          /* Do nothing if the other case range is within the original range. */
3431    
3432          else if (oc >= start && od <= end) continue;
3433    
3434          /* Extend the original range if there is overlap, noting that if oc < c, we
3435          can't have od > end because a subrange is always shorter than the basic
3436          range. Otherwise, use a recursive call to add the additional range. */
3437    
3438          else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3439          else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
3440          else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3441          }
3442        }
3443      else
3444    #endif  /* SUPPORT_UCP */
3445    
3446      /* Not UTF-mode, or no UCP */
3447    
3448      for (c = start; c <= end && c < 256; c++)
3449        {
3450        SETBIT(classbits, cd->fcc[c]);
3451        n8++;
3452        }
3453      }
3454    
3455    /* Now handle the original range. Adjust the final value according to the bit
3456    length - this means that the same lists of (e.g.) horizontal spaces can be used
3457    in all cases. */
3458    
3459    #ifdef COMPILE_PCRE8
3460    #ifdef SUPPORT_UTF
3461      if ((options & PCRE_UTF8) == 0)
3462    #endif
3463      if (end > 0xff) end = 0xff;
3464    #endif
3465    
3466    #ifdef COMPILE_PCRE16
3467    #ifdef SUPPORT_UTF
3468      if ((options & PCRE_UTF16) == 0)
3469    #endif
3470      if (end > 0xffff) end = 0xffff;
3471    #endif
3472    
3473    /* If all characters are less than 256, use the bit map. Otherwise use extra
3474    data. */
3475    
3476    if (end < 0x100)
3477      {
3478      for (c = start; c <= end; c++)
3479        {
3480        n8++;
3481        SETBIT(classbits, c);
3482        }
3483      }
3484    
3485    else
3486      {
3487      pcre_uchar *uchardata = *uchardptr;
3488    
3489    #ifdef SUPPORT_UTF
3490      if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
3491        {
3492        if (start < end)
3493          {
3494          *uchardata++ = XCL_RANGE;
3495          uchardata += PRIV(ord2utf)(start, uchardata);
3496          uchardata += PRIV(ord2utf)(end, uchardata);
3497          }
3498        else if (start == end)
3499          {
3500          *uchardata++ = XCL_SINGLE;
3501          uchardata += PRIV(ord2utf)(start, uchardata);
3502          }
3503        }
3504      else
3505    #endif  /* SUPPORT_UTF */
3506    
3507      /* Without UTF support, character values are constrained by the bit length,
3508      and can only be > 256 for 16-bit and 32-bit libraries. */
3509    
3510    #ifdef COMPILE_PCRE8
3511        {}
3512    #else
3513      if (start < end)
3514        {
3515        *uchardata++ = XCL_RANGE;
3516        *uchardata++ = start;
3517        *uchardata++ = end;
3518        }
3519      else if (start == end)
3520        {
3521        *uchardata++ = XCL_SINGLE;
3522        *uchardata++ = start;
3523        }
3524    #endif
3525    
3526      *uchardptr = uchardata;   /* Updata extra data pointer */
3527      }
3528    
3529    return n8;    /* Number of 8-bit characters */
3530    }
3531    
3532    
3533    
3534    
3535    /*************************************************
3536    *        Add a list of characters to a class     *
3537    *************************************************/
3538    
3539    /* This function is used for adding a list of case-equivalent characters to a
3540    class, and also for adding a list of horizontal or vertical whitespace. If the
3541    list is in order (which it should be), ranges of characters are detected and
3542    handled appropriately. This function is mutually recursive with the function
3543    above.
3544    
3545    Arguments:
3546      classbits     the bit map for characters < 256
3547      uchardptr     points to the pointer for extra data
3548      options       the options word
3549      cd            contains pointers to tables etc.
3550      p             points to row of 32-bit values, terminated by NOTACHAR
3551      except        character to omit; this is used when adding lists of
3552                      case-equivalent characters to avoid including the one we
3553                      already know about
3554    
3555    Returns:        the number of < 256 characters added
3556                    the pointer to extra data is updated
3557    */
3558    
3559    static int
3560    add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3561      compile_data *cd, const pcre_uint32 *p, unsigned int except)
3562    {
3563    int n8 = 0;
3564    while (p[0] < NOTACHAR)
3565      {
3566      int n = 0;
3567      if (p[0] != except)
3568        {
3569        while(p[n+1] == p[0] + n + 1) n++;
3570        n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3571        }
3572      p += n + 1;
3573      }
3574    return n8;
3575    }
3576    
3577    
3578    
3579    /*************************************************
3580    *    Add characters not in a list to a class     *
3581    *************************************************/
3582    
3583    /* This function is used for adding the complement of a list of horizontal or
3584    vertical whitespace to a class. The list must be in order.
3585    
3586    Arguments:
3587      classbits     the bit map for characters < 256
3588      uchardptr     points to the pointer for extra data
3589      options       the options word
3590      cd            contains pointers to tables etc.
3591      p             points to row of 32-bit values, terminated by NOTACHAR
3592    
3593    Returns:        the number of < 256 characters added
3594                    the pointer to extra data is updated
3595    */
3596    
3597    static int
3598    add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3599      int options, compile_data *cd, const pcre_uint32 *p)
3600    {
3601    int n8 = 0;
3602    if (p[0] > 0)
3603      n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3604    while (p[0] < NOTACHAR)
3605      {
3606      while (p[1] == p[0] + 1) p++;
3607      n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3608        (p[1] == NOTACHAR)? 0x10ffff : p[1] - 1);
3609      p++;
3610      }
3611    return n8;
3612    }
3613    
3614    
3615    
3616    /*************************************************
3617  *           Compile one branch                   *  *           Compile one branch                   *
3618  *************************************************/  *************************************************/
3619    
# Line 3518  for (;; ptr++) Line 3731  for (;; ptr++)
3731    BOOL is_recurse;    BOOL is_recurse;
3732    BOOL reset_bracount;    BOOL reset_bracount;
3733    int class_has_8bitchar;    int class_has_8bitchar;
3734    int class_single_char;    int class_one_char;
3735    int newoptions;    int newoptions;
3736    int recno;    int recno;
3737    int refsign;    int refsign;
# Line 3816  for (;; ptr++) Line 4029  for (;; ptr++)
4029    
4030      should_flip_negation = FALSE;      should_flip_negation = FALSE;
4031    
4032      /* For optimization purposes, we track some properties of the class.      /* For optimization purposes, we track some properties of the class:
4033      class_has_8bitchar will be non-zero, if the class contains at least one      class_has_8bitchar will be non-zero if the class contains at least one <
4034      < 256 character. class_single_char will be 1 if the class contains only      256 character; class_one_char will be 1 if the class contains just one
4035      a single character. */      character. */
4036    
4037      class_has_8bitchar = 0;      class_has_8bitchar = 0;
4038      class_single_char = 0;      class_one_char = 0;
4039    
4040      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
4041      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains fewer than two
4042      than 256), because in that case the compiled code doesn't use the bit map.      8-bit characters because in that case the compiled code doesn't use the bit
4043      */      map. */
4044    
4045      memset(classbits, 0, 32 * sizeof(pcre_uint8));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
4046    
4047  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4048      xclass = FALSE;                           /* No chars >= 256 */      xclass = FALSE;
4049      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4050      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* Save the start */
4051  #endif  #endif
4052    
4053      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3856  for (;; ptr++) Line 4069  for (;; ptr++)
4069        /* In the pre-compile phase, accumulate the length of any extra        /* In the pre-compile phase, accumulate the length of any extra
4070        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
4071        contain a zillion > 255 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
4072        (which is on the stack). */        (which is on the stack). We have to remember that there was XCLASS data,
4073          however. */
4074    
4075        if (lengthptr != NULL)        if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4076          {          {
4077            xclass = TRUE;
4078          *lengthptr += class_uchardata - class_uchardata_base;          *lengthptr += class_uchardata - class_uchardata_base;
4079          class_uchardata = class_uchardata_base;          class_uchardata = class_uchardata_base;
4080          }          }
# Line 3961  for (;; ptr++) Line 4176  for (;; ptr++)
4176              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4177            }            }
4178    
4179          /* Not see if we need to remove any special characters. An option          /* Now see if we need to remove any special characters. An option
4180          value of 1 removes vertical space and 2 removes underscore. */          value of 1 removes vertical space and 2 removes underscore. */
4181    
4182          if (tabopt < 0) tabopt = -tabopt;          if (tabopt < 0) tabopt = -tabopt;
# Line 3977  for (;; ptr++) Line 4192  for (;; ptr++)
4192            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4193    
4194          ptr = tempptr + 1;          ptr = tempptr + 1;
4195          /* Every class contains at least one < 256 characters. */          /* Every class contains at least one < 256 character. */
4196          class_has_8bitchar = 1;          class_has_8bitchar = 1;
4197          /* Every class contains at least two characters. */          /* Every class contains at least two characters. */
4198          class_single_char = 2;          class_one_char = 2;
4199          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
4200          }          }
4201    
# Line 3988  for (;; ptr++) Line 4203  for (;; ptr++)
4203        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
4204        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
4205        assume that other escapes have more than one character in them, so        assume that other escapes have more than one character in them, so
4206        speculatively set both class_has_8bitchar and class_single_char bigger        speculatively set both class_has_8bitchar and class_one_char bigger
4207        than one. Unrecognized escapes fall through and are either treated        than one. Unrecognized escapes fall through and are either treated
4208        as literal characters (by default), or are faulted if        as literal characters (by default), or are faulted if
4209        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
# Line 4021  for (;; ptr++) Line 4236  for (;; ptr++)
4236            /* Every class contains at least two < 256 characters. */            /* Every class contains at least two < 256 characters. */
4237            class_has_8bitchar++;            class_has_8bitchar++;
4238            /* Every class contains at least two characters. */            /* Every class contains at least two characters. */
4239            class_single_char += 2;            class_one_char += 2;
4240    
4241            switch (-c)            switch (-c)
4242              {              {
# Line 4057  for (;; ptr++) Line 4272  for (;; ptr++)
4272    
4273              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4274              if it was previously set by something earlier in the character              if it was previously set by something earlier in the character
4275              class. */              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
4276                EBCDIC, so we lazily just adjust the appropriate bit. */
4277    
4278              case ESC_s:              case ESC_s:
4279              classbits[0] |= cbits[cbit_space];              classbits[0] |= cbits[cbit_space];
# Line 4070  for (;; ptr++) Line 4286  for (;; ptr++)
4286              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4287              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4288              continue;              continue;
4289    
4290                /* The rest apply in both UCP and non-UCP cases. */
4291    
4292              case ESC_h:              case ESC_h:
4293              SETBIT(classbits, 0x09); /* VT */              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4294              SETBIT(classbits, 0x20); /* SPACE */                PRIV(hspace_list), NOTACHAR);
             SETBIT(classbits, 0xa0); /* NSBP */  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x1680;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x180e;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2000;  
             *class_uchardata++ = 0x200a;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x202f;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x205f;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x3000;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);  
               }  
 #endif  
4295              continue;              continue;
4296    
4297              case ESC_H:              case ESC_H:
4298              for (c = 0; c < 32; c++)              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4299                {                cd, PRIV(hspace_list));
               int x = 0xff;  
               switch (c)  
                 {  
                 case 0x09/8: x ^= 1 << (0x09%8); break;  
                 case 0x20/8: x ^= 1 << (0x20%8); break;  
                 case 0xa0/8: x ^= 1 << (0xa0%8); break;  
                 default: break;  
                 }  
               classbits[c] |= x;  
               }  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x0100;  
             *class_uchardata++ = 0x167f;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x1681;  
             *class_uchardata++ = 0x180d;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x180f;  
             *class_uchardata++ = 0x1fff;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x200b;  
             *class_uchardata++ = 0x202e;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2030;  
             *class_uchardata++ = 0x205e;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2060;  
             *class_uchardata++ = 0x2fff;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x3001;  
 #ifdef SUPPORT_UTF  
             if (utf)  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
             else  
 #endif  
               *class_uchardata++ = 0xffff;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
               }  
 #endif  
4300              continue;              continue;
4301    
4302              case ESC_v:              case ESC_v:
4303              SETBIT(classbits, 0x0a); /* LF */              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4304              SETBIT(classbits, 0x0b); /* VT */                PRIV(vspace_list), NOTACHAR);
             SETBIT(classbits, 0x0c); /* FF */  
             SETBIT(classbits, 0x0d); /* CR */  
             SETBIT(classbits, 0x85); /* NEL */  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2028;  
             *class_uchardata++ = 0x2029;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);  
               }  
 #endif  
4305              continue;              continue;
4306    
4307              case ESC_V:              case ESC_V:
4308              for (c = 0; c < 32; c++)              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4309                {                cd, PRIV(vspace_list));
               int x = 0xff;  
               switch (c)  
                 {  
                 case 0x0a/8: x ^= 1 << (0x0a%8);  
                              x ^= 1 << (0x0b%8);  
                              x ^= 1 << (0x0c%8);  
                              x ^= 1 << (0x0d%8);  
                              break;  
                 case 0x85/8: x ^= 1 << (0x85%8); break;  
                 default: break;  
                 }  
               classbits[c] |= x;  
               }  
   
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x0100;  
             *class_uchardata++ = 0x2027;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x202a;  
 #ifdef SUPPORT_UTF  
             if (utf)  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
             else  
 #endif  
               *class_uchardata++ = 0xffff;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
               }  
 #endif  
4310              continue;              continue;
4311    
4312  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 4255  for (;; ptr++) Line 4317  for (;; ptr++)
4317                int pdata;                int pdata;
4318                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4319                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
               xclass = TRUE;  
4320                *class_uchardata++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4321                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4322                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
# Line 4275  for (;; ptr++) Line 4336  for (;; ptr++)
4336                goto FAILED;                goto FAILED;
4337                }                }
4338              class_has_8bitchar--;    /* Undo the speculative increase. */              class_has_8bitchar--;    /* Undo the speculative increase. */
4339              class_single_char -= 2;  /* Undo the speculative increase. */              class_one_char -= 2;     /* Undo the speculative increase. */
4340              c = *ptr;                /* Get the final character and fall through */              c = *ptr;                /* Get the final character and fall through */
4341              break;              break;
4342              }              }
4343            }            }
4344    
4345          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if the escape just defined a single character (c >= 0).
4346          greater than 256. */          This may be greater than 256. */
4347    
4348          }   /* End of backslash handling */          }   /* End of backslash handling */
4349    
4350        /* A single character may be followed by '-' to form a range. However,        /* A character may be followed by '-' to form a range. However, Perl does
4351        Perl does not permit ']' to be the end of the range. A '-' character        not permit ']' to be the end of the range. A '-' character at the end is
4352        at the end is treated as a literal. Perl ignores orphaned \E sequences        treated as a literal. Perl ignores orphaned \E sequences entirely. The
4353        entirely. The code for handling \Q and \E is messy. */        code for handling \Q and \E is messy. */
4354    
4355        CHECK_RANGE:        CHECK_RANGE:
4356        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 4297  for (;; ptr++) Line 4358  for (;; ptr++)
4358          inescq = FALSE;          inescq = FALSE;
4359          ptr += 2;          ptr += 2;
4360          }          }
   
4361        oldptr = ptr;        oldptr = ptr;
4362    
4363        /* Remember \r or \n */        /* Remember if \r or \n were explicitly used */
4364    
4365        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4366    
# Line 4323  for (;; ptr++) Line 4383  for (;; ptr++)
4383            inescq = TRUE;            inescq = TRUE;
4384            break;            break;
4385            }            }
4386    
4387            /* Minus (hyphen) at the end of a class is treated as a literal, so put
4388            back the pointer and jump to handle the character that preceded it. */
4389    
4390          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4391            {            {
4392            ptr = oldptr;            ptr = oldptr;
4393            goto LONE_SINGLE_CHARACTER;            goto CLASS_SINGLE_CHARACTER;
4394            }            }
4395    
4396            /* Otherwise, we have a potential range; pick up the next character */
4397    
4398  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4399          if (utf)          if (utf)
# Line 4348  for (;; ptr++) Line 4413  for (;; ptr++)
4413            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4414            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
4415    
4416            /* \b is backspace; any other special means the '-' was literal */            /* \b is backspace; any other special means the '-' was literal. */
4417    
4418            if (d < 0)            if (d < 0)
4419              {              {
4420              if (d == -ESC_b) d = CHAR_BS; else              if (d == -ESC_b) d = CHAR_BS; else
4421                {                {
4422                ptr = oldptr;                ptr = oldptr;
4423                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4424                }                }
4425              }              }
4426            }            }
4427    
4428          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
4429          one-character ranges */          one-character ranges. */
4430    
4431          if (d < c)          if (d < c)
4432            {            {
4433            *errorcodeptr = ERR8;            *errorcodeptr = ERR8;
4434            goto FAILED;            goto FAILED;
4435            }            }
4436            if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4437    
4438          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          /* We have found a character range, so single character optimizations
4439            cannot be done anymore. Any value greater than 1 indicates that there
4440            is more than one character. */
4441    
4442            class_one_char = 2;
4443    
4444          /* Remember \r or \n */          /* Remember an explicit \r or \n, and add the range to the class. */
4445    
4446          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4447    
4448          /* Since we found a character range, single character optimizations          class_has_8bitchar +=
4449          cannot be done anymore. */            add_to_class(classbits, &class_uchardata, options, cd, c, d);
4450          class_single_char = 2;  
   
         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless  
         matching, we have to use an XCLASS with extra data items. Caseless  
         matching for characters > 127 is available only if UCP support is  
         available. */  
   
 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)  
         if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))  
 #elif defined  SUPPORT_UTF  
         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))  
 #elif !(defined COMPILE_PCRE8)  
         if (d > 255)  
 #endif  
 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)  
           {  
           xclass = TRUE;  
   
           /* With UCP support, we can find the other case equivalents of  
           the relevant characters. There may be several ranges. Optimize how  
           they fit with the basic range. */  
   
 #ifdef SUPPORT_UCP  
 #ifndef COMPILE_PCRE8  
           if (utf && (options & PCRE_CASELESS) != 0)  
 #else  
           if ((options & PCRE_CASELESS) != 0)  
 #endif  
             {  
             unsigned int occ, ocd;  
             unsigned int cc = c;  
             unsigned int origd = d;  
             while (get_othercase_range(&cc, origd, &occ, &ocd))  
               {  
               if (occ >= (unsigned int)c &&  
                   ocd <= (unsigned int)d)  
                 continue;                          /* Skip embedded ranges */  
   
               if (occ < (unsigned int)c  &&  
                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */  
                 {                                  /* if there is overlap,   */  
                 c = occ;                           /* noting that if occ < c */  
                 continue;                          /* we can't have ocd > d  */  
                 }                                  /* because a subrange is  */  
               if (ocd > (unsigned int)d &&  
                   occ <= (unsigned int)d + 1)      /* always shorter than    */  
                 {                                  /* the basic range.       */  
                 d = ocd;  
                 continue;  
                 }  
   
               if (occ == ocd)  
                 {  
                 *class_uchardata++ = XCL_SINGLE;  
                 }  
               else  
                 {  
                 *class_uchardata++ = XCL_RANGE;  
                 class_uchardata += PRIV(ord2utf)(occ, class_uchardata);  
                 }  
               class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);  
               }  
             }  
 #endif  /* SUPPORT_UCP */  
   
           /* Now record the original range, possibly modified for UCP caseless  
           overlapping ranges. */  
   
           *class_uchardata++ = XCL_RANGE;  
 #ifdef SUPPORT_UTF  
 #ifndef COMPILE_PCRE8  
           if (utf)  
             {  
             class_uchardata += PRIV(ord2utf)(c, class_uchardata);  
             class_uchardata += PRIV(ord2utf)(d, class_uchardata);  
             }  
           else  
             {  
             *class_uchardata++ = c;  
             *class_uchardata++ = d;  
             }  
 #else  
           class_uchardata += PRIV(ord2utf)(c, class_uchardata);  
           class_uchardata += PRIV(ord2utf)(d, class_uchardata);  
 #endif  
 #else /* SUPPORT_UTF */  
           *class_uchardata++ = c;  
           *class_uchardata++ = d;  
 #endif /* SUPPORT_UTF */  
   
           /* With UCP support, we are done. Without UCP support, there is no  
           caseless matching for UTF characters > 127; we can use the bit map  
           for the smaller ones. As for 16 bit characters without UTF, we  
           can still use  */  
   
 #ifdef SUPPORT_UCP  
 #ifndef COMPILE_PCRE8  
           if (utf)  
 #endif  
             continue;    /* With next character in the class */  
 #endif  /* SUPPORT_UCP */  
   
 #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)  
           if (utf)  
             {  
             if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  
             /* Adjust upper limit and fall through to set up the map */  
             d = 127;  
             }  
           else  
             {  
             if (c > 255) continue;  
             /* Adjust upper limit and fall through to set up the map */  
             d = 255;  
             }  
 #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)  
           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  
           /* Adjust upper limit and fall through to set up the map */  
           d = 127;  
 #else  
           if (c > 255) continue;  
           /* Adjust upper limit and fall through to set up the map */  
           d = 255;  
 #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */  
           }  
 #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */  
   
         /* We use the bit map for 8 bit mode, or when the characters fall  
         partially or entirely to [0-255] ([0-127] for UCP) ranges. */  
   
         class_has_8bitchar = 1;  
   
         /* We can save a bit of time by skipping this in the pre-compile. */  
   
         if (lengthptr == NULL) for (; c <= d; c++)  
           {  
           classbits[c/8] |= (1 << (c&7));  
           if ((options & PCRE_CASELESS) != 0)  
             {  
             int uc = cd->fcc[c]; /* flip case */  
             classbits[uc/8] |= (1 << (uc&7));  
             }  
           }  
   
4451          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
4452          }          }
4453    
4454        /* Handle a lone single character - we can get here for a normal        /* Handle a single character - we can get here for a normal non-escape
4455        non-escape char, or after \ that introduces a single character or for an        char, or after \ that introduces a single character or for an apparent
4456        apparent range that isn't. */        range that isn't. Only the value 1 matters for class_one_char, so don't
4457          increase it if it is already 2 or more ... just in case there's a class
4458        LONE_SINGLE_CHARACTER:        with a zillion characters in it. */
4459    
4460        /* Only the value of 1 matters for class_single_char. */        CLASS_SINGLE_CHARACTER:
4461          if (class_one_char < 2) class_one_char++;
4462        if (class_single_char < 2) class_single_char++;  
4463          /* If class_one_char is 1, we have the first single character in the
4464          class, and there have been no prior ranges, or XCLASS items generated by
4465          escapes. If this is the final character in the class, we can optimize by
4466          turning the item into a 1-character OP_CHAR[I] if it's positive, or
4467          OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4468          to be set. Otherwise, there can be no first char if this item is first,
4469          whatever repeat count may follow. In the case of reqchar, save the
4470          previous value for reinstating. */
4471    
4472        /* If class_charcount is 1, we saw precisely one character. As long as        if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
       there was no use of \p or \P, in other words, no use of any XCLASS  
       features, we can optimize.  
   
       The optimization throws away the bit map. We turn the item into a  
       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
       In the positive case, it can cause firstchar to be set. Otherwise, there  
       can be no first char if this item is first, whatever repeat count may  
       follow. In the case of reqchar, save the previous value for reinstating. */  
   
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)  
4473          {          {
4474          ptr++;          ptr++;
4475          zeroreqchar = reqchar;          zeroreqchar = reqchar;
# Line 4577  for (;; ptr++) Line 4502  for (;; ptr++)
4502            }            }
4503          goto ONE_CHAR;          goto ONE_CHAR;
4504          }       /* End of 1-char optimization */          }       /* End of 1-char optimization */
4505    
4506        /* Handle a character that cannot go in the bit map. */        /* There is more than one character in the class, or an XCLASS item
4507          has been generated. Add this character to the class. */
4508  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)  
4509        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))        class_has_8bitchar +=
4510  #elif defined SUPPORT_UTF          add_to_class(classbits, &class_uchardata, options, cd, c, c);
       if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))  
 #elif !(defined COMPILE_PCRE8)  
       if (c > 255)  
 #endif  
   
 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)  
         {  
         xclass = TRUE;  
         *class_uchardata++ = XCL_SINGLE;  
 #ifdef SUPPORT_UTF  
 #ifndef COMPILE_PCRE8  
         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */  
         if (!utf)  
           *class_uchardata++ = c;  
         else  
 #endif  
           class_uchardata += PRIV(ord2utf)(c, class_uchardata);  
 #else /* SUPPORT_UTF */  
         *class_uchardata++ = c;  
 #endif /* SUPPORT_UTF */  
   
 #ifdef SUPPORT_UCP  
 #ifdef COMPILE_PCRE8  
         if ((options & PCRE_CASELESS) != 0)  
 #else  
         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */  
         if (utf && (options & PCRE_CASELESS) != 0)  
 #endif  
           {  
           unsigned int othercase;  
           if ((int)(othercase = UCD_OTHERCASE(c)) != c)  
             {  
             *class_uchardata++ = XCL_SINGLE;  
             class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);  
             }  
           }  
 #endif  /* SUPPORT_UCP */  
   
         }  
       else  
 #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */  
   
       /* Handle a single-byte character */  
         {  
         class_has_8bitchar = 1;  
         classbits[c/8] |= (1 << (c&7));  
         if ((options & PCRE_CASELESS) != 0)  
           {  
           c = cd->fcc[c]; /* flip case */  
           classbits[c/8] |= (1 << (c&7));  
           }  
         }  
4511        }        }
4512    
4513      /* Loop until ']' reached. This "while" is the end of the "do" far above.      /* Loop until ']' reached. This "while" is the end of the "do" far above.
# Line 4654  for (;; ptr++) Line 4527  for (;; ptr++)
4527        goto FAILED;        goto FAILED;
4528        }        }
4529    
4530        /* We will need an XCLASS if data has been placed in class_uchardata. In
4531        the second phase this is a sufficient test. However, in the pre-compile
4532        phase, class_uchardata gets emptied to prevent workspace overflow, so it
4533        only if the very last character in the class needs XCLASS will it contain
4534        anything at this point. For this reason, xclass gets set TRUE above when
4535        uchar_classdata is emptied, and that's why this code is the way it is here
4536        instead of just doing a test on class_uchardata below. */
4537    
4538    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4539        if (class_uchardata > class_uchardata_base) xclass = TRUE;
4540    #endif
4541    
4542      /* If this is the first thing in the branch, there can be no first char      /* If this is the first thing in the branch, there can be no first char
4543      setting, whatever the repeat count. Any reqchar setting must remain      setting, whatever the repeat count. Any reqchar setting must remain
4544      unchanged after any kind of repeat. */      unchanged after any kind of repeat. */
# Line 5636  for (;; ptr++) Line 5521  for (;; ptr++)
5521          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
5522              STRNCMP_UC_C8(name, vn, namelen) == 0)              STRNCMP_UC_C8(name, vn, namelen) == 0)
5523            {            {
5524              int setverb;
5525    
5526            /* Check for open captures before ACCEPT and convert it to            /* Check for open captures before ACCEPT and convert it to
5527            ASSERT_ACCEPT if in an assertion. */            ASSERT_ACCEPT if in an assertion. */
5528    
# Line 5653  for (;; ptr++) Line 5540  for (;; ptr++)
5540                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5541                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5542                }                }
5543              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              setverb = *code++ =
5544                  (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5545    
5546              /* Do not set firstchar after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5547              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
# Line 5668  for (;; ptr++) Line 5556  for (;; ptr++)
5556                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
5557                goto FAILED;                goto FAILED;
5558                }                }
5559              *code = verbs[i].op;              setverb = *code++ = verbs[i].op;
             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;  
5560              }              }
5561    
5562            else            else
# Line 5679  for (;; ptr++) Line 5566  for (;; ptr++)
5566                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
5567                goto FAILED;                goto FAILED;
5568                }                }
5569              *code = verbs[i].op_arg;              setverb = *code++ = verbs[i].op_arg;
             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;  
5570              *code++ = arglen;              *code++ = arglen;
5571              memcpy(code, arg, IN_UCHARS(arglen));              memcpy(code, arg, IN_UCHARS(arglen));
5572              code += arglen;              code += arglen;
5573              *code++ = 0;              *code++ = 0;
5574              }              }
5575    
5576              switch (setverb)
5577                {
5578                case OP_THEN:
5579                case OP_THEN_ARG:
5580                cd->external_flags |= PCRE_HASTHEN;
5581                break;
5582    
5583                case OP_PRUNE:
5584                case OP_PRUNE_ARG:
5585                case OP_SKIP:
5586                case OP_SKIP_ARG:
5587                cd->had_pruneorskip = TRUE;
5588                break;
5589                }
5590    
5591            break;  /* Found verb, exit loop */            break;  /* Found verb, exit loop */
5592            }            }
5593    
# Line 7323  and the highest back reference was great Line 7224  and the highest back reference was great
7224  However, by keeping a bitmap of the first 31 back references, we can catch some  However, by keeping a bitmap of the first 31 back references, we can catch some
7225  of the more common cases more precisely.  of the more common cases more precisely.
7226    
7227    ... A second exception is when the .* appears inside an atomic group, because
7228    this prevents the number of characters it matches from being adjusted.
7229    
7230  Arguments:  Arguments:
7231    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7232    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7233                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7234                    the less precise approach                    the less precise approach
7235    backref_map    the back reference bitmap    cd             points to the compile data block
7236      atomcount      atomic group level
7237    
7238  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
7239  */  */
7240    
7241  static BOOL  static BOOL
7242  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,  is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
7243    unsigned int backref_map)    compile_data *cd, int atomcount)
7244  {  {
7245  do {  do {
7246     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7347  do { Line 7252  do {
7252     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7253         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7254       {       {
7255       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7256       }       }
7257    
7258     /* Capturing brackets */     /* Capturing brackets */
# Line 7357  do { Line 7262  do {
7262       {       {
7263       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7264       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7265       if (!is_anchored(scode, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
7266       }       }
7267    
7268     /* Other brackets */     /* Positive forward assertions and conditions */
7269    
7270     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||     else if (op == OP_ASSERT || op == OP_COND)
             op == OP_COND)  
7271       {       {
7272       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
7273         }
7274    
7275       /* Atomic groups */
7276    
7277       else if (op == OP_ONCE || op == OP_ONCE_NC)
7278         {
7279         if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
7280           return FALSE;
7281       }       }
7282    
7283     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7284     it isn't in brackets that are or may be referenced. */     it isn't in brackets that are or may be referenced or inside an atomic
7285       group. */
7286    
7287     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7288               op == OP_TYPEPOSSTAR))               op == OP_TYPEPOSSTAR))
7289       {       {
7290       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)       if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
7291             atomcount > 0 || cd->had_pruneorskip)
7292         return FALSE;         return FALSE;
7293       }       }
7294    
7295     /* Check for explicit anchoring */     /* Check for explicit anchoring */
7296    
7297     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;     else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7298    
7299     code += GET(code, 1);     code += GET(code, 1);
7300     }     }
7301  while (*code == OP_ALT);   /* Loop for each alternative */  while (*code == OP_ALT);   /* Loop for each alternative */
# Line 7398  return TRUE; Line 7313  return TRUE;
7313  matching and for non-DOTALL patterns that start with .* (which must start at  matching and for non-DOTALL patterns that start with .* (which must start at
7314  the beginning or after \n). As in the case of is_anchored() (see above), we  the beginning or after \n). As in the case of is_anchored() (see above), we
7315  have to take account of back references to capturing brackets that contain .*  have to take account of back references to capturing brackets that contain .*
7316  because in that case we can't make the assumption.  because in that case we can't make the assumption. Also, the appearance of .*
7317    inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
7318    count, because once again the assumption no longer holds.
7319    
7320  Arguments:  Arguments:
7321    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
7322    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
7323                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
7324                    the less precise approach                    the less precise approach
7325    backref_map    the back reference bitmap    cd             points to the compile data
7326      atomcount      atomic group level
7327    
7328  Returns:         TRUE or FALSE  Returns:         TRUE or FALSE
7329  */  */
7330    
7331  static BOOL  static BOOL
7332  is_startline(const pcre_uchar *code, unsigned int bracket_map,  is_startline(const pcre_uchar *code, unsigned int bracket_map,
7333    unsigned int backref_map)    compile_data *cd, int atomcount)
7334  {  {
7335  do {  do {
7336     const pcre_uchar *scode = first_significant_code(     const pcre_uchar *scode = first_significant_code(
# Line 7438  do { Line 7356  do {
7356         return FALSE;         return FALSE;
7357    
7358         default:     /* Assertion */         default:     /* Assertion */
7359         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7360         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
7361         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
7362         break;         break;
# Line 7452  do { Line 7370  do {
7370     if (op == OP_BRA  || op == OP_BRAPOS ||     if (op == OP_BRA  || op == OP_BRAPOS ||
7371         op == OP_SBRA || op == OP_SBRAPOS)         op == OP_SBRA || op == OP_SBRAPOS)
7372       {       {
7373       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7374       }       }
7375    
7376     /* Capturing brackets */     /* Capturing brackets */
# Line 7462  do { Line 7380  do {
7380       {       {
7381       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
7382       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7383       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
7384       }       }
7385    
7386     /* Other brackets */     /* Positive forward assertions */
7387    
7388       else if (op == OP_ASSERT)
7389         {
7390         if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
7391         }
7392    
7393       /* Atomic brackets */
7394    
7395     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)     else if (op == OP_ONCE || op == OP_ONCE_NC)
7396       {       {
7397       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
7398       }       }
7399    
7400     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in atomic brackets or
7401     may be referenced. */     brackets that may be referenced, as long as the pattern does not contain
7402       *PRUNE or *SKIP, because these break the feature. Consider, for example,
7403       /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
7404       start of a line. */
7405    
7406     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7407       {       {
7408       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
7409             atomcount > 0 || cd->had_pruneorskip)
7410           return FALSE;
7411       }       }
7412    
7413     /* Check for explicit circumflex */     /* Check for explicit circumflex; anything else gives a FALSE result. Note
7414       in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
7415       because the number of characters matched by .* cannot be adjusted inside
7416       them. */
7417    
7418     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;     else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7419    
# Line 7939  cd->start_code = codestart; Line 7872  cd->start_code = codestart;
7872  cd->hwm = (pcre_uchar *)(cd->start_workspace);  cd->hwm = (pcre_uchar *)(cd->start_workspace);
7873  cd->req_varyopt = 0;  cd->req_varyopt = 0;
7874  cd->had_accept = FALSE;  cd->had_accept = FALSE;
7875    cd->had_pruneorskip = FALSE;
7876  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
7877  cd->open_caps = NULL;  cd->open_caps = NULL;
7878    
# Line 8062  if (errorcode != 0) Line 7996  if (errorcode != 0)
7996    }    }
7997    
7998  /* If the anchored option was not passed, set the flag if we can determine that  /* If the anchored option was not passed, set the flag if we can determine that
7999  the pattern is anchored by virtue of ^ characters or \A or anything else (such  the pattern is anchored by virtue of ^ characters or \A or anything else, such
8000  as starting with .* when DOTALL is set).  as starting with non-atomic .* when DOTALL is set and there are no occurrences
8001    of *PRUNE or *SKIP.
8002    
8003  Otherwise, if we know what the first byte has to be, save it, because that  Otherwise, if we know what the first byte has to be, save it, because that
8004  speeds up unanchored matches no end. If not, see if we can set the  speeds up unanchored matches no end. If not, see if we can set the
8005  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches  PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8006  start with ^. and also when all branches start with .* for non-DOTALL matches.  start with ^. and also when all branches start with non-atomic .* for
8007  */  non-DOTALL matches when *PRUNE and SKIP are not present. */
8008    
8009  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
8010    {    {
8011    if (is_anchored(codestart, 0, cd->backref_map))    if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
     re->options |= PCRE_ANCHORED;  
8012    else    else
8013      {      {
8014      if (firstchar < 0)      if (firstchar < 0)
# Line 8111  if ((re->options & PCRE_ANCHORED) == 0) Line 8045  if ((re->options & PCRE_ANCHORED) == 0)
8045    
8046        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
8047        }        }
8048      else if (is_startline(codestart, 0, cd->backref_map))  
8049        re->flags |= PCRE_STARTLINE;      else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
8050      }      }
8051    }    }
8052    

Legend:
Removed from v.978  
changed lines
  Added in v.1045

  ViewVC Help
Powered by ViewVC 1.1.5