/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1041 by ph10, Sun Sep 16 10:16:27 2012 UTC revision 1045 by ph10, Sun Sep 23 16:50:00 2012 UTC
# Line 68  COMPILE_PCREx macro will already be appr Line 68  COMPILE_PCREx macro will already be appr
68    
69  /* Macro for setting individual bits in class bitmaps. */  /* Macro for setting individual bits in class bitmaps. */
70    
71  #define SETBIT(a,b) a[b/8] |= (1 << (b%8))  #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72    
73  /* Maximum length value to check against when making sure that the integer that  /* Maximum length value to check against when making sure that the integer that
74  holds the compiled pattern length does not overflow. We make it a bit less than  holds the compiled pattern length does not overflow. We make it a bit less than
# Line 77  to check them every time. */ Line 77  to check them every time. */
77    
78  #define OFLOW_MAX (INT_MAX - 20)  #define OFLOW_MAX (INT_MAX - 20)
79    
80    /* Definitions to allow mutual recursion */
81    
82    static int
83      add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84        const pcre_uint32 *, unsigned int);
85    
86    static BOOL
87      compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL,
88        int, int, int *, int *, branch_chain *, compile_data *, int *);
89    
90    
91    
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
# Line 631  static const pcre_uint8 ebcdic_chartab[] Line 642  static const pcre_uint8 ebcdic_chartab[]
642  #endif  #endif
643    
644    
 /* Definition to allow mutual recursion */  
   
 static BOOL  
   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,  
     int *, int *, branch_chain *, compile_data *, int *);  
   
645    
646    
647  /*************************************************  /*************************************************
# Line 2871  PUT(previous_callout, 2 + LINK_SIZE, len Line 2876  PUT(previous_callout, 2 + LINK_SIZE, len
2876  *************************************************/  *************************************************/
2877    
2878  /* This function is passed the start and end of a class range, in UTF-8 mode  /* This function is passed the start and end of a class range, in UTF-8 mode
2879  with UCP support. It searches up the characters, looking for internal ranges of  with UCP support. It searches up the characters, looking for ranges of
2880  characters in the "other" case. Each call returns the next one, updating the  characters in the "other" case. Each call returns the next one, updating the
2881  start address.  start address. A character with multiple other cases is returned on its own
2882    with a special return value.
2883    
2884  Arguments:  Arguments:
2885    cptr        points to starting character value; updated    cptr        points to starting character value; updated
# Line 2881  Arguments: Line 2887  Arguments:
2887    ocptr       where to put start of othercase range    ocptr       where to put start of othercase range
2888    odptr       where to put end of othercase range    odptr       where to put end of othercase range
2889    
2890  Yield:        TRUE when range returned; FALSE when no more  Yield:        -1 when no more
2891                   0 when a range is returned
2892                  >0 the CASESET offset for char with multiple other cases
2893                    in this case, ocptr contains the original
2894  */  */
2895    
2896  static BOOL  static int
2897  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2898    unsigned int *odptr)    unsigned int *odptr)
2899  {  {
2900  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2901    int co;
2902    
2903    /* Find the first character that has an other case. If it has multiple other
2904    cases, return its case offset value. */
2905    
2906  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2907    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }    {
2908      if ((co = UCD_CASESET(c)) != 0)
2909        {
2910        *ocptr = c++;   /* Character that has the set */
2911        *cptr = c;      /* Rest of input range */
2912        return co;
2913        }
2914      if ((othercase = UCD_OTHERCASE(c)) != c) break;
2915      }
2916    
2917  if (c > d) return FALSE;  if (c > d) return -1;  /* Reached end of range */
2918    
2919  *ocptr = othercase;  *ocptr = othercase;
2920  next = othercase + 1;  next = othercase + 1;
# Line 2904  for (++c; c <= d; c++) Line 2925  for (++c; c <= d; c++)
2925    next++;    next++;
2926    }    }
2927    
2928  *odptr = next - 1;  *odptr = next - 1;     /* End of othercase range */
2929  *cptr = c;  *cptr = c;             /* Rest of input range */
2930    return 0;
 return TRUE;  
2931  }  }
2932    
2933    
# Line 3357  switch(op_code) Line 3377  switch(op_code)
3377    
3378    
3379  /*************************************************  /*************************************************
3380    *        Add a character or range to a class     *
3381    *************************************************/
3382    
3383    /* This function packages up the logic of adding a character or range of
3384    characters to a class. The character values in the arguments will be within the
3385    valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3386    mutually recursive with the function immediately below.
3387    
3388    Arguments:
3389      classbits     the bit map for characters < 256
3390      uchardptr     points to the pointer for extra data
3391      options       the options word
3392      cd            contains pointers to tables etc.
3393      start         start of range character
3394      end           end of range character
3395    
3396    Returns:        the number of < 256 characters added
3397                    the pointer to extra data is updated
3398    */
3399    
3400    static int
3401    add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3402      compile_data *cd, unsigned int start, unsigned int end)
3403    {
3404    unsigned int c;
3405    int n8 = 0;
3406    
3407    /* If caseless matching is required, scan the range and process alternate
3408    cases. In Unicode, there are 8-bit characters that have alternate cases that
3409    are greater than 255 and vice-versa. Sometimes we can just extend the original
3410    range. */
3411    
3412    if ((options & PCRE_CASELESS) != 0)
3413      {
3414    #ifdef SUPPORT_UCP
3415      if ((options & PCRE_UTF8) != 0)
3416        {
3417        int rc;
3418        unsigned int oc, od;
3419    
3420        options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
3421        c = start;
3422    
3423        while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3424          {
3425          /* Handle a single character that has more than one other case. */
3426    
3427          if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3428            PRIV(ucd_caseless_sets) + rc, oc);
3429    
3430          /* Do nothing if the other case range is within the original range. */
3431    
3432          else if (oc >= start && od <= end) continue;
3433    
3434          /* Extend the original range if there is overlap, noting that if oc < c, we
3435          can't have od > end because a subrange is always shorter than the basic
3436          range. Otherwise, use a recursive call to add the additional range. */
3437    
3438          else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3439          else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
3440          else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3441          }
3442        }
3443      else
3444    #endif  /* SUPPORT_UCP */
3445    
3446      /* Not UTF-mode, or no UCP */
3447    
3448      for (c = start; c <= end && c < 256; c++)
3449        {
3450        SETBIT(classbits, cd->fcc[c]);
3451        n8++;
3452        }
3453      }
3454    
3455    /* Now handle the original range. Adjust the final value according to the bit
3456    length - this means that the same lists of (e.g.) horizontal spaces can be used
3457    in all cases. */
3458    
3459    #ifdef COMPILE_PCRE8
3460    #ifdef SUPPORT_UTF
3461      if ((options & PCRE_UTF8) == 0)
3462    #endif
3463      if (end > 0xff) end = 0xff;
3464    #endif
3465    
3466    #ifdef COMPILE_PCRE16
3467    #ifdef SUPPORT_UTF
3468      if ((options & PCRE_UTF16) == 0)
3469    #endif
3470      if (end > 0xffff) end = 0xffff;
3471    #endif
3472    
3473    /* If all characters are less than 256, use the bit map. Otherwise use extra
3474    data. */
3475    
3476    if (end < 0x100)
3477      {
3478      for (c = start; c <= end; c++)
3479        {
3480        n8++;
3481        SETBIT(classbits, c);
3482        }
3483      }
3484    
3485    else
3486      {
3487      pcre_uchar *uchardata = *uchardptr;
3488    
3489    #ifdef SUPPORT_UTF
3490      if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
3491        {
3492        if (start < end)
3493          {
3494          *uchardata++ = XCL_RANGE;
3495          uchardata += PRIV(ord2utf)(start, uchardata);
3496          uchardata += PRIV(ord2utf)(end, uchardata);
3497          }
3498        else if (start == end)
3499          {
3500          *uchardata++ = XCL_SINGLE;
3501          uchardata += PRIV(ord2utf)(start, uchardata);
3502          }
3503        }
3504      else
3505    #endif  /* SUPPORT_UTF */
3506    
3507      /* Without UTF support, character values are constrained by the bit length,
3508      and can only be > 256 for 16-bit and 32-bit libraries. */
3509    
3510    #ifdef COMPILE_PCRE8
3511        {}
3512    #else
3513      if (start < end)
3514        {
3515        *uchardata++ = XCL_RANGE;
3516        *uchardata++ = start;
3517        *uchardata++ = end;
3518        }
3519      else if (start == end)
3520        {
3521        *uchardata++ = XCL_SINGLE;
3522        *uchardata++ = start;
3523        }
3524    #endif
3525    
3526      *uchardptr = uchardata;   /* Updata extra data pointer */
3527      }
3528    
3529    return n8;    /* Number of 8-bit characters */
3530    }
3531    
3532    
3533    
3534    
3535    /*************************************************
3536    *        Add a list of characters to a class     *
3537    *************************************************/
3538    
3539    /* This function is used for adding a list of case-equivalent characters to a
3540    class, and also for adding a list of horizontal or vertical whitespace. If the
3541    list is in order (which it should be), ranges of characters are detected and
3542    handled appropriately. This function is mutually recursive with the function
3543    above.
3544    
3545    Arguments:
3546      classbits     the bit map for characters < 256
3547      uchardptr     points to the pointer for extra data
3548      options       the options word
3549      cd            contains pointers to tables etc.
3550      p             points to row of 32-bit values, terminated by NOTACHAR
3551      except        character to omit; this is used when adding lists of
3552                      case-equivalent characters to avoid including the one we
3553                      already know about
3554    
3555    Returns:        the number of < 256 characters added
3556                    the pointer to extra data is updated
3557    */
3558    
3559    static int
3560    add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3561      compile_data *cd, const pcre_uint32 *p, unsigned int except)
3562    {
3563    int n8 = 0;
3564    while (p[0] < NOTACHAR)
3565      {
3566      int n = 0;
3567      if (p[0] != except)
3568        {
3569        while(p[n+1] == p[0] + n + 1) n++;
3570        n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3571        }
3572      p += n + 1;
3573      }
3574    return n8;
3575    }
3576    
3577    
3578    
3579    /*************************************************
3580    *    Add characters not in a list to a class     *
3581    *************************************************/
3582    
3583    /* This function is used for adding the complement of a list of horizontal or
3584    vertical whitespace to a class. The list must be in order.
3585    
3586    Arguments:
3587      classbits     the bit map for characters < 256
3588      uchardptr     points to the pointer for extra data
3589      options       the options word
3590      cd            contains pointers to tables etc.
3591      p             points to row of 32-bit values, terminated by NOTACHAR
3592    
3593    Returns:        the number of < 256 characters added
3594                    the pointer to extra data is updated
3595    */
3596    
3597    static int
3598    add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3599      int options, compile_data *cd, const pcre_uint32 *p)
3600    {
3601    int n8 = 0;
3602    if (p[0] > 0)
3603      n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3604    while (p[0] < NOTACHAR)
3605      {
3606      while (p[1] == p[0] + 1) p++;
3607      n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3608        (p[1] == NOTACHAR)? 0x10ffff : p[1] - 1);
3609      p++;
3610      }
3611    return n8;
3612    }
3613    
3614    
3615    
3616    /*************************************************
3617  *           Compile one branch                   *  *           Compile one branch                   *
3618  *************************************************/  *************************************************/
3619    
# Line 3474  for (;; ptr++) Line 3731  for (;; ptr++)
3731    BOOL is_recurse;    BOOL is_recurse;
3732    BOOL reset_bracount;    BOOL reset_bracount;
3733    int class_has_8bitchar;    int class_has_8bitchar;
3734    int class_single_char;    int class_one_char;
3735    int newoptions;    int newoptions;
3736    int recno;    int recno;
3737    int refsign;    int refsign;
# Line 3772  for (;; ptr++) Line 4029  for (;; ptr++)
4029    
4030      should_flip_negation = FALSE;      should_flip_negation = FALSE;
4031    
4032      /* For optimization purposes, we track some properties of the class.      /* For optimization purposes, we track some properties of the class:
4033      class_has_8bitchar will be non-zero, if the class contains at least one      class_has_8bitchar will be non-zero if the class contains at least one <
4034      < 256 character. class_single_char will be 1 if the class contains only      256 character; class_one_char will be 1 if the class contains just one
4035      a single character. */      character. */
4036    
4037      class_has_8bitchar = 0;      class_has_8bitchar = 0;
4038      class_single_char = 0;      class_one_char = 0;
4039    
4040      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
4041      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains fewer than two
4042      than 256), because in that case the compiled code doesn't use the bit map.      8-bit characters because in that case the compiled code doesn't use the bit
4043      */      map. */
4044    
4045      memset(classbits, 0, 32 * sizeof(pcre_uint8));      memset(classbits, 0, 32 * sizeof(pcre_uint8));
4046    
4047  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4048      xclass = FALSE;                           /* No chars >= 256 */      xclass = FALSE;
4049      class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */      class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4050      class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */      class_uchardata_base = class_uchardata;   /* Save the start */
4051  #endif  #endif
4052    
4053      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 3812  for (;; ptr++) Line 4069  for (;; ptr++)
4069        /* In the pre-compile phase, accumulate the length of any extra        /* In the pre-compile phase, accumulate the length of any extra
4070        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
4071        contain a zillion > 255 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
4072        (which is on the stack). */        (which is on the stack). We have to remember that there was XCLASS data,
4073          however. */
4074    
4075        if (lengthptr != NULL)        if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4076          {          {
4077            xclass = TRUE;
4078          *lengthptr += class_uchardata - class_uchardata_base;          *lengthptr += class_uchardata - class_uchardata_base;
4079          class_uchardata = class_uchardata_base;          class_uchardata = class_uchardata_base;
4080          }          }
# Line 3917  for (;; ptr++) Line 4176  for (;; ptr++)
4176              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4177            }            }
4178    
4179          /* Not see if we need to remove any special characters. An option          /* Now see if we need to remove any special characters. An option
4180          value of 1 removes vertical space and 2 removes underscore. */          value of 1 removes vertical space and 2 removes underscore. */
4181    
4182          if (tabopt < 0) tabopt = -tabopt;          if (tabopt < 0) tabopt = -tabopt;
# Line 3933  for (;; ptr++) Line 4192  for (;; ptr++)
4192            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];            for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4193    
4194          ptr = tempptr + 1;          ptr = tempptr + 1;
4195          /* Every class contains at least one < 256 characters. */          /* Every class contains at least one < 256 character. */
4196          class_has_8bitchar = 1;          class_has_8bitchar = 1;
4197          /* Every class contains at least two characters. */          /* Every class contains at least two characters. */
4198          class_single_char = 2;          class_one_char = 2;
4199          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
4200          }          }
4201    
# Line 3944  for (;; ptr++) Line 4203  for (;; ptr++)
4203        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
4204        case. Inside a class (and only there) it is treated as backspace. We        case. Inside a class (and only there) it is treated as backspace. We
4205        assume that other escapes have more than one character in them, so        assume that other escapes have more than one character in them, so
4206        speculatively set both class_has_8bitchar and class_single_char bigger        speculatively set both class_has_8bitchar and class_one_char bigger
4207        than one. Unrecognized escapes fall through and are either treated        than one. Unrecognized escapes fall through and are either treated
4208        as literal characters (by default), or are faulted if        as literal characters (by default), or are faulted if
4209        PCRE_EXTRA is set. */        PCRE_EXTRA is set. */
# Line 3977  for (;; ptr++) Line 4236  for (;; ptr++)
4236            /* Every class contains at least two < 256 characters. */            /* Every class contains at least two < 256 characters. */
4237            class_has_8bitchar++;            class_has_8bitchar++;
4238            /* Every class contains at least two characters. */            /* Every class contains at least two characters. */
4239            class_single_char += 2;            class_one_char += 2;
4240    
4241            switch (-c)            switch (-c)
4242              {              {
# Line 4027  for (;; ptr++) Line 4286  for (;; ptr++)
4286              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4287              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4288              continue;              continue;
4289    
4290                /* The rest apply in both UCP and non-UCP cases. */
4291    
4292              case ESC_h:              case ESC_h:
4293              SETBIT(classbits, CHAR_HT);              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4294              SETBIT(classbits, CHAR_SPACE);                PRIV(hspace_list), NOTACHAR);
 #ifndef EBCDIC  
             SETBIT(classbits, 0xa0); /* NSBP */  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x1680;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x180e;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2000;  
             *class_uchardata++ = 0x200a;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x202f;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x205f;  
             *class_uchardata++ = XCL_SINGLE;  
             *class_uchardata++ = 0x3000;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);  
               *class_uchardata++ = XCL_SINGLE;  
               class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);  
               }  
 #endif  
 #endif  /* Not EBCDIC */  
4295              continue;              continue;
4296    
4297              case ESC_H:              case ESC_H:
4298              for (c = 0; c < 32; c++)              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4299                {                cd, PRIV(hspace_list));
               int x = 0xff;  
               switch (c)  
                 {  
                 case CHAR_HT/8:    x ^= 1 << (CHAR_HT%8); break;  
                 case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break;  
 #ifndef EBCDIC  
                 case 0xa0/8: x ^= 1 << (0xa0%8); break;  /* NSBSP */  
 #endif  
                 default: break;  
                 }  
               classbits[c] |= x;  
               }  
 #ifndef EBCDIC  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x0100;  
             *class_uchardata++ = 0x167f;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x1681;  
             *class_uchardata++ = 0x180d;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x180f;  
             *class_uchardata++ = 0x1fff;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x200b;  
             *class_uchardata++ = 0x202e;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2030;  
             *class_uchardata++ = 0x205e;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2060;  
             *class_uchardata++ = 0x2fff;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x3001;  
 #ifdef SUPPORT_UTF  
             if (utf)  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
             else  
 #endif   /* SUPPORT_UTF */  
               *class_uchardata++ = 0xffff;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
               }  
 #endif  
 #endif  /* Not EBCDIC */  
4300              continue;              continue;
4301    
4302              case ESC_v:              case ESC_v:
4303              SETBIT(classbits, CHAR_LF);              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4304              SETBIT(classbits, CHAR_VT);                PRIV(vspace_list), NOTACHAR);
             SETBIT(classbits, CHAR_FF);  
             SETBIT(classbits, CHAR_CR);  
             SETBIT(classbits, CHAR_NEL);  
 #ifndef EBCDIC  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x2028;  
             *class_uchardata++ = 0x2029;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);  
               }  
 #endif  
 #endif  /* Not EBCDIC */  
4305              continue;              continue;
4306    
4307              case ESC_V:              case ESC_V:
4308              for (c = 0; c < 32; c++)              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4309                {                cd, PRIV(vspace_list));
               int x = 0xff;  
               switch (c)  
                 {  
                 case CHAR_LF/8: x ^= 1 << (CHAR_LF%8);  
                                 x ^= 1 << (CHAR_VT%8);  
                                 x ^= 1 << (CHAR_FF%8);  
                                 x ^= 1 << (CHAR_CR%8);  
                                 break;  
                 case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break;  
                 default: break;  
                 }  
               classbits[c] |= x;  
               }  
   
 #ifndef EBCDIC  
 #ifndef COMPILE_PCRE8  
             xclass = TRUE;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x0100;  
             *class_uchardata++ = 0x2027;  
             *class_uchardata++ = XCL_RANGE;  
             *class_uchardata++ = 0x202a;  
 #ifdef SUPPORT_UTF  
             if (utf)  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
             else  
 #endif  
               *class_uchardata++ = 0xffff;  
 #elif defined SUPPORT_UTF  
             if (utf)  
               {  
               xclass = TRUE;  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);  
               *class_uchardata++ = XCL_RANGE;  
               class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);  
               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);  
               }  
 #endif  
 #endif  /* Not EBCDIC */  
4310              continue;              continue;
4311    
4312  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 4222  for (;; ptr++) Line 4317  for (;; ptr++)
4317                int pdata;                int pdata;
4318                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4319                if (ptype < 0) goto FAILED;                if (ptype < 0) goto FAILED;
               xclass = TRUE;  
4320                *class_uchardata++ = ((-c == ESC_p) != negated)?                *class_uchardata++ = ((-c == ESC_p) != negated)?
4321                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4322                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
# Line 4242  for (;; ptr++) Line 4336  for (;; ptr++)
4336                goto FAILED;                goto FAILED;
4337                }                }
4338              class_has_8bitchar--;    /* Undo the speculative increase. */              class_has_8bitchar--;    /* Undo the speculative increase. */
4339              class_single_char -= 2;  /* Undo the speculative increase. */              class_one_char -= 2;     /* Undo the speculative increase. */
4340              c = *ptr;                /* Get the final character and fall through */              c = *ptr;                /* Get the final character and fall through */
4341              break;              break;
4342              }              }
4343            }            }
4344    
4345          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if the escape just defined a single character (c >= 0).
4346          greater than 256. */          This may be greater than 256. */
4347    
4348          }   /* End of backslash handling */          }   /* End of backslash handling */
4349    
4350        /* A single character may be followed by '-' to form a range. However,        /* A character may be followed by '-' to form a range. However, Perl does
4351        Perl does not permit ']' to be the end of the range. A '-' character        not permit ']' to be the end of the range. A '-' character at the end is
4352        at the end is treated as a literal. Perl ignores orphaned \E sequences        treated as a literal. Perl ignores orphaned \E sequences entirely. The
4353        entirely. The code for handling \Q and \E is messy. */        code for handling \Q and \E is messy. */
4354    
4355        CHECK_RANGE:        CHECK_RANGE:
4356        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 4264  for (;; ptr++) Line 4358  for (;; ptr++)
4358          inescq = FALSE;          inescq = FALSE;
4359          ptr += 2;          ptr += 2;
4360          }          }
   
4361        oldptr = ptr;        oldptr = ptr;
4362    
4363        /* Remember \r or \n */        /* Remember if \r or \n were explicitly used */
4364    
4365        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4366    
# Line 4290  for (;; ptr++) Line 4383  for (;; ptr++)
4383            inescq = TRUE;            inescq = TRUE;
4384            break;            break;
4385            }            }
4386    
4387            /* Minus (hyphen) at the end of a class is treated as a literal, so put
4388            back the pointer and jump to handle the character that preceded it. */
4389    
4390          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4391            {            {
4392            ptr = oldptr;            ptr = oldptr;
4393            goto LONE_SINGLE_CHARACTER;            goto CLASS_SINGLE_CHARACTER;
4394            }            }
4395    
4396            /* Otherwise, we have a potential range; pick up the next character */
4397    
4398  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4399          if (utf)          if (utf)
# Line 4315  for (;; ptr++) Line 4413  for (;; ptr++)
4413            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4414            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
4415    
4416            /* \b is backspace; any other special means the '-' was literal */            /* \b is backspace; any other special means the '-' was literal. */
4417    
4418            if (d < 0)            if (d < 0)
4419              {              {
4420              if (d == -ESC_b) d = CHAR_BS; else              if (d == -ESC_b) d = CHAR_BS; else
4421                {                {
4422                ptr = oldptr;                ptr = oldptr;
4423                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4424                }                }
4425              }              }
4426            }            }
4427    
4428          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
4429          one-character ranges */          one-character ranges. */
4430    
4431          if (d < c)          if (d < c)
4432            {            {
4433            *errorcodeptr = ERR8;            *errorcodeptr = ERR8;
4434            goto FAILED;            goto FAILED;
4435            }            }
4436            if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4437    
4438          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          /* We have found a character range, so single character optimizations
4439            cannot be done anymore. Any value greater than 1 indicates that there
4440            is more than one character. */
4441    
4442            class_one_char = 2;
4443    
4444          /* Remember \r or \n */          /* Remember an explicit \r or \n, and add the range to the class. */
4445    
4446          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4447    
4448          /* Since we found a character range, single character optimizations          class_has_8bitchar +=
4449          cannot be done anymore. */            add_to_class(classbits, &class_uchardata, options, cd, c, d);
4450          class_single_char = 2;  
   
         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless  
         matching, we have to use an XCLASS with extra data items. Caseless  
         matching for characters > 127 is available only if UCP support is  
         available. */  
   
 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)  
         if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))  
 #elif defined  SUPPORT_UTF  
         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))  
 #elif !(defined COMPILE_PCRE8)  
         if (d > 255)  
 #endif  
 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)  
           {  
           xclass = TRUE;  
   
           /* With UCP support, we can find the other case equivalents of  
           the relevant characters. There may be several ranges. Optimize how  
           they fit with the basic range. */  
   
 #ifdef SUPPORT_UCP  
 #ifndef COMPILE_PCRE8  
           if (utf && (options & PCRE_CASELESS) != 0)  
 #else  
           if ((options & PCRE_CASELESS) != 0)  
 #endif  
             {  
             unsigned int occ, ocd;  
             unsigned int cc = c;  
             unsigned int origd = d;  
             while (get_othercase_range(&cc, origd, &occ, &ocd))  
               {  
               if (occ >= (unsigned int)c &&  
                   ocd <= (unsigned int)d)  
                 continue;                          /* Skip embedded ranges */  
   
               if (occ < (unsigned int)c  &&  
                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */  
                 {                                  /* if there is overlap,   */  
                 c = occ;                           /* noting that if occ < c */  
                 continue;                          /* we can't have ocd > d  */  
                 }                                  /* because a subrange is  */  
               if (ocd > (unsigned int)d &&  
                   occ <= (unsigned int)d + 1)      /* always shorter than    */  
                 {                                  /* the basic range.       */  
                 d = ocd;  
                 continue;  
                 }  
   
               if (occ == ocd)  
                 {  
                 *class_uchardata++ = XCL_SINGLE;  
                 }  
               else  
                 {  
                 *class_uchardata++ = XCL_RANGE;  
                 class_uchardata += PRIV(ord2utf)(occ, class_uchardata);  
                 }  
               class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);  
               }  
             }  
 #endif  /* SUPPORT_UCP */  
   
           /* Now record the original range, possibly modified for UCP caseless  
           overlapping ranges. */  
   
           *class_uchardata++ = XCL_RANGE;  
 #ifdef SUPPORT_UTF  
 #ifndef COMPILE_PCRE8  
           if (utf)  
             {  
             class_uchardata += PRIV(ord2utf)(c, class_uchardata);  
             class_uchardata += PRIV(ord2utf)(d, class_uchardata);  
             }  
           else  
             {  
             *class_uchardata++ = c;  
             *class_uchardata++ = d;  
             }  
 #else  
           class_uchardata += PRIV(ord2utf)(c, class_uchardata);  
           class_uchardata += PRIV(ord2utf)(d, class_uchardata);  
 #endif  
 #else /* SUPPORT_UTF */  
           *class_uchardata++ = c;  
           *class_uchardata++ = d;  
 #endif /* SUPPORT_UTF */  
   
           /* With UCP support, we are done. Without UCP support, there is no  
           caseless matching for UTF characters > 127; we can use the bit map  
           for the smaller ones. As for 16 bit characters without UTF, we  
           can still use  */  
   
 #ifdef SUPPORT_UCP  
 #ifndef COMPILE_PCRE8  
           if (utf)  
 #endif  
             continue;    /* With next character in the class */  
 #endif  /* SUPPORT_UCP */  
   
 #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)  
           if (utf)  
             {  
             if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  
             /* Adjust upper limit and fall through to set up the map */  
             d = 127;  
             }  
           else  
             {  
             if (c > 255) continue;  
             /* Adjust upper limit and fall through to set up the map */  
             d = 255;  
             }  
 #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)  
           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;  
           /* Adjust upper limit and fall through to set up the map */  
           d = 127;  
 #else  
           if (c > 255) continue;  
           /* Adjust upper limit and fall through to set up the map */  
           d = 255;  
 #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */  
           }  
 #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */  
   
         /* We use the bit map for 8 bit mode, or when the characters fall  
         partially or entirely to [0-255] ([0-127] for UCP) ranges. */  
   
         class_has_8bitchar = 1;  
   
         /* We can save a bit of time by skipping this in the pre-compile. */  
   
         if (lengthptr == NULL) for (; c <= d; c++)  
           {  
           classbits[c/8] |= (1 << (c&7));  
           if ((options & PCRE_CASELESS) != 0)  
             {  
             int uc = cd->fcc[c]; /* flip case */  
             classbits[uc/8] |= (1 << (uc&7));  
             }  
           }  
   
4451          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
4452          }          }
4453    
4454        /* Handle a lone single character - we can get here for a normal        /* Handle a single character - we can get here for a normal non-escape
4455        non-escape char, or after \ that introduces a single character or for an        char, or after \ that introduces a single character or for an apparent
4456        apparent range that isn't. */        range that isn't. Only the value 1 matters for class_one_char, so don't
4457          increase it if it is already 2 or more ... just in case there's a class
4458        LONE_SINGLE_CHARACTER:        with a zillion characters in it. */
4459    
4460        /* Only the value of 1 matters for class_single_char. */        CLASS_SINGLE_CHARACTER:
4461          if (class_one_char < 2) class_one_char++;
4462        if (class_single_char < 2) class_single_char++;  
4463          /* If class_one_char is 1, we have the first single character in the
4464        /* If class_charcount is 1, we saw precisely one character. As long as        class, and there have been no prior ranges, or XCLASS items generated by
4465        there was no use of \p or \P, in other words, no use of any XCLASS        escapes. If this is the final character in the class, we can optimize by
4466        features, we can optimize.        turning the item into a 1-character OP_CHAR[I] if it's positive, or
4467          OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4468        The optimization throws away the bit map. We turn the item into a        to be set. Otherwise, there can be no first char if this item is first,
4469        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.        whatever repeat count may follow. In the case of reqchar, save the
4470        In the positive case, it can cause firstchar to be set. Otherwise, there        previous value for reinstating. */
       can be no first char if this item is first, whatever repeat count may  
       follow. In the case of reqchar, save the previous value for reinstating. */  
4471    
4472        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)        if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4473          {          {
4474          ptr++;          ptr++;
4475          zeroreqchar = reqchar;          zeroreqchar = reqchar;
# Line 4544  for (;; ptr++) Line 4502  for (;; ptr++)
4502            }            }
4503          goto ONE_CHAR;          goto ONE_CHAR;
4504          }       /* End of 1-char optimization */          }       /* End of 1-char optimization */
4505    
4506        /* Handle a character that cannot go in the bit map. */        /* There is more than one character in the class, or an XCLASS item
4507          has been generated. Add this character to the class. */
4508  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)  
4509        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))        class_has_8bitchar +=
4510  #elif defined SUPPORT_UTF          add_to_class(classbits, &class_uchardata, options, cd, c, c);
       if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))  
 #elif !(defined COMPILE_PCRE8)  
       if (c > 255)  
 #endif  
   
 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)  
         {  
         xclass = TRUE;  
         *class_uchardata++ = XCL_SINGLE;  
 #ifdef SUPPORT_UTF  
 #ifndef COMPILE_PCRE8  
         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */  
         if (!utf)  
           *class_uchardata++ = c;  
         else  
 #endif  
           class_uchardata += PRIV(ord2utf)(c, class_uchardata);  
 #else /* SUPPORT_UTF */  
         *class_uchardata++ = c;  
 #endif /* SUPPORT_UTF */  
   
 #ifdef SUPPORT_UCP  
 #ifdef COMPILE_PCRE8  
         if ((options & PCRE_CASELESS) != 0)  
 #else  
         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */  
         if (utf && (options & PCRE_CASELESS) != 0)  
 #endif  
           {  
           unsigned int othercase;  
           if ((int)(othercase = UCD_OTHERCASE(c)) != c)  
             {  
             *class_uchardata++ = XCL_SINGLE;  
             class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);  
             }  
           }  
 #endif  /* SUPPORT_UCP */  
   
         }  
       else  
 #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */  
   
       /* Handle a single-byte character */  
         {  
         class_has_8bitchar = 1;  
         classbits[c/8] |= (1 << (c&7));  
         if ((options & PCRE_CASELESS) != 0)  
           {  
           c = cd->fcc[c]; /* flip case */  
           classbits[c/8] |= (1 << (c&7));  
           }  
         }  
4511        }        }
4512    
4513      /* Loop until ']' reached. This "while" is the end of the "do" far above.      /* Loop until ']' reached. This "while" is the end of the "do" far above.
# Line 4621  for (;; ptr++) Line 4527  for (;; ptr++)
4527        goto FAILED;        goto FAILED;
4528        }        }
4529    
4530        /* We will need an XCLASS if data has been placed in class_uchardata. In
4531        the second phase this is a sufficient test. However, in the pre-compile
4532        phase, class_uchardata gets emptied to prevent workspace overflow, so it
4533        only if the very last character in the class needs XCLASS will it contain
4534        anything at this point. For this reason, xclass gets set TRUE above when
4535        uchar_classdata is emptied, and that's why this code is the way it is here
4536        instead of just doing a test on class_uchardata below. */
4537    
4538    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4539        if (class_uchardata > class_uchardata_base) xclass = TRUE;
4540    #endif
4541    
4542      /* If this is the first thing in the branch, there can be no first char      /* If this is the first thing in the branch, there can be no first char
4543      setting, whatever the repeat count. Any reqchar setting must remain      setting, whatever the repeat count. Any reqchar setting must remain
4544      unchanged after any kind of repeat. */      unchanged after any kind of repeat. */

Legend:
Removed from v.1041  
changed lines
  Added in v.1045

  ViewVC Help
Powered by ViewVC 1.1.5