/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 773 by zherczeg, Mon Nov 28 20:39:30 2011 UTC revision 774 by zherczeg, Thu Dec 1 06:08:45 2011 UTC
# Line 97  overrun before it actually does run off Line 97  overrun before it actually does run off
97    
98  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)  #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    /* Private flags added to firstchar and reqchar. */
101    
102    #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
103    #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
104    
105  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
106  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
# Line 484  For convenience, we use the same bit def Line 488  For convenience, we use the same bit def
488    
489  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
490    
491    /* Using a simple comparison for decimal numbers rather than a memory read
492    is much faster, and the resulting code is simpler (the compiler turns it
493    into a subtraction and unsigned comparison). */
494    
495    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
496    
497  #ifndef EBCDIC  #ifndef EBCDIC
498    
499  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
500  UTF-8 mode. */  UTF-8 mode. */
501    
502  static const unsigned char digitab[] =  static const pcre_uint8 digitab[] =
503    {    {
504    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
505    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 528  static const unsigned char digitab[] = Line 538  static const unsigned char digitab[] =
538    
539  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
540    
541  static const unsigned char digitab[] =  static const pcre_unit8 digitab[] =
542    {    {
543    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
544    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 563  static const unsigned char digitab[] = Line 573  static const unsigned char digitab[] =
573    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
574    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
575    
576  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
577    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
578    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
579    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 651  Returns:    TRUE or FALSE Line 661  Returns:    TRUE or FALSE
661  static BOOL  static BOOL
662  is_counted_repeat(const pcre_uchar *p)  is_counted_repeat(const pcre_uchar *p)
663  {  {
664  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
665  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
666    while (IS_DIGIT(*p)) p++;
667  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
668    
669  if (*p++ != CHAR_COMMA) return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
670  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
671    
672  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if (!IS_DIGIT(*p)) return FALSE;
673  while ((digitab[*p] & ctype_digit) != 0) p++;  p++;
674    while (IS_DIGIT(*p)) p++;
675    
676  return (*p == CHAR_RIGHT_CURLY_BRACKET);  return (*p == CHAR_RIGHT_CURLY_BRACKET);
677  }  }
# Line 710  in a table. A non-zero result is somethi Line 722  in a table. A non-zero result is somethi
722  Otherwise further processing may be required. */  Otherwise further processing may be required. */
723    
724  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
725  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */  /* Not alphanumeric */
726    else if (c < CHAR_0 || c > CHAR_z) {}
727  else if ((i = escapes[c - CHAR_0]) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
728    
729  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
730  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  /* Not alphanumeric */
731    else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
732  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
733  #endif  #endif
734    
# Line 740  else Line 754  else
754        {        {
755        /* In JavaScript, \u must be followed by four hexadecimal numbers.        /* In JavaScript, \u must be followed by four hexadecimal numbers.
756        Otherwise it is a lowercase u letter. */        Otherwise it is a lowercase u letter. */
757        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
758             && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
759            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
760            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
761          {          {
762          c = 0;          c = 0;
763          for (i = 0; i < 4; ++i)          for (i = 0; i < 4; ++i)
# Line 797  else Line 813  else
813        {        {
814        const pcre_uchar *p;        const pcre_uchar *p;
815        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
816          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
817        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
818          {          {
819          c = -ESC_k;          c = -ESC_k;
# Line 815  else Line 831  else
831        }        }
832      else negated = FALSE;      else negated = FALSE;
833    
834        /* The integer range is limited by the machine's int representation. */
835      c = 0;      c = 0;
836      while ((digitab[ptr[1]] & ctype_digit) != 0)      while (IS_DIGIT(ptr[1]))
837          {
838          if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
839            {
840            c = -1;
841            break;
842            }
843        c = c * 10 + *(++ptr) - CHAR_0;        c = c * 10 + *(++ptr) - CHAR_0;
844          }
845      if (c < 0)   /* Integer overflow */      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
846        {        {
847          while (IS_DIGIT(ptr[1]))
848            ptr++;
849        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
850        break;        break;
851        }        }
# Line 868  else Line 893  else
893      if (!isclass)      if (!isclass)
894        {        {
895        oldptr = ptr;        oldptr = ptr;
896          /* The integer range is limited by the machine's int representation. */
897        c -= CHAR_0;        c -= CHAR_0;
898        while ((digitab[ptr[1]] & ctype_digit) != 0)        while (IS_DIGIT(ptr[1]))
899            {
900            if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
901              {
902              c = -1;
903              break;
904              }
905          c = c * 10 + *(++ptr) - CHAR_0;          c = c * 10 + *(++ptr) - CHAR_0;
906        if (c < 0)    /* Integer overflow */          }
907          if (((unsigned int)c) > INT_MAX) /* Integer overflow */
908          {          {
909            while (IS_DIGIT(ptr[1]))
910              ptr++;
911          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
912          break;          break;
913          }          }
# Line 905  else Line 940  else
940      c -= CHAR_0;      c -= CHAR_0;
941      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
942          c = c * 8 + *(++ptr) - CHAR_0;          c = c * 8 + *(++ptr) - CHAR_0;
943      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 0xff) *errorcodeptr = ERR51;
944      break;      break;
945    
946      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 917  else Line 952  else
952        {        {
953        /* In JavaScript, \x must be followed by two hexadecimal numbers.        /* In JavaScript, \x must be followed by two hexadecimal numbers.
954        Otherwise it is a lowercase x letter. */        Otherwise it is a lowercase x letter. */
955        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
956            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
957          {          {
958          c = 0;          c = 0;
959          for (i = 0; i < 2; ++i)          for (i = 0; i < 2; ++i)
# Line 941  else Line 977  else
977        int count = 0;        int count = 0;
978    
979        c = 0;        c = 0;
980        while ((digitab[*pt] & ctype_xdigit) != 0)        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
981          {          {
982          register int cc = *pt++;          register int cc = *pt++;
983          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
# Line 958  else Line 994  else
994    
995        if (*pt == CHAR_RIGHT_CURLY_BRACKET)        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
996          {          {
997          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;  #ifdef COMPILE_PCRE8
998            if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34;
999    #else
1000    #ifdef COMPILE_PCRE16
1001            if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34;
1002    #endif
1003    #endif
1004          ptr = pt;          ptr = pt;
1005          break;          break;
1006          }          }
# Line 970  else Line 1012  else
1012      /* Read just a single-byte hex-defined char */      /* Read just a single-byte hex-defined char */
1013    
1014      c = 0;      c = 0;
1015      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1016        {        {
1017        int cc;                                  /* Some compilers don't like */        int cc;                                  /* Some compilers don't like */
1018        cc = *(++ptr);                           /* ++ in initializers */        cc = *(++ptr);                           /* ++ in initializers */
# Line 1169  int max = -1; Line 1211  int max = -1;
1211  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1212  an integer overflow. */  an integer overflow. */
1213    
1214  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;  while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1215  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1216    {    {
1217    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 1184  if (*p == CHAR_RIGHT_CURLY_BRACKET) max Line 1226  if (*p == CHAR_RIGHT_CURLY_BRACKET) max
1226    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1227      {      {
1228      max = 0;      max = 0;
1229      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;      while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1230      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1231        {        {
1232        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 3258  Arguments: Line 3300  Arguments:
3300    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
3301    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
3302    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
3303    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3304    reqbyteptr     set to the last literal character required, else < 0    reqcharptr     set to the last literal character required, else < 0
3305    bcptr          points to current branch chain    bcptr          points to current branch chain
3306    cond_depth     conditional nesting depth    cond_depth     conditional nesting depth
3307    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
# Line 3272  Returns:         TRUE on success Line 3314  Returns:         TRUE on success
3314    
3315  static BOOL  static BOOL
3316  compile_branch(int *optionsptr, pcre_uchar **codeptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3317    const pcre_uchar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3318    int *reqbyteptr, branch_chain *bcptr, int cond_depth, compile_data *cd,    pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3319    int *lengthptr)    compile_data *cd, int *lengthptr)
3320  {  {
3321  int repeat_type, op_type;  int repeat_type, op_type;
3322  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3323  int bravalue = 0;  int bravalue = 0;
3324  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3325  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
3326  int zeroreqbyte, zerofirstbyte;  pcre_int32 zeroreqchar, zerofirstchar;
3327  int req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
3328  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
3329  int after_manual_callout = 0;  int after_manual_callout = 0;
3330  int length_prevgroup = 0;  int length_prevgroup = 0;
# Line 3292  pcre_uchar *last_code = code; Line 3334  pcre_uchar *last_code = code;
3334  pcre_uchar *orig_code = code;  pcre_uchar *orig_code = code;
3335  pcre_uchar *tempcode;  pcre_uchar *tempcode;
3336  BOOL inescq = FALSE;  BOOL inescq = FALSE;
3337  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstchar = FALSE;
3338  const pcre_uchar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
3339  const pcre_uchar *tempptr;  const pcre_uchar *tempptr;
3340  const pcre_uchar *nestptr = NULL;  const pcre_uchar *nestptr = NULL;
# Line 3331  greedy_non_default = greedy_default ^ 1; Line 3373  greedy_non_default = greedy_default ^ 1;
3373    
3374  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3375  matching encountered yet". It gets changed to REQ_NONE if we hit something that  matching encountered yet". It gets changed to REQ_NONE if we hit something that
3376  matches a non-fixed char first char; reqbyte just remains unset if we never  matches a non-fixed char first char; reqchar just remains unset if we never
3377  find one.  find one.
3378    
3379  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
3380  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
3381  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3382  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
3383    
3384  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3385    
3386  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* The variable req_caseopt contains either the REQ_CASELESS value
3387  according to the current setting of the caseless flag. REQ_CASELESS is a bit  or zero, according to the current setting of the caseless flag. The
3388  value > 255. It is added into the firstbyte or reqbyte variables to record the  REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3389  case status of the value. This is used only for ASCII characters. */  firstchar or reqchar variables to record the case status of the
3390    value. This is used only for ASCII characters. */
3391    
3392  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3393    
3394  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
3395    
# Line 3364  for (;; ptr++) Line 3407  for (;; ptr++)
3407    int recno;    int recno;
3408    int refsign;    int refsign;
3409    int skipbytes;    int skipbytes;
3410    int subreqbyte;    int subreqchar;
3411    int subfirstbyte;    int subfirstchar;
3412    int terminator;    int terminator;
3413    int mclength;    int mclength;
3414    int tempbracount;    int tempbracount;
# Line 3528  for (;; ptr++) Line 3571  for (;; ptr++)
3571      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3572      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3573      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
3574      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
3575      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
3576      *codeptr = code;      *codeptr = code;
3577      *ptrptr = ptr;      *ptrptr = ptr;
3578      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3553  for (;; ptr++) Line 3596  for (;; ptr++)
3596      previous = NULL;      previous = NULL;
3597      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3598        {        {
3599        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3600        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
3601        }        }
3602      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3565  for (;; ptr++) Line 3608  for (;; ptr++)
3608      break;      break;
3609    
3610      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3611      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqchar doesn't change either. */
3612    
3613      case CHAR_DOT:      case CHAR_DOT:
3614      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3615      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
3616      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
3617      previous = code;      previous = code;
3618      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3619      break;      break;
# Line 3644  for (;; ptr++) Line 3687  for (;; ptr++)
3687          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3688        {        {
3689        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
3690        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3691        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
3692        break;        break;
3693        }        }
3694    
# Line 4335  for (;; ptr++) Line 4378  for (;; ptr++)
4378      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
4379      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4380      Note that OP_NOT[I] does not support multibyte characters. In the positive      Note that OP_NOT[I] does not support multibyte characters. In the positive
4381      case, it can cause firstbyte to be set. Otherwise, there can be no first      case, it can cause firstchar to be set. Otherwise, there can be no first
4382      char if this item is first, whatever repeat count may follow. In the case      char if this item is first, whatever repeat count may follow. In the case
4383      of reqbyte, save the previous value for reinstating. */      of reqchar, save the previous value for reinstating. */
4384    
4385  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4386      if (class_charcount == 1 && !xclass &&      if (class_charcount == 1 && !xclass &&
# Line 4348  for (;; ptr++) Line 4391  for (;; ptr++)
4391      if (class_charcount == 1 && !xclass)      if (class_charcount == 1 && !xclass)
4392  #endif  #endif
4393        {        {
4394        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
4395    
4396        /* The OP_NOT[I] opcodes work on one-byte characters only. */        /* The OP_NOT[I] opcodes work on one-byte characters only. */
4397    
4398        if (negate_class)        if (negate_class)
4399          {          {
4400          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4401          zerofirstbyte = firstbyte;          zerofirstchar = firstchar;
4402          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4403          *code++ = class_lastchar;          *code++ = class_lastchar;
4404          break;          break;
# Line 4378  for (;; ptr++) Line 4421  for (;; ptr++)
4421    
4422      /* The general case - not the one-char optimization. If this is the first      /* The general case - not the one-char optimization. If this is the first
4423      thing in the branch, there can be no first char setting, whatever the      thing in the branch, there can be no first char setting, whatever the
4424      repeat count. Any reqbyte setting must remain unchanged after any kind of      repeat count. Any reqchar setting must remain unchanged after any kind of
4425      repeat. */      repeat. */
4426    
4427      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4428      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
4429      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
4430    
4431      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4432      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4476  for (;; ptr++) Line 4519  for (;; ptr++)
4519    
4520      if (repeat_min == 0)      if (repeat_min == 0)
4521        {        {
4522        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
4523        reqbyte = zeroreqbyte;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
4524        }        }
4525    
4526      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4542  for (;; ptr++) Line 4585  for (;; ptr++)
4585    
4586      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4587      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
4588      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqchar - it might not be if a sequence such as x{3} is
4589      the first thing in a branch because the x will have gone into firstbyte      the first thing in a branch because the x will have gone into firstchar
4590      instead.  */      instead.  */
4591    
4592      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI)
# Line 4572  for (;; ptr++) Line 4615  for (;; ptr++)
4615    
4616          {          {
4617          c = code[-1];          c = code[-1];
4618          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4619          }          }
4620    
4621        /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
# Line 4971  for (;; ptr++) Line 5014  for (;; ptr++)
5014    
5015            else            else
5016              {              {
5017              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5018              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5019                {                {
5020                pcre_uchar *hc;                pcre_uchar *hc;
# Line 5274  for (;; ptr++) Line 5317  for (;; ptr++)
5317        }        }
5318    
5319      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
5320      "follows varying string" flag for subsequently encountered reqbytes if      "follows varying string" flag for subsequently encountered reqchars if
5321      it isn't already set and we have just passed a varying length item. */      it isn't already set and we have just passed a varying length item. */
5322    
5323      END_REPEAT:      END_REPEAT:
# Line 5352  for (;; ptr++) Line 5395  for (;; ptr++)
5395                }                }
5396              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;              *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5397    
5398              /* Do not set firstbyte after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5399              if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5400              }              }
5401    
5402            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5506  for (;; ptr++) Line 5549  for (;; ptr++)
5549          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while ((cd->ctypes[*ptr] & ctype_word) != 0)
5550            {            {
5551            if (recno >= 0)            if (recno >= 0)
5552              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
               recno * 10 + *ptr - CHAR_0 : -1;  
5553            ptr++;            ptr++;
5554            }            }
5555          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
# Line 5597  for (;; ptr++) Line 5639  for (;; ptr++)
5639            recno = 0;            recno = 0;
5640            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
5641              {              {
5642              if ((digitab[name[i]] & ctype_digit) == 0)              if (!IS_DIGIT(name[i]))
5643                {                {
5644                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
5645                goto FAILED;                goto FAILED;
# Line 5697  for (;; ptr++) Line 5739  for (;; ptr++)
5739          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5740            {            {
5741            int n = 0;            int n = 0;
5742            while ((digitab[*(++ptr)] & ctype_digit) != 0)            ptr++;
5743              n = n * 10 + *ptr - CHAR_0;            while(IS_DIGIT(*ptr))
5744                n = n * 10 + *ptr++ - CHAR_0;
5745            if (*ptr != CHAR_RIGHT_PARENTHESIS)            if (*ptr != CHAR_RIGHT_PARENTHESIS)
5746              {              {
5747              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
# Line 5981  for (;; ptr++) Line 6024  for (;; ptr++)
6024            if ((refsign = *ptr) == CHAR_PLUS)            if ((refsign = *ptr) == CHAR_PLUS)
6025              {              {
6026              ptr++;              ptr++;
6027              if ((digitab[*ptr] & ctype_digit) == 0)              if (!IS_DIGIT(*ptr))
6028                {                {
6029                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
6030                goto FAILED;                goto FAILED;
# Line 5989  for (;; ptr++) Line 6032  for (;; ptr++)
6032              }              }
6033            else if (refsign == CHAR_MINUS)            else if (refsign == CHAR_MINUS)
6034              {              {
6035              if ((digitab[ptr[1]] & ctype_digit) == 0)              if (!IS_DIGIT(ptr[1]))
6036                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
6037              ptr++;              ptr++;
6038              }              }
6039    
6040            recno = 0;            recno = 0;
6041            while((digitab[*ptr] & ctype_digit) != 0)            while(IS_DIGIT(*ptr))
6042              recno = recno * 10 + *ptr++ - CHAR_0;              recno = recno * 10 + *ptr++ - CHAR_0;
6043    
6044            if (*ptr != terminator)            if (*ptr != terminator)
# Line 6093  for (;; ptr++) Line 6136  for (;; ptr++)
6136    
6137          /* Can't determine a first byte now */          /* Can't determine a first byte now */
6138    
6139          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6140          continue;          continue;
6141    
6142    
# Line 6150  for (;; ptr++) Line 6193  for (;; ptr++)
6193          both phases.          both phases.
6194    
6195          If we are not at the pattern start, reset the greedy defaults and the          If we are not at the pattern start, reset the greedy defaults and the
6196          case value for firstbyte and reqbyte. */          case value for firstchar and reqchar. */
6197    
6198          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
6199            {            {
# Line 6163  for (;; ptr++) Line 6206  for (;; ptr++)
6206              {              {
6207              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6208              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
6209              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6210              }              }
6211    
6212            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
# Line 6226  for (;; ptr++) Line 6269  for (;; ptr++)
6269           skipbytes,                       /* Skip over bracket number */           skipbytes,                       /* Skip over bracket number */
6270           cond_depth +           cond_depth +
6271             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */             ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6272           &subfirstbyte,                   /* For possible first char */           &subfirstchar,                   /* For possible first char */
6273           &subreqbyte,                     /* For possible last char */           &subreqchar,                     /* For possible last char */
6274           bcptr,                           /* Current branch chain */           bcptr,                           /* Current branch chain */
6275           cd,                              /* Tables block */           cd,                              /* Tables block */
6276           (lengthptr == NULL)? NULL :      /* Actual compile phase */           (lengthptr == NULL)? NULL :      /* Actual compile phase */
# Line 6278  for (;; ptr++) Line 6321  for (;; ptr++)
6321          }          }
6322    
6323        /* A "normal" conditional group. If there is just one branch, we must not        /* A "normal" conditional group. If there is just one branch, we must not
6324        make use of its firstbyte or reqbyte, because this is equivalent to an        make use of its firstchar or reqchar, because this is equivalent to an
6325        empty second branch. */        empty second branch. */
6326    
6327        else        else
# Line 6288  for (;; ptr++) Line 6331  for (;; ptr++)
6331            *errorcodeptr = ERR27;            *errorcodeptr = ERR27;
6332            goto FAILED;            goto FAILED;
6333            }            }
6334          if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;          if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6335          }          }
6336        }        }
6337    
# Line 6332  for (;; ptr++) Line 6375  for (;; ptr++)
6375      /* Handle updating of the required and first characters for other types of      /* Handle updating of the required and first characters for other types of
6376      group. Update for normal brackets of all kinds, and conditions with two      group. Update for normal brackets of all kinds, and conditions with two
6377      branches (see code above). If the bracket is followed by a quantifier with      branches (see code above). If the bracket is followed by a quantifier with
6378      zero repeat, we have to back off. Hence the definition of zeroreqbyte and      zero repeat, we have to back off. Hence the definition of zeroreqchar and
6379      zerofirstbyte outside the main loop so that they can be accessed for the      zerofirstchar outside the main loop so that they can be accessed for the
6380      back off. */      back off. */
6381    
6382      zeroreqbyte = reqbyte;      zeroreqchar = reqchar;
6383      zerofirstbyte = firstbyte;      zerofirstchar = firstchar;
6384      groupsetfirstbyte = FALSE;      groupsetfirstchar = FALSE;
6385    
6386      if (bravalue >= OP_ONCE)      if (bravalue >= OP_ONCE)
6387        {        {
6388        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstchar in this branch, take it from the
6389        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
6390        than one can replicate it as reqbyte if necessary. If the subpattern has        than one can replicate it as reqchar if necessary. If the subpattern has
6391        no firstbyte, set "none" for the whole branch. In both cases, a zero        no firstchar, set "none" for the whole branch. In both cases, a zero
6392        repeat forces firstbyte to "none". */        repeat forces firstchar to "none". */
6393    
6394        if (firstbyte == REQ_UNSET)        if (firstchar == REQ_UNSET)
6395          {          {
6396          if (subfirstbyte >= 0)          if (subfirstchar >= 0)
6397            {            {
6398            firstbyte = subfirstbyte;            firstchar = subfirstchar;
6399            groupsetfirstbyte = TRUE;            groupsetfirstchar = TRUE;
6400            }            }
6401          else firstbyte = REQ_NONE;          else firstchar = REQ_NONE;
6402          zerofirstbyte = REQ_NONE;          zerofirstchar = REQ_NONE;
6403          }          }
6404    
6405        /* If firstbyte was previously set, convert the subpattern's firstbyte        /* If firstchar was previously set, convert the subpattern's firstchar
6406        into reqbyte if there wasn't one, using the vary flag that was in        into reqchar if there wasn't one, using the vary flag that was in
6407        existence beforehand. */        existence beforehand. */
6408    
6409        else if (subfirstbyte >= 0 && subreqbyte < 0)        else if (subfirstchar >= 0 && subreqchar < 0)
6410          subreqbyte = subfirstbyte | tempreqvary;          subreqchar = subfirstchar | tempreqvary;
6411    
6412        /* If the subpattern set a required byte (or set a first byte that isn't        /* If the subpattern set a required byte (or set a first byte that isn't
6413        really the first byte - see above), set it. */        really the first byte - see above), set it. */
6414    
6415        if (subreqbyte >= 0) reqbyte = subreqbyte;        if (subreqchar >= 0) reqchar = subreqchar;
6416        }        }
6417    
6418      /* For a forward assertion, we take the reqbyte, if set. This can be      /* For a forward assertion, we take the reqchar, if set. This can be
6419      helpful if the pattern that follows the assertion doesn't set a different      helpful if the pattern that follows the assertion doesn't set a different
6420      char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte      char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6421      for an assertion, however because it leads to incorrect effect for patterns      for an assertion, however because it leads to incorrect effect for patterns
6422      such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead      such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6423      of a firstbyte. This is overcome by a scan at the end if there's no      of a firstchar. This is overcome by a scan at the end if there's no
6424      firstbyte, looking for an asserted first char. */      firstchar, looking for an asserted first char. */
6425    
6426      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
6427      break;     /* End of processing '(' */      break;     /* End of processing '(' */
6428    
6429    
# Line 6413  for (;; ptr++) Line 6456  for (;; ptr++)
6456        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
6457        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
6458    
6459        if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)        if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6460          firstbyte = REQ_NONE;          firstchar = REQ_NONE;
6461    
6462        /* Set values to reset to if this is followed by a zero repeat. */        /* Set values to reset to if this is followed by a zero repeat. */
6463    
6464        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
6465        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
6466    
6467        /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'        /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6468        is a subroutine call by number (Oniguruma syntax). In fact, the value        is a subroutine call by number (Oniguruma syntax). In fact, the value
# Line 6470  for (;; ptr++) Line 6513  for (;; ptr++)
6513          /* Test a signed number in angle brackets or quotes. */          /* Test a signed number in angle brackets or quotes. */
6514    
6515          p = ptr + 2;          p = ptr + 2;
6516          while ((digitab[*p] & ctype_digit) != 0) p++;          while (IS_DIGIT(*p)) p++;
6517          if (*p != terminator)          if (*p != terminator)
6518            {            {
6519            *errorcodeptr = ERR57;            *errorcodeptr = ERR57;
# Line 6498  for (;; ptr++) Line 6541  for (;; ptr++)
6541          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
6542          }          }
6543    
6544        /* Back references are handled specially; must disable firstbyte if        /* Back references are handled specially; must disable firstchar if
6545        not set to cope with cases like (?=(\w+))\1: which would otherwise set        not set to cope with cases like (?=(\w+))\1: which would otherwise set
6546        ':' later. */        ':' later. */
6547    
# Line 6508  for (;; ptr++) Line 6551  for (;; ptr++)
6551          recno = -c - ESC_REF;          recno = -c - ESC_REF;
6552    
6553          HANDLE_REFERENCE:    /* Come here from named backref handling */          HANDLE_REFERENCE:    /* Come here from named backref handling */
6554          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6555          previous = code;          previous = code;
6556          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6557          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
# Line 6631  for (;; ptr++) Line 6674  for (;; ptr++)
6674    
6675      /* Set the first and required bytes appropriately. If no previous first      /* Set the first and required bytes appropriately. If no previous first
6676      byte, set it from this character, but revert to none on a zero repeat.      byte, set it from this character, but revert to none on a zero repeat.
6677      Otherwise, leave the firstbyte value alone, and don't change it on a zero      Otherwise, leave the firstchar value alone, and don't change it on a zero
6678      repeat. */      repeat. */
6679    
6680      if (firstbyte == REQ_UNSET)      if (firstchar == REQ_UNSET)
6681        {        {
6682        zerofirstbyte = REQ_NONE;        zerofirstchar = REQ_NONE;
6683        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
6684    
6685        /* If the character is more than one byte long, we can set firstbyte        /* If the character is more than one byte long, we can set firstchar
6686        only if it is not to be matched caselessly. */        only if it is not to be matched caselessly. */
6687    
6688        if (mclength == 1 || req_caseopt == 0)        if (mclength == 1 || req_caseopt == 0)
6689          {          {
6690          firstbyte = mcbuffer[0] | req_caseopt;          firstchar = mcbuffer[0] | req_caseopt;
6691          if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;          if (mclength != 1) reqchar = code[-1] | cd->req_varyopt;
6692          }          }
6693        else firstbyte = reqbyte = REQ_NONE;        else firstchar = reqchar = REQ_NONE;
6694        }        }
6695    
6696      /* firstbyte was previously set; we can set reqbyte only if the length is      /* firstchar was previously set; we can set reqchar only if the length is
6697      1 or the matching is caseful. */      1 or the matching is caseful. */
6698    
6699      else      else
6700        {        {
6701        zerofirstbyte = firstbyte;        zerofirstchar = firstchar;
6702        zeroreqbyte = reqbyte;        zeroreqchar = reqchar;
6703        if (mclength == 1 || req_caseopt == 0)        if (mclength == 1 || req_caseopt == 0)
6704          reqbyte = code[-1] | req_caseopt | cd->req_varyopt;          reqchar = code[-1] | req_caseopt | cd->req_varyopt;
6705        }        }
6706    
6707      break;            /* End of literal character handling */      break;            /* End of literal character handling */
# Line 6698  Arguments: Line 6741  Arguments:
6741    reset_bracount TRUE to reset the count for each branch    reset_bracount TRUE to reset the count for each branch
6742    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
6743    cond_depth     depth of nesting for conditional subpatterns    cond_depth     depth of nesting for conditional subpatterns
6744    firstbyteptr   place to put the first required character, or a negative number    firstcharptr   place to put the first required character, or a negative number
6745    reqbyteptr     place to put the last required character, or a negative number    reqcharptr     place to put the last required character, or a negative number
6746    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
6747    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
6748    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
# Line 6711  Returns:         TRUE on success Line 6754  Returns:         TRUE on success
6754  static BOOL  static BOOL
6755  compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,  compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
6756    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6757    int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr,
6758    compile_data *cd, int *lengthptr)    branch_chain *bcptr, compile_data *cd, int *lengthptr)
6759  {  {
6760  const pcre_uchar *ptr = *ptrptr;  const pcre_uchar *ptr = *ptrptr;
6761  pcre_uchar *code = *codeptr;  pcre_uchar *code = *codeptr;
# Line 6721  pcre_uchar *start_bracket = code; Line 6764  pcre_uchar *start_bracket = code;
6764  pcre_uchar *reverse_count = NULL;  pcre_uchar *reverse_count = NULL;
6765  open_capitem capitem;  open_capitem capitem;
6766  int capnumber = 0;  int capnumber = 0;
6767  int firstbyte, reqbyte;  pcre_int32 firstchar, reqchar;
6768  int branchfirstbyte, branchreqbyte;  pcre_int32 branchfirstchar, branchreqchar;
6769  int length;  int length;
6770  int orig_bracount;  int orig_bracount;
6771  int max_bracount;  int max_bracount;
# Line 6731  branch_chain bc; Line 6774  branch_chain bc;
6774  bc.outer = bcptr;  bc.outer = bcptr;
6775  bc.current_branch = code;  bc.current_branch = code;
6776    
6777  firstbyte = reqbyte = REQ_UNSET;  firstchar = reqchar = REQ_UNSET;
6778    
6779  /* Accumulate the length for use in the pre-compile phase. Start with the  /* Accumulate the length for use in the pre-compile phase. Start with the
6780  length of the BRA and KET and any extra bytes that are required at the  length of the BRA and KET and any extra bytes that are required at the
# Line 6790  for (;;) Line 6833  for (;;)
6833    /* Now compile the branch; in the pre-compile phase its length gets added    /* Now compile the branch; in the pre-compile phase its length gets added
6834    into the length. */    into the length. */
6835    
6836    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
6837          &branchreqbyte, &bc, cond_depth, cd,          &branchreqchar, &bc, cond_depth, cd,
6838          (lengthptr == NULL)? NULL : &length))          (lengthptr == NULL)? NULL : &length))
6839      {      {
6840      *ptrptr = ptr;      *ptrptr = ptr;
# Line 6807  for (;;) Line 6850  for (;;)
6850    
6851    if (lengthptr == NULL)    if (lengthptr == NULL)
6852      {      {
6853      /* If this is the first branch, the firstbyte and reqbyte values for the      /* If this is the first branch, the firstchar and reqchar values for the
6854      branch become the values for the regex. */      branch become the values for the regex. */
6855    
6856      if (*last_branch != OP_ALT)      if (*last_branch != OP_ALT)
6857        {        {
6858        firstbyte = branchfirstbyte;        firstchar = branchfirstchar;
6859        reqbyte = branchreqbyte;        reqchar = branchreqchar;
6860        }        }
6861    
6862      /* If this is not the first branch, the first char and reqbyte have to      /* If this is not the first branch, the first char and reqchar have to
6863      match the values from all the previous branches, except that if the      match the values from all the previous branches, except that if the
6864      previous value for reqbyte didn't have REQ_VARY set, it can still match,      previous value for reqchar didn't have REQ_VARY set, it can still match,
6865      and we set REQ_VARY for the regex. */      and we set REQ_VARY for the regex. */
6866    
6867      else      else
6868        {        {
6869        /* If we previously had a firstbyte, but it doesn't match the new branch,        /* If we previously had a firstchar, but it doesn't match the new branch,
6870        we have to abandon the firstbyte for the regex, but if there was        we have to abandon the firstchar for the regex, but if there was
6871        previously no reqbyte, it takes on the value of the old firstbyte. */        previously no reqchar, it takes on the value of the old firstchar. */
6872    
6873        if (firstbyte >= 0 && firstbyte != branchfirstbyte)        if (firstchar >= 0 && firstchar != branchfirstchar)
6874          {          {
6875          if (reqbyte < 0) reqbyte = firstbyte;          if (reqchar < 0) reqchar = firstchar;
6876          firstbyte = REQ_NONE;          firstchar = REQ_NONE;
6877          }          }
6878    
6879        /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstchar, a firstchar from the
6880        branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqchar if there isn't a branch reqchar. */
6881    
6882        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
6883            branchreqbyte = branchfirstbyte;            branchreqchar = branchfirstchar;
6884    
6885        /* Now ensure that the reqbytes match */        /* Now ensure that the reqchars match */
6886    
6887        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY))
6888          reqbyte = REQ_NONE;          reqchar = REQ_NONE;
6889        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqchar |= branchreqchar;   /* To "or" REQ_VARY */
6890        }        }
6891    
6892      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
# Line 6933  for (;;) Line 6976  for (;;)
6976    
6977      *codeptr = code;      *codeptr = code;
6978      *ptrptr = ptr;      *ptrptr = ptr;
6979      *firstbyteptr = firstbyte;      *firstcharptr = firstchar;
6980      *reqbyteptr = reqbyte;      *reqcharptr = reqchar;
6981      if (lengthptr != NULL)      if (lengthptr != NULL)
6982        {        {
6983        if (OFLOW_MAX - *lengthptr < length)        if (OFLOW_MAX - *lengthptr < length)
# Line 7313  pcre16_compile2(PCRE_SPTR16 pattern, int Line 7356  pcre16_compile2(PCRE_SPTR16 pattern, int
7356  {  {
7357  real_pcre *re;  real_pcre *re;
7358  int length = 1;  /* For final END opcode */  int length = 1;  /* For final END opcode */
7359  int firstbyte, reqbyte, newline;  pcre_int32 firstchar, reqchar;
7360    int newline;
7361  int errorcode = 0;  int errorcode = 0;
7362  int skipatstart = 0;  int skipatstart = 0;
7363  BOOL utf8;  BOOL utf8;
# Line 7541  ptr += skipatstart; Line 7585  ptr += skipatstart;
7585  code = cworkspace;  code = cworkspace;
7586  *code = OP_BRA;  *code = OP_BRA;
7587  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7588    FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length);    FALSE, 0, 0, &firstchar, &reqchar, NULL, cd, &length);
7589  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7590    
7591  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 7578  re->size = (int)size; Line 7622  re->size = (int)size;
7622  re->options = cd->external_options;  re->options = cd->external_options;
7623  re->flags = cd->external_flags;  re->flags = cd->external_flags;
7624  re->dummy1 = 0;  re->dummy1 = 0;
7625  re->first_byte = 0;  re->first_char = 0;
7626  re->req_byte = 0;  re->req_char = 0;
7627  re->name_table_offset = sizeof(real_pcre) / sizeof(pcre_uchar);  re->name_table_offset = sizeof(real_pcre) / sizeof(pcre_uchar);
7628  re->name_entry_size = cd->name_entry_size;  re->name_entry_size = cd->name_entry_size;
7629  re->name_count = cd->names_found;  re->name_count = cd->names_found;
# Line 7615  ptr = (const pcre_uchar *)pattern + skip Line 7659  ptr = (const pcre_uchar *)pattern + skip
7659  code = (pcre_uchar *)codestart;  code = (pcre_uchar *)codestart;
7660  *code = OP_BRA;  *code = OP_BRA;
7661  (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,  (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
7662    &firstbyte, &reqbyte, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
7663  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7664  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7665  re->flags = cd->external_flags;  re->flags = cd->external_flags;
7666    
7667  if (cd->had_accept) reqbyte = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7668    
7669  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
7670    
# Line 7726  if ((re->options & PCRE_ANCHORED) == 0) Line 7770  if ((re->options & PCRE_ANCHORED) == 0)
7770      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
7771    else    else
7772      {      {
7773      if (firstbyte < 0)      if (firstchar < 0)
7774        firstbyte = find_firstassertedchar(codestart, FALSE);        firstchar = find_firstassertedchar(codestart, FALSE);
7775      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */      if (firstchar >= 0)   /* Remove caseless flag for non-caseable chars */
7776        {        {
7777        int ch = firstbyte & 255;  #ifdef COMPILE_PCRE8
7778        re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&        re->first_char = firstchar & 0xff;
7779           cd->fcc[ch] == ch)? ch : firstbyte;  #else
7780    #ifdef COMPILE_PCRE16
7781          re->first_char = firstchar & 0xffff;
7782    #endif
7783    #endif
7784          if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)
7785            && cd->fcc[re->first_char] != re->first_char)
7786            re->flags |= PCRE_FCH_CASELESS;
7787    
7788        re->flags |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
7789        }        }
7790      else if (is_startline(codestart, 0, cd->backref_map))      else if (is_startline(codestart, 0, cd->backref_map))
# Line 7744  if ((re->options & PCRE_ANCHORED) == 0) Line 7796  if ((re->options & PCRE_ANCHORED) == 0)
7796  variable length item in the regex. Remove the caseless flag for non-caseable  variable length item in the regex. Remove the caseless flag for non-caseable
7797  bytes. */  bytes. */
7798    
7799  if (reqbyte >= 0 &&  if (reqchar >= 0 &&
7800       ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))       ((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0))
7801    {    {
7802    int ch = reqbyte & 255;  #ifdef COMPILE_PCRE8
7803    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&    re->req_char = reqchar & 0xff;
7804      cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;  #else
7805    #ifdef COMPILE_PCRE16
7806      re->req_char = reqchar & 0xffff;
7807    #endif
7808    #endif
7809      if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)
7810        && cd->fcc[re->req_char] != re->req_char)
7811        re->flags |= PCRE_RCH_CASELESS;
7812    
7813    re->flags |= PCRE_REQCHSET;    re->flags |= PCRE_REQCHSET;
7814    }    }
7815    
# Line 7764  printf("Options=%08x\n", re->options); Line 7824  printf("Options=%08x\n", re->options);
7824    
7825  if ((re->flags & PCRE_FIRSTSET) != 0)  if ((re->flags & PCRE_FIRSTSET) != 0)
7826    {    {
7827    int ch = re->first_byte & 255;    pcre_uchar ch = re->first_char;
7828    const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?    const char *caseless =
7829      "" : " (caseless)";      ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
7830    if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);    if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
7831      else printf("First char = \\x%02x%s\n", ch, caseless);      else printf("First char = \\x%02x%s\n", ch, caseless);
7832    }    }
7833    
7834  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
7835    {    {
7836    int ch = re->req_byte & 255;    pcre_uchar ch = re->req_char;
7837    const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?    const char *caseless =
7838      "" : " (caseless)";      ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
7839    if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);    if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
7840      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
7841    }    }
7842    

Legend:
Removed from v.773  
changed lines
  Added in v.774

  ViewVC Help
Powered by ViewVC 1.1.5