/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 68 by nigel, Sat Feb 24 21:40:13 2007 UTC revision 69 by nigel, Sat Feb 24 21:40:18 2007 UTC
# Line 113  static const short int escapes[] = { Line 113  static const short int escapes[] = {
113      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
114    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
115      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
116      0,      0,  ESC_r, -ESC_s,  ESC_t,      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
117      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
118  };  };
119    
# Line 150  static const int posix_class_maps[] = { Line 150  static const int posix_class_maps[] = {
150    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,         -1              /* xdigit */
151  };  };
152    
153    /* Table to identify ASCII digits and hex digits. This is used when compiling
154    patterns. Note that the tables in chartables are dependent on the locale, and
155    may mark arbitrary characters as digits - but the PCRE compiling code expects
156    to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157    a private table here. It costs 256 bytes, but it is a lot faster than doing
158    character value tests (at least in some simple cases I timed), and in some
159    applications one wants PCRE to compile efficiently as well as match
160    efficiently.
161    
162    For convenience, we use the same bit definitions as in chartables:
163    
164      0x04   decimal digit
165      0x08   hexadecimal digit
166    
167    Then we can use ctype_digit and ctype_xdigit in the code. */
168    
169    static const unsigned char digitab[] =
170      {
171      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
172      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
173      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
174      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
175      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
176      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
177      0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
178      0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
179      0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
180      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
181      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
182      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
183      0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
184      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
185      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
186      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
187      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202      0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
203    
204  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
205    
# Line 315  tables. */ Line 365  tables. */
365  /* These are the breakpoints for different numbers of bytes in a UTF-8  /* These are the breakpoints for different numbers of bytes in a UTF-8
366  character. */  character. */
367    
368  static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};  static const int utf8_table1[] =
369      { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
370    
371  /* These are the indicator bits and the mask for the data bits to set in the  /* These are the indicator bits and the mask for the data bits to set in the
372  first byte of a character, indexed by the number of additional bytes. */  first byte of a character, indexed by the number of additional bytes. */
373    
374  static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};  static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
375  static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};  static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
376    
377  /* Table of the number of extra characters, indexed by the first character  /* Table of the number of extra characters, indexed by the first character
378  masked with 0x3f. The highest number for a valid UTF-8 character is in fact  masked with 0x3f. The highest number for a valid UTF-8 character is in fact
379  0x3d. */  0x3d. */
380    
381  static uschar utf8_table4[] = {  static const uschar utf8_table4[] = {
382    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
383    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
384    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# Line 686  else Line 737  else
737        {        {
738        oldptr = ptr;        oldptr = ptr;
739        c -= '0';        c -= '0';
740        while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
741          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
742        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
743          {          {
# Line 712  else Line 763  else
763    
764      case '0':      case '0':
765      c -= '0';      c -= '0';
766      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
       ptr[1] != '8' && ptr[1] != '9')  
767          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
768      c &= 255;     /* Take least significant 8 bits */      c &= 255;     /* Take least significant 8 bits */
769      break;      break;
# Line 728  else Line 778  else
778        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
779        register int count = 0;        register int count = 0;
780        c = 0;        c = 0;
781        while ((cd->ctypes[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
782          {          {
783            int cc = *pt++;
784            if (cc >= 'a') cc -= 32;            /* Convert to upper case */
785          count++;          count++;
786          c = c * 16 + cd->lcc[*pt] -          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
           (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');  
         pt++;  
787          }          }
788        if (*pt == '}')        if (*pt == '}')
789          {          {
# Line 749  else Line 799  else
799      /* Read just a single hex char */      /* Read just a single hex char */
800    
801      c = 0;      c = 0;
802      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
803        {        {
804        ptr++;        int cc = *(++ptr);
805        c = c * 16 + cd->lcc[*ptr] -        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
806          (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
807        }        }
808      break;      break;
809    
# Line 767  else Line 817  else
817        return 0;        return 0;
818        }        }
819    
820      /* A letter is upper-cased; then the 0x40 bit is flipped */      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
821        is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
822    
823      if (c >= 'a' && c <= 'z') c = cd->fcc[c];      if (c >= 'a' && c <= 'z') c -= 32;
824      c ^= 0x40;      c ^= 0x40;
825      break;      break;
826    
# Line 815  Returns:    TRUE or FALSE Line 866  Returns:    TRUE or FALSE
866  static BOOL  static BOOL
867  is_counted_repeat(const uschar *p, compile_data *cd)  is_counted_repeat(const uschar *p, compile_data *cd)
868  {  {
869  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
870  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
871  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
872    
873  if (*p++ != ',') return FALSE;  if (*p++ != ',') return FALSE;
874  if (*p == '}') return TRUE;  if (*p == '}') return TRUE;
875    
876  if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
877  while ((cd->ctypes[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
878    
879  return (*p == '}');  return (*p == '}');
880  }  }
881    
# Line 856  read_repeat_counts(const uschar *p, int Line 908  read_repeat_counts(const uschar *p, int
908  int min = 0;  int min = 0;
909  int max = -1;  int max = -1;
910    
911  while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
912    
913  if (*p == '}') max = min; else  if (*p == '}') max = min; else
914    {    {
915    if (*(++p) != '}')    if (*(++p) != '}')
916      {      {
917      max = 0;      max = 0;
918      while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919      if (max < min)      if (max < min)
920        {        {
921        *errorptr = ERR4;        *errorptr = ERR4;
# Line 2570  for (;; ptr++) Line 2622  for (;; ptr++)
2622            ptr += 3;            ptr += 3;
2623            }            }
2624    
2625          /* Condition to test for a numbered subpattern match */          /* Condition to test for a numbered subpattern match. We know that
2626            if a digit follows ( then there will just be digits until ) because
2627            the syntax was checked in the first pass. */
2628    
2629          else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)          else if ((digitab[ptr[1]] && ctype_digit) != 0)
2630            {            {
2631            int condref;                 /* Don't amalgamate; some compilers */            int condref;                 /* Don't amalgamate; some compilers */
2632            condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */            condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
# Line 2625  for (;; ptr++) Line 2679  for (;; ptr++)
2679          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
2680            {            {
2681            int n = 0;            int n = 0;
2682            while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
2683              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
2684            if (n > 255)            if (n > 255)
2685              {              {
# Line 2725  for (;; ptr++) Line 2779  for (;; ptr++)
2779            {            {
2780            const uschar *called;            const uschar *called;
2781            recno = 0;            recno = 0;
2782              while((digitab[*ptr] & ctype_digit) != 0)
           while ((cd->ctypes[*ptr] & ctype_digit) != 0)  
2783              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
2784    
2785            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
# Line 4164  while ((c = *(++ptr)) != 0) Line 4217  while ((c = *(++ptr)) != 0)
4217          case '5': case '6': case '7': case '8': case '9':          case '5': case '6': case '7': case '8': case '9':
4218          ptr += 2;          ptr += 2;
4219          if (c != 'R')          if (c != 'R')
4220            while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);            while ((digitab[*(++ptr)] & ctype_digit) != 0);
4221          if (*ptr != ')')          if (*ptr != ')')
4222            {            {
4223            *errorptr = ERR29;            *errorptr = ERR29;
# Line 4190  while ((c = *(++ptr)) != 0) Line 4243  while ((c = *(++ptr)) != 0)
4243    
4244          case 'C':          case 'C':
4245          ptr += 2;          ptr += 2;
4246          while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);          while ((digitab[*(++ptr)] & ctype_digit) != 0);
4247          if (*ptr != ')')          if (*ptr != ')')
4248            {            {
4249            *errorptr = ERR39;            *errorptr = ERR39;
# Line 4257  while ((c = *(++ptr)) != 0) Line 4310  while ((c = *(++ptr)) != 0)
4310            ptr += 4;            ptr += 4;
4311            length += 3;            length += 3;
4312            }            }
4313          else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)          else if ((digitab[ptr[3]] & ctype_digit) != 0)
4314            {            {
4315            ptr += 4;            ptr += 4;
4316            length += 3;            length += 3;
4317            while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;            while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4318            if (*ptr != ')')            if (*ptr != ')')
4319              {              {
4320              *errorptr = ERR26;              *errorptr = ERR26;
# Line 5171  for (;;) Line 5224  for (;;)
5224    
5225      case OP_REVERSE:      case OP_REVERSE:
5226  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5227      c = GET(ecode,1);      if (md->utf8)
     for (i = 0; i < c; i++)  
5228        {        {
5229        eptr--;        c = GET(ecode,1);
5230        BACKCHAR(eptr)        for (i = 0; i < c; i++)
5231            {
5232            eptr--;
5233            if (eptr < md->start_subject) return MATCH_NOMATCH;
5234            BACKCHAR(eptr)
5235            }
5236        }        }
5237  #else      else
     eptr -= GET(ecode,1);  
5238  #endif  #endif
5239    
5240      if (eptr < md->start_subject) return MATCH_NOMATCH;      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5241    
5242          {
5243          eptr -= GET(ecode,1);
5244          if (eptr < md->start_subject) return MATCH_NOMATCH;
5245          }
5246    
5247        /* Skip to next op code */
5248    
5249      ecode += 1 + LINK_SIZE;      ecode += 1 + LINK_SIZE;
5250      break;      break;
5251    
# Line 5999  for (;;) Line 6063  for (;;)
6063                }                }
6064              eptr += len;              eptr += len;
6065              }              }
6066            while (eptr >= pp)            for (;;)
6067              {              {
6068              if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=              if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6069                   MATCH_NOMATCH) return rrc;                   MATCH_NOMATCH) return rrc;
6070              BACKCHAR(eptr)              if (eptr-- == pp) break;        /* Stop if tried at original pos */
6071                BACKCHAR(eptr);
6072              }              }
6073            }            }
6074          else          else
# Line 6111  for (;;) Line 6176  for (;;)
6176            if (!match_xclass(c, data)) break;            if (!match_xclass(c, data)) break;
6177            eptr += len;            eptr += len;
6178            }            }
6179          while (eptr >= pp)          for(;;)
6180            {            {
6181            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6182                 MATCH_NOMATCH) return rrc;                 MATCH_NOMATCH) return rrc;
6183              if (eptr-- == pp) break;        /* Stop if tried at original pos */
6184            BACKCHAR(eptr)            BACKCHAR(eptr)
6185            }            }
6186          return MATCH_NOMATCH;          return MATCH_NOMATCH;
# Line 6490  for (;;) Line 6556  for (;;)
6556              if (c == d) break;              if (c == d) break;
6557              eptr += len;              eptr += len;
6558              }              }
6559            while (eptr >= pp)            for(;;)
6560              {              {
6561              if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=              if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6562                   MATCH_NOMATCH) return rrc;                   MATCH_NOMATCH) return rrc;
6563              eptr--;              if (eptr-- == pp) break;        /* Stop if tried at original pos */
6564              BACKCHAR(eptr);              BACKCHAR(eptr);
6565              }              }
6566            }            }
# Line 6595  for (;;) Line 6661  for (;;)
6661              if (c == d) break;              if (c == d) break;
6662              eptr += len;              eptr += len;
6663              }              }
6664            while (eptr >= pp)            for(;;)
6665              {              {
6666              if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=              if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6667                  MATCH_NOMATCH) return rrc;                  MATCH_NOMATCH) return rrc;
6668              eptr--;              if (eptr-- == pp) break;        /* Stop if tried at original pos */
6669              BACKCHAR(eptr);              BACKCHAR(eptr);
6670              }              }
6671            }            }
# Line 7053  for (;;) Line 7119  for (;;)
7119    
7120          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
7121    
7122          while (eptr >= pp)          for(;;)
7123            {            {
7124            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
7125                 MATCH_NOMATCH) return rrc;                 MATCH_NOMATCH) return rrc;
7126              if (eptr-- == pp) break;        /* Stop if tried at original pos */
7127            BACKCHAR(eptr);            BACKCHAR(eptr);
7128            }            }
7129          }          }

Legend:
Removed from v.68  
changed lines
  Added in v.69

  ViewVC Help
Powered by ViewVC 1.1.5