/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC revision 51 by nigel, Sat Feb 24 21:39:37 2007 UTC
# Line 66  not be set greater than 200. */ Line 66  not be set greater than 200. */
66  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
67    
68    
69    /* The number of bytes in a literal character string above which we can't add
70    any more is different when UTF-8 characters may be encountered. */
71    
72    #ifdef SUPPORT_UTF8
73    #define MAXLIT 250
74    #else
75    #define MAXLIT 255
76    #endif
77    
78    
79  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
# Line 176  void  (*pcre_free)(void *) = free; Line 186  void  (*pcre_free)(void *) = free;
186    
187    
188    
189    /*************************************************
190    *    Macros and tables for character handling    *
191    *************************************************/
192    
193    /* When UTF-8 encoding is being used, a character is no longer just a single
194    byte. The macros for character handling generate simple sequences when used in
195    byte-mode, and more complicated ones for UTF-8 characters. */
196    
197    #ifndef SUPPORT_UTF8
198    #define GETCHARINC(c, eptr) c = *eptr++;
199    #define GETCHARLEN(c, eptr, len) c = *eptr;
200    #define BACKCHAR(eptr)
201    
202    #else   /* SUPPORT_UTF8 */
203    
204    /* Get the next UTF-8 character, advancing the pointer */
205    
206    #define GETCHARINC(c, eptr) \
207      c = *eptr++; \
208      if (md->utf8 && (c & 0xc0) == 0xc0) \
209        { \
210        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
211        int s = 6 - a;                  /* Amount to shift next byte */  \
212        c &= utf8_table3[a];            /* Low order bits from first byte */ \
213        while (a-- > 0) \
214          { \
215          c |= (*eptr++ & 0x3f) << s; \
216          s += 6; \
217          } \
218        }
219    
220    /* Get the next UTF-8 character, not advancing the pointer, setting length */
221    
222    #define GETCHARLEN(c, eptr, len) \
223      c = *eptr; \
224      len = 1; \
225      if (md->utf8 && (c & 0xc0) == 0xc0) \
226        { \
227        int i; \
228        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
229        int s = 6 - a;                  /* Amount to shift next byte */  \
230        c &= utf8_table3[a];            /* Low order bits from first byte */ \
231        for (i = 1; i <= a; i++) \
232          { \
233          c |= (eptr[i] & 0x3f) << s; \
234          s += 6; \
235          } \
236        len += a; \
237        }
238    
239    /* If the pointer is not at the start of a character, move it back until
240    it is. */
241    
242    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243    
244    #endif
245    
246    
247    
248  /*************************************************  /*************************************************
249  *             Default character tables           *  *             Default character tables           *
# Line 191  tables. */ Line 259  tables. */
259    
260    
261    
262    #ifdef SUPPORT_UTF8
263    /*************************************************
264    *           Tables for UTF-8 support             *
265    *************************************************/
266    
267    /* These are the breakpoints for different numbers of bytes in a UTF-8
268    character. */
269    
270    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271    
272    /* These are the indicator bits and the mask for the data bits to set in the
273    first byte of a character, indexed by the number of additional bytes. */
274    
275    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277    
278    /* Table of the number of extra characters, indexed by the first character
279    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280    0x3d. */
281    
282    static uschar utf8_table4[] = {
283      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287    
288    
289    /*************************************************
290    *       Convert character value to UTF-8         *
291    *************************************************/
292    
293    /* This function takes an integer value in the range 0 - 0x7fffffff
294    and encodes it as a UTF-8 character in 0 to 6 bytes.
295    
296    Arguments:
297      cvalue     the character value
298      buffer     pointer to buffer for result - at least 6 bytes long
299    
300    Returns:     number of characters placed in the buffer
301    */
302    
303    static int
304    ord2utf8(int cvalue, uschar *buffer)
305    {
306    register int i, j;
307    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308      if (cvalue <= utf8_table1[i]) break;
309    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310    cvalue >>= 6 - i;
311    for (j = 0; j < i; j++)
312      {
313      *buffer++ = 0x80 | (cvalue & 0x3f);
314      cvalue >>= 6;
315      }
316    return i + 1;
317    }
318    #endif
319    
320    
321    
322  /*************************************************  /*************************************************
323  *          Return version string                 *  *          Return version string                 *
324  *************************************************/  *************************************************/
# Line 349  while (length-- > 0) Line 477  while (length-- > 0)
477    
478  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
479  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
480  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
482  sequence.  the \. On exit, it is on the final character of the escape sequence.
483    
484  Arguments:  Arguments:
485    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 373  check_escape(const uschar **ptrptr, cons Line 501  check_escape(const uschar **ptrptr, cons
501  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
502  int c, i;  int c, i;
503    
504  c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  /* If backslash is at the end of the pattern, it's an error. */
505    
506    c = *(++ptr);
507  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
508    
509  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 433  else Line 563  else
563        }        }
564    
565      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
566      larger first octal digit */      larger first octal digit. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
572          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
573        c &= 255;     /* Take least significant 8 bits */
574      break;      break;
575    
576      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577        which can be greater than 0xff, but only if the ddd are hex digits. */
578    
579      case 'x':      case 'x':
580    #ifdef SUPPORT_UTF8
581        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582          {
583          const uschar *pt = ptr + 2;
584          register int count = 0;
585          c = 0;
586          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587            {
588            count++;
589            c = c * 16 + cd->lcc[*pt] -
590              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591            pt++;
592            }
593          if (*pt == '}')
594            {
595            if (c < 0 || count > 8) *errorptr = ERR34;
596            ptr = pt;
597            break;
598            }
599          /* If the sequence of hex digits does not end with '}', then we don't
600          recognize this construct; fall through to the normal \x handling. */
601          }
602    #endif
603    
604        /* Read just a single hex char */
605    
606      c = 0;      c = 0;
607      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608        {        {
# Line 454  else Line 612  else
612        }        }
613      break;      break;
614    
615        /* Other special escapes not starting with a digit are straightforward */
616    
617      case 'c':      case 'c':
618      c = *(++ptr);      c = *(++ptr);
619      if (c == 0)      if (c == 0)
# Line 591  if the length is fixed. This is needed f Line 751  if the length is fixed. This is needed f
751    
752  Arguments:  Arguments:
753    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
754      options  the compiling options
755    
756  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length
757  */  */
758    
759  static int  static int
760  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
761  {  {
762  int length = -1;  int length = -1;
763    
# Line 617  for (;;) Line 778  for (;;)
778      case OP_BRA:      case OP_BRA:
779      case OP_ONCE:      case OP_ONCE:
780      case OP_COND:      case OP_COND:
781      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
782      if (d < 0) return -1;      if (d < 0) return -1;
783      branchlength += d;      branchlength += d;
784      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
# Line 671  for (;;) Line 832  for (;;)
832      cc++;      cc++;
833      break;      break;
834    
835      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836        This requires a scan of the string, unfortunately. We assume valid UTF-8
837        strings, so all we do is reduce the length by one for byte whose bits are
838        10xxxxxx. */
839    
840      case OP_CHARS:      case OP_CHARS:
841      branchlength += *(++cc);      branchlength += *(++cc);
842    #ifdef SUPPORT_UTF8
843        for (d = 1; d <= *cc; d++)
844          if ((cc[d] & 0xc0) == 0x80) branchlength--;
845    #endif
846      cc += *cc + 1;      cc += *cc + 1;
847      break;      break;
848    
# Line 1054  for (;; ptr++) Line 1222  for (;; ptr++)
1222              goto FAILED;              goto FAILED;
1223              }              }
1224            }            }
1225          /* Fall through if single character */  
1226            /* Fall through if single character, but don't at present allow
1227            chars > 255 in UTF-8 mode. */
1228    
1229    #ifdef SUPPORT_UTF8
1230            if (c > 255)
1231              {
1232              *errorptr = ERR33;
1233              goto FAILED;
1234              }
1235    #endif
1236          }          }
1237    
1238        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
# Line 1074  for (;; ptr++) Line 1252  for (;; ptr++)
1252            }            }
1253    
1254          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1255          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256            in such circumstances. */
1257    
1258          if (d == '\\')          if (d == '\\')
1259            {            {
1260              const uschar *oldptr = ptr;
1261            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262    
1263    #ifdef SUPPORT_UTF8
1264              if (d > 255)
1265                {
1266                *errorptr = ERR33;
1267                goto FAILED;
1268                }
1269    #endif
1270              /* \b is backslash; any other special means the '-' was literal */
1271    
1272            if (d < 0)            if (d < 0)
1273              {              {
1274              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1275                {                {
1276                *errorptr = ERR7;                ptr = oldptr - 2;
1277                goto FAILED;                goto SINGLE_CHARACTER;  /* A few lines below */
1278                }                }
1279              }              }
1280            }            }
# Line 1112  for (;; ptr++) Line 1302  for (;; ptr++)
1302        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1303        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1304    
1305          SINGLE_CHARACTER:
1306    
1307        class [c/8] |= (1 << (c&7));        class [c/8] |= (1 << (c&7));
1308        if ((options & PCRE_CASELESS) != 0)        if ((options & PCRE_CASELESS) != 0)
1309          {          {
# Line 1562  for (;; ptr++) Line 1754  for (;; ptr++)
1754            {            {
1755            condref = *ptr - '0';            condref = *ptr - '0';
1756            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757              if (condref == 0)
1758                {
1759                *errorptr = ERR35;
1760                goto FAILED;
1761                }
1762            ptr++;            ptr++;
1763            }            }
1764          else ptr--;          else ptr--;
# Line 1829  for (;; ptr++) Line 2026  for (;; ptr++)
2026          tempptr = ptr;          tempptr = ptr;
2027          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
2029    
2030            /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031            two or more characters in the UTF-8 encoding. */
2032    
2033    #ifdef SUPPORT_UTF8
2034            if (c > 127 && (options & PCRE_UTF8) != 0)
2035              {
2036              uschar buffer[8];
2037              int len = ord2utf8(c, buffer);
2038              for (c = 0; c < len; c++) *code++ = buffer[c];
2039              length += len;
2040              continue;
2041              }
2042    #endif
2043          }          }
2044    
2045        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 1839  for (;; ptr++) Line 2050  for (;; ptr++)
2050    
2051      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
2052    
2053      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054    
2055      /* Update the last character and the count of literals */      /* Update the last character and the count of literals */
2056    
# Line 1851  for (;; ptr++) Line 2062  for (;; ptr++)
2062      the next state. */      the next state. */
2063    
2064      previous[1] = length;      previous[1] = length;
2065      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
2066      break;      break;
2067      }      }
2068    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1889  Argument: Line 2100  Argument:
2100    ptrptr      -> the address of the current pattern pointer    ptrptr      -> the address of the current pattern pointer
2101    errorptr    -> pointer to error message    errorptr    -> pointer to error message
2102    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
2103    condref     > 0 for OPT_CREF setting at start of conditional group    condref     >= 0 for OPT_CREF setting at start of conditional group
2104    reqchar     -> place to put the last required character, or a negative number    reqchar     -> place to put the last required character, or a negative number
2105    countlits   -> place to put the shortest literal count of any branch    countlits   -> place to put the shortest literal count of any branch
2106    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
# Line 1917  code += 3; Line 2128  code += 3;
2128  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
2129  number as an OP_CREF item. */  number as an OP_CREF item. */
2130    
2131  if (condref > 0)  if (condref >= 0)
2132    {    {
2133    *code++ = OP_CREF;    *code++ = OP_CREF;
2134    *code++ = condref;    *code++ = condref;
# Line 1989  for (;;) Line 2200  for (;;)
2200    if (lookbehind)    if (lookbehind)
2201      {      {
2202      *code = OP_END;      *code = OP_END;
2203      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
2204      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
2205      if (length < 0)      if (length < 0)
2206        {        {
# Line 2280  uschar bralenstack[BRASTACK_SIZE]; Line 2491  uschar bralenstack[BRASTACK_SIZE];
2491  uschar *code_base, *code_end;  uschar *code_base, *code_end;
2492  #endif  #endif
2493    
2494    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2495    
2496    #ifndef SUPPORT_UTF8
2497    if ((options & PCRE_UTF8) != 0)
2498      {
2499      *errorptr = ERR32;
2500      return NULL;
2501      }
2502    #endif
2503    
2504  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
2505  can do is just return NULL. */  can do is just return NULL. */
2506    
# Line 2775  while ((c = *(++ptr)) != 0) Line 2996  while ((c = *(++ptr)) != 0)
2996            &compile_block);            &compile_block);
2997          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2998          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
2999    
3000    #ifdef SUPPORT_UTF8
3001            if (c > 127 && (options & PCRE_UTF8) != 0)
3002              {
3003              int i;
3004              for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3005                if (c <= utf8_table1[i]) break;
3006              runlength += i;
3007              }
3008    #endif
3009          }          }
3010    
3011        /* Ordinary character or single-char escape */        /* Ordinary character or single-char escape */
# Line 2784  while ((c = *(++ptr)) != 0) Line 3015  while ((c = *(++ptr)) != 0)
3015    
3016      /* This "while" is the end of the "do" above. */      /* This "while" is the end of the "do" above. */
3017    
3018      while (runlength < 255 &&      while (runlength < MAXLIT &&
3019        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);        (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3020    
3021      ptr--;      ptr--;
# Line 3429  for (;;) Line 3660  for (;;)
3660    
3661      /* Move the subject pointer back. This occurs only at the start of      /* Move the subject pointer back. This occurs only at the start of
3662      each branch of a lookbehind assertion. If we are too close to the start to      each branch of a lookbehind assertion. If we are too close to the start to
3663      move back, this match function fails. */      move back, this match function fails. When working with UTF-8 we move
3664        back a number of characters, not bytes. */
3665    
3666      case OP_REVERSE:      case OP_REVERSE:
3667    #ifdef SUPPORT_UTF8
3668        c = (ecode[1] << 8) + ecode[2];
3669        for (i = 0; i < c; i++)
3670          {
3671          eptr--;
3672          BACKCHAR(eptr)
3673          }
3674    #else
3675      eptr -= (ecode[1] << 8) + ecode[2];      eptr -= (ecode[1] << 8) + ecode[2];
3676    #endif
3677    
3678      if (eptr < md->start_subject) return FALSE;      if (eptr < md->start_subject) return FALSE;
3679      ecode += 3;      ecode += 3;
3680      break;      break;
# Line 3752  for (;;) Line 3994  for (;;)
3994      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')      if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3995        return FALSE;        return FALSE;
3996      if (eptr++ >= md->end_subject) return FALSE;      if (eptr++ >= md->end_subject) return FALSE;
3997    #ifdef SUPPORT_UTF8
3998        if (md->utf8)
3999          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4000    #endif
4001      ecode++;      ecode++;
4002      break;      break;
4003    
# Line 3953  for (;;) Line 4199  for (;;)
4199        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
4200          {          {
4201          if (eptr >= md->end_subject) return FALSE;          if (eptr >= md->end_subject) return FALSE;
4202          c = *eptr++;          GETCHARINC(c, eptr)         /* Get character; increment eptr */
4203    
4204    #ifdef SUPPORT_UTF8
4205            /* We do not yet support class members > 255 */
4206            if (c > 255) return FALSE;
4207    #endif
4208    
4209          if ((data[c/8] & (1 << (c&7))) != 0) continue;          if ((data[c/8] & (1 << (c&7))) != 0) continue;
4210          return FALSE;          return FALSE;
4211          }          }
# Line 3973  for (;;) Line 4225  for (;;)
4225            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))            if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4226              return TRUE;              return TRUE;
4227            if (i >= max || eptr >= md->end_subject) return FALSE;            if (i >= max || eptr >= md->end_subject) return FALSE;
4228            c = *eptr++;            GETCHARINC(c, eptr)       /* Get character; increment eptr */
4229    
4230    #ifdef SUPPORT_UTF8
4231              /* We do not yet support class members > 255 */
4232              if (c > 255) return FALSE;
4233    #endif
4234            if ((data[c/8] & (1 << (c&7))) != 0) continue;            if ((data[c/8] & (1 << (c&7))) != 0) continue;
4235            return FALSE;            return FALSE;
4236            }            }
# Line 3985  for (;;) Line 4242  for (;;)
4242        else        else
4243          {          {
4244          const uschar *pp = eptr;          const uschar *pp = eptr;
4245          for (i = min; i < max; eptr++, i++)          int len = 1;
4246            for (i = min; i < max; i++)
4247            {            {
4248            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
4249            c = *eptr;            GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4250            if ((data[c/8] & (1 << (c&7))) != 0) continue;  
4251            break;  #ifdef SUPPORT_UTF8
4252              /* We do not yet support class members > 255 */
4253              if (c > 255) break;
4254    #endif
4255              if ((data[c/8] & (1 << (c&7))) == 0) break;
4256              eptr += len;
4257            }            }
4258    
4259          while (eptr >= pp)          while (eptr >= pp)
4260              {
4261            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))            if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4262              return TRUE;              return TRUE;
4263    
4264    #ifdef SUPPORT_UTF8
4265              BACKCHAR(eptr)
4266    #endif
4267              }
4268          return FALSE;          return FALSE;
4269          }          }
4270        }        }
# Line 4315  for (;;) Line 4584  for (;;)
4584    
4585      /* First, ensure the minimum number of matches are present. Use inline      /* First, ensure the minimum number of matches are present. Use inline
4586      code for maximizing the speed, and do the type test once at the start      code for maximizing the speed, and do the type test once at the start
4587      (i.e. keep it out of the loop). Also test that there are at least the      (i.e. keep it out of the loop). Also we can test that there are at least
4588      minimum number of characters before we start. */      the minimum number of bytes before we start, except when doing '.' in
4589        UTF8 mode. Leave the test in in all cases; in the special case we have
4590        to test after each character. */
4591    
4592      if (min > md->end_subject - eptr) return FALSE;      if (min > md->end_subject - eptr) return FALSE;
4593      if (min > 0) switch(ctype)      if (min > 0) switch(ctype)
4594        {        {
4595        case OP_ANY:        case OP_ANY:
4596    #ifdef SUPPORT_UTF8
4597          if (md->utf8)
4598            {
4599            for (i = 1; i <= min; i++)
4600              {
4601              if (eptr >= md->end_subject ||
4602                 (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4603                return FALSE;
4604              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4605              }
4606            break;
4607            }
4608    #endif
4609          /* Non-UTF8 can be faster */
4610        if ((ims & PCRE_DOTALL) == 0)        if ((ims & PCRE_DOTALL) == 0)
4611          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }          { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4612        else eptr += min;        else eptr += min;
# Line 4379  for (;;) Line 4664  for (;;)
4664            {            {
4665            case OP_ANY:            case OP_ANY:
4666            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;            if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4667    #ifdef SUPPORT_UTF8
4668              if (md->utf8)
4669                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4670    #endif
4671            break;            break;
4672    
4673            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
# Line 4418  for (;;) Line 4707  for (;;)
4707        switch(ctype)        switch(ctype)
4708          {          {
4709          case OP_ANY:          case OP_ANY:
4710    
4711            /* Special code is required for UTF8, but when the maximum is unlimited
4712            we don't need it. */
4713    
4714    #ifdef SUPPORT_UTF8
4715            if (md->utf8 && max < INT_MAX)
4716              {
4717              if ((ims & PCRE_DOTALL) == 0)
4718                {
4719                for (i = min; i < max; i++)
4720                  {
4721                  if (eptr >= md->end_subject || *eptr++ == '\n') break;
4722                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4723                  }
4724                }
4725              else
4726                {
4727                for (i = min; i < max; i++)
4728                  {
4729                  eptr++;
4730                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4731                  }
4732                }
4733              break;
4734              }
4735    #endif
4736            /* Non-UTF8 can be faster */
4737          if ((ims & PCRE_DOTALL) == 0)          if ((ims & PCRE_DOTALL) == 0)
4738            {            {
4739            for (i = min; i < max; i++)            for (i = min; i < max; i++)
# Line 4490  for (;;) Line 4806  for (;;)
4806          }          }
4807    
4808        while (eptr >= pp)        while (eptr >= pp)
4809            {
4810          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4811            return TRUE;            return TRUE;
4812    #ifdef SUPPORT_UTF8
4813            if (md->utf8)
4814              while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4815    #endif
4816            }
4817        return FALSE;        return FALSE;
4818        }        }
4819      /* Control never gets here */      /* Control never gets here */
# Line 4572  match_block.end_subject = match_block.st Line 4894  match_block.end_subject = match_block.st
4894  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;
4895    
4896  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4897    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4898    
4899  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4900  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;

Legend:
Removed from v.47  
changed lines
  Added in v.51

  ViewVC Help
Powered by ViewVC 1.1.5