/[pcre]/code/branches/pcre16/pcre_study.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 780 by zherczeg, Mon Nov 28 20:39:30 2011 UTC revision 781 by zherczeg, Sat Dec 3 07:58:30 2011 UTC
# Line 82  find_minlength(const pcre_uchar *code, c Line 82  find_minlength(const pcre_uchar *code, c
82    int recurse_depth)    int recurse_depth)
83  {  {
84  int length = -1;  int length = -1;
85  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
86    BOOL utf = (options & PCRE_UTF8) != 0;
87  BOOL had_recurse = FALSE;  BOOL had_recurse = FALSE;
88  register int branchlength = 0;  register int branchlength = 0;
89  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
# Line 224  for (;;) Line 225  for (;;)
225      branchlength++;      branchlength++;
226      cc += 2;      cc += 2;
227  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
228      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];      if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
229  #endif  #endif
230      break;      break;
231    
# Line 245  for (;;) Line 246  for (;;)
246      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
247      cc += 2 + IMM2_SIZE;      cc += 2 + IMM2_SIZE;
248  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
249      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];      if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
250  #endif  #endif
251      break;      break;
252    
# Line 293  for (;;) Line 294  for (;;)
294    
295      case OP_ANYBYTE:      case OP_ANYBYTE:
296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
297      if (utf8) return -1;      if (utf) return -1;
298  #endif  #endif
299      branchlength++;      branchlength++;
300      cc++;      cc++;
# Line 374  for (;;) Line 375  for (;;)
375      case OP_REFI:      case OP_REFI:
376      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
377        {        {
378        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf8, GET2(cc, 1));        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
379        if (cs == NULL) return -2;        if (cs == NULL) return -2;
380        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
381        if (cc > cs && cc < ce)        if (cc > cs && cc < ce)
# Line 486  for (;;) Line 487  for (;;)
487    
488      cc += PRIV(OP_lengths)[op];      cc += PRIV(OP_lengths)[op];
489  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
490      if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];      if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
491  #endif  #endif
492      break;      break;
493    
# Line 537  Arguments: Line 538  Arguments:
538    p             points to the character    p             points to the character
539    caseless      the caseless flag    caseless      the caseless flag
540    cd            the block with char table pointers    cd            the block with char table pointers
541    utf8          TRUE for UTF-8 mode    utf           TRUE for UTF-8 / UTF-16 mode
542    
543  Returns:        pointer after the character  Returns:        pointer after the character
544  */  */
545    
546  static const pcre_uchar *  static const pcre_uchar *
547  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
548    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf)
549  {  {
550  unsigned int c = *p;  unsigned int c = *p;
551    
552  SET_BIT(c);  SET_BIT(c);
553    
554  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
555  if (utf8 && c > 127)  if (utf && c > 127)
556    {    {
557    GETCHARINC(c, p);    GETCHARINC(c, p);
558  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
559    if (caseless)    if (caseless)
560      {      {
561      pcre_uint8 buff[8];      pcre_uchar buff[6];
562      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
563      (void)PRIV(ord2utf8)(c, buff);      (void)PRIV(ord2utf)(c, buff);
564      SET_BIT(buff[0]);      SET_BIT(buff[0]);
565      }      }
566  #endif  #endif
# Line 607  for (c = 128; c < 256; c++) Line 608  for (c = 128; c < 256; c++)
608    {    {
609    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
610      {      {
611      pcre_uint8 buff[8];      pcre_uchar buff[6];
612      (void)PRIV(ord2utf8)(c, buff);      (void)PRIV(ord2utf)(c, buff);
613      SET_BIT(buff[0]);      SET_BIT(buff[0]);
614      }      }
615    }    }
# Line 663  function fails unless the result is SSB_ Line 664  function fails unless the result is SSB_
664  Arguments:  Arguments:
665    code         points to an expression    code         points to an expression
666    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
667    utf8         TRUE if in UTF-8 mode    utf          TRUE if in UTF-8 / UTF-16 mode
668    cd           the block with char table pointers    cd           the block with char table pointers
669    
670  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
# Line 673  Returns:       SSB_FAIL     => Failed to Line 674  Returns:       SSB_FAIL     => Failed to
674  */  */
675    
676  static int  static int
677  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf8,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
678    compile_data *cd)    compile_data *cd)
679  {  {
680  register int c;  register int c;
681  int yield = SSB_DONE;  int yield = SSB_DONE;
682  int table_limit = utf8? 16:32;  int table_limit = utf? 16:32;
683    
684  #if 0  #if 0
685  /* ========================================================================= */  /* ========================================================================= */
# Line 817  do Line 818  do
818        case OP_ONCE:        case OP_ONCE:
819        case OP_ONCE_NC:        case OP_ONCE_NC:
820        case OP_ASSERT:        case OP_ASSERT:
821        rc = set_start_bits(tcode, start_bits, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf, cd);
822        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
823        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
824          {          {
# Line 864  do Line 865  do
865        case OP_BRAZERO:        case OP_BRAZERO:
866        case OP_BRAMINZERO:        case OP_BRAMINZERO:
867        case OP_BRAPOSZERO:        case OP_BRAPOSZERO:
868        rc = set_start_bits(++tcode, start_bits, utf8, cd);        rc = set_start_bits(++tcode, start_bits, utf, cd);
869        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
870  /* =========================================================================  /* =========================================================================
871        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 891  do Line 892  do
892        case OP_QUERY:        case OP_QUERY:
893        case OP_MINQUERY:        case OP_MINQUERY:
894        case OP_POSQUERY:        case OP_POSQUERY:
895        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
896        break;        break;
897    
898        case OP_STARI:        case OP_STARI:
# Line 900  do Line 901  do
901        case OP_QUERYI:        case OP_QUERYI:
902        case OP_MINQUERYI:        case OP_MINQUERYI:
903        case OP_POSQUERYI:        case OP_POSQUERYI:
904        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
905        break;        break;
906    
907        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 908  do Line 909  do
909        case OP_UPTO:        case OP_UPTO:
910        case OP_MINUPTO:        case OP_MINUPTO:
911        case OP_POSUPTO:        case OP_POSUPTO:
912        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
913        break;        break;
914    
915        case OP_UPTOI:        case OP_UPTOI:
916        case OP_MINUPTOI:        case OP_MINUPTOI:
917        case OP_POSUPTOI:        case OP_POSUPTOI:
918        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
919        break;        break;
920    
921        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 926  do Line 927  do
927        case OP_PLUS:        case OP_PLUS:
928        case OP_MINPLUS:        case OP_MINPLUS:
929        case OP_POSPLUS:        case OP_POSPLUS:
930        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
931        try_next = FALSE;        try_next = FALSE;
932        break;        break;
933    
# Line 937  do Line 938  do
938        case OP_PLUSI:        case OP_PLUSI:
939        case OP_MINPLUSI:        case OP_MINPLUSI:
940        case OP_POSPLUSI:        case OP_POSPLUSI:
941        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
942        try_next = FALSE;        try_next = FALSE;
943        break;        break;
944    
# Line 950  do Line 951  do
951        case OP_HSPACE:        case OP_HSPACE:
952        SET_BIT(0x09);        SET_BIT(0x09);
953        SET_BIT(0x20);        SET_BIT(0x20);
954        if (utf8)        if (utf)
955          {          {
956          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
957          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
# Line 967  do Line 968  do
968        SET_BIT(0x0B);        SET_BIT(0x0B);
969        SET_BIT(0x0C);        SET_BIT(0x0C);
970        SET_BIT(0x0D);        SET_BIT(0x0D);
971        if (utf8)        if (utf)
972          {          {
973          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
974          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
# Line 1057  do Line 1058  do
1058          case OP_HSPACE:          case OP_HSPACE:
1059          SET_BIT(0x09);          SET_BIT(0x09);
1060          SET_BIT(0x20);          SET_BIT(0x20);
1061          if (utf8)          if (utf)
1062            {            {
1063            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
1064            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
# Line 1073  do Line 1074  do
1074          SET_BIT(0x0B);          SET_BIT(0x0B);
1075          SET_BIT(0x0C);          SET_BIT(0x0C);
1076          SET_BIT(0x0D);          SET_BIT(0x0D);
1077          if (utf8)          if (utf)
1078            {            {
1079            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
1080            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
# Line 1126  do Line 1127  do
1127    
1128        case OP_NCLASS:        case OP_NCLASS:
1129  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1130        if (utf8)        if (utf)
1131          {          {
1132          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1133          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
# Line 1147  do Line 1148  do
1148          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1149    
1150  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1151          if (utf8)          if (utf)
1152            {            {
1153            for (c = 0; c < 16; c++) start_bits[c] |= map[c];            for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1154            for (c = 128; c < 256; c++)            for (c = 128; c < 256; c++)

Legend:
Removed from v.780  
changed lines
  Added in v.781

  ViewVC Help
Powered by ViewVC 1.1.5