/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 523 by ph10, Sun May 23 18:48:54 2010 UTC revision 538 by ph10, Wed Jun 9 19:30:57 2010 UTC
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
# Line 440  for (;;) Line 441  for (;;)
441  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
442  *************************************************/  *************************************************/
443    
444  /* Given a character, set its first byte's bit in the table, and also the  /* Given a character, set its first byte's bit in the table, and also the
445  corresponding bit for the other version of a letter if we are caseless. In  corresponding bit for the other version of a letter if we are caseless. In
446  UTF-8 mode, for characters greater than 127, we can only do the caseless thing  UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447  when Unicode property support is available.  when Unicode property support is available.
# Line 450  Arguments: Line 451  Arguments:
451    p             points to the character    p             points to the character
452    caseless      the caseless flag    caseless      the caseless flag
453    cd            the block with char table pointers    cd            the block with char table pointers
454    utf8          TRUE for UTF-8 mode    utf8          TRUE for UTF-8 mode
455    
456  Returns:        pointer after the character  Returns:        pointer after the character
457  */  */
# Line 460  set_table_bit(uschar *start_bits, const Line 461  set_table_bit(uschar *start_bits, const
461    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf8)
462  {  {
463  unsigned int c = *p;  unsigned int c = *p;
464  start_bits[c/8] |= (1 << (c&7));  
465    SET_BIT(c);
466    
467  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
468  if (utf8 && c > 127)  if (utf8 && c > 127)
# Line 469  if (utf8 && c > 127) Line 471  if (utf8 && c > 127)
471  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
472    if (caseless)    if (caseless)
473      {      {
474      uschar buff[8];      uschar buff[8];
475      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
476      (void)_pcre_ord2utf8(c, buff);      (void)_pcre_ord2utf8(c, buff);
477      c = buff[0];      SET_BIT(buff[0]);
478      start_bits[c/8] |= (1 << (c&7));      }
479      }  #endif
 #endif  
480    return p;    return p;
481    }    }
482  #endif  #endif
483    
484  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
485    
486  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  
487  return p + 1;  return p + 1;
488  }  }
489    
# Line 519  set_start_bits(const uschar *code, uscha Line 519  set_start_bits(const uschar *code, uscha
519  {  {
520  register int c;  register int c;
521  int yield = SSB_DONE;  int yield = SSB_DONE;
522    int table_limit = utf8? 16:32;
523    
524  #if 0  #if 0
525  /* ========================================================================= */  /* ========================================================================= */
# Line 667  do Line 668  do
668        try_next = FALSE;        try_next = FALSE;
669        break;        break;
670    
671        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Special spacing and line-terminating items. These recognize specific
672        is set, we do not see these op codes because \d etc are converted to        lists of characters. The difference between VSPACE and ANYNL is that the
673        properties. Therefore, these apply in the case when only ASCII characters        latter can match the two-character CRLF sequence, but that is not
674        are recognized to match the types. */        relevant for finding the first character, so their code here is
675          identical. */
676    
677          case OP_HSPACE:
678          SET_BIT(0x09);
679          SET_BIT(0x20);
680          if (utf8)
681            {
682            SET_BIT(0xC2);  /* For U+00A0 */
683            SET_BIT(0xE1);  /* For U+1680, U+180E */
684            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
685            SET_BIT(0xE3);  /* For U+3000 */
686            }
687          else SET_BIT(0xA0);
688          try_next = FALSE;
689          break;
690    
691          case OP_ANYNL:
692          case OP_VSPACE:
693          SET_BIT(0x0A);
694          SET_BIT(0x0B);
695          SET_BIT(0x0C);
696          SET_BIT(0x0D);
697          if (utf8)
698            {
699            SET_BIT(0xC2);  /* For U+0085 */
700            SET_BIT(0xE2);  /* For U+2028, U+2029 */
701            }
702          else SET_BIT(0x85);
703          try_next = FALSE;
704          break;
705    
706          /* Single character types set the bits and stop. Note that if PCRE_UCP
707          is set, we do not see these op codes because \d etc are converted to
708          properties. Therefore, these apply in the case when only ASCII characters
709          are recognized to match the types. In UTF-8 mode, we must restrict
710          ourselves to bytes less than 128, as otherwise there can be confusion
711          with bytes in the middle of UTF-8 characters. (In a "traditional"
712          environment, the tables will only recognize ASCII characters anyway, but
713          in at least one Windows environment, some higher bytes bits were set in
714          the tables.) */
715    
716        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
717        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
718          start_bits[c] |= ~cd->cbits[c+cbit_digit];          start_bits[c] |= ~cd->cbits[c+cbit_digit];
719        try_next = FALSE;        try_next = FALSE;
720        break;        break;
721    
722        case OP_DIGIT:        case OP_DIGIT:
723        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
724          start_bits[c] |= cd->cbits[c+cbit_digit];          start_bits[c] |= cd->cbits[c+cbit_digit];
725        try_next = FALSE;        try_next = FALSE;
726        break;        break;
# Line 688  do Line 729  do
729        discard it. */        discard it. */
730    
731        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
732        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
733          {          {
734          int d = cd->cbits[c+cbit_space];          int d = cd->cbits[c+cbit_space];
735          if (c == 1) d &= ~0x08;          if (c == 1) d &= ~0x08;
# Line 701  do Line 742  do
742        discard it. */        discard it. */
743    
744        case OP_WHITESPACE:        case OP_WHITESPACE:
745        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
746          {          {
747          int d = cd->cbits[c+cbit_space];          int d = cd->cbits[c+cbit_space];
748          if (c == 1) d &= ~0x08;          if (c == 1) d &= ~0x08;
# Line 711  do Line 752  do
752        break;        break;
753    
754        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
755        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
756          start_bits[c] |= ~cd->cbits[c+cbit_word];          start_bits[c] |= ~cd->cbits[c+cbit_word];
757        try_next = FALSE;        try_next = FALSE;
758        break;        break;
759    
760        case OP_WORDCHAR:        case OP_WORDCHAR:
761        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
762          start_bits[c] |= cd->cbits[c+cbit_word];          start_bits[c] |= cd->cbits[c+cbit_word];
763        try_next = FALSE;        try_next = FALSE;
764        break;        break;
# Line 727  do Line 768  do
768    
769        case OP_TYPEPLUS:        case OP_TYPEPLUS:
770        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
771          case OP_TYPEPOSPLUS:
772        tcode++;        tcode++;
773        break;        break;
774    
# Line 755  do Line 797  do
797          case OP_ALLANY:          case OP_ALLANY:
798          return SSB_FAIL;          return SSB_FAIL;
799    
800            case OP_HSPACE:
801            SET_BIT(0x09);
802            SET_BIT(0x20);
803            if (utf8)
804              {
805              SET_BIT(0xC2);  /* For U+00A0 */
806              SET_BIT(0xE1);  /* For U+1680, U+180E */
807              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
808              SET_BIT(0xE3);  /* For U+3000 */
809              }
810            else SET_BIT(0xA0);
811            break;
812    
813            case OP_ANYNL:
814            case OP_VSPACE:
815            SET_BIT(0x0A);
816            SET_BIT(0x0B);
817            SET_BIT(0x0C);
818            SET_BIT(0x0D);
819            if (utf8)
820              {
821              SET_BIT(0xC2);  /* For U+0085 */
822              SET_BIT(0xE2);  /* For U+2028, U+2029 */
823              }
824            else SET_BIT(0x85);
825            break;
826    
827          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
828          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
829            start_bits[c] |= ~cd->cbits[c+cbit_digit];            start_bits[c] |= ~cd->cbits[c+cbit_digit];
830          break;          break;
831    
832          case OP_DIGIT:          case OP_DIGIT:
833          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
834            start_bits[c] |= cd->cbits[c+cbit_digit];            start_bits[c] |= cd->cbits[c+cbit_digit];
835          break;          break;
836    
# Line 769  do Line 838  do
838          discard it. */          discard it. */
839    
840          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
841          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
842            {            {
843            int d = cd->cbits[c+cbit_space];            int d = cd->cbits[c+cbit_space];
844            if (c == 1) d &= ~0x08;            if (c == 1) d &= ~0x08;
# Line 781  do Line 850  do
850          discard it. */          discard it. */
851    
852          case OP_WHITESPACE:          case OP_WHITESPACE:
853          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
854            {            {
855            int d = cd->cbits[c+cbit_space];            int d = cd->cbits[c+cbit_space];
856            if (c == 1) d &= ~0x08;            if (c == 1) d &= ~0x08;
# Line 790  do Line 859  do
859          break;          break;
860    
861          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
862          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
863            start_bits[c] |= ~cd->cbits[c+cbit_word];            start_bits[c] |= ~cd->cbits[c+cbit_word];
864          break;          break;
865    
866          case OP_WORDCHAR:          case OP_WORDCHAR:
867          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
868            start_bits[c] |= cd->cbits[c+cbit_word];            start_bits[c] |= cd->cbits[c+cbit_word];
869          break;          break;
870          }          }

Legend:
Removed from v.523  
changed lines
  Added in v.538

  ViewVC Help
Powered by ViewVC 1.1.5