/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 538 by ph10, Wed Jun 9 19:30:57 2010 UTC revision 550 by ph10, Sun Oct 10 16:24:11 2010 UTC
# Line 419  for (;;) Line 419  for (;;)
419      case OP_MARK:      case OP_MARK:
420      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
421      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     case OP_THEN_ARG:  
422      cc += _pcre_OP_lengths[op] + cc[1];      cc += _pcre_OP_lengths[op] + cc[1];
423      break;      break;
424    
425        case OP_THEN_ARG:
426        cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
427        break;
428    
429      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
430      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
431      OP_THEN. */      OP_THEN. */
# Line 490  return p + 1; Line 493  return p + 1;
493    
494    
495  /*************************************************  /*************************************************
496    *     Set bits for a positive character type     *
497    *************************************************/
498    
499    /* This function sets starting bits for a character type. In UTF-8 mode, we can
500    only do a direct setting for bytes less than 128, as otherwise there can be
501    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
502    environment, the tables will only recognize ASCII characters anyway, but in at
503    least one Windows environment, some higher bytes bits were set in the tables.
504    So we deal with that case by considering the UTF-8 encoding.
505    
506    Arguments:
507      start_bits     the starting bitmap
508      cbit type      the type of character wanted
509      table_limit    32 for non-UTF-8; 16 for UTF-8
510      cd             the block with char table pointers
511    
512    Returns:         nothing
513    */
514    
515    static void
516    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
517      compile_data *cd)
518    {
519    register int c;
520    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
521    if (table_limit == 32) return;
522    for (c = 128; c < 256; c++)
523      {
524      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
525        {
526        uschar buff[8];
527        (void)_pcre_ord2utf8(c, buff);
528        SET_BIT(buff[0]);
529        }
530      }
531    }
532    
533    
534    /*************************************************
535    *     Set bits for a negative character type     *
536    *************************************************/
537    
538    /* This function sets starting bits for a negative character type such as \D.
539    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
540    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
541    Unlike in the positive case, where we can set appropriate starting bits for
542    specific high-valued UTF-8 characters, in this case we have to set the bits for
543    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
544    0xc0 (192) for simplicity.
545    
546    Arguments:
547      start_bits     the starting bitmap
548      cbit type      the type of character wanted
549      table_limit    32 for non-UTF-8; 16 for UTF-8
550      cd             the block with char table pointers
551    
552    Returns:         nothing
553    */
554    
555    static void
556    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
557      compile_data *cd)
558    {
559    register int c;
560    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
561    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
562    }
563    
564    
565    
566    /*************************************************
567  *          Create bitmap of starting bytes       *  *          Create bitmap of starting bytes       *
568  *************************************************/  *************************************************/
569    
# Line 679  do Line 753  do
753        SET_BIT(0x20);        SET_BIT(0x20);
754        if (utf8)        if (utf8)
755          {          {
756          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
757          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
758          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
759          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
# Line 694  do Line 768  do
768        SET_BIT(0x0B);        SET_BIT(0x0B);
769        SET_BIT(0x0C);        SET_BIT(0x0C);
770        SET_BIT(0x0D);        SET_BIT(0x0D);
771        if (utf8)        if (utf8)
772          {          {
773          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
774          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
775          }          }
776        else SET_BIT(0x85);        else SET_BIT(0x85);
777        try_next = FALSE;        try_next = FALSE;
778        break;        break;
779    
780        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Single character types set the bits and stop. Note that if PCRE_UCP
781        is set, we do not see these op codes because \d etc are converted to        is set, we do not see these op codes because \d etc are converted to
782        properties. Therefore, these apply in the case when only ASCII characters        properties. Therefore, these apply in the case when only characters less
783        are recognized to match the types. In UTF-8 mode, we must restrict        than 256 are recognized to match the types. */
       ourselves to bytes less than 128, as otherwise there can be confusion  
       with bytes in the middle of UTF-8 characters. (In a "traditional"  
       environment, the tables will only recognize ASCII characters anyway, but  
       in at least one Windows environment, some higher bytes bits were set in  
       the tables.) */  
784    
785        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
786        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
787        try_next = FALSE;        try_next = FALSE;
788        break;        break;
789    
790        case OP_DIGIT:        case OP_DIGIT:
791        for (c = 0; c < table_limit; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
792        try_next = FALSE;        try_next = FALSE;
793        break;        break;
794    
795        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
796        discard it. */        ensure it is set as not whitespace. */
797    
798        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
799        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
800          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
801        try_next = FALSE;        try_next = FALSE;
802        break;        break;
803    
804        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
805        discard it. */        not set it from the table. */
806    
807        case OP_WHITESPACE:        case OP_WHITESPACE:
808        for (c = 0; c < table_limit; c++)        c = start_bits[1];    /* Save in case it was already set */
809          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
810          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
811        try_next = FALSE;        try_next = FALSE;
812        break;        break;
813    
814        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
815        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
816        try_next = FALSE;        try_next = FALSE;
817        break;        break;
818    
819        case OP_WORDCHAR:        case OP_WORDCHAR:
820        for (c = 0; c < table_limit; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
821        try_next = FALSE;        try_next = FALSE;
822        break;        break;
823    
# Line 802  do Line 860  do
860          SET_BIT(0x20);          SET_BIT(0x20);
861          if (utf8)          if (utf8)
862            {            {
863            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
864            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
865            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
866            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
# Line 816  do Line 874  do
874          SET_BIT(0x0B);          SET_BIT(0x0B);
875          SET_BIT(0x0C);          SET_BIT(0x0C);
876          SET_BIT(0x0D);          SET_BIT(0x0D);
877          if (utf8)          if (utf8)
878            {            {
879            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
880            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
881            }            }
882          else SET_BIT(0x85);          else SET_BIT(0x85);
883          break;          break;
884    
885          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
886          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
887          break;          break;
888    
889          case OP_DIGIT:          case OP_DIGIT:
890          for (c = 0; c < table_limit; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
891          break;          break;
892    
893          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
894          discard it. */          ensure it gets set as not whitespace. */
895    
896          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
897          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
898            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
899          break;          break;
900    
901          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
902          discard it. */          avoid setting it. */
903    
904          case OP_WHITESPACE:          case OP_WHITESPACE:
905          for (c = 0; c < table_limit; c++)          c = start_bits[1];    /* Save in case it was already set */
906            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
907            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
908          break;          break;
909    
910          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
911          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
912          break;          break;
913    
914          case OP_WORDCHAR:          case OP_WORDCHAR:
915          for (c = 0; c < table_limit; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
916          break;          break;
917          }          }
918    

Legend:
Removed from v.538  
changed lines
  Added in v.550

  ViewVC Help
Powered by ViewVC 1.1.5