/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 538 by ph10, Wed Jun 9 19:30:57 2010 UTC revision 602 by ph10, Wed May 25 08:29:03 2011 UTC
# Line 160  for (;;) Line 160  for (;;)
160      case OP_RREF:      case OP_RREF:
161      case OP_NRREF:      case OP_NRREF:
162      case OP_DEF:      case OP_DEF:
     case OP_OPT:  
163      case OP_CALLOUT:      case OP_CALLOUT:
164      case OP_SOD:      case OP_SOD:
165      case OP_SOM:      case OP_SOM:
166      case OP_EOD:      case OP_EOD:
167      case OP_EODN:      case OP_EODN:
168      case OP_CIRC:      case OP_CIRC:
169        case OP_CIRCM:
170      case OP_DOLL:      case OP_DOLL:
171        case OP_DOLLM:
172      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
173      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
174      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
# Line 186  for (;;) Line 187  for (;;)
187      /* Handle literal characters and + repetitions */      /* Handle literal characters and + repetitions */
188    
189      case OP_CHAR:      case OP_CHAR:
190      case OP_CHARNC:      case OP_CHARI:
191      case OP_NOT:      case OP_NOT:
192        case OP_NOTI:
193      case OP_PLUS:      case OP_PLUS:
194      case OP_MINPLUS:      case OP_MINPLUS:
195      case OP_POSPLUS:      case OP_POSPLUS:
# Line 337  for (;;) Line 339  for (;;)
339      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
340    
341      case OP_REF:      case OP_REF:
342        case OP_REFI:
343      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
344        {        {
345        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
# Line 391  for (;;) Line 394  for (;;)
394    
395      /* Anything else does not or need not match a character. We can get the      /* Anything else does not or need not match a character. We can get the
396      item's length from the table, but for those that can match zero occurrences      item's length from the table, but for those that can match zero occurrences
397      of a character, we must take special action for UTF-8 characters. */      of a character, we must take special action for UTF-8 characters. As it
398        happens, the "NOT" versions of these opcodes are used at present only for
399        ASCII characters, so they could be omitted from this list. However, in
400        future that may change, so we leave them in this special case. */
401    
402      case OP_UPTO:      case OP_UPTO:
403        case OP_UPTOI:
404      case OP_NOTUPTO:      case OP_NOTUPTO:
405        case OP_NOTUPTOI:
406      case OP_MINUPTO:      case OP_MINUPTO:
407        case OP_MINUPTOI:
408      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
409        case OP_NOTMINUPTOI:
410      case OP_POSUPTO:      case OP_POSUPTO:
411        case OP_POSUPTOI:
412        case OP_NOTPOSUPTO:
413        case OP_NOTPOSUPTOI:
414    
415      case OP_STAR:      case OP_STAR:
416        case OP_STARI:
417        case OP_NOTSTAR:
418        case OP_NOTSTARI:
419      case OP_MINSTAR:      case OP_MINSTAR:
420        case OP_MINSTARI:
421      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
422        case OP_NOTMINSTARI:
423      case OP_POSSTAR:      case OP_POSSTAR:
424        case OP_POSSTARI:
425      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
426        case OP_NOTPOSSTARI:
427    
428      case OP_QUERY:      case OP_QUERY:
429        case OP_QUERYI:
430        case OP_NOTQUERY:
431        case OP_NOTQUERYI:
432      case OP_MINQUERY:      case OP_MINQUERY:
433        case OP_MINQUERYI:
434      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
435        case OP_NOTMINQUERYI:
436      case OP_POSQUERY:      case OP_POSQUERY:
437        case OP_POSQUERYI:
438      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
439        case OP_NOTPOSQUERYI:
440    
441      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
442  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
443      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
# Line 419  for (;;) Line 449  for (;;)
449      case OP_MARK:      case OP_MARK:
450      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
451      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     case OP_THEN_ARG:  
452      cc += _pcre_OP_lengths[op] + cc[1];      cc += _pcre_OP_lengths[op] + cc[1];
453      break;      break;
454    
455        case OP_THEN_ARG:
456        cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
457        break;
458    
459      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
460      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
461      OP_THEN. */      OP_THEN. */
# Line 490  return p + 1; Line 523  return p + 1;
523    
524    
525  /*************************************************  /*************************************************
526    *     Set bits for a positive character type     *
527    *************************************************/
528    
529    /* This function sets starting bits for a character type. In UTF-8 mode, we can
530    only do a direct setting for bytes less than 128, as otherwise there can be
531    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
532    environment, the tables will only recognize ASCII characters anyway, but in at
533    least one Windows environment, some higher bytes bits were set in the tables.
534    So we deal with that case by considering the UTF-8 encoding.
535    
536    Arguments:
537      start_bits     the starting bitmap
538      cbit type      the type of character wanted
539      table_limit    32 for non-UTF-8; 16 for UTF-8
540      cd             the block with char table pointers
541    
542    Returns:         nothing
543    */
544    
545    static void
546    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
547      compile_data *cd)
548    {
549    register int c;
550    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
551    if (table_limit == 32) return;
552    for (c = 128; c < 256; c++)
553      {
554      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
555        {
556        uschar buff[8];
557        (void)_pcre_ord2utf8(c, buff);
558        SET_BIT(buff[0]);
559        }
560      }
561    }
562    
563    
564    /*************************************************
565    *     Set bits for a negative character type     *
566    *************************************************/
567    
568    /* This function sets starting bits for a negative character type such as \D.
569    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
570    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
571    Unlike in the positive case, where we can set appropriate starting bits for
572    specific high-valued UTF-8 characters, in this case we have to set the bits for
573    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
574    0xc0 (192) for simplicity.
575    
576    Arguments:
577      start_bits     the starting bitmap
578      cbit type      the type of character wanted
579      table_limit    32 for non-UTF-8; 16 for UTF-8
580      cd             the block with char table pointers
581    
582    Returns:         nothing
583    */
584    
585    static void
586    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
587      compile_data *cd)
588    {
589    register int c;
590    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
591    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
592    }
593    
594    
595    
596    /*************************************************
597  *          Create bitmap of starting bytes       *  *          Create bitmap of starting bytes       *
598  *************************************************/  *************************************************/
599    
# Line 605  do Line 709  do
709        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
710        break;        break;
711    
       /* Skip over an option setting, changing the caseless flag */  
   
       case OP_OPT:  
       caseless = (tcode[1] & PCRE_CASELESS) != 0;  
       tcode += 2;  
       break;  
   
712        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
713    
714        case OP_BRAZERO:        case OP_BRAZERO:
# Line 646  do Line 743  do
743        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
744        break;        break;
745    
746          case OP_STARI:
747          case OP_MINSTARI:
748          case OP_POSSTARI:
749          case OP_QUERYI:
750          case OP_MINQUERYI:
751          case OP_POSQUERYI:
752          tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
753          break;
754    
755        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
756    
757        case OP_UPTO:        case OP_UPTO:
# Line 654  do Line 760  do
760        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
761        break;        break;
762    
763          case OP_UPTOI:
764          case OP_MINUPTOI:
765          case OP_POSUPTOI:
766          tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
767          break;
768    
769        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
770    
771        case OP_EXACT:       /* Fall through */        case OP_EXACT:
772        tcode += 2;        tcode += 2;
773          /* Fall through */
774        case OP_CHAR:        case OP_CHAR:
       case OP_CHARNC:  
775        case OP_PLUS:        case OP_PLUS:
776        case OP_MINPLUS:        case OP_MINPLUS:
777        case OP_POSPLUS:        case OP_POSPLUS:
# Line 668  do Line 779  do
779        try_next = FALSE;        try_next = FALSE;
780        break;        break;
781    
782          case OP_CHARI:
783          case OP_PLUSI:
784          case OP_MINPLUSI:
785          case OP_POSPLUSI:
786          (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
787          try_next = FALSE;
788          break;
789    
790        /* Special spacing and line-terminating items. These recognize specific        /* Special spacing and line-terminating items. These recognize specific
791        lists of characters. The difference between VSPACE and ANYNL is that the        lists of characters. The difference between VSPACE and ANYNL is that the
792        latter can match the two-character CRLF sequence, but that is not        latter can match the two-character CRLF sequence, but that is not
# Line 679  do Line 798  do
798        SET_BIT(0x20);        SET_BIT(0x20);
799        if (utf8)        if (utf8)
800          {          {
801          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
802          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
803          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
804          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
# Line 694  do Line 813  do
813        SET_BIT(0x0B);        SET_BIT(0x0B);
814        SET_BIT(0x0C);        SET_BIT(0x0C);
815        SET_BIT(0x0D);        SET_BIT(0x0D);
816        if (utf8)        if (utf8)
817          {          {
818          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
819          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
820          }          }
821        else SET_BIT(0x85);        else SET_BIT(0x85);
822        try_next = FALSE;        try_next = FALSE;
823        break;        break;
824    
825        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Single character types set the bits and stop. Note that if PCRE_UCP
826        is set, we do not see these op codes because \d etc are converted to        is set, we do not see these op codes because \d etc are converted to
827        properties. Therefore, these apply in the case when only ASCII characters        properties. Therefore, these apply in the case when only characters less
828        are recognized to match the types. In UTF-8 mode, we must restrict        than 256 are recognized to match the types. */
       ourselves to bytes less than 128, as otherwise there can be confusion  
       with bytes in the middle of UTF-8 characters. (In a "traditional"  
       environment, the tables will only recognize ASCII characters anyway, but  
       in at least one Windows environment, some higher bytes bits were set in  
       the tables.) */  
829    
830        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
831        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
832        try_next = FALSE;        try_next = FALSE;
833        break;        break;
834    
835        case OP_DIGIT:        case OP_DIGIT:
836        for (c = 0; c < table_limit; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
837        try_next = FALSE;        try_next = FALSE;
838        break;        break;
839    
840        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
841        discard it. */        ensure it is set as not whitespace. */
842    
843        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
844        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
845          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
846        try_next = FALSE;        try_next = FALSE;
847        break;        break;
848    
849        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
850        discard it. */        not set it from the table. */
851    
852        case OP_WHITESPACE:        case OP_WHITESPACE:
853        for (c = 0; c < table_limit; c++)        c = start_bits[1];    /* Save in case it was already set */
854          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
855          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
856        try_next = FALSE;        try_next = FALSE;
857        break;        break;
858    
859        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
860        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
861        try_next = FALSE;        try_next = FALSE;
862        break;        break;
863    
864        case OP_WORDCHAR:        case OP_WORDCHAR:
865        for (c = 0; c < table_limit; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
866        try_next = FALSE;        try_next = FALSE;
867        break;        break;
868    
# Line 802  do Line 905  do
905          SET_BIT(0x20);          SET_BIT(0x20);
906          if (utf8)          if (utf8)
907            {            {
908            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
909            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
910            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
911            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
# Line 816  do Line 919  do
919          SET_BIT(0x0B);          SET_BIT(0x0B);
920          SET_BIT(0x0C);          SET_BIT(0x0C);
921          SET_BIT(0x0D);          SET_BIT(0x0D);
922          if (utf8)          if (utf8)
923            {            {
924            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
925            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
926            }            }
927          else SET_BIT(0x85);          else SET_BIT(0x85);
928          break;          break;
929    
930          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
931          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
932          break;          break;
933    
934          case OP_DIGIT:          case OP_DIGIT:
935          for (c = 0; c < table_limit; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
936          break;          break;
937    
938          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
939          discard it. */          ensure it gets set as not whitespace. */
940    
941          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
942          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
943            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
944          break;          break;
945    
946          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
947          discard it. */          avoid setting it. */
948    
949          case OP_WHITESPACE:          case OP_WHITESPACE:
950          for (c = 0; c < table_limit; c++)          c = start_bits[1];    /* Save in case it was already set */
951            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
952            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
953          break;          break;
954    
955          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
956          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
957          break;          break;
958    
959          case OP_WORDCHAR:          case OP_WORDCHAR:
960          for (c = 0; c < table_limit; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
961          break;          break;
962          }          }
963    

Legend:
Removed from v.538  
changed lines
  Added in v.602

  ViewVC Help
Powered by ViewVC 1.1.5