/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 447 by ph10, Tue Sep 15 18:17:54 2009 UTC revision 461 by ph10, Mon Oct 5 10:59:35 2009 UTC
# Line 839  for (;;) Line 839  for (;;)
839    
840      /* Now see what the actual condition is */      /* Now see what the actual condition is */
841    
842      if (condcode == OP_RREF)         /* Recursion test */      if (condcode == OP_RREF || condcode == OP_NRREF)    /* Recursion test */
843        {        {
844        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        if (md->recursive == NULL)                /* Not recursing => FALSE */
845        condition = md->recursive != NULL &&          {
846          (offset == RREF_ANY || offset == md->recursive->group_num);          condition = FALSE;
847        ecode += condition? 3 : GET(ecode, 1);          ecode += GET(ecode, 1);
848            }
849          else
850            {
851            int recno = GET2(ecode, LINK_SIZE + 2);   /* Recursion group number*/
852            condition =  (recno == RREF_ANY || recno == md->recursive->group_num);
853    
854            /* If the test is for recursion into a specific subpattern, and it is
855            false, but the test was set up by name, scan the table to see if the
856            name refers to any other numbers, and test them. The condition is true
857            if any one is set. */
858    
859            if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
860              {
861              uschar *slotA = md->name_table;
862              for (i = 0; i < md->name_count; i++)
863                {
864                if (GET2(slotA, 0) == recno) break;
865                slotA += md->name_entry_size;
866                }
867    
868              /* Found a name for the number - there can be only one; duplicate
869              names for different numbers are allowed, but not vice versa. First
870              scan down for duplicates. */
871    
872              if (i < md->name_count)
873                {
874                uschar *slotB = slotA;
875                while (slotB > md->name_table)
876                  {
877                  slotB -= md->name_entry_size;
878                  if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
879                    {
880                    condition = GET2(slotB, 0) == md->recursive->group_num;
881                    if (condition) break;
882                    }
883                  else break;
884                  }
885    
886                /* Scan up for duplicates */
887    
888                if (!condition)
889                  {
890                  slotB = slotA;
891                  for (i++; i < md->name_count; i++)
892                    {
893                    slotB += md->name_entry_size;
894                    if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
895                      {
896                      condition = GET2(slotB, 0) == md->recursive->group_num;
897                      if (condition) break;
898                      }
899                    else break;
900                    }
901                  }
902                }
903              }
904    
905            /* Chose branch according to the condition */
906    
907            ecode += condition? 3 : GET(ecode, 1);
908            }
909        }        }
910    
911      else if (condcode == OP_CREF)    /* Group used test */      else if (condcode == OP_CREF || condcode == OP_NCREF)  /* Group used test */
912        {        {
913        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
914        condition = offset < offset_top && md->offset_vector[offset] >= 0;        condition = offset < offset_top && md->offset_vector[offset] >= 0;
915    
916          /* If the numbered capture is unset, but the reference was by name,
917          scan the table to see if the name refers to any other numbers, and test
918          them. The condition is true if any one is set. This is tediously similar
919          to the code above, but not close enough to try to amalgamate. */
920    
921          if (!condition && condcode == OP_NCREF)
922            {
923            int refno = offset >> 1;
924            uschar *slotA = md->name_table;
925    
926            for (i = 0; i < md->name_count; i++)
927              {
928              if (GET2(slotA, 0) == refno) break;
929              slotA += md->name_entry_size;
930              }
931    
932            /* Found a name for the number - there can be only one; duplicate names
933            for different numbers are allowed, but not vice versa. First scan down
934            for duplicates. */
935    
936            if (i < md->name_count)
937              {
938              uschar *slotB = slotA;
939              while (slotB > md->name_table)
940                {
941                slotB -= md->name_entry_size;
942                if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
943                  {
944                  offset = GET2(slotB, 0) << 1;
945                  condition = offset < offset_top &&
946                    md->offset_vector[offset] >= 0;
947                  if (condition) break;
948                  }
949                else break;
950                }
951    
952              /* Scan up for duplicates */
953    
954              if (!condition)
955                {
956                slotB = slotA;
957                for (i++; i < md->name_count; i++)
958                  {
959                  slotB += md->name_entry_size;
960                  if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961                    {
962                    offset = GET2(slotB, 0) << 1;
963                    condition = offset < offset_top &&
964                      md->offset_vector[offset] >= 0;
965                    if (condition) break;
966                    }
967                  else break;
968                  }
969                }
970              }
971            }
972    
973          /* Chose branch according to the condition */
974    
975        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
976        }        }
977    
# Line 909  for (;;) Line 1030  for (;;)
1030        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1031        }        }
1032      break;      break;
1033    
1034    
1035      /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,      /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1036      to close any currently open capturing brackets. */      to close any currently open capturing brackets. */
1037    
1038      case OP_CLOSE:      case OP_CLOSE:
1039      number = GET2(ecode, 1);      number = GET2(ecode, 1);
1040      offset = number << 1;      offset = number << 1;
1041    
1042  #ifdef DEBUG  #ifdef DEBUG
1043        printf("end bracket %d at *ACCEPT", number);        printf("end bracket %d at *ACCEPT", number);
1044        printf("\n");        printf("\n");
# Line 932  for (;;) Line 1053  for (;;)
1053        if (offset_top <= offset) offset_top = offset + 2;        if (offset_top <= offset) offset_top = offset + 2;
1054        }        }
1055      ecode += 3;      ecode += 3;
1056      break;      break;
1057    
1058    
1059      /* End of the pattern, either real or forced. If we are in a top-level      /* End of the pattern, either real or forced. If we are in a top-level
# Line 948  for (;;) Line 1069  for (;;)
1069        md->recursive = rec->prevrec;        md->recursive = rec->prevrec;
1070        memmove(md->offset_vector, rec->offset_save,        memmove(md->offset_vector, rec->offset_save,
1071          rec->saved_max * sizeof(int));          rec->saved_max * sizeof(int));
1072        offset_top = rec->offset_top;        offset_top = rec->save_offset_top;
1073        mstart = rec->save_start;        mstart = rec->save_start;
1074        ims = original_ims;        ims = original_ims;
1075        ecode = rec->after_call;        ecode = rec->after_call;
# Line 1140  for (;;) Line 1261  for (;;)
1261        memcpy(new_recursive.offset_save, md->offset_vector,        memcpy(new_recursive.offset_save, md->offset_vector,
1262              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
1263        new_recursive.save_start = mstart;        new_recursive.save_start = mstart;
1264        new_recursive.offset_top = offset_top;        new_recursive.save_offset_top = offset_top;
1265        mstart = eptr;        mstart = eptr;
1266    
1267        /* OK, now we can do the recursion. For each top-level alternative we        /* OK, now we can do the recursion. For each top-level alternative we
# Line 1339  for (;;) Line 1460  for (;;)
1460        {        {
1461        number = GET2(prev, 1+LINK_SIZE);        number = GET2(prev, 1+LINK_SIZE);
1462        offset = number << 1;        offset = number << 1;
1463    
1464  #ifdef DEBUG  #ifdef DEBUG
1465        printf("end bracket %d", number);        printf("end bracket %d", number);
1466        printf("\n");        printf("\n");
# Line 1365  for (;;) Line 1486  for (;;)
1486          mstart = rec->save_start;          mstart = rec->save_start;
1487          memcpy(md->offset_vector, rec->offset_save,          memcpy(md->offset_vector, rec->offset_save,
1488            rec->saved_max * sizeof(int));            rec->saved_max * sizeof(int));
1489          offset_top = rec->offset_top;          offset_top = rec->save_offset_top;
1490          ecode = rec->after_call;          ecode = rec->after_call;
1491          ims = original_ims;          ims = original_ims;
1492          break;          break;
# Line 4889  if (re == NULL || subject == NULL || Line 5010  if (re == NULL || subject == NULL ||
5010     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5011  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5012    
5013    /* This information is for finding all the numbers associated with a given
5014    name, for condition testing. */
5015    
5016    md->name_table = (uschar *)re + re->name_table_offset;
5017    md->name_count = re->name_count;
5018    md->name_entry_size = re->name_entry_size;
5019    
5020  /* Fish out the optional data from the extra_data structure, first setting  /* Fish out the optional data from the extra_data structure, first setting
5021  the default values. */  the default values. */
5022    
# Line 5121  if (!anchored) Line 5249  if (!anchored)
5249      }      }
5250    else    else
5251      if (!startline && study != NULL &&      if (!startline && study != NULL &&
5252        (study->options & PCRE_STUDY_MAPPED) != 0)        (study->flags & PCRE_STUDY_MAPPED) != 0)
5253          start_bits = study->start_bits;          start_bits = study->start_bits;
5254    }    }
5255    
# Line 5248  for(;;) Line 5376  for(;;)
5376    
5377    end_subject = save_end_subject;    end_subject = save_end_subject;
5378    
5379  #ifdef DEBUG  /* Sigh. Some compilers never learn. */    /* The following two optimizations are disabled for partial matching or if
   printf(">>>> Match against: ");  
   pchars(start_match, end_subject - start_match, TRUE, md);  
   printf("\n");  
 #endif  
   
   /* If req_byte is set, we know that that character must appear in the  
   subject for the match to succeed. If the first character is set, req_byte  
   must be later in the subject; otherwise the test starts at the match point.  
   This optimization can save a huge amount of backtracking in patterns with  
   nested unlimited repeats that aren't going to match. Writing separate code  
   for cased/caseless versions makes it go faster, as does using an  
   autoincrement and backing off on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end  
   can take a long time, and give bad performance on quite ordinary patterns.  
   This showed up when somebody was matching something like /^\d+C/ on a  
   32-megabyte string... so we don't do this when the string is sufficiently  
   long.  
   
   ALSO: this processing is disabled when partial matching is requested, or if  
5380    disabling is explicitly requested. */    disabling is explicitly requested. */
5381    
5382    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
       req_byte >= 0 &&  
       end_subject - start_match < REQ_BYTE_MAX &&  
       !md->partial)  
5383      {      {
5384      register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);      /* If the pattern was studied, a minimum subject length may be set. This is
5385        a lower bound; no actual string of that length may actually match the
5386        pattern. Although the value is, strictly, in characters, we treat it as
5387        bytes to avoid spending too much time in this optimization. */
5388    
5389      /* We don't need to repeat the search if we haven't yet reached the      if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5390      place we found it at last time. */          end_subject - start_match < study->minlength)
5391          {
5392          rc = MATCH_NOMATCH;
5393          break;
5394          }
5395    
5396        /* If req_byte is set, we know that that character must appear in the
5397        subject for the match to succeed. If the first character is set, req_byte
5398        must be later in the subject; otherwise the test starts at the match point.
5399        This optimization can save a huge amount of backtracking in patterns with
5400        nested unlimited repeats that aren't going to match. Writing separate code
5401        for cased/caseless versions makes it go faster, as does using an
5402        autoincrement and backing off on a match.
5403    
5404        HOWEVER: when the subject string is very, very long, searching to its end
5405        can take a long time, and give bad performance on quite ordinary patterns.
5406        This showed up when somebody was matching something like /^\d+C/ on a
5407        32-megabyte string... so we don't do this when the string is sufficiently
5408        long. */
5409    
5410      if (p > req_byte_ptr)      if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5411        {        {
5412        if (req_byte_caseless)        register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5413    
5414          /* We don't need to repeat the search if we haven't yet reached the
5415          place we found it at last time. */
5416    
5417          if (p > req_byte_ptr)
5418          {          {
5419          while (p < end_subject)          if (req_byte_caseless)
5420            {            {
5421            register int pp = *p++;            while (p < end_subject)
5422            if (pp == req_byte || pp == req_byte2) { p--; break; }              {
5423                register int pp = *p++;
5424                if (pp == req_byte || pp == req_byte2) { p--; break; }
5425                }
5426            }            }
5427          }          else
       else  
         {  
         while (p < end_subject)  
5428            {            {
5429            if (*p++ == req_byte) { p--; break; }            while (p < end_subject)
5430                {
5431                if (*p++ == req_byte) { p--; break; }
5432                }
5433            }            }
         }  
5434    
5435        /* If we can't find the required character, break the matching loop,          /* If we can't find the required character, break the matching loop,
5436        forcing a match failure. */          forcing a match failure. */
5437    
5438        if (p >= end_subject)          if (p >= end_subject)
5439          {            {
5440          rc = MATCH_NOMATCH;            rc = MATCH_NOMATCH;
5441          break;            break;
5442          }            }
5443    
5444        /* If we have found the required character, save the point where we          /* If we have found the required character, save the point where we
5445        found it, so that we don't search again next time round the loop if          found it, so that we don't search again next time round the loop if
5446        the start hasn't passed this character yet. */          the start hasn't passed this character yet. */
5447    
5448        req_byte_ptr = p;          req_byte_ptr = p;
5449            }
5450        }        }
5451      }      }
5452    
5453    #ifdef DEBUG  /* Sigh. Some compilers never learn. */
5454      printf(">>>> Match against: ");
5455      pchars(start_match, end_subject - start_match, TRUE, md);
5456      printf("\n");
5457    #endif
5458    
5459    /* OK, we can now run the match. If "hitend" is set afterwards, remember the    /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5460    first starting point for which a partial match was found. */    first starting point for which a partial match was found. */
5461    
# Line 5435  if (rc == MATCH_MATCH) Line 5575  if (rc == MATCH_MATCH)
5575    too many to fit into the vector. */    too many to fit into the vector. */
5576    
5577    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = md->offset_overflow? 0 : md->end_offset_top/2;
5578    
5579    /* If there is space, set up the whole thing as substring 0. The value of    /* If there is space, set up the whole thing as substring 0. The value of
5580    md->start_match_ptr might be modified if \K was encountered on the success    md->start_match_ptr might be modified if \K was encountered on the success
5581    matching path. */    matching path. */

Legend:
Removed from v.447  
changed lines
  Added in v.461

  ViewVC Help
Powered by ViewVC 1.1.5