/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 36 by nigel, Sat Feb 24 21:39:05 2007 UTC revision 37 by nigel, Sat Feb 24 21:39:09 2007 UTC
# Line 111  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111    
112  static BOOL  static BOOL
113    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
115    
116    
117    
# Line 162  return PCRE_VERSION; Line 162  return PCRE_VERSION;
162  *************************************************/  *************************************************/
163    
164  /* This function picks potentially useful data out of the private  /* This function picks potentially useful data out of the private
165  structure.  structure. The public options are passed back in an int - though the
166    re->options field has been expanded to a long int, all the public options
167    at the low end of it, and so even on 16-bit systems this will still be OK.
168    Therefore, I haven't changed the API for pcre_info().
169    
170  Arguments:  Arguments:
171    external_re   points to compiled code    external_re   points to compiled code
# Line 181  pcre_info(const pcre *external_re, int * Line 184  pcre_info(const pcre *external_re, int *
184  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
185  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
186  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
187  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
188  if (first_char != NULL)  if (first_char != NULL)
189    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
190       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 532  for (;;) Line 535  for (;;)
535    
536      case OP_REVERSE:      case OP_REVERSE:
537      cc++;      cc++;
538        /* Fall through */
539    
540      case OP_CREF:      case OP_CREF:
541      case OP_OPT:      case OP_OPT:
# Line 627  Arguments: Line 631  Arguments:
631    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
632    errorptr     points to pointer to error message    errorptr     points to pointer to error message
633    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
634      reqchar      set to the last literal character required, else -1
635      countlits    set to count of mandatory literal characters
636    cd           contains pointers to tables    cd           contains pointers to tables
637    
638  Returns:       TRUE on success  Returns:       TRUE on success
# Line 636  Returns:       TRUE on success Line 642  Returns:       TRUE on success
642  static BOOL  static BOOL
643  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
644    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
645    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
646  {  {
647  int repeat_type, op_type;  int repeat_type, op_type;
648  int repeat_min, repeat_max;  int repeat_min, repeat_max;
649  int bravalue, length;  int bravalue, length;
650  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
651    int prevreqchar;
652    int condcount = 0;
653    int subcountlits = 0;
654  register int c;  register int c;
655  register uschar *code = *codeptr;  register uschar *code = *codeptr;
656  uschar *tempcode;  uschar *tempcode;
# Line 655  uschar class[32]; Line 664  uschar class[32];
664  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
665  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
666    
667    /* Initialize no required char, and count of literals */
668    
669    *reqchar = prevreqchar = -1;
670    *countlits = 0;
671    
672  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
673    
674  for (;; ptr++)  for (;; ptr++)
# Line 664  for (;; ptr++) Line 678  for (;; ptr++)
678    int class_lastchar;    int class_lastchar;
679    int newoptions;    int newoptions;
680    int condref;    int condref;
681      int subreqchar;
682    
683    c = *ptr;    c = *ptr;
684    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 937  for (;; ptr++) Line 952  for (;; ptr++)
952        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
953      else repeat_type = greedy_default;      else repeat_type = greedy_default;
954    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
955      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
956      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
957      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
958        out any reqchar setting, backing up to the previous value. We must also
959        adjust the countlits value. */
960    
961      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
962        {        {
963        int len = previous[1];        int len = previous[1];
964    
965          if (repeat_min == 0) *reqchar = prevreqchar;
966          *countlits += repeat_min - 1;
967    
968        if (len == 1)        if (len == 1)
969          {          {
970          c = previous[2];          c = previous[2];
# Line 987  for (;; ptr++) Line 1003  for (;; ptr++)
1003        code = previous;        code = previous;
1004    
1005        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1006        repeat_type += op_type;      /* Combine both values for many cases */  
1007          /* If the maximum is zero then the minimum must also be zero; Perl allows
1008          this case, so we do too - by simply omitting the item altogether. */
1009    
1010          if (repeat_max == 0) goto END_REPEAT;
1011    
1012          /* Combine the op_type with the repeat_type */
1013    
1014          repeat_type += op_type;
1015    
1016        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1017        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1064  for (;; ptr++) Line 1088  for (;; ptr++)
1088        }        }
1089    
1090      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1091      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1092    
1093      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1094        {        {
1095          if (repeat_max == 0)
1096            {
1097            code = previous;
1098            goto END_REPEAT;
1099            }
1100        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1101          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1102        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1118  for (;; ptr++) Line 1147  for (;; ptr++)
1147    
1148        if (repeat_min == 0)        if (repeat_min == 0)
1149          {          {
1150            /* If we set up a required char from the bracket, we must back off
1151            to the previous value and reset the countlits value too. */
1152    
1153            if (subcountlits > 0)
1154              {
1155              *reqchar = prevreqchar;
1156              *countlits -= subcountlits;
1157              }
1158    
1159          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we just omit the group from the output
1160          altogether. */          altogether. */
1161    
1162          if (repeat_max == 0)          if (repeat_max == 0)
1163            {            {
1164            code = previous;            code = previous;
1165            previous = NULL;            goto END_REPEAT;
           break;  
1166            }            }
1167    
1168          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
# Line 1230  for (;; ptr++) Line 1267  for (;; ptr++)
1267        correct offset was computed above. */        correct offset was computed above. */
1268    
1269        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
   
   
 #ifdef NEVER  
       /* If the minimum is greater than zero, and the maximum is unlimited or  
       equal to the minimum, the first copy remains where it is, and is  
       replicated up to the minimum number of times. This case includes the +  
       repeat, but of course no replication is needed in that case. */  
   
       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))  
         {  
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the minimum is zero, stick BRAZERO in front of the first copy.  
       Then, if there is a fixed upper limit, replicated up to that many times,  
       sticking BRAZERO in front of all the optional ones. */  
   
       else  
         {  
         if (repeat_min == 0)  
           {  
           memmove(previous+1, previous, len);  
           code++;  
           *previous++ = OP_BRAZERO + repeat_type;  
           }  
   
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
   
         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)  
           {  
           *code++ = OP_BRAZERO + repeat_type;  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the maximum is unlimited, set a repeater in the final copy. We  
       can't just offset backwards from the current code point, because we  
       don't know if there's been an options resetting after the ket. The  
       correct offset was computed above. */  
   
       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;  
 #endif  
   
   
1270        }        }
1271    
1272      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1295  for (;; ptr++) Line 1279  for (;; ptr++)
1279    
1280      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1281    
1282        END_REPEAT:
1283      previous = NULL;      previous = NULL;
1284      break;      break;
1285    
# Line 1463  for (;; ptr++) Line 1448  for (;; ptr++)
1448           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1449            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1450           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1451             &subreqchar,                  /* For possible last char */
1452             &subcountlits,                /* For literal count */
1453           cd))                          /* Tables block */           cd))                          /* Tables block */
1454        goto FAILED;        goto FAILED;
1455    
# Line 1476  for (;; ptr++) Line 1463  for (;; ptr++)
1463    
1464      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1465        {        {
       int branchcount = 0;  
1466        uschar *tc = code;        uschar *tc = code;
1467          condcount = 0;
1468    
1469        do {        do {
1470           branchcount++;           condcount++;
1471           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1472           }           }
1473        while (*tc != OP_KET);        while (*tc != OP_KET);
1474    
1475        if (branchcount > 2)        if (condcount > 2)
1476          {          {
1477          *errorptr = ERR27;          *errorptr = ERR27;
1478          goto FAILED;          goto FAILED;
1479          }          }
1480        }        }
1481    
1482        /* Handle updating of the required character. If the subpattern didn't
1483        set one, leave it as it was. Otherwise, update it for normal brackets of
1484        all kinds, forward assertions, and conditions with two branches. Don't
1485        update the literal count for forward assertions, however. If the bracket
1486        is followed by a quantifier with zero repeat, we have to back off. Hence
1487        the definition of prevreqchar and subcountlits outside the main loop so
1488        that they can be accessed for the back off. */
1489    
1490        if (subreqchar > 0 &&
1491             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1492             (bravalue == OP_COND && condcount == 2)))
1493          {
1494          prevreqchar = *reqchar;
1495          *reqchar = subreqchar;
1496          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1497          }
1498    
1499      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1500    
1501      code = tempcode;      code = tempcode;
# Line 1586  for (;; ptr++) Line 1590  for (;; ptr++)
1590    
1591      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1592    
1593        /* Update the last character and the count of literals */
1594    
1595        prevreqchar = (length > 1)? code[-2] : *reqchar;
1596        *reqchar = code[-1];
1597        *countlits += length;
1598    
1599      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1600      the next state. */      the next state. */
1601    
# Line 1629  Argument: Line 1639  Argument:
1639    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1640    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1641    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1642      reqchar     -> place to put the last required character, or a negative number
1643      countlits   -> place to put the shortest literal count of any branch
1644    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
1645    
1646  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1637  Returns:      TRUE on success Line 1649  Returns:      TRUE on success
1649  static BOOL  static BOOL
1650  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1651    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1652    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
1653  {  {
1654  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1655  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1645  uschar *last_branch = code; Line 1657  uschar *last_branch = code;
1657  uschar *start_bracket = code;  uschar *start_bracket = code;
1658  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1659  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1660    int branchreqchar, branchcountlits;
1661    
1662    *reqchar = -1;
1663    *countlits = INT_MAX;
1664  code += 3;  code += 3;
1665    
1666  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1684  for (;;) Line 1699  for (;;)
1699    
1700    /* Now compile the branch */    /* Now compile the branch */
1701    
1702    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1703          &branchreqchar, &branchcountlits, cd))
1704      {      {
1705      *ptrptr = ptr;      *ptrptr = ptr;
1706      return FALSE;      return FALSE;
# Line 1696  for (;;) Line 1712  for (;;)
1712    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1713    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1714    
1715      /* Save the last required character if all branches have the same; a current
1716      value of -1 means unset, while -2 means "previous branch had no last required
1717      char".  */
1718    
1719      if (*reqchar != -2)
1720        {
1721        if (branchreqchar >= 0)
1722          {
1723          if (*reqchar == -1) *reqchar = branchreqchar;
1724          else if (*reqchar != branchreqchar) *reqchar = -2;
1725          }
1726        else *reqchar = -2;
1727        }
1728    
1729      /* Keep the shortest literal count */
1730    
1731      if (branchcountlits < *countlits) *countlits = branchcountlits;
1732      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1733    
1734    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1735    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1736    the branch with OP_END. */    the branch with OP_END. */
# Line 1977  pcre_compile(const char *pattern, int op Line 2012  pcre_compile(const char *pattern, int op
2012  real_pcre *re;  real_pcre *re;
2013  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2014  int runlength;  int runlength;
2015  int c, size;  int c, size, reqchar, countlits;
2016  int bracount = 0;  int bracount = 0;
2017  int top_backref = 0;  int top_backref = 0;
2018  int branch_extra = 0;  int branch_extra = 0;
# Line 2317  while ((c = *(++ptr)) != 0) Line 2352  while ((c = *(++ptr)) != 0)
2352              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2353              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2354              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2355              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2356                flag ever changes within the regex. This is used by the "required
2357                character" code. */
2358    
2359              case ':':              case ':':
2360              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2361                {                {
2362                length += 4;                length += 4;
2363                branch_newextra = 2;                branch_newextra = 2;
2364                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2365                }                }
2366              goto END_OPTIONS;              goto END_OPTIONS;
2367    
# Line 2524  code = re->code; Line 2562  code = re->code;
2562  *code = OP_BRA;  *code = OP_BRA;
2563  bracount = 0;  bracount = 0;
2564  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2565    &compile_block);    &reqchar, &countlits, &compile_block);
2566  re->top_bracket = bracount;  re->top_bracket = bracount;
2567  re->top_backref = top_backref;  re->top_backref = top_backref;
2568    
# Line 2584  if ((options & PCRE_ANCHORED) == 0) Line 2622  if ((options & PCRE_ANCHORED) == 0)
2622      }      }
2623    }    }
2624    
2625    /* Save the last required character if there are at least two literal
2626    characters on all paths, or if there is no first character setting. */
2627    
2628    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2629      {
2630      re->req_char = reqchar;
2631      re->options |= PCRE_REQCHSET;
2632      }
2633    
2634  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2635    
2636  #ifdef DEBUG  #ifdef DEBUG
# Line 2593  printf("Length = %d top_bracket = %d top Line 2640  printf("Length = %d top_bracket = %d top
2640    
2641  if (re->options != 0)  if (re->options != 0)
2642    {    {
2643    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2644      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2645      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2646        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2647      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2648      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2649      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2610  if ((re->options & PCRE_FIRSTSET) != 0) Line 2658  if ((re->options & PCRE_FIRSTSET) != 0)
2658      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2659    }    }
2660    
2661    if ((re->options & PCRE_REQCHSET) != 0)
2662      {
2663      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2664        else printf("Req char = \\x%02x\n", re->req_char);
2665      }
2666    
2667  code_end = code;  code_end = code;
2668  code_base = code = re->code;  code_base = code = re->code;
2669    
# Line 2843  Returns:      TRUE if matched Line 2897  Returns:      TRUE if matched
2897    
2898  static BOOL  static BOOL
2899  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2900    int ims)    unsigned long int ims)
2901  {  {
2902  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
2903    
# Line 2902  Returns:       TRUE if matched Line 2956  Returns:       TRUE if matched
2956    
2957  static BOOL  static BOOL
2958  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
2959    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2960      const uschar *eptrb)
2961  {  {
2962  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
2963    
2964  for (;;)  for (;;)
2965    {    {
# Line 3019  for (;;) Line 3074  for (;;)
3074      ecode += 2;      ecode += 2;
3075      break;      break;
3076    
3077      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3078        an empty string - recursion will then try other alternatives, if any. */
3079    
3080      case OP_END:      case OP_END:
3081        if (md->notempty && eptr == md->start_match) return FALSE;
3082      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3083      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3084      return TRUE;      return TRUE;
# Line 4136  pcre_exec(const pcre *external_re, const Line 4193  pcre_exec(const pcre *external_re, const
4193  {  {
4194  int resetcount, ocount;  int resetcount, ocount;
4195  int first_char = -1;  int first_char = -1;
4196  int ims = 0;  int req_char = -1;
4197    int req_char2 = -1;
4198    unsigned long int ims = 0;
4199  match_data match_block;  match_data match_block;
4200  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4201  const uschar *start_match = (const uschar *)subject + start_offset;  const uschar *start_match = (const uschar *)subject + start_offset;
4202  const uschar *end_subject;  const uschar *end_subject;
4203    const uschar *req_char_ptr = start_match - 1;
4204  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4205  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4206  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 4161  match_block.endonly = (re->options & PCR Line 4221  match_block.endonly = (re->options & PCR
4221    
4222  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4223  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4224    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4225    
4226  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4227    
# Line 4231  if (!anchored) Line 4292  if (!anchored)
4292          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4293    }    }
4294    
4295  /* Loop for unanchored matches; for anchored regexs the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4296    character" set. If the PCRE_CASELESS is set, implying that the match starts
4297    caselessly, or if there are any changes of this flag within the regex, set up
4298    both cases of the character. Otherwise set the two values the same, which will
4299    avoid duplicate testing (which takes significant time). This covers the vast
4300    majority of cases. It will be suboptimal when the case flag changes in a regex
4301    and the required character in fact is caseful. */
4302    
4303    if ((re->options & PCRE_REQCHSET) != 0)
4304      {
4305      req_char = re->req_char;
4306      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4307        (re->tables + fcc_offset)[req_char] : req_char;
4308      }
4309    
4310    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4311    the loop runs just once. */
4312    
4313  do  do
4314    {    {
# Line 4267  do Line 4344  do
4344        }        }
4345      }      }
4346    
4347    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4348    
4349    else if (start_bits != NULL)    else if (start_bits != NULL)
4350      {      {
# Line 4284  do Line 4361  do
4361    printf("\n");    printf("\n");
4362  #endif  #endif
4363    
4364      /* If req_char is set, we know that that character must appear in the subject
4365      for the match to succeed. If the first character is set, req_char must be
4366      later in the subject; otherwise the test starts at the match point. This
4367      optimization can save a huge amount of backtracking in patterns with nested
4368      unlimited repeats that aren't going to match. We don't know what the state of
4369      case matching may be when this character is hit, so test for it in both its
4370      cases if necessary. However, the different cased versions will not be set up
4371      unless PCRE_CASELESS was given or the casing state changes within the regex.
4372      Writing separate code makes it go faster, as does using an autoincrement and
4373      backing off on a match. */
4374    
4375      if (req_char >= 0)
4376        {
4377        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4378    
4379        /* We don't need to repeat the search if we haven't yet reached the
4380        place we found it at last time. */
4381    
4382        if (p > req_char_ptr)
4383          {
4384          /* Do a single test if no case difference is set up */
4385    
4386          if (req_char == req_char2)
4387            {
4388            while (p < end_subject)
4389              {
4390              if (*p++ == req_char) { p--; break; }
4391              }
4392            }
4393    
4394          /* Otherwise test for either case */
4395    
4396          else
4397            {
4398            while (p < end_subject)
4399              {
4400              register int pp = *p++;
4401              if (pp == req_char || pp == req_char2) { p--; break; }
4402              }
4403            }
4404    
4405          /* If we can't find the required character, break the matching loop */
4406    
4407          if (p >= end_subject) break;
4408    
4409          /* If we have found the required character, save the point where we
4410          found it, so that we don't search again next time round the loop if
4411          the start hasn't passed this character yet. */
4412    
4413          req_char_ptr = p;
4414          }
4415        }
4416    
4417    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4418    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4419    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4291  do Line 4421  do
4421    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4422    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4423    
4424      match_block.start_match = start_match;
4425    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4426      continue;      continue;
4427    

Legend:
Removed from v.36  
changed lines
  Added in v.37

  ViewVC Help
Powered by ViewVC 1.1.5