/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 27 by nigel, Sat Feb 24 21:38:49 2007 UTC revision 37 by nigel, Sat Feb 24 21:39:09 2007 UTC
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 107  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111    
112  static BOOL  static BOOL
113    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114      BOOL, int, compile_data *);      BOOL, int, int *, int *, compile_data *);
115    
116    
117    
# Line 158  return PCRE_VERSION; Line 162  return PCRE_VERSION;
162  *************************************************/  *************************************************/
163    
164  /* This function picks potentially useful data out of the private  /* This function picks potentially useful data out of the private
165  structure.  structure. The public options are passed back in an int - though the
166    re->options field has been expanded to a long int, all the public options
167    at the low end of it, and so even on 16-bit systems this will still be OK.
168    Therefore, I haven't changed the API for pcre_info().
169    
170  Arguments:  Arguments:
171    external_re   points to compiled code    external_re   points to compiled code
# Line 177  pcre_info(const pcre *external_re, int * Line 184  pcre_info(const pcre *external_re, int *
184  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
185  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
186  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
187  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
188  if (first_char != NULL)  if (first_char != NULL)
189    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
190       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
# Line 528  for (;;) Line 535  for (;;)
535    
536      case OP_REVERSE:      case OP_REVERSE:
537      cc++;      cc++;
538        /* Fall through */
539    
540      case OP_CREF:      case OP_CREF:
541      case OP_OPT:      case OP_OPT:
# Line 623  Arguments: Line 631  Arguments:
631    ptrptr       points to the current pattern pointer    ptrptr       points to the current pattern pointer
632    errorptr     points to pointer to error message    errorptr     points to pointer to error message
633    optchanged   set to the value of the last OP_OPT item compiled    optchanged   set to the value of the last OP_OPT item compiled
634      reqchar      set to the last literal character required, else -1
635      countlits    set to count of mandatory literal characters
636    cd           contains pointers to tables    cd           contains pointers to tables
637    
638  Returns:       TRUE on success  Returns:       TRUE on success
# Line 632  Returns:       TRUE on success Line 642  Returns:       TRUE on success
642  static BOOL  static BOOL
643  compile_branch(int options, int *brackets, uschar **codeptr,  compile_branch(int options, int *brackets, uschar **codeptr,
644    const uschar **ptrptr, const char **errorptr, int *optchanged,    const uschar **ptrptr, const char **errorptr, int *optchanged,
645    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
646  {  {
647  int repeat_type, op_type;  int repeat_type, op_type;
648  int repeat_min, repeat_max;  int repeat_min, repeat_max;
649  int bravalue, length;  int bravalue, length;
650  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
651    int prevreqchar;
652    int condcount = 0;
653    int subcountlits = 0;
654  register int c;  register int c;
655  register uschar *code = *codeptr;  register uschar *code = *codeptr;
656  uschar *tempcode;  uschar *tempcode;
# Line 651  uschar class[32]; Line 664  uschar class[32];
664  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
665  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
666    
667    /* Initialize no required char, and count of literals */
668    
669    *reqchar = prevreqchar = -1;
670    *countlits = 0;
671    
672  /* Switch on next character until the end of the branch */  /* Switch on next character until the end of the branch */
673    
674  for (;; ptr++)  for (;; ptr++)
# Line 660  for (;; ptr++) Line 678  for (;; ptr++)
678    int class_lastchar;    int class_lastchar;
679    int newoptions;    int newoptions;
680    int condref;    int condref;
681      int subreqchar;
682    
683    c = *ptr;    c = *ptr;
684    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
# Line 933  for (;; ptr++) Line 952  for (;; ptr++)
952        { repeat_type = greedy_non_default; ptr++; }        { repeat_type = greedy_non_default; ptr++; }
953      else repeat_type = greedy_default;      else repeat_type = greedy_default;
954    
     /* If the maximum is zero then the minimum must also be zero; Perl allows  
     this case, so we do too - by simply omitting the item altogether. */  
   
     if (repeat_max == 0) code = previous;  
   
955      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
956      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
957      abolish the previous item altogether. */      abolish the previous item altogether. A repeat with a zero minimum wipes
958        out any reqchar setting, backing up to the previous value. We must also
959        adjust the countlits value. */
960    
961      else if (*previous == OP_CHARS)      if (*previous == OP_CHARS)
962        {        {
963        int len = previous[1];        int len = previous[1];
964    
965          if (repeat_min == 0) *reqchar = prevreqchar;
966          *countlits += repeat_min - 1;
967    
968        if (len == 1)        if (len == 1)
969          {          {
970          c = previous[2];          c = previous[2];
# Line 983  for (;; ptr++) Line 1003  for (;; ptr++)
1003        code = previous;        code = previous;
1004    
1005        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
1006        repeat_type += op_type;      /* Combine both values for many cases */  
1007          /* If the maximum is zero then the minimum must also be zero; Perl allows
1008          this case, so we do too - by simply omitting the item altogether. */
1009    
1010          if (repeat_max == 0) goto END_REPEAT;
1011    
1012          /* Combine the op_type with the repeat_type */
1013    
1014          repeat_type += op_type;
1015    
1016        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
1017        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 1060  for (;; ptr++) Line 1088  for (;; ptr++)
1088        }        }
1089    
1090      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
1091      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
1092    
1093      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS || *previous == OP_REF)
1094        {        {
1095          if (repeat_max == 0)
1096            {
1097            code = previous;
1098            goto END_REPEAT;
1099            }
1100        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
1101          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
1102        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1087  for (;; ptr++) Line 1120  for (;; ptr++)
1120      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1121               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1122        {        {
1123        int i, ketoffset = 0;        register int i;
1124          int ketoffset = 0;
1125        int len = code - previous;        int len = code - previous;
1126          uschar *bralink = NULL;
1127    
1128        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1129        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1103  for (;; ptr++) Line 1138  for (;; ptr++)
1138          ketoffset = code - ket;          ketoffset = code - ket;
1139          }          }
1140    
1141        /* If the minimum is greater than zero, and the maximum is unlimited or        /* The case of a zero minimum is special because of the need to stick
1142        equal to the minimum, the first copy remains where it is, and is        OP_BRAZERO in front of it, and because the group appears once in the
1143        replicated up to the minimum number of times. This case includes the +        data, whereas in other cases it appears the minimum number of times. For
1144        repeat, but of course no replication is needed in that case. */        this reason, it is simplest to treat this case separately, as otherwise
1145          the code gets far too mess. There are several special subcases when the
1146          minimum is zero. */
1147    
1148        if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))        if (repeat_min == 0)
1149          {          {
1150          for (i = 1; i < repeat_min; i++)          /* If we set up a required char from the bracket, we must back off
1151            to the previous value and reset the countlits value too. */
1152    
1153            if (subcountlits > 0)
1154            {            {
1155            memcpy(code, previous, len);            *reqchar = prevreqchar;
1156            code += len;            *countlits -= subcountlits;
1157            }            }
         }  
1158    
1159        /* If the minimum is zero, stick BRAZERO in front of the first copy.          /* If the maximum is also zero, we just omit the group from the output
1160        Then, if there is a fixed upper limit, replicated up to that many times,          altogether. */
       sticking BRAZERO in front of all the optional ones. */  
1161    
1162        else          if (repeat_max == 0)
1163          {            {
1164          if (repeat_min == 0)            code = previous;
1165              goto END_REPEAT;
1166              }
1167    
1168            /* If the maximum is 1 or unlimited, we just have to stick in the
1169            BRAZERO and do no more at this point. */
1170    
1171            if (repeat_max <= 1)
1172            {            {
1173            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
1174            code++;            code++;
1175            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
1176            }            }
1177    
1178            /* If the maximum is greater than 1 and limited, we have to replicate
1179            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1180            The first one has to be handled carefully because it's the original
1181            copy, which has to be moved up. The remainder can be handled by code
1182            that is common with the non-zero minimum case below. We just have to
1183            adjust the value or repeat_max, since one less copy is required. */
1184    
1185            else
1186              {
1187              int offset;
1188              memmove(previous+4, previous, len);
1189              code += 4;
1190              *previous++ = OP_BRAZERO + repeat_type;
1191              *previous++ = OP_BRA;
1192    
1193              /* We chain together the bracket offset fields that have to be
1194              filled in later when the ends of the brackets are reached. */
1195    
1196              offset = (bralink == NULL)? 0 : previous - bralink;
1197              bralink = previous;
1198              *previous++ = offset >> 8;
1199              *previous++ = offset & 255;
1200              }
1201    
1202            repeat_max--;
1203            }
1204    
1205          /* If the minimum is greater than zero, replicate the group as many
1206          times as necessary, and adjust the maximum to the number of subsequent
1207          copies that we need. */
1208    
1209          else
1210            {
1211          for (i = 1; i < repeat_min; i++)          for (i = 1; i < repeat_min; i++)
1212            {            {
1213            memcpy(code, previous, len);            memcpy(code, previous, len);
1214            code += len;            code += len;
1215            }            }
1216            if (repeat_max > 0) repeat_max -= repeat_min;
1217            }
1218    
1219          /* This code is common to both the zero and non-zero minimum cases. If
1220          the maximum is limited, it replicates the group in a nested fashion,
1221          remembering the bracket starts on a stack. In the case of a zero minimum,
1222          the first one was set up above. In all cases the repeat_max now specifies
1223          the number of additional copies needed. */
1224    
1225          for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)        if (repeat_max >= 0)
1226            {
1227            for (i = repeat_max - 1; i >= 0; i--)
1228            {            {
1229            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
1230    
1231              /* All but the final copy start a new nesting, maintaining the
1232              chain of brackets outstanding. */
1233    
1234              if (i != 0)
1235                {
1236                int offset;
1237                *code++ = OP_BRA;
1238                offset = (bralink == NULL)? 0 : code - bralink;
1239                bralink = code;
1240                *code++ = offset >> 8;
1241                *code++ = offset & 255;
1242                }
1243    
1244            memcpy(code, previous, len);            memcpy(code, previous, len);
1245            code += len;            code += len;
1246            }            }
1247    
1248            /* Now chain through the pending brackets, and fill in their length
1249            fields (which are holding the chain links pro tem). */
1250    
1251            while (bralink != NULL)
1252              {
1253              int oldlinkoffset;
1254              int offset = code - bralink + 1;
1255              uschar *bra = code - offset;
1256              oldlinkoffset = (bra[1] << 8) + bra[2];
1257              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1258              *code++ = OP_KET;
1259              *code++ = bra[1] = offset >> 8;
1260              *code++ = bra[2] = (offset & 255);
1261              }
1262          }          }
1263    
1264        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
# Line 1149  for (;; ptr++) Line 1266  for (;; ptr++)
1266        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
1267        correct offset was computed above. */        correct offset was computed above. */
1268    
1269        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
1270        }        }
1271    
1272      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1162  for (;; ptr++) Line 1279  for (;; ptr++)
1279    
1280      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
1281    
1282        END_REPEAT:
1283      previous = NULL;      previous = NULL;
1284      break;      break;
1285    
# Line 1330  for (;; ptr++) Line 1448  for (;; ptr++)
1448           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
1449            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1450           condref,                      /* Condition reference number */           condref,                      /* Condition reference number */
1451             &subreqchar,                  /* For possible last char */
1452             &subcountlits,                /* For literal count */
1453           cd))                          /* Tables block */           cd))                          /* Tables block */
1454        goto FAILED;        goto FAILED;
1455    
# Line 1343  for (;; ptr++) Line 1463  for (;; ptr++)
1463    
1464      if (bravalue == OP_COND)      if (bravalue == OP_COND)
1465        {        {
       int branchcount = 0;  
1466        uschar *tc = code;        uschar *tc = code;
1467          condcount = 0;
1468    
1469        do {        do {
1470           branchcount++;           condcount++;
1471           tc += (tc[1] << 8) | tc[2];           tc += (tc[1] << 8) | tc[2];
1472           }           }
1473        while (*tc != OP_KET);        while (*tc != OP_KET);
1474    
1475        if (branchcount > 2)        if (condcount > 2)
1476          {          {
1477          *errorptr = ERR27;          *errorptr = ERR27;
1478          goto FAILED;          goto FAILED;
1479          }          }
1480        }        }
1481    
1482        /* Handle updating of the required character. If the subpattern didn't
1483        set one, leave it as it was. Otherwise, update it for normal brackets of
1484        all kinds, forward assertions, and conditions with two branches. Don't
1485        update the literal count for forward assertions, however. If the bracket
1486        is followed by a quantifier with zero repeat, we have to back off. Hence
1487        the definition of prevreqchar and subcountlits outside the main loop so
1488        that they can be accessed for the back off. */
1489    
1490        if (subreqchar > 0 &&
1491             (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1492             (bravalue == OP_COND && condcount == 2)))
1493          {
1494          prevreqchar = *reqchar;
1495          *reqchar = subreqchar;
1496          if (bravalue != OP_ASSERT) *countlits += subcountlits;
1497          }
1498    
1499      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
1500    
1501      code = tempcode;      code = tempcode;
# Line 1453  for (;; ptr++) Line 1590  for (;; ptr++)
1590    
1591      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);      while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1592    
1593        /* Update the last character and the count of literals */
1594    
1595        prevreqchar = (length > 1)? code[-2] : *reqchar;
1596        *reqchar = code[-1];
1597        *countlits += length;
1598    
1599      /* Compute the length and set it in the data vector, and advance to      /* Compute the length and set it in the data vector, and advance to
1600      the next state. */      the next state. */
1601    
# Line 1496  Argument: Line 1639  Argument:
1639    errorptr    -> pointer to error message    errorptr    -> pointer to error message
1640    lookbehind  TRUE if this is a lookbehind assertion    lookbehind  TRUE if this is a lookbehind assertion
1641    condref     > 0 for OPT_CREF setting at start of conditional group    condref     > 0 for OPT_CREF setting at start of conditional group
1642      reqchar     -> place to put the last required character, or a negative number
1643      countlits   -> place to put the shortest literal count of any branch
1644    cd          points to the data block with tables pointers    cd          points to the data block with tables pointers
1645    
1646  Returns:      TRUE on success  Returns:      TRUE on success
# Line 1504  Returns:      TRUE on success Line 1649  Returns:      TRUE on success
1649  static BOOL  static BOOL
1650  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1651    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1652    compile_data *cd)    int *reqchar, int *countlits, compile_data *cd)
1653  {  {
1654  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
1655  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 1512  uschar *last_branch = code; Line 1657  uschar *last_branch = code;
1657  uschar *start_bracket = code;  uschar *start_bracket = code;
1658  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
1659  int oldoptions = options & PCRE_IMS;  int oldoptions = options & PCRE_IMS;
1660    int branchreqchar, branchcountlits;
1661    
1662    *reqchar = -1;
1663    *countlits = INT_MAX;
1664  code += 3;  code += 3;
1665    
1666  /* At the start of a reference-based conditional group, insert the reference  /* At the start of a reference-based conditional group, insert the reference
# Line 1551  for (;;) Line 1699  for (;;)
1699    
1700    /* Now compile the branch */    /* Now compile the branch */
1701    
1702    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1703          &branchreqchar, &branchcountlits, cd))
1704      {      {
1705      *ptrptr = ptr;      *ptrptr = ptr;
1706      return FALSE;      return FALSE;
# Line 1563  for (;;) Line 1712  for (;;)
1712    last_branch[1] = length >> 8;    last_branch[1] = length >> 8;
1713    last_branch[2] = length & 255;    last_branch[2] = length & 255;
1714    
1715      /* Save the last required character if all branches have the same; a current
1716      value of -1 means unset, while -2 means "previous branch had no last required
1717      char".  */
1718    
1719      if (*reqchar != -2)
1720        {
1721        if (branchreqchar >= 0)
1722          {
1723          if (*reqchar == -1) *reqchar = branchreqchar;
1724          else if (*reqchar != branchreqchar) *reqchar = -2;
1725          }
1726        else *reqchar = -2;
1727        }
1728    
1729      /* Keep the shortest literal count */
1730    
1731      if (branchcountlits < *countlits) *countlits = branchcountlits;
1732      DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1733    
1734    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
1735    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
1736    the branch with OP_END. */    the branch with OP_END. */
# Line 1657  for (;;) Line 1825  for (;;)
1825      code += 2;      code += 2;
1826      break;      break;
1827    
1828        case OP_WORD_BOUNDARY:
1829        case OP_NOT_WORD_BOUNDARY:
1830        code++;
1831        break;
1832    
1833      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1834      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1835      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1684  all of whose alternatives start with OP_ Line 1857  all of whose alternatives start with OP_
1857  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
1858  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
1859    
1860  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1861  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
1862  trying them again.  so there is no point trying them again.
1863    
1864  Arguments:  Arguments:
1865    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1704  do { Line 1877  do {
1877     register int op = *scode;     register int op = *scode;
1878     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1879       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
1880     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1881                (*options & PCRE_DOTALL) != 0)
1882       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
1883     else if (op != OP_SOD &&     else if (op != OP_SOD &&
1884             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1718  return TRUE; Line 1892  return TRUE;
1892    
1893    
1894  /*************************************************  /*************************************************
1895  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
1896  *************************************************/  *************************************************/
1897    
1898  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
1899  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
1900    matching and for non-DOTALL patterns that start with .* (which must start at
1901    the beginning or after \n).
1902    
1903  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
1904  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1736  do { Line 1912  do {
1912     register int op = *scode;     register int op = *scode;
1913     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1914       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
1915       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1916         { if (scode[1] != OP_ANY) return FALSE; }
1917     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
1918     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
1919     }     }
# Line 1834  pcre_compile(const char *pattern, int op Line 2012  pcre_compile(const char *pattern, int op
2012  real_pcre *re;  real_pcre *re;
2013  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2014  int runlength;  int runlength;
2015  int c, size;  int c, size, reqchar, countlits;
2016  int bracount = 0;  int bracount = 0;
2017  int top_backref = 0;  int top_backref = 0;
2018  int branch_extra = 0;  int branch_extra = 0;
# Line 2174  while ((c = *(++ptr)) != 0) Line 2352  while ((c = *(++ptr)) != 0)
2352              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
2353              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
2354              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
2355              branch_newextra to 2. */              branch_newextra to 2. Finally, we record whether the case-dependent
2356                flag ever changes within the regex. This is used by the "required
2357                character" code. */
2358    
2359              case ':':              case ':':
2360              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
2361                {                {
2362                length += 4;                length += 4;
2363                branch_newextra = 2;                branch_newextra = 2;
2364                  if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2365                }                }
2366              goto END_OPTIONS;              goto END_OPTIONS;
2367    
# Line 2268  while ((c = *(++ptr)) != 0) Line 2449  while ((c = *(++ptr)) != 0)
2449        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2450        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2451    
2452        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2453        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2454        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2455        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2456    
2457        if (minval == 0) length++;        if (minval == 0)
2458          else if (minval > 1) length += (minval - 1) * duplength;          {
2459        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2460            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2461            }
2462    
2463          /* When the minimum is greater than zero, 1 we have to replicate up to
2464          minval-1 times, with no additions required in the copies. Then, if
2465          there is a limited maximum we have to replicate up to maxval-1 times
2466          allowing for a BRAZERO item before each optional copy and nesting
2467          brackets for all but one of the optional copies. */
2468    
2469          else
2470            {
2471            length += (minval - 1) * duplength;
2472            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2473              length += (maxval - minval) * (duplength + 7) - 6;
2474            }
2475        }        }
2476      continue;      continue;
2477    
# Line 2366  code = re->code; Line 2562  code = re->code;
2562  *code = OP_BRA;  *code = OP_BRA;
2563  bracount = 0;  bracount = 0;
2564  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,  (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2565    &compile_block);    &reqchar, &countlits, &compile_block);
2566  re->top_bracket = bracount;  re->top_bracket = bracount;
2567  re->top_backref = top_backref;  re->top_backref = top_backref;
2568    
# Line 2398  if (*errorptr != NULL) Line 2594  if (*errorptr != NULL)
2594    return NULL;    return NULL;
2595    }    }
2596    
2597  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2598  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2599  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2600  unanchored matches no end. In the case of multiline matches, an alternative is  
2601  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2602    that speeds up unanchored matches no end. If not, see if we can set the
2603    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2604    start with ^. and also when all branches start with .* for non-DOTALL matches.
2605    */
2606    
2607  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2608    {    {
# Line 2422  if ((options & PCRE_ANCHORED) == 0) Line 2622  if ((options & PCRE_ANCHORED) == 0)
2622      }      }
2623    }    }
2624    
2625    /* Save the last required character if there are at least two literal
2626    characters on all paths, or if there is no first character setting. */
2627    
2628    if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2629      {
2630      re->req_char = reqchar;
2631      re->options |= PCRE_REQCHSET;
2632      }
2633    
2634  /* Print out the compiled data for debugging */  /* Print out the compiled data for debugging */
2635    
2636  #ifdef DEBUG  #ifdef DEBUG
# Line 2431  printf("Length = %d top_bracket = %d top Line 2640  printf("Length = %d top_bracket = %d top
2640    
2641  if (re->options != 0)  if (re->options != 0)
2642    {    {
2643    printf("%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
2644      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2645      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2646        ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2647      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2648      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2649      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
# Line 2448  if ((re->options & PCRE_FIRSTSET) != 0) Line 2658  if ((re->options & PCRE_FIRSTSET) != 0)
2658      else printf("First char = \\x%02x\n", re->first_char);      else printf("First char = \\x%02x\n", re->first_char);
2659    }    }
2660    
2661    if ((re->options & PCRE_REQCHSET) != 0)
2662      {
2663      if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2664        else printf("Req char = \\x%02x\n", re->req_char);
2665      }
2666    
2667  code_end = code;  code_end = code;
2668  code_base = code = re->code;  code_base = code = re->code;
2669    
# Line 2681  Returns:      TRUE if matched Line 2897  Returns:      TRUE if matched
2897    
2898  static BOOL  static BOOL
2899  match_ref(int offset, register const uschar *eptr, int length, match_data *md,  match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2900    int ims)    unsigned long int ims)
2901  {  {
2902  const uschar *p = md->start_subject + md->offset_vector[offset];  const uschar *p = md->start_subject + md->offset_vector[offset];
2903    
# Line 2740  Returns:       TRUE if matched Line 2956  Returns:       TRUE if matched
2956    
2957  static BOOL  static BOOL
2958  match(register const uschar *eptr, register const uschar *ecode,  match(register const uschar *eptr, register const uschar *ecode,
2959    int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)    int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2960      const uschar *eptrb)
2961  {  {
2962  int original_ims = ims;   /* Save for resetting on ')' */  unsigned long int original_ims = ims;   /* Save for resetting on ')' */
2963    
2964  for (;;)  for (;;)
2965    {    {
# Line 2771  for (;;) Line 2988  for (;;)
2988      int number = op - OP_BRA;      int number = op - OP_BRA;
2989      int offset = number << 1;      int offset = number << 1;
2990    
2991      DPRINTF(("start bracket %d\n", number));  #ifdef DEBUG
2992        printf("start bracket %d subject=", number);
2993        pchars(eptr, 16, TRUE, md);
2994        printf("\n");
2995    #endif
2996    
2997      if (offset < md->offset_max)      if (offset < md->offset_max)
2998        {        {
# Line 2853  for (;;) Line 3074  for (;;)
3074      ecode += 2;      ecode += 2;
3075      break;      break;
3076    
3077      /* End of the pattern */      /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3078        an empty string - recursion will then try other alternatives, if any. */
3079    
3080      case OP_END:      case OP_END:
3081        if (md->notempty && eptr == md->start_match) return FALSE;
3082      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;          /* Record where we ended */
3083      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;   /* and how many extracts were taken */
3084      return TRUE;      return TRUE;
# Line 3952  Arguments: Line 4175  Arguments:
4175    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4176    subject         points to the subject string    subject         points to the subject string
4177    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4178      start_offset    where to start in the subject string
4179    options         option bits    options         option bits
4180    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4181    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3964  Returns:          > 0 => success; value Line 4188  Returns:          > 0 => success; value
4188    
4189  int  int
4190  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4191    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4192      int offsetcount)
4193  {  {
4194  int resetcount, ocount;  int resetcount, ocount;
4195  int first_char = -1;  int first_char = -1;
4196  int ims = 0;  int req_char = -1;
4197    int req_char2 = -1;
4198    unsigned long int ims = 0;
4199  match_data match_block;  match_data match_block;
4200  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4201  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4202  const uschar *end_subject;  const uschar *end_subject;
4203    const uschar *req_char_ptr = start_match - 1;
4204  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4205  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4206  BOOL using_temporary_offsets = FALSE;  BOOL using_temporary_offsets = FALSE;
# Line 3993  match_block.endonly = (re->options & PCR Line 4221  match_block.endonly = (re->options & PCR
4221    
4222  match_block.notbol = (options & PCRE_NOTBOL) != 0;  match_block.notbol = (options & PCRE_NOTBOL) != 0;
4223  match_block.noteol = (options & PCRE_NOTEOL) != 0;  match_block.noteol = (options & PCRE_NOTEOL) != 0;
4224    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4225    
4226  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */  match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4227    
# Line 4063  if (!anchored) Line 4292  if (!anchored)
4292          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4293    }    }
4294    
4295  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* For anchored or unanchored matches, there may be a "last known required
4296    character" set. If the PCRE_CASELESS is set, implying that the match starts
4297    caselessly, or if there are any changes of this flag within the regex, set up
4298    both cases of the character. Otherwise set the two values the same, which will
4299    avoid duplicate testing (which takes significant time). This covers the vast
4300    majority of cases. It will be suboptimal when the case flag changes in a regex
4301    and the required character in fact is caseful. */
4302    
4303    if ((re->options & PCRE_REQCHSET) != 0)
4304      {
4305      req_char = re->req_char;
4306      req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4307        (re->tables + fcc_offset)[req_char] : req_char;
4308      }
4309    
4310    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4311    the loop runs just once. */
4312    
4313  do  do
4314    {    {
# Line 4099  do Line 4344  do
4344        }        }
4345      }      }
4346    
4347    /* Or to a non-unique first char */    /* Or to a non-unique first char after study */
4348    
4349    else if (start_bits != NULL)    else if (start_bits != NULL)
4350      {      {
# Line 4116  do Line 4361  do
4361    printf("\n");    printf("\n");
4362  #endif  #endif
4363    
4364      /* If req_char is set, we know that that character must appear in the subject
4365      for the match to succeed. If the first character is set, req_char must be
4366      later in the subject; otherwise the test starts at the match point. This
4367      optimization can save a huge amount of backtracking in patterns with nested
4368      unlimited repeats that aren't going to match. We don't know what the state of
4369      case matching may be when this character is hit, so test for it in both its
4370      cases if necessary. However, the different cased versions will not be set up
4371      unless PCRE_CASELESS was given or the casing state changes within the regex.
4372      Writing separate code makes it go faster, as does using an autoincrement and
4373      backing off on a match. */
4374    
4375      if (req_char >= 0)
4376        {
4377        register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4378    
4379        /* We don't need to repeat the search if we haven't yet reached the
4380        place we found it at last time. */
4381    
4382        if (p > req_char_ptr)
4383          {
4384          /* Do a single test if no case difference is set up */
4385    
4386          if (req_char == req_char2)
4387            {
4388            while (p < end_subject)
4389              {
4390              if (*p++ == req_char) { p--; break; }
4391              }
4392            }
4393    
4394          /* Otherwise test for either case */
4395    
4396          else
4397            {
4398            while (p < end_subject)
4399              {
4400              register int pp = *p++;
4401              if (pp == req_char || pp == req_char2) { p--; break; }
4402              }
4403            }
4404    
4405          /* If we can't find the required character, break the matching loop */
4406    
4407          if (p >= end_subject) break;
4408    
4409          /* If we have found the required character, save the point where we
4410          found it, so that we don't search again next time round the loop if
4411          the start hasn't passed this character yet. */
4412    
4413          req_char_ptr = p;
4414          }
4415        }
4416    
4417    /* When a match occurs, substrings will be set for all internal extractions;    /* When a match occurs, substrings will be set for all internal extractions;
4418    we just need to set up the whole thing as substring 0 before returning. If    we just need to set up the whole thing as substring 0 before returning. If
4419    there were too many extractions, set the return code to zero. In the case    there were too many extractions, set the return code to zero. In the case
# Line 4123  do Line 4421  do
4421    those back references that we can. In this case there need not be overflow    those back references that we can. In this case there need not be overflow
4422    if certain parts of the pattern were not used. */    if certain parts of the pattern were not used. */
4423    
4424      match_block.start_match = start_match;
4425    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))    if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4426      continue;      continue;
4427    

Legend:
Removed from v.27  
changed lines
  Added in v.37

  ViewVC Help
Powered by ViewVC 1.1.5