# Diff of /code/trunk/pcre.c

revision 29 by nigel, Sat Feb 24 21:38:53 2007 UTC revision 35 by nigel, Sat Feb 24 21:39:05 2007 UTC
# Line 1091  for (;; ptr++) Line 1091  for (;; ptr++)
1091      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1093        {        {
1094        int i, ketoffset = 0;        register int i;
1095          int ketoffset = 0;
1096        int len = code - previous;        int len = code - previous;
1098
1099        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1100        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1107  for (;; ptr++) Line 1109  for (;; ptr++)
1109          ketoffset = code - ket;          ketoffset = code - ket;
1110          }          }
1111
1112          /* The case of a zero minimum is special because of the need to stick
1113          OP_BRAZERO in front of it, and because the group appears once in the
1114          data, whereas in other cases it appears the minimum number of times. For
1115          this reason, it is simplest to treat this case separately, as otherwise
1116          the code gets far too mess. There are several special subcases when the
1117          minimum is zero. */
1118
1119          if (repeat_min == 0)
1120            {
1121            /* If the maximum is also zero, we just omit the group from the output
1122            altogether. */
1123
1124            if (repeat_max == 0)
1125              {
1126              code = previous;
1127              previous = NULL;
1128              break;
1129              }
1130
1131            /* If the maximum is 1 or unlimited, we just have to stick in the
1132            BRAZERO and do no more at this point. */
1133
1134            if (repeat_max <= 1)
1135              {
1136              memmove(previous+1, previous, len);
1137              code++;
1138              *previous++ = OP_BRAZERO + repeat_type;
1139              }
1140
1141            /* If the maximum is greater than 1 and limited, we have to replicate
1142            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1143            The first one has to be handled carefully because it's the original
1144            copy, which has to be moved up. The remainder can be handled by code
1145            that is common with the non-zero minimum case below. We just have to
1146            adjust the value or repeat_max, since one less copy is required. */
1147
1148            else
1149              {
1150              int offset;
1151              memmove(previous+4, previous, len);
1152              code += 4;
1153              *previous++ = OP_BRAZERO + repeat_type;
1154              *previous++ = OP_BRA;
1155
1156              /* We chain together the bracket offset fields that have to be
1157              filled in later when the ends of the brackets are reached. */
1158
1161              *previous++ = offset >> 8;
1162              *previous++ = offset & 255;
1163              }
1164
1165            repeat_max--;
1166            }
1167
1168          /* If the minimum is greater than zero, replicate the group as many
1169          times as necessary, and adjust the maximum to the number of subsequent
1170          copies that we need. */
1171
1172          else
1173            {
1174            for (i = 1; i < repeat_min; i++)
1175              {
1176              memcpy(code, previous, len);
1177              code += len;
1178              }
1179            if (repeat_max > 0) repeat_max -= repeat_min;
1180            }
1181
1182          /* This code is common to both the zero and non-zero minimum cases. If
1183          the maximum is limited, it replicates the group in a nested fashion,
1184          remembering the bracket starts on a stack. In the case of a zero minimum,
1185          the first one was set up above. In all cases the repeat_max now specifies
1186          the number of additional copies needed. */
1187
1188          if (repeat_max >= 0)
1189            {
1190            for (i = repeat_max - 1; i >= 0; i--)
1191              {
1192              *code++ = OP_BRAZERO + repeat_type;
1193
1194              /* All but the final copy start a new nesting, maintaining the
1195              chain of brackets outstanding. */
1196
1197              if (i != 0)
1198                {
1199                int offset;
1200                *code++ = OP_BRA;
1203                *code++ = offset >> 8;
1204                *code++ = offset & 255;
1205                }
1206
1207              memcpy(code, previous, len);
1208              code += len;
1209              }
1210
1211            /* Now chain through the pending brackets, and fill in their length
1212            fields (which are holding the chain links pro tem). */
1213
1215              {
1217              int offset = code - bralink + 1;
1218              uschar *bra = code - offset;
1219              oldlinkoffset = (bra[1] << 8) + bra[2];
1221              *code++ = OP_KET;
1222              *code++ = bra[1] = offset >> 8;
1223              *code++ = bra[2] = (offset & 255);
1224              }
1225            }
1226
1227          /* If the maximum is unlimited, set a repeater in the final copy. We
1228          can't just offset backwards from the current code point, because we
1229          don't know if there's been an options resetting after the ket. The
1230          correct offset was computed above. */
1231
1232          else code[-ketoffset] = OP_KETRMAX + repeat_type;
1233
1234
1235    #ifdef NEVER
1236        /* If the minimum is greater than zero, and the maximum is unlimited or        /* If the minimum is greater than zero, and the maximum is unlimited or
1237        equal to the minimum, the first copy remains where it is, and is        equal to the minimum, the first copy remains where it is, and is
1238        replicated up to the minimum number of times. This case includes the +        replicated up to the minimum number of times. This case includes the +
# Line 1154  for (;; ptr++) Line 1280  for (;; ptr++)
1280        correct offset was computed above. */        correct offset was computed above. */
1281
1282        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1283    #endif
1284
1285
1286        }        }
1287
1288      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1661  for (;;) Line 1790  for (;;)
1790      code += 2;      code += 2;
1791      break;      break;
1792
1793        case OP_WORD_BOUNDARY:
1794        case OP_NOT_WORD_BOUNDARY:
1795        code++;
1796        break;
1797
1798      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1799      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1800      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1822  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
1823  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
1824
1825  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1826  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
1827  trying them again.  so there is no point trying them again.
1828
1829  Arguments:  Arguments:
1830    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1708  do { Line 1842  do {
1842     register int op = *scode;     register int op = *scode;
1843     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1844       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
1845     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1846                (*options & PCRE_DOTALL) != 0)
1847       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
1848     else if (op != OP_SOD &&     else if (op != OP_SOD &&
1849             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1722  return TRUE; Line 1857  return TRUE;
1857
1858
1859  /*************************************************  /*************************************************
1860  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
1861  *************************************************/  *************************************************/
1862
1863  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
1864  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
1865    matching and for non-DOTALL patterns that start with .* (which must start at
1866    the beginning or after \n).
1867
1868  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
1869  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1740  do { Line 1877  do {
1877     register int op = *scode;     register int op = *scode;
1878     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1879       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
1880       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1881         { if (scode[1] != OP_ANY) return FALSE; }
1882     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
1883     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
1884     }     }
# Line 2272  while ((c = *(++ptr)) != 0) Line 2411  while ((c = *(++ptr)) != 0)
2411        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2412        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2413
2414        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2415        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2416        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2417        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2418
2419        if (minval == 0) length++;        if (minval == 0)
2420          else if (minval > 1) length += (minval - 1) * duplength;          {
2421        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2422            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2423            }
2424
2425          /* When the minimum is greater than zero, 1 we have to replicate up to
2426          minval-1 times, with no additions required in the copies. Then, if
2427          there is a limited maximum we have to replicate up to maxval-1 times
2428          allowing for a BRAZERO item before each optional copy and nesting
2429          brackets for all but one of the optional copies. */
2430
2431          else
2432            {
2433            length += (minval - 1) * duplength;
2434            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2435              length += (maxval - minval) * (duplength + 7) - 6;
2436            }
2437        }        }
2438      continue;      continue;
2439
# Line 2402  if (*errorptr != NULL) Line 2556  if (*errorptr != NULL)
2556    return NULL;    return NULL;
2557    }    }
2558
2559  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2560  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2561  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2562  unanchored matches no end. In the case of multiline matches, an alternative is
2563  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2564    that speeds up unanchored matches no end. If not, see if we can set the
2565    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2567    */
2568
2569  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2570    {    {
# Line 2775  for (;;) Line 2933  for (;;)
2933      int number = op - OP_BRA;      int number = op - OP_BRA;
2934      int offset = number << 1;      int offset = number << 1;
2935
2936      DPRINTF(("start bracket %d\n", number));  #ifdef DEBUG
2937        printf("start bracket %d subject=", number);
2938        pchars(eptr, 16, TRUE, md);
2939        printf("\n");
2940    #endif
2941
2942      if (offset < md->offset_max)      if (offset < md->offset_max)
2943        {        {
# Line 3956  Arguments: Line 4118  Arguments:
4118    external_extra  points to "hints" from pcre_study() or is NULL    external_extra  points to "hints" from pcre_study() or is NULL
4119    subject         points to the subject string    subject         points to the subject string
4120    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
4121      start_offset    where to start in the subject string
4122    options         option bits    options         option bits
4123    offsets         points to a vector of ints to be filled in with offsets    offsets         points to a vector of ints to be filled in with offsets
4124    offsetcount     the number of elements in the vector    offsetcount     the number of elements in the vector
# Line 3968  Returns:          > 0 => success; value Line 4131  Returns:          > 0 => success; value
4131
4132  int  int
4133  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,  pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4134    const char *subject, int length, int options, int *offsets, int offsetcount)    const char *subject, int length, int start_offset, int options, int *offsets,
4135      int offsetcount)
4136  {  {
4137  int resetcount, ocount;  int resetcount, ocount;
4138  int first_char = -1;  int first_char = -1;
4139  int ims = 0;  int ims = 0;
4140  match_data match_block;  match_data match_block;
4141  const uschar *start_bits = NULL;  const uschar *start_bits = NULL;
4142  const uschar *start_match = (const uschar *)subject;  const uschar *start_match = (const uschar *)subject + start_offset;
4143  const uschar *end_subject;  const uschar *end_subject;
4144  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
4145  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;  const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
# Line 4067  if (!anchored) Line 4231  if (!anchored)
4231          start_bits = extra->start_bits;          start_bits = extra->start_bits;
4232    }    }
4233
4234  /* Loop for unanchored matches; for anchored regexps the loop runs just once. */  /* Loop for unanchored matches; for anchored regexs the loop runs just once. */
4235
4236  do  do
4237    {    {

Legend:
 Removed from v.29 changed lines Added in v.35