/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 447 by ph10, Tue Sep 15 18:17:54 2009 UTC revision 454 by ph10, Tue Sep 22 09:42:11 2009 UTC
# Line 1331  for (;;) Line 1331  for (;;)
1331    
1332    
1333  /*************************************************  /*************************************************
1334  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1335  *************************************************/  *************************************************/
1336    
1337  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1338  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1339  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1340    temporarily terminated with OP_END when this function is called.
1341    
1342    This function is called when a backward assertion is encountered, so that if it
1343    fails, the error message can point to the correct place in the pattern.
1344    However, we cannot do this when the assertion contains subroutine calls,
1345    because they can be forward references. We solve this by remembering this case
1346    and doing the check at the end; a flag specifies which mode we are running in.
1347    
1348  Arguments:  Arguments:
1349    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1350    options  the compiling options    options  the compiling options
1351      atend    TRUE if called when the pattern is complete
1352      cd       the "compile data" structure
1353    
1354  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1355                 or -1 if there is no fixed length,
1356               or -2 if \C was encountered               or -2 if \C was encountered
1357                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1358  */  */
1359    
1360  static int  static int
1361  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1362  {  {
1363  int length = -1;  int length = -1;
1364    
# Line 1360  branch, check the length against that of Line 1371  branch, check the length against that of
1371  for (;;)  for (;;)
1372    {    {
1373    int d;    int d;
1374      uschar *ce, *cs;
1375    register int op = *cc;    register int op = *cc;
1376    switch (op)    switch (op)
1377      {      {
# Line 1367  for (;;) Line 1379  for (;;)
1379      case OP_BRA:      case OP_BRA:
1380      case OP_ONCE:      case OP_ONCE:
1381      case OP_COND:      case OP_COND:
1382      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1383      if (d < 0) return d;      if (d < 0) return d;
1384      branchlength += d;      branchlength += d;
1385      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1389  for (;;) Line 1401  for (;;)
1401      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1402      branchlength = 0;      branchlength = 0;
1403      break;      break;
1404    
1405        /* A true recursion implies not fixed length, but a subroutine call may
1406        be OK. If the subroutine is a forward reference, we can't deal with
1407        it until the end of the pattern, so return -3. */
1408    
1409        case OP_RECURSE:
1410        if (!atend) return -3;
1411        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1412        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1413        if (cc > cs && cc < ce) return -1;                /* Recursion */
1414        d = find_fixedlength(cs + 2, options, atend, cd);
1415        if (d < 0) return d;
1416        branchlength += d;
1417        cc += 1 + LINK_SIZE;
1418        break;
1419    
1420      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1421    
# Line 1518  for (;;) Line 1545  for (;;)
1545    
1546    
1547  /*************************************************  /*************************************************
1548  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1549  *************************************************/  *************************************************/
1550    
1551  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1552  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1553    instance of OP_REVERSE for a lookbehind.
1554    
1555  Arguments:  Arguments:
1556    code        points to start of expression    code        points to start of expression
1557    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1558    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1559    
1560  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1561  */  */
# Line 1545  for (;;) Line 1573  for (;;)
1573    the table is zero; the actual length is stored in the compiled code. */    the table is zero; the actual length is stored in the compiled code. */
1574    
1575    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1576    
1577      /* Handle recursion */
1578    
1579      else if (c == OP_REVERSE)
1580        {
1581        if (number < 0) return (uschar *)code;
1582        code += _pcre_OP_lengths[c];
1583        }
1584    
1585    /* Handle capturing bracket */    /* Handle capturing bracket */
1586    
# Line 5813  for (;;) Line 5849  for (;;)
5849    
5850      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5851      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5852      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5853        because there may be forward references that we can't check here. Set a
5854        flag to cause another lookbehind check at the end. Why not do it all at the
5855        end? Because common, erroneous checks are picked up here and the offset of
5856        the problem can be shown. */
5857    
5858      if (lookbehind)      if (lookbehind)
5859        {        {
5860        int fixed_length;        int fixed_length;
5861        *code = OP_END;        *code = OP_END;
5862        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5863        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
5864        if (fixed_length < 0)        if (fixed_length == -3)
5865            {
5866            cd->check_lookbehind = TRUE;
5867            }
5868          else if (fixed_length < 0)
5869          {          {
5870          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5871          *ptrptr = ptr;          *ptrptr = ptr;
5872          return FALSE;          return FALSE;
5873          }          }
5874        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
5875        }        }
5876      }      }
5877    
# Line 6230  int length = 1;  /* For final END opcode Line 6274  int length = 1;  /* For final END opcode
6274  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6275  int errorcode = 0;  int errorcode = 0;
6276  int skipatstart = 0;  int skipatstart = 0;
6277  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6278  size_t size;  size_t size;
6279  uschar *code;  uschar *code;
6280  const uschar *codestart;  const uschar *codestart;
# Line 6329  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6371  while (ptr[skipatstart] == CHAR_LEFT_PAR
6371  /* Can't support UTF8 unless PCRE has been compiled to include the code. */  /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6372    
6373  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
 utf8 = (options & PCRE_UTF8) != 0;  
6374  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6375       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6376    {    {
# Line 6337  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 6378  if (utf8 && (options & PCRE_NO_UTF8_CHEC
6378    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
6379    }    }
6380  #else  #else
6381  if ((options & PCRE_UTF8) != 0)  if (utf8)
6382    {    {
6383    errorcode = ERR32;    errorcode = ERR32;
6384    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
# Line 6501  cd->start_code = codestart; Line 6542  cd->start_code = codestart;
6542  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6543  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6544  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6545    cd->check_lookbehind = FALSE;
6546  cd->open_caps = NULL;  cd->open_caps = NULL;
6547    
6548  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
# Line 6540  while (errorcode == 0 && cd->hwm > cwork Line 6582  while (errorcode == 0 && cd->hwm > cwork
6582    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6583    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6584    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6585    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = find_bracket(codestart, utf8, recno);
6586    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6587      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6588    }    }
# Line 6550  subpattern. */ Line 6592  subpattern. */
6592    
6593  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6594    
6595    /* If there were any lookbehind assertions that contained OP_RECURSE
6596    (recursions or subroutine calls), a flag is set for them to be checked here,
6597    because they may contain forward references. Actual recursions can't be fixed
6598    length, but subroutine calls can. It is done like this so that those without
6599    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6600    exceptional ones forgo this. We scan the pattern to check that they are fixed
6601    length, and set their lengths. */
6602    
6603    if (cd->check_lookbehind)
6604      {
6605      uschar *cc = (uschar *)codestart;
6606    
6607      /* Loop, searching for OP_REVERSE items, and process those that do not have
6608      their length set. (Actually, it will also re-process any that have a length
6609      of zero, but that is a pathological case, and it does no harm.) When we find
6610      one, we temporarily terminate the branch it is in while we scan it. */
6611    
6612      for (cc = (uschar *)find_bracket(codestart, utf8, -1);
6613           cc != NULL;
6614           cc = (uschar *)find_bracket(cc, utf8, -1))
6615        {
6616        if (GET(cc, 1) == 0)
6617          {
6618          int fixed_length;
6619          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6620          int end_op = *be;
6621          *be = OP_END;
6622          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6623          *be = end_op;
6624          DPRINTF(("fixed length = %d\n", fixed_length));
6625          if (fixed_length < 0)
6626            {
6627            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6628            break;
6629            }
6630          PUT(cc, 1, fixed_length);
6631          }
6632        cc += 1 + LINK_SIZE;
6633        }
6634      }
6635    
6636  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6637    
6638  if (errorcode != 0)  if (errorcode != 0)

Legend:
Removed from v.447  
changed lines
  Added in v.454

  ViewVC Help
Powered by ViewVC 1.1.5