/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 535 by ph10, Thu Jun 3 19:18:24 2010 UTC revision 574 by ph10, Sat Nov 20 17:47:27 2010 UTC
# Line 408  static const char error_texts[] = Line 408  static const char error_texts[] =
408    "different names for subpatterns of the same number are not allowed\0"    "different names for subpatterns of the same number are not allowed\0"
409    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
410    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
411      "\\c must be followed by an ASCII character\0"
412    ;    ;
413    
414  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 841  else Line 842  else
842      break;      break;
843    
844      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
845      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
846        coding is ASCII-specific, but then the whole concept of \cx is
847      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
848    
849      case CHAR_c:      case CHAR_c:
# Line 851  else Line 853  else
853        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
854        break;        break;
855        }        }
856    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
857  #ifndef EBCDIC  /* ASCII/UTF-8 coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
858          {
859          *errorcodeptr = ERR68;
860          break;
861          }
862      if (c >= CHAR_a && c <= CHAR_z) c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
863      c ^= 0x40;      c ^= 0x40;
864  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
865      if (c >= CHAR_a && c <= CHAR_z) c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
866      c ^= 0xC0;      c ^= 0xC0;
867  #endif  #endif
# Line 1110  Arguments: Line 1116  Arguments:
1116    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1117    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1118    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1119      utf8         TRUE if we are in UTF-8 mode
1120    count        pointer to the current capturing subpattern number (updated)    count        pointer to the current capturing subpattern number (updated)
1121    
1122  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
# Line 1117  Returns:       the number of the named s Line 1124  Returns:       the number of the named s
1124    
1125  static int  static int
1126  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1127    BOOL xmode, int *count)    BOOL xmode, BOOL utf8, int *count)
1128  {  {
1129  uschar *ptr = *ptrptr;  uschar *ptr = *ptrptr;
1130  int start_count = *count;  int start_count = *count;
# Line 1129  dealing with. The very first call may no Line 1136  dealing with. The very first call may no
1136    
1137  if (ptr[0] == CHAR_LEFT_PARENTHESIS)  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1138    {    {
1139    if (ptr[1] == CHAR_QUESTION_MARK &&    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1140        ptr[2] == CHAR_VERTICAL_LINE)  
1141      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1142    
1143      /* Handle a normal, unnamed capturing parenthesis. */
1144    
1145      else if (ptr[1] != CHAR_QUESTION_MARK)
1146        {
1147        *count += 1;
1148        if (name == NULL && *count == lorn) return *count;
1149        ptr++;
1150        }
1151    
1152      /* All cases now have (? at the start. Remember when we are in a group
1153      where the parenthesis numbers are duplicated. */
1154    
1155      else if (ptr[2] == CHAR_VERTICAL_LINE)
1156      {      {
1157      ptr += 3;      ptr += 3;
1158      dup_parens = TRUE;      dup_parens = TRUE;
1159      }      }
1160    
1161    /* Handle a normal, unnamed capturing parenthesis */    /* Handle comments; all characters are allowed until a ket is reached. */
1162    
1163    else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)    else if (ptr[2] == CHAR_NUMBER_SIGN)
1164      {      {
1165      *count += 1;      for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1166      if (name == NULL && *count == lorn) return *count;      goto FAIL_EXIT;
     ptr++;  
1167      }      }
1168    
1169    /* Handle a condition. If it is an assertion, just carry on so that it    /* Handle a condition. If it is an assertion, just carry on so that it
1170    is processed as normal. If not, skip to the closing parenthesis of the    is processed as normal. If not, skip to the closing parenthesis of the
1171    condition (there can't be any nested parens. */    condition (there can't be any nested parens). */
1172    
1173    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1174      {      {
# Line 1159  if (ptr[0] == CHAR_LEFT_PARENTHESIS) Line 1180  if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1180        }        }
1181      }      }
1182    
1183    /* We have either (? or (* and not a condition */    /* Start with (? but not a condition. */
1184    
1185    else    else
1186      {      {
# Line 1264  for (; *ptr != 0; ptr++) Line 1285  for (; *ptr != 0; ptr++)
1285    
1286    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1287      {      {
1288      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      ptr++;
1289        while (*ptr != 0)
1290          {
1291          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1292          ptr++;
1293    #ifdef SUPPORT_UTF8
1294          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1295    #endif
1296          }
1297      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1298      continue;      continue;
1299      }      }
# Line 1273  for (; *ptr != 0; ptr++) Line 1302  for (; *ptr != 0; ptr++)
1302    
1303    if (*ptr == CHAR_LEFT_PARENTHESIS)    if (*ptr == CHAR_LEFT_PARENTHESIS)
1304      {      {
1305      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1306      if (rc > 0) return rc;      if (rc > 0) return rc;
1307      if (*ptr == 0) goto FAIL_EXIT;      if (*ptr == 0) goto FAIL_EXIT;
1308      }      }
# Line 1281  for (; *ptr != 0; ptr++) Line 1310  for (; *ptr != 0; ptr++)
1310    else if (*ptr == CHAR_RIGHT_PARENTHESIS)    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1311      {      {
1312      if (dup_parens && *count < hwm_count) *count = hwm_count;      if (dup_parens && *count < hwm_count) *count = hwm_count;
1313      *ptrptr = ptr;      goto FAIL_EXIT;
     return -1;  
1314      }      }
1315    
1316    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
# Line 1320  Arguments: Line 1348  Arguments:
1348    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1349    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1350    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1351      utf8         TRUE if we are in UTF-8 mode
1352    
1353  Returns:       the number of the found subpattern, or -1 if not found  Returns:       the number of the found subpattern, or -1 if not found
1354  */  */
1355    
1356  static int  static int
1357  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1358      BOOL utf8)
1359  {  {
1360  uschar *ptr = (uschar *)cd->start_pattern;  uschar *ptr = (uschar *)cd->start_pattern;
1361  int count = 0;  int count = 0;
# Line 1338  matching closing parens. That is why we Line 1368  matching closing parens. That is why we
1368    
1369  for (;;)  for (;;)
1370    {    {
1371    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1372    if (rc > 0 || *ptr++ == 0) break;    if (rc > 0 || *ptr++ == 0) break;
1373    }    }
1374    
# Line 1711  for (;;) Line 1741  for (;;)
1741        case OP_MARK:        case OP_MARK:
1742        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
1743        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       case OP_THEN_ARG:  
1744        code += code[1];        code += code[1];
1745        break;        break;
1746    
1747          case OP_THEN_ARG:
1748          code += code[1+LINK_SIZE];
1749          break;
1750        }        }
1751    
1752      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
# Line 1814  for (;;) Line 1847  for (;;)
1847        case OP_MARK:        case OP_MARK:
1848        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
1849        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       case OP_THEN_ARG:  
1850        code += code[1];        code += code[1];
1851        break;        break;
1852    
1853          case OP_THEN_ARG:
1854          code += code[1+LINK_SIZE];
1855          break;
1856        }        }
1857    
1858      /* Add in the fixed length from the table */      /* Add in the fixed length from the table */
# Line 2092  for (code = first_significant_code(code Line 2128  for (code = first_significant_code(code
2128      case OP_MARK:      case OP_MARK:
2129      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2130      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     case OP_THEN_ARG:  
2131      code += code[1];      code += code[1];
2132      break;      break;
2133    
2134        case OP_THEN_ARG:
2135        code += code[1+LINK_SIZE];
2136        break;
2137    
2138      /* None of the remaining opcodes are required to match a character. */      /* None of the remaining opcodes are required to match a character. */
2139    
2140      default:      default:
# Line 2493  if ((options & PCRE_EXTENDED) != 0) Line 2532  if ((options & PCRE_EXTENDED) != 0)
2532      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2533      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2534        {        {
2535        while (*(++ptr) != 0)        ptr++;
2536          while (*ptr != 0)
2537            {
2538          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2539            ptr++;
2540    #ifdef SUPPORT_UTF8
2541            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2542    #endif
2543            }
2544        }        }
2545      else break;      else break;
2546      }      }
# Line 2530  if ((options & PCRE_EXTENDED) != 0) Line 2576  if ((options & PCRE_EXTENDED) != 0)
2576      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2577      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2578        {        {
2579        while (*(++ptr) != 0)        ptr++;
2580          while (*ptr != 0)
2581            {
2582          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2583            ptr++;
2584    #ifdef SUPPORT_UTF8
2585            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2586    #endif
2587            }
2588        }        }
2589      else break;      else break;
2590      }      }
# Line 3104  for (;; ptr++) Line 3157  for (;; ptr++)
3157      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3158      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3159        {        {
3160        while (*(++ptr) != 0)        ptr++;
3161          while (*ptr != 0)
3162          {          {
3163          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3164            ptr++;
3165    #ifdef SUPPORT_UTF8
3166            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3167    #endif
3168          }          }
3169        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3170    
# Line 3481  for (;; ptr++) Line 3539  for (;; ptr++)
3539              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3540              continue;              continue;
3541    
3542                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3543                if it was previously set by something earlier in the character
3544                class. */
3545    
3546              case ESC_s:              case ESC_s:
3547              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3548              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3549                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3550              continue;              continue;
3551    
3552              case ESC_S:              case ESC_S:
# Line 4795  for (;; ptr++) Line 4858  for (;; ptr++)
4858                *errorcodeptr = ERR66;                *errorcodeptr = ERR66;
4859                goto FAILED;                goto FAILED;
4860                }                }
4861              *code++ = verbs[i].op;              *code = verbs[i].op;
4862                if (*code++ == OP_THEN)
4863                  {
4864                  PUT(code, 0, code - bcptr->current_branch - 1);
4865                  code += LINK_SIZE;
4866                  }
4867              }              }
4868    
4869            else            else
# Line 4805  for (;; ptr++) Line 4873  for (;; ptr++)
4873                *errorcodeptr = ERR59;                *errorcodeptr = ERR59;
4874                goto FAILED;                goto FAILED;
4875                }                }
4876              *code++ = verbs[i].op_arg;              *code = verbs[i].op_arg;
4877                if (*code++ == OP_THEN_ARG)
4878                  {
4879                  PUT(code, 0, code - bcptr->current_branch - 1);
4880                  code += LINK_SIZE;
4881                  }
4882              *code++ = arglen;              *code++ = arglen;
4883              memcpy(code, arg, arglen);              memcpy(code, arg, arglen);
4884              code += arglen;              code += arglen;
# Line 4999  for (;; ptr++) Line 5072  for (;; ptr++)
5072          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
5073    
5074          else if ((i = find_parens(cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
5075                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5076            {            {
5077            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
5078            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
# Line 5345  for (;; ptr++) Line 5418  for (;; ptr++)
5418              }              }
5419            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5420                      find_parens(cd, name, namelen,                      find_parens(cd, name, namelen,
5421                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5422              {              {
5423              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
5424              goto FAILED;              goto FAILED;
# Line 5456  for (;; ptr++) Line 5529  for (;; ptr++)
5529              if (called == NULL)              if (called == NULL)
5530                {                {
5531                if (find_parens(cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5532                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0, utf8) < 0)
5533                  {                  {
5534                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
5535                  goto FAILED;                  goto FAILED;

Legend:
Removed from v.535  
changed lines
  Added in v.574

  ViewVC Help
Powered by ViewVC 1.1.5