/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 406 by ph10, Mon Mar 23 12:05:43 2009 UTC revision 412 by ph10, Sat Apr 11 10:34:37 2009 UTC
# Line 1009  return p; Line 1009  return p;
1009    
1010    
1011  /*************************************************  /*************************************************
1012  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1013  *************************************************/  *************************************************/
1014    
1015  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1016    top-level call starts at the beginning of the pattern. All other calls must
1017    start at a parenthesis. It scans along a pattern's text looking for capturing
1018  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1019  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1020  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1021  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1022  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1023    capturing group numbers - the (?| feature.
1024    
1025  Arguments:  Arguments:
1026    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1027    cd           compile background data    cd           compile background data
1028    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1029    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1030    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1031      count        pointer to the current capturing subpattern number (updated)
1032    
1033  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1034  */  */
1035    
1036  static int  static int
1037  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038    BOOL xmode)    BOOL xmode, int *count)
1039  {  {
1040  const uschar *thisname;  uschar *ptr = *ptrptr;
1041  int count = cd->bracount;  int start_count = *count;
1042    int hwm_count = start_count;
1043    BOOL dup_parens = FALSE;
1044    
1045  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1046    dealing with. The very first call may not start with a parenthesis. */
1047    
1048    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049    {    {
1050    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1051          ptr[2] == CHAR_VERTICAL_LINE)
1052        {
1053        ptr += 3;
1054        dup_parens = TRUE;
1055        }
1056    
1057      /* Handle a normal, unnamed capturing parenthesis */
1058    
1059      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060        {
1061        *count += 1;
1062        if (name == NULL && *count == lorn) return *count;
1063        ptr++;
1064        }
1065    
1066      /* Handle a condition. If it is an assertion, just carry on so that it
1067      is processed as normal. If not, skip to the closing parenthesis of the
1068      condition (there can't be any nested parens. */
1069    
1070      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071        {
1072        ptr += 2;
1073        if (ptr[1] != CHAR_QUESTION_MARK)
1074          {
1075          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076          if (*ptr != 0) ptr++;
1077          }
1078        }
1079    
1080      /* We have either (? or (* and not a condition */
1081    
1082      else
1083        {
1084        ptr += 2;
1085        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1086    
1087        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088    
1089        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091          {
1092          int term;
1093          const uschar *thisname;
1094          *count += 1;
1095          if (name == NULL && *count == lorn) return *count;
1096          term = *ptr++;
1097          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098          thisname = ptr;
1099          while (*ptr != term) ptr++;
1100          if (name != NULL && lorn == ptr - thisname &&
1101              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102            return *count;
1103          }
1104        }
1105      }
1106    
1107    /* Past any initial parenthesis handling, scan for parentheses or vertical
1108    bars. */
1109    
1110    for (; *ptr != 0; ptr++)
1111      {
1112    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1113    
1114    if (*ptr == CHAR_BACKSLASH)    if (*ptr == CHAR_BACKSLASH)
1115      {      {
1116      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1117      if (*ptr == CHAR_Q) for (;;)      if (*ptr == CHAR_Q) for (;;)
1118        {        {
1119        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1120        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1121        if (*(++ptr) == CHAR_E) break;        if (*(++ptr) == CHAR_E) break;
1122        }        }
1123      continue;      continue;
# Line 1093  for (; *ptr != 0; ptr++) Line 1162  for (; *ptr != 0; ptr++)
1162        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1163        if (*ptr == CHAR_BACKSLASH)        if (*ptr == CHAR_BACKSLASH)
1164          {          {
1165          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1166          if (*ptr == CHAR_Q) for (;;)          if (*ptr == CHAR_Q) for (;;)
1167            {            {
1168            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1169            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1170            if (*(++ptr) == CHAR_E) break;            if (*(++ptr) == CHAR_E) break;
1171            }            }
1172          continue;          continue;
# Line 1111  for (; *ptr != 0; ptr++) Line 1180  for (; *ptr != 0; ptr++)
1180    if (xmode && *ptr == CHAR_NUMBER_SIGN)    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1181      {      {
1182      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1183      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1184      continue;      continue;
1185      }      }
1186    
1187    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1188    
1189    if (*ptr != CHAR_LEFT_PARENTHESIS) continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)  
1190      {      {
1191      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1192      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1193      continue;      if (*ptr == 0) goto FAIL_EXIT;
1194      }      }
1195    
1196    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1197    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */      {
1198        if (dup_parens && *count < hwm_count) *count = hwm_count;
1199        *ptrptr = ptr;
1200        return -1;
1201        }
1202    
1203      else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1204        {
1205        if (*count > hwm_count) hwm_count = *count;
1206        *count = start_count;
1207        }
1208      }
1209    
1210    FAIL_EXIT:
1211    *ptrptr = ptr;
1212    return -1;
1213    }
1214    
   /* We have to disambiguate (?<! and (?<= from (?<name> */  
1215    
   if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||  
       ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)  
     continue;  
1216    
   count++;  
1217    
1218    if (name == NULL && count == lorn) return count;  /*************************************************
1219    term = *ptr++;  *       Find forward referenced subpattern       *
1220    if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  *************************************************/
1221    thisname = ptr;  
1222    while (*ptr != term) ptr++;  /* This function scans along a pattern's text looking for capturing
1223    if (name != NULL && lorn == ptr - thisname &&  subpatterns, and counting them. If it finds a named pattern that matches the
1224        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  name it is given, it returns its number. Alternatively, if the name is NULL, it
1225      return count;  returns when it reaches a given numbered subpattern. This is used for forward
1226    references to subpatterns. We used to be able to start this scan from the
1227    current compiling point, using the current count value from cd->bracount, and
1228    do it all in a single loop, but the addition of the possibility of duplicate
1229    subpattern numbers means that we have to scan from the very start, in order to
1230    take account of such duplicates, and to use a recursive function to keep track
1231    of the different types of group.
1232    
1233    Arguments:
1234      cd           compile background data
1235      name         name to seek, or NULL if seeking a numbered subpattern
1236      lorn         name length, or subpattern number if name is NULL
1237      xmode        TRUE if we are in /x mode
1238    
1239    Returns:       the number of the found subpattern, or -1 if not found
1240    */
1241    
1242    static int
1243    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1244    {
1245    uschar *ptr = (uschar *)cd->start_pattern;
1246    int count = 0;
1247    int rc;
1248    
1249    /* If the pattern does not start with an opening parenthesis, the first call
1250    to find_parens_sub() will scan right to the end (if necessary). However, if it
1251    does start with a parenthesis, find_parens_sub() will return when it hits the
1252    matching closing parens. That is why we have to have a loop. */
1253    
1254    for (;;)
1255      {
1256      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1257      if (rc > 0 || *ptr++ == 0) break;
1258    }    }
1259    
1260  return -1;  return rc;
1261  }  }
1262    
1263    
1264    
1265    
1266  /*************************************************  /*************************************************
1267  *      Find first significant op code            *  *      Find first significant op code            *
1268  *************************************************/  *************************************************/
# Line 4489  we set the flag only if there is a liter Line 4601  we set the flag only if there is a liter
4601    
4602          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4603    
4604          else if ((i = find_parens(ptr, cd, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4605                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4606            {            {
4607            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
# Line 4788  we set the flag only if there is a liter Line 4900  we set the flag only if there is a liter
4900              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4901              }              }
4902            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4903                      find_parens(ptr, cd, name, namelen,                      find_parens(cd, name, namelen,
4904                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
4905              {              {
4906              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4898  we set the flag only if there is a liter Line 5010  we set the flag only if there is a liter
5010    
5011              if (called == NULL)              if (called == NULL)
5012                {                {
5013                if (find_parens(ptr, cd, NULL, recno,                if (find_parens(cd, NULL, recno,
5014                      (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5015                  {                  {
5016                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
# Line 6114  if (erroroffset == NULL) Line 6226  if (erroroffset == NULL)
6226    
6227  *erroroffset = 0;  *erroroffset = 0;
6228    
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifdef SUPPORT_UTF8  
 utf8 = (options & PCRE_UTF8) != 0;  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  
      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)  
   {  
   errorcode = ERR44;  
   goto PCRE_EARLY_ERROR_RETURN2;  
   }  
 #else  
 if ((options & PCRE_UTF8) != 0)  
   {  
   errorcode = ERR32;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
 #endif  
   
 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)  
   {  
   errorcode = ERR17;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
   
6229  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
6230    
6231  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
# Line 6146  cd->fcc = tables + fcc_offset; Line 6234  cd->fcc = tables + fcc_offset;
6234  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6235  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6236    
6237    /* Check that all undefined public option bits are zero */
6238    
6239    if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6240      {
6241      errorcode = ERR17;
6242      goto PCRE_EARLY_ERROR_RETURN;
6243      }
6244    
6245  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6246  the offset for later. */  the offset for later. */
6247    
# Line 6155  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6251  while (ptr[skipatstart] == CHAR_LEFT_PAR
6251    int newnl = 0;    int newnl = 0;
6252    int newbsr = 0;    int newbsr = 0;
6253    
6254      if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6255        { skipatstart += 7; options |= PCRE_UTF8; continue; }
6256    
6257    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6258      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6259    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
# Line 6178  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6277  while (ptr[skipatstart] == CHAR_LEFT_PAR
6277    else break;    else break;
6278    }    }
6279    
6280    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6281    
6282    #ifdef SUPPORT_UTF8
6283    utf8 = (options & PCRE_UTF8) != 0;
6284    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6285         (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6286      {
6287      errorcode = ERR44;
6288      goto PCRE_EARLY_ERROR_RETURN2;
6289      }
6290    #else
6291    if ((options & PCRE_UTF8) != 0)
6292      {
6293      errorcode = ERR32;
6294      goto PCRE_EARLY_ERROR_RETURN;
6295      }
6296    #endif
6297    
6298  /* Check validity of \R options. */  /* Check validity of \R options. */
6299    
6300  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))

Legend:
Removed from v.406  
changed lines
  Added in v.412

  ViewVC Help
Powered by ViewVC 1.1.5