/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1348 by ph10, Fri Jul 5 10:38:37 2013 UTC revision 1359 by ph10, Tue Sep 3 10:10:59 2013 UTC
# Line 115  kicks in at the same number of forward r Line 115  kicks in at the same number of forward r
115  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123    #define NAMED_GROUP_LIST_SIZE  20
124    
125  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
126  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
127    
# Line 1358  return p; Line 1365  return p;
1365    
1366    
1367  /*************************************************  /*************************************************
 *  Subroutine for finding forward reference      *  
 *************************************************/  
   
 /* This recursive function is called only from find_parens() below. The  
 top-level call starts at the beginning of the pattern. All other calls must  
 start at a parenthesis. It scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. Recursion is used to keep  
 track of subpatterns that reset the capturing group numbers - the (?| feature.  
   
 This function was originally called only from the second pass, in which we know  
 that if (?< or (?' or (?P< is encountered, the name will be correctly  
 terminated because that is checked in the first pass. There is now one call to  
 this function in the first pass, to check for a recursive back reference by  
 name (so that we can make the whole group atomic). In this case, we need check  
 only up to the current position in the pattern, and that is still OK because  
 and previous occurrences will have been checked. To make this work, the test  
 for "end of pattern" is a check against cd->end_pattern in the main loop,  
 instead of looking for a binary zero. This means that the special first-pass  
 call can adjust cd->end_pattern temporarily. (Checks for binary zero while  
 processing items within the loop are OK, because afterwards the main loop will  
 terminate.)  
   
 Arguments:  
   ptrptr       address of the current character pointer (updated)  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   count        pointer to the current capturing subpattern number (updated)  
   
 Returns:       the number of the named subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,  
   BOOL xmode, BOOL utf, int *count)  
 {  
 pcre_uchar *ptr = *ptrptr;  
 int start_count = *count;  
 int hwm_count = start_count;  
 BOOL dup_parens = FALSE;  
   
 /* If the first character is a parenthesis, check on the type of group we are  
 dealing with. The very first call may not start with a parenthesis. */  
   
 if (ptr[0] == CHAR_LEFT_PARENTHESIS)  
   {  
   /* Handle specials such as (*SKIP) or (*UTF8) etc. */  
   
   if (ptr[1] == CHAR_ASTERISK)  
     {  
     ptr += 2;  
     while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
     }  
   
   /* Handle a normal, unnamed capturing parenthesis. */  
   
   else if (ptr[1] != CHAR_QUESTION_MARK)  
     {  
     *count += 1;  
     if (name == NULL && *count == lorn) return *count;  
     ptr++;  
     }  
   
   /* All cases now have (? at the start. Remember when we are in a group  
   where the parenthesis numbers are duplicated. */  
   
   else if (ptr[2] == CHAR_VERTICAL_LINE)  
     {  
     ptr += 3;  
     dup_parens = TRUE;  
     }  
   
   /* Handle comments; all characters are allowed until a ket is reached. */  
   
   else if (ptr[2] == CHAR_NUMBER_SIGN)  
     {  
     for (ptr += 3; *ptr != CHAR_NULL; ptr++)  
       if (*ptr == CHAR_RIGHT_PARENTHESIS) break;  
     goto FAIL_EXIT;  
     }  
   
   /* Handle a condition. If it is an assertion, just carry on so that it  
   is processed as normal. If not, skip to the closing parenthesis of the  
   condition (there can't be any nested parens). */  
   
   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)  
     {  
     ptr += 2;  
     if (ptr[1] != CHAR_QUESTION_MARK)  
       {  
       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
       if (*ptr != CHAR_NULL) ptr++;  
       }  
     }  
   
   /* Start with (? but not a condition. */  
   
   else  
     {  
     ptr += 2;  
     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */  
   
     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */  
   
     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&  
         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)  
       {  
       pcre_uchar term;  
       const pcre_uchar *thisname;  
       *count += 1;  
       if (name == NULL && *count == lorn) return *count;  
       term = *ptr++;  
       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  
       thisname = ptr;  
       while (*ptr != term) ptr++;  
       if (name != NULL && lorn == (int)(ptr - thisname) &&  
           STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)  
         return *count;  
       term++;  
       }  
     }  
   }  
   
 /* Past any initial parenthesis handling, scan for parentheses or vertical  
 bars. Stop if we get to cd->end_pattern. Note that this is important for the  
 first-pass call when this value is temporarily adjusted to stop at the current  
 position. So DO NOT change this to a test for binary zero. */  
   
 for (; ptr < cd->end_pattern; ptr++)  
   {  
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == CHAR_BACKSLASH)  
     {  
     if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
     if (*ptr == CHAR_Q) for (;;)  
       {  
       while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
       if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
       if (*(++ptr) == CHAR_E) break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes; this logic must be similar to the way they  
   are handled for real. If the first character is '^', skip it. Also, if the  
   first few characters (either before or after ^) are \Q\E or \E we skip them  
   too. This makes for compatibility with Perl. Note the use of STR macros to  
   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */  
   
   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)  
     {  
     BOOL negate_class = FALSE;  
     for (;;)  
       {  
       if (ptr[1] == CHAR_BACKSLASH)  
         {  
         if (ptr[2] == CHAR_E)  
           ptr+= 2;  
         else if (STRNCMP_UC_C8(ptr + 2,  
                  STR_Q STR_BACKSLASH STR_E, 3) == 0)  
           ptr += 4;  
         else  
           break;  
         }  
       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)  
         {  
         negate_class = TRUE;  
         ptr++;  
         }  
       else break;  
       }  
   
     /* If the next character is ']', it is a data character that must be  
     skipped, except in JavaScript compatibility mode. */  
   
     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&  
         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)  
       ptr++;  
   
     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       if (*ptr == CHAR_NULL) return -1;  
       if (*ptr == CHAR_BACKSLASH)  
         {  
         if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
         if (*ptr == CHAR_Q) for (;;)  
           {  
           while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
           if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
           if (*(++ptr) == CHAR_E) break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == CHAR_NUMBER_SIGN)  
     {  
     ptr++;  
     while (*ptr != CHAR_NULL)  
       {  
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }  
       ptr++;  
 #ifdef SUPPORT_UTF  
       if (utf) FORWARDCHAR(ptr);  
 #endif  
       }  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     continue;  
     }  
   
   /* Check for the special metacharacters */  
   
   if (*ptr == CHAR_LEFT_PARENTHESIS)  
     {  
     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);  
     if (rc > 0) return rc;  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_RIGHT_PARENTHESIS)  
     {  
     if (dup_parens && *count < hwm_count) *count = hwm_count;  
     goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)  
     {  
     if (*count > hwm_count) hwm_count = *count;  
     *count = start_count;  
     }  
   }  
   
 FAIL_EXIT:  
 *ptrptr = ptr;  
 return -1;  
 }  
   
   
   
   
 /*************************************************  
 *       Find forward referenced subpattern       *  
 *************************************************/  
   
 /* This function scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. This is used for forward  
 references to subpatterns. We used to be able to start this scan from the  
 current compiling point, using the current count value from cd->bracount, and  
 do it all in a single loop, but the addition of the possibility of duplicate  
 subpattern numbers means that we have to scan from the very start, in order to  
 take account of such duplicates, and to use a recursive function to keep track  
 of the different types of group.  
   
 Arguments:  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   
 Returns:       the number of the found subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,  
   BOOL utf)  
 {  
 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;  
 int count = 0;  
 int rc;  
   
 /* If the pattern does not start with an opening parenthesis, the first call  
 to find_parens_sub() will scan right to the end (if necessary). However, if it  
 does start with a parenthesis, find_parens_sub() will return when it hits the  
 matching closing parens. That is why we have to have a loop. */  
   
 for (;;)  
   {  
   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);  
   if (rc > 0 || *ptr++ == CHAR_NULL) break;  
   }  
   
 return rc;  
 }  
   
   
   
   
 /*************************************************  
1368  *      Find first significant op code            *  *      Find first significant op code            *
1369  *************************************************/  *************************************************/
1370    
# Line 2361  Returns:      TRUE if what is matched co Line 2068  Returns:      TRUE if what is matched co
2068  typedef struct recurse_check {  typedef struct recurse_check {
2069    struct recurse_check *prev;    struct recurse_check *prev;
2070    const pcre_uchar *group;    const pcre_uchar *group;
2071  } recurse_check;  } recurse_check;
2072    
2073  static BOOL  static BOOL
2074  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
# Line 2377  for (code = first_significant_code(code Line 2084  for (code = first_significant_code(code
2084    const pcre_uchar *ccode;    const pcre_uchar *ccode;
2085    
2086    c = *code;    c = *code;
2087    
2088    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
2089    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
2090    
# Line 2405  for (code = first_significant_code(code Line 2112  for (code = first_significant_code(code
2112      NULL. */      NULL. */
2113    
2114      if (cd->start_workspace != NULL)      if (cd->start_workspace != NULL)
2115        {        {
2116        const pcre_uchar *tcode;        const pcre_uchar *tcode;
2117        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2118          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2119        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2120        }        }
2121    
2122      /* If we are scanning a completed pattern, there are no forward references      /* If we are scanning a completed pattern, there are no forward references
2123      and all groups are complete. We need to detect whether this is a recursive      and all groups are complete. We need to detect whether this is a recursive
2124      call, as otherwise there will be an infinite loop. If it is a recursion,      call, as otherwise there will be an infinite loop. If it is a recursion,
2125      just skip over it. Simple recursions are easily detected. For mutual      just skip over it. Simple recursions are easily detected. For mutual
2126      recursions we keep a chain on the stack. */      recursions we keep a chain on the stack. */
2127    
2128      else      else
2129        {        {
2130        recurse_check *r = recurses;        recurse_check *r = recurses;
2131        const pcre_uchar *endgroup = scode;        const pcre_uchar *endgroup = scode;
2132    
2133        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2134        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2135    
2136        for (r = recurses; r != NULL; r = r->prev)        for (r = recurses; r != NULL; r = r->prev)
2137          if (r->group == scode) break;          if (r->group == scode) break;
2138        if (r != NULL) continue;   /* Mutual recursion */        if (r != NULL) continue;   /* Mutual recursion */
# Line 2436  for (code = first_significant_code(code Line 2143  for (code = first_significant_code(code
2143    
2144      empty_branch = FALSE;      empty_branch = FALSE;
2145      this_recurse.prev = recurses;      this_recurse.prev = recurses;
2146      this_recurse.group = scode;      this_recurse.group = scode;
2147    
2148      do      do
2149        {        {
2150        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
# Line 2557  for (code = first_significant_code(code Line 2264  for (code = first_significant_code(code
2264      case OP_ANY:      case OP_ANY:
2265      case OP_ALLANY:      case OP_ALLANY:
2266      case OP_ANYBYTE:      case OP_ANYBYTE:
2267    
2268      case OP_PROP:      case OP_PROP:
2269      case OP_NOTPROP:      case OP_NOTPROP:
2270      case OP_ANYNL:      case OP_ANYNL:
2271    
2272      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
2273      case OP_HSPACE:      case OP_HSPACE:
2274      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
2275      case OP_VSPACE:      case OP_VSPACE:
2276      case OP_EXTUNI:      case OP_EXTUNI:
2277    
2278      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2279      case OP_DIGIT:      case OP_DIGIT:
2280      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2281      case OP_WHITESPACE:      case OP_WHITESPACE:
2282      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2283      case OP_WORDCHAR:      case OP_WORDCHAR:
2284    
2285      case OP_CHAR:      case OP_CHAR:
2286      case OP_CHARI:      case OP_CHARI:
2287      case OP_NOT:      case OP_NOT:
2288      case OP_NOTI:      case OP_NOTI:
2289    
2290      case OP_PLUS:      case OP_PLUS:
2291      case OP_PLUSI:      case OP_PLUSI:
2292      case OP_MINPLUS:      case OP_MINPLUS:
# Line 2589  for (code = first_significant_code(code Line 2296  for (code = first_significant_code(code
2296      case OP_NOTPLUSI:      case OP_NOTPLUSI:
2297      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2298      case OP_NOTMINPLUSI:      case OP_NOTMINPLUSI:
2299    
2300      case OP_POSPLUS:      case OP_POSPLUS:
2301      case OP_POSPLUSI:      case OP_POSPLUSI:
2302      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2303      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
2304    
2305      case OP_EXACT:      case OP_EXACT:
2306      case OP_EXACTI:      case OP_EXACTI:
2307      case OP_NOTEXACT:      case OP_NOTEXACT:
2308      case OP_NOTEXACTI:      case OP_NOTEXACTI:
2309    
2310      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2311      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2312      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2313      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2314    
2315      return FALSE;      return FALSE;
2316    
2317      /* These are going to continue, as they may be empty, but we have to      /* These are going to continue, as they may be empty, but we have to
# Line 2644  for (code = first_significant_code(code Line 2351  for (code = first_significant_code(code
2351  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2352      case OP_STAR:      case OP_STAR:
2353      case OP_STARI:      case OP_STARI:
2354      case OP_NOTSTAR:      case OP_NOTSTAR:
2355      case OP_NOTSTARI:      case OP_NOTSTARI:
2356    
2357      case OP_MINSTAR:      case OP_MINSTAR:
2358      case OP_MINSTARI:      case OP_MINSTARI:
2359      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
2360      case OP_NOTMINSTARI:      case OP_NOTMINSTARI:
2361    
2362      case OP_POSSTAR:      case OP_POSSTAR:
2363      case OP_POSSTARI:      case OP_POSSTARI:
2364      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
2365      case OP_NOTPOSSTARI:      case OP_NOTPOSSTARI:
2366    
2367      case OP_QUERY:      case OP_QUERY:
2368      case OP_QUERYI:      case OP_QUERYI:
2369      case OP_NOTQUERY:      case OP_NOTQUERY:
2370      case OP_NOTQUERYI:      case OP_NOTQUERYI:
2371    
2372      case OP_MINQUERY:      case OP_MINQUERY:
2373      case OP_MINQUERYI:      case OP_MINQUERYI:
2374      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
2375      case OP_NOTMINQUERYI:      case OP_NOTMINQUERYI:
2376    
2377      case OP_POSQUERY:      case OP_POSQUERY:
2378      case OP_POSQUERYI:      case OP_POSQUERYI:
2379      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
2380      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
2381    
2382      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2383      break;      break;
2384    
2385      case OP_UPTO:      case OP_UPTO:
2386      case OP_UPTOI:      case OP_UPTOI:
2387      case OP_NOTUPTO:      case OP_NOTUPTO:
2388      case OP_NOTUPTOI:      case OP_NOTUPTOI:
2389    
2390      case OP_MINUPTO:      case OP_MINUPTO:
2391      case OP_MINUPTOI:      case OP_MINUPTOI:
2392      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
2393      case OP_NOTMINUPTOI:      case OP_NOTMINUPTOI:
2394    
2395      case OP_POSUPTO:      case OP_POSUPTO:
2396      case OP_POSUPTOI:      case OP_POSUPTOI:
2397      case OP_NOTPOSUPTO:      case OP_NOTPOSUPTO:
2398      case OP_NOTPOSUPTOI:      case OP_NOTPOSUPTOI:
2399    
2400      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2401      break;      break;
2402  #endif  #endif
# Line 3754  to find out the amount of memory needed, Line 3461  to find out the amount of memory needed,
3461  phase. The value of lengthptr distinguishes the two phases.  phase. The value of lengthptr distinguishes the two phases.
3462    
3463  Arguments:  Arguments:
3464    optionsptr     pointer to the option bits    optionsptr        pointer to the option bits
3465    codeptr        points to the pointer to the current code point    codeptr           points to the pointer to the current code point
3466    ptrptr         points to the current pattern pointer    ptrptr            points to the current pattern pointer
3467    errorcodeptr   points to error code variable    errorcodeptr      points to error code variable
3468    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
3469    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
3470    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
3471    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
3472    bcptr          points to current branch chain    bcptr             points to current branch chain
3473    cond_depth     conditional nesting depth    cond_depth        conditional nesting depth
3474    cd             contains pointers to tables etc.    cd                contains pointers to tables etc.
3475    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
3476                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
3477    
3478  Returns:         TRUE on success  Returns:            TRUE on success
3479                   FALSE, with *errorcodeptr set non-zero on error                      FALSE, with *errorcodeptr set non-zero on error
3480  */  */
3481    
3482  static BOOL  static BOOL
# Line 5949  for (;; ptr++) Line 5656  for (;; ptr++)
5656            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5657            }            }
5658    
5659          /* Found a previous named subpattern */          /* Found the named subpattern */
5660    
5661          if (i < cd->names_found)          if (i < cd->names_found)
5662            {            {
# Line 5958  for (;; ptr++) Line 5665  for (;; ptr++)
5665            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
5666            }            }
5667    
         /* Search the pattern for a forward reference */  
   
         else if ((i = find_parens(cd, name, namelen,  
                         (options & PCRE_EXTENDED) != 0, utf)) > 0)  
           {  
           PUT2(code, 2+LINK_SIZE, i);  
           code[1+LINK_SIZE]++;  
           }  
   
5668          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
5669          after the opening parenthesis [e.g. (?(abc)...] and in this case there          after the opening parenthesis [e.g. (?(abc)...] and in this case there
5670          are some further alternatives to try. For the cases where terminator !=          are some further alternatives to try. For the cases where terminator !=
# Line 6130  for (;; ptr++) Line 5828  for (;; ptr++)
5828          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5829          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
5830          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
5831            {          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5832            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5833              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;          name = ++ptr;
           name = ++ptr;  
5834    
5835            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5836            namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
5837    
5838            /* In the pre-compile phase, just do a syntax check. */          /* In the pre-compile phase, do a syntax check, remember the longest
5839            name, and then remember the group in a vector, expanding it if
5840            necessary. Duplicates for the same number are skipped; other duplicates
5841            are checked for validity. In the actual compile, there is nothing to
5842            do. */
5843    
5844            if (lengthptr != NULL)          if (lengthptr != NULL)
5845              {
5846              named_group *ng;
5847              pcre_uint32 number = cd->bracount + 1;
5848    
5849              if (*ptr != (pcre_uchar)terminator)
5850              {              {
5851              if (*ptr != (pcre_uchar)terminator)              *errorcodeptr = ERR42;
5852                {              goto FAILED;
5853                *errorcodeptr = ERR42;              }
5854                goto FAILED;  
5855                }            if (cd->names_found >= MAX_NAME_COUNT)
5856              if (cd->names_found >= MAX_NAME_COUNT)              {
5857                *errorcodeptr = ERR49;
5858                goto FAILED;
5859                }
5860    
5861              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5862                {
5863                cd->name_entry_size = namelen + IMM2_SIZE + 1;
5864                if (namelen > MAX_NAME_SIZE)
5865                {                {
5866                *errorcodeptr = ERR49;                *errorcodeptr = ERR48;
5867                goto FAILED;                goto FAILED;
5868                }                }
             if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)  
               {  
               cd->name_entry_size = namelen + IMM2_SIZE + 1;  
               if (namelen > MAX_NAME_SIZE)  
                 {  
                 *errorcodeptr = ERR48;  
                 goto FAILED;  
                 }  
               }  
5869              }              }
5870    
5871            /* In the real compile, create the entry in the table, maintaining            /* Scan the list to check for duplicates. For duplicate names, if the
5872            alphabetical order. Duplicate names for different numbers are            number is the same, break the loop, which causes the name to be
5873            permitted only if PCRE_DUPNAMES is set. Duplicate names for the same            discarded; otherwise, if DUPNAMES is not set, give an error.
5874            number are always OK. (An existing number can be re-used if (?|            If it is set, allow the name with a different number, but continue
5875            appears in the pattern.) In either event, a duplicate name results in            scanning in case this is a duplicate with the same number. For
5876            a duplicate entry in the table, even if the number is the same. This            non-duplicate names, give an error if the number is duplicated. */
5877            is because the number of names, and hence the table size, is computed  
5878            in the pre-compile, and it affects various numbers and pointers which            ng = cd->named_groups;
5879            would all have to be modified, and the compiled code moved down, if            for (i = 0; i < cd->names_found; i++, ng++)
5880            duplicates with the same number were omitted from the table. This              {
5881            doesn't seem worth the hassle. However, *different* names for the              if (namelen == ng->length &&
5882            same number are not permitted. */                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5883                  {
5884            else                if (ng->number == number) break;
5885              {                if ((options & PCRE_DUPNAMES) == 0)
             BOOL dupname = FALSE;  
             slot = cd->name_table;  
   
             for (i = 0; i < cd->names_found; i++)  
               {  
               int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));  
               if (crc == 0)  
                 {  
                 if (slot[IMM2_SIZE+namelen] == 0)  
                   {  
                   if (GET2(slot, 0) != cd->bracount + 1 &&  
                       (options & PCRE_DUPNAMES) == 0)  
                     {  
                     *errorcodeptr = ERR43;  
                     goto FAILED;  
                     }  
                   else dupname = TRUE;  
                   }  
                 else crc = -1;      /* Current name is a substring */  
                 }  
   
               /* Make space in the table and break the loop for an earlier  
               name. For a duplicate or later name, carry on. We do this for  
               duplicates so that in the simple case (when ?(| is not used) they  
               are in order of their numbers. */  
   
               if (crc < 0)  
5886                  {                  {
5887                  memmove(slot + cd->name_entry_size, slot,                  *errorcodeptr = ERR43;
5888                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));                  goto FAILED;
5889                  break;                  }
5890                  }                }
5891                else if (ng->number == number)
5892                /* Continue the loop for a later or duplicate name */                {
5893                  *errorcodeptr = ERR65;
5894                slot += cd->name_entry_size;                goto FAILED;
5895                }                }
5896                }
             /* For non-duplicate names, check for a duplicate number before  
             adding the new name. */  
5897    
5898              if (!dupname)            if (i >= cd->names_found)     /* Not a duplicate with same number */
5899                {
5900                /* Increase the list size if necessary */
5901    
5902                if (cd->names_found >= cd->named_group_list_size)
5903                {                {
5904                pcre_uchar *cslot = cd->name_table;                int newsize = cd->named_group_list_size * 2;
5905                for (i = 0; i < cd->names_found; i++)                named_group *newspace = (PUBL(malloc))
5906                    (newsize * sizeof(named_group));
5907    
5908                  if (newspace == NULL)
5909                  {                  {
5910                  if (cslot != slot)                  *errorcodeptr = ERR21;
5911                    {                  goto FAILED;
5912                    if (GET2(cslot, 0) == cd->bracount + 1)                  }
5913                      {  
5914                      *errorcodeptr = ERR65;                memcpy(newspace, cd->named_groups,
5915                      goto FAILED;                  cd->named_group_list_size * sizeof(named_group));
5916                      }                if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5917                    }                  (PUBL(free))((void *)cd->named_groups);
5918                  else i--;                cd->named_groups = newspace;
5919                  cslot += cd->name_entry_size;                cd->named_group_list_size = newsize;
5920                  }                }
5921                }  
5922                cd->named_groups[cd->names_found].name = name;
5923              PUT2(slot, 0, cd->bracount + 1);              cd->named_groups[cd->names_found].length = namelen;
5924              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));              cd->named_groups[cd->names_found].number = number;
5925              slot[IMM2_SIZE + namelen] = 0;              cd->names_found++;
5926              }              }
5927            }            }
5928    
5929          /* In both pre-compile and compile, count the number of names we've          ptr++;                    /* Move past > or ' in both passes. */
         encountered. */  
   
         cd->names_found++;  
         ptr++;                    /* Move past > or ' */  
5930          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
5931    
5932    
# Line 6277  for (;; ptr++) Line 5956  for (;; ptr++)
5956    
5957          if (lengthptr != NULL)          if (lengthptr != NULL)
5958            {            {
5959            const pcre_uchar *temp;            named_group *ng;
5960    
5961            if (namelen == 0)            if (namelen == 0)
5962              {              {
5963              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
# Line 6295  for (;; ptr++) Line 5974  for (;; ptr++)
5974              goto FAILED;              goto FAILED;
5975              }              }
5976    
5977            /* The name table does not exist in the first pass, so we cannot            /* The name table does not exist in the first pass; instead we must
5978            do a simple search as in the code below. Instead, we have to scan the            scan the list of names encountered so far in order to get the
5979            pattern to find the number. It is important that we scan it only as            number. The number may be negative if it is for a name that may be
5980            far as we have got because the syntax of named subpatterns has not            duplicated. If the name is not found, set the value to 0 for a
5981            been checked for the rest of the pattern, and find_parens() assumes            forward reference. */
5982            correct syntax. In any case, it's a waste of resources to scan  
5983            further. We stop the scan at the current point by temporarily            ng = cd->named_groups;
5984            adjusting the value of cd->endpattern. */            for (i = 0; i < cd->names_found; i++, ng++)
5985                {
5986            temp = cd->end_pattern;              if (namelen == ng->length &&
5987            cd->end_pattern = ptr;                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5988            recno = find_parens(cd, name, namelen,                break;
5989              (options & PCRE_EXTENDED) != 0, utf);              }
5990            cd->end_pattern = temp;            recno = (i < cd->names_found)? ng->number : 0;
           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */  
5991            }            }
5992    
5993          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, search the name table. We check the name
5994          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
5995          table. That way, if the name that is longer than any in the table,          table. That way, if the name is longer than any in the table, the
5996          the comparison will fail without reading beyond the table entry. */          comparison will fail without reading beyond the table entry. */
5997    
5998          else          else
5999            {            {
# Line 6328  for (;; ptr++) Line 6006  for (;; ptr++)
6006              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6007              }              }
6008    
6009            if (i < cd->names_found)         /* Back reference */            if (i < cd->names_found)
6010              {              {
6011              recno = GET2(slot, 0);              recno = GET2(slot, 0);
6012              }              }
6013            else if ((recno =                /* Forward back reference */            else
                     find_parens(cd, name, namelen,  
                       (options & PCRE_EXTENDED) != 0, utf)) <= 0)  
6014              {              {
6015              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6016              goto FAILED;              goto FAILED;
# Line 6444  for (;; ptr++) Line 6120  for (;; ptr++)
6120    
6121              if (called == NULL)              if (called == NULL)
6122                {                {
6123                if (find_parens(cd, NULL, recno,                if (recno > cd->final_bracount)
                     (options & PCRE_EXTENDED) != 0, utf) < 0)  
6124                  {                  {
6125                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6126                  goto FAILED;                  goto FAILED;
# Line 7058  for (;; ptr++) Line 6733  for (;; ptr++)
6733          *code++ = OP_PROP;          *code++ = OP_PROP;
6734          *code++ = PT_CLIST;          *code++ = PT_CLIST;
6735          *code++ = c;          *code++ = c;
6736          if (firstcharflags == REQ_UNSET) firstcharflags = zerofirstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET)
6737              firstcharflags = zerofirstcharflags = REQ_NONE;
6738          break;          break;
6739          }          }
6740        }        }
# Line 7147  out the amount of memory needed, as well Line 6823  out the amount of memory needed, as well
6823  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
6824    
6825  Arguments:  Arguments:
6826    options        option bits, including any changes for this subpattern    options           option bits, including any changes for this subpattern
6827    codeptr        -> the address of the current code pointer    codeptr           -> the address of the current code pointer
6828    ptrptr         -> the address of the current pattern pointer    ptrptr            -> the address of the current pattern pointer
6829    errorcodeptr   -> pointer to error code variable    errorcodeptr      -> pointer to error code variable
6830    lookbehind     TRUE if this is a lookbehind assertion    lookbehind        TRUE if this is a lookbehind assertion
6831    reset_bracount TRUE to reset the count for each branch    reset_bracount    TRUE to reset the count for each branch
6832    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes         skip this many bytes at start (for brackets and OP_COND)
6833    cond_depth     depth of nesting for conditional subpatterns    cond_depth        depth of nesting for conditional subpatterns
6834    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
6835    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
6836    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
6837    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
6838    bcptr          pointer to the chain of currently open branches    bcptr             pointer to the chain of currently open branches
6839    cd             points to the data block with tables pointers etc.    cd                points to the data block with tables pointers etc.
6840    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
6841                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
6842    
6843  Returns:         TRUE on success  Returns:            TRUE on success
6844  */  */
6845    
6846  static BOOL  static BOOL
# Line 7701  return TRUE; Line 7377  return TRUE;
7377  discarded, because they can cause conflicts with actual literals that follow.  discarded, because they can cause conflicts with actual literals that follow.
7378  However, if we end up without a first char setting for an unanchored pattern,  However, if we end up without a first char setting for an unanchored pattern,
7379  it is worth scanning the regex to see if there is an initial asserted first  it is worth scanning the regex to see if there is an initial asserted first
7380  char. If all branches start with the same asserted char, or with a bracket all  char. If all branches start with the same asserted char, or with a
7381  of whose alternatives start with the same asserted char (recurse ad lib), then  non-conditional bracket all of whose alternatives start with the same asserted
7382  we return that char, otherwise -1.  char (recurse ad lib), then we return that char, with the flags set to zero or
7383    REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
7384    
7385  Arguments:  Arguments:
7386    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
7387    flags       points to the first char flags, or to REQ_NONE    flags      points to the first char flags, or to REQ_NONE
7388    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
7389    
7390  Returns:     the fixed first char, or 0 with REQ_NONE in flags  Returns:     the fixed first char, or 0 with REQ_NONE in flags
# Line 7744  do { Line 7421  do {
7421       case OP_ASSERT:       case OP_ASSERT:
7422       case OP_ONCE:       case OP_ONCE:
7423       case OP_ONCE_NC:       case OP_ONCE_NC:
      case OP_COND:  
7424       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
7425       if (dflags < 0)       if (dflags < 0)
7426         return 0;         return 0;
# Line 7789  return c; Line 7465  return c;
7465    
7466    
7467  /*************************************************  /*************************************************
7468    *     Add an entry to the name/number table      *
7469    *************************************************/
7470    
7471    /* This function is called between compiling passes to add an entry to the
7472    name/number table, maintaining alphabetical order. Checking for permitted
7473    and forbidden duplicates has already been done.
7474    
7475    Arguments:
7476      cd           the compile data block
7477      name         the name to add
7478      length       the length of the name
7479      groupno      the group number
7480    
7481    Returns:       nothing
7482    */
7483    
7484    static void
7485    add_name(compile_data *cd, const pcre_uchar *name, int length,
7486      unsigned int groupno)
7487    {
7488    int i;
7489    pcre_uchar *slot = cd->name_table;
7490    
7491    for (i = 0; i < cd->names_found; i++)
7492      {
7493      int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
7494      if (crc == 0 && slot[IMM2_SIZE+length] != 0)
7495        crc = -1; /* Current name is a substring */
7496    
7497      /* Make space in the table and break the loop for an earlier name. For a
7498      duplicate or later name, carry on. We do this for duplicates so that in the
7499      simple case (when ?(| is not used) they are in order of their numbers. In all
7500      cases they are in the order in which they appear in the pattern. */
7501    
7502      if (crc < 0)
7503        {
7504        memmove(slot + cd->name_entry_size, slot,
7505          IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
7506        break;
7507        }
7508    
7509      /* Continue the loop for a later or duplicate name */
7510    
7511      slot += cd->name_entry_size;
7512      }
7513    
7514    PUT2(slot, 0, groupno);
7515    memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
7516    slot[IMM2_SIZE + length] = 0;
7517    cd->names_found++;
7518    }
7519    
7520    
7521    
7522    /*************************************************
7523  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
7524  *************************************************/  *************************************************/
7525    
# Line 7875  new memory is obtained from malloc(). */ Line 7606  new memory is obtained from malloc(). */
7606    
7607  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7608    
7609    /* This vector is used for remembering name groups during the pre-compile. In a
7610    similar way to cworkspace, it can be expanded using malloc() if necessary. */
7611    
7612    named_group named_groups[NAMED_GROUP_LIST_SIZE];
7613    
7614  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
7615    
7616  ptr = (const pcre_uchar *)pattern;  ptr = (const pcre_uchar *)pattern;
# Line 8141  cd->start_code = cworkspace; Line 7877  cd->start_code = cworkspace;
7877  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7878  cd->start_workspace = cworkspace;  cd->start_workspace = cworkspace;
7879  cd->workspace_size = COMPILE_WORK_SIZE;  cd->workspace_size = COMPILE_WORK_SIZE;
7880    cd->named_groups = named_groups;
7881    cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
7882  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7883  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7884  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 8223  cd->final_bracount = cd->bracount;  /* S Line 7961  cd->final_bracount = cd->bracount;  /* S
7961  cd->assert_depth = 0;  cd->assert_depth = 0;
7962  cd->bracount = 0;  cd->bracount = 0;
7963  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
 cd->names_found = 0;  
7964  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7965  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7966  cd->start_code = codestart;  cd->start_code = codestart;
# Line 8234  cd->had_pruneorskip = FALSE; Line 7971  cd->had_pruneorskip = FALSE;
7971  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
7972  cd->open_caps = NULL;  cd->open_caps = NULL;
7973    
7974    /* If any named groups were found, create the name/number table from the list
7975    created in the first pass. */
7976    
7977    if (cd->names_found > 0)
7978      {
7979      int i = cd->names_found;
7980      named_group *ng = cd->named_groups;
7981      cd->names_found = 0;
7982      for (; i > 0; i--, ng++)
7983        add_name(cd, ng->name, ng->length, ng->number);
7984      if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7985        (PUBL(free))((void *)cd->named_groups);
7986      }
7987    
7988  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
7989  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
7990  of the function here. */  of the function here. */
# Line 8297  if (cd->hwm > cd->start_workspace) Line 8048  if (cd->hwm > cd->start_workspace)
8048      }      }
8049    }    }
8050    
8051  /* If the workspace had to be expanded, free the new memory. Set the pointer to  /* If the workspace had to be expanded, free the new memory. Set the pointer to
8052  NULL to indicate that forward references have been filled in. */  NULL to indicate that forward references have been filled in. */
8053    
8054  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
8055    (PUBL(free))((void *)cd->start_workspace);    (PUBL(free))((void *)cd->start_workspace);
8056  cd->start_workspace = NULL;  cd->start_workspace = NULL;
8057    
8058  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
8059  subpattern. */  subpattern. */
# Line 8506  if (code - codestart > length) Line 8257  if (code - codestart > length)
8257    }    }
8258  #endif   /* PCRE_DEBUG */  #endif   /* PCRE_DEBUG */
8259    
8260  /* Check for a pattern than can match an empty string, so that this information  /* Check for a pattern than can match an empty string, so that this information
8261  can be provided to applications. */  can be provided to applications. */
8262    
8263  do  do
# Line 8515  do Line 8266  do
8266      {      {
8267      re->flags |= PCRE_MATCH_EMPTY;      re->flags |= PCRE_MATCH_EMPTY;
8268      break;      break;
8269      }      }
8270    codestart += GET(codestart, 1);    codestart += GET(codestart, 1);
8271    }    }
8272  while (*codestart == OP_ALT);  while (*codestart == OP_ALT);

Legend:
Removed from v.1348  
changed lines
  Added in v.1359

  ViewVC Help
Powered by ViewVC 1.1.5