/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1348 by ph10, Fri Jul 5 10:38:37 2013 UTC revision 1360 by ph10, Tue Sep 3 10:25:39 2013 UTC
# Line 115  kicks in at the same number of forward r Line 115  kicks in at the same number of forward r
115  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123    #define NAMED_GROUP_LIST_SIZE  20
124    
125  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
126  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
127    
# Line 1358  return p; Line 1365  return p;
1365    
1366    
1367  /*************************************************  /*************************************************
 *  Subroutine for finding forward reference      *  
 *************************************************/  
   
 /* This recursive function is called only from find_parens() below. The  
 top-level call starts at the beginning of the pattern. All other calls must  
 start at a parenthesis. It scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. Recursion is used to keep  
 track of subpatterns that reset the capturing group numbers - the (?| feature.  
   
 This function was originally called only from the second pass, in which we know  
 that if (?< or (?' or (?P< is encountered, the name will be correctly  
 terminated because that is checked in the first pass. There is now one call to  
 this function in the first pass, to check for a recursive back reference by  
 name (so that we can make the whole group atomic). In this case, we need check  
 only up to the current position in the pattern, and that is still OK because  
 and previous occurrences will have been checked. To make this work, the test  
 for "end of pattern" is a check against cd->end_pattern in the main loop,  
 instead of looking for a binary zero. This means that the special first-pass  
 call can adjust cd->end_pattern temporarily. (Checks for binary zero while  
 processing items within the loop are OK, because afterwards the main loop will  
 terminate.)  
   
 Arguments:  
   ptrptr       address of the current character pointer (updated)  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   count        pointer to the current capturing subpattern number (updated)  
   
 Returns:       the number of the named subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,  
   BOOL xmode, BOOL utf, int *count)  
 {  
 pcre_uchar *ptr = *ptrptr;  
 int start_count = *count;  
 int hwm_count = start_count;  
 BOOL dup_parens = FALSE;  
   
 /* If the first character is a parenthesis, check on the type of group we are  
 dealing with. The very first call may not start with a parenthesis. */  
   
 if (ptr[0] == CHAR_LEFT_PARENTHESIS)  
   {  
   /* Handle specials such as (*SKIP) or (*UTF8) etc. */  
   
   if (ptr[1] == CHAR_ASTERISK)  
     {  
     ptr += 2;  
     while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
     }  
   
   /* Handle a normal, unnamed capturing parenthesis. */  
   
   else if (ptr[1] != CHAR_QUESTION_MARK)  
     {  
     *count += 1;  
     if (name == NULL && *count == lorn) return *count;  
     ptr++;  
     }  
   
   /* All cases now have (? at the start. Remember when we are in a group  
   where the parenthesis numbers are duplicated. */  
   
   else if (ptr[2] == CHAR_VERTICAL_LINE)  
     {  
     ptr += 3;  
     dup_parens = TRUE;  
     }  
   
   /* Handle comments; all characters are allowed until a ket is reached. */  
   
   else if (ptr[2] == CHAR_NUMBER_SIGN)  
     {  
     for (ptr += 3; *ptr != CHAR_NULL; ptr++)  
       if (*ptr == CHAR_RIGHT_PARENTHESIS) break;  
     goto FAIL_EXIT;  
     }  
   
   /* Handle a condition. If it is an assertion, just carry on so that it  
   is processed as normal. If not, skip to the closing parenthesis of the  
   condition (there can't be any nested parens). */  
   
   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)  
     {  
     ptr += 2;  
     if (ptr[1] != CHAR_QUESTION_MARK)  
       {  
       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
       if (*ptr != CHAR_NULL) ptr++;  
       }  
     }  
   
   /* Start with (? but not a condition. */  
   
   else  
     {  
     ptr += 2;  
     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */  
   
     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */  
   
     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&  
         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)  
       {  
       pcre_uchar term;  
       const pcre_uchar *thisname;  
       *count += 1;  
       if (name == NULL && *count == lorn) return *count;  
       term = *ptr++;  
       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  
       thisname = ptr;  
       while (*ptr != term) ptr++;  
       if (name != NULL && lorn == (int)(ptr - thisname) &&  
           STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)  
         return *count;  
       term++;  
       }  
     }  
   }  
   
 /* Past any initial parenthesis handling, scan for parentheses or vertical  
 bars. Stop if we get to cd->end_pattern. Note that this is important for the  
 first-pass call when this value is temporarily adjusted to stop at the current  
 position. So DO NOT change this to a test for binary zero. */  
   
 for (; ptr < cd->end_pattern; ptr++)  
   {  
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == CHAR_BACKSLASH)  
     {  
     if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
     if (*ptr == CHAR_Q) for (;;)  
       {  
       while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
       if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
       if (*(++ptr) == CHAR_E) break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes; this logic must be similar to the way they  
   are handled for real. If the first character is '^', skip it. Also, if the  
   first few characters (either before or after ^) are \Q\E or \E we skip them  
   too. This makes for compatibility with Perl. Note the use of STR macros to  
   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */  
   
   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)  
     {  
     BOOL negate_class = FALSE;  
     for (;;)  
       {  
       if (ptr[1] == CHAR_BACKSLASH)  
         {  
         if (ptr[2] == CHAR_E)  
           ptr+= 2;  
         else if (STRNCMP_UC_C8(ptr + 2,  
                  STR_Q STR_BACKSLASH STR_E, 3) == 0)  
           ptr += 4;  
         else  
           break;  
         }  
       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)  
         {  
         negate_class = TRUE;  
         ptr++;  
         }  
       else break;  
       }  
   
     /* If the next character is ']', it is a data character that must be  
     skipped, except in JavaScript compatibility mode. */  
   
     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&  
         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)  
       ptr++;  
   
     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       if (*ptr == CHAR_NULL) return -1;  
       if (*ptr == CHAR_BACKSLASH)  
         {  
         if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
         if (*ptr == CHAR_Q) for (;;)  
           {  
           while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
           if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
           if (*(++ptr) == CHAR_E) break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == CHAR_NUMBER_SIGN)  
     {  
     ptr++;  
     while (*ptr != CHAR_NULL)  
       {  
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }  
       ptr++;  
 #ifdef SUPPORT_UTF  
       if (utf) FORWARDCHAR(ptr);  
 #endif  
       }  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     continue;  
     }  
   
   /* Check for the special metacharacters */  
   
   if (*ptr == CHAR_LEFT_PARENTHESIS)  
     {  
     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);  
     if (rc > 0) return rc;  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_RIGHT_PARENTHESIS)  
     {  
     if (dup_parens && *count < hwm_count) *count = hwm_count;  
     goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)  
     {  
     if (*count > hwm_count) hwm_count = *count;  
     *count = start_count;  
     }  
   }  
   
 FAIL_EXIT:  
 *ptrptr = ptr;  
 return -1;  
 }  
   
   
   
   
 /*************************************************  
 *       Find forward referenced subpattern       *  
 *************************************************/  
   
 /* This function scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. This is used for forward  
 references to subpatterns. We used to be able to start this scan from the  
 current compiling point, using the current count value from cd->bracount, and  
 do it all in a single loop, but the addition of the possibility of duplicate  
 subpattern numbers means that we have to scan from the very start, in order to  
 take account of such duplicates, and to use a recursive function to keep track  
 of the different types of group.  
   
 Arguments:  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   
 Returns:       the number of the found subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,  
   BOOL utf)  
 {  
 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;  
 int count = 0;  
 int rc;  
   
 /* If the pattern does not start with an opening parenthesis, the first call  
 to find_parens_sub() will scan right to the end (if necessary). However, if it  
 does start with a parenthesis, find_parens_sub() will return when it hits the  
 matching closing parens. That is why we have to have a loop. */  
   
 for (;;)  
   {  
   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);  
   if (rc > 0 || *ptr++ == CHAR_NULL) break;  
   }  
   
 return rc;  
 }  
   
   
   
   
 /*************************************************  
1368  *      Find first significant op code            *  *      Find first significant op code            *
1369  *************************************************/  *************************************************/
1370    
# Line 2361  Returns:      TRUE if what is matched co Line 2068  Returns:      TRUE if what is matched co
2068  typedef struct recurse_check {  typedef struct recurse_check {
2069    struct recurse_check *prev;    struct recurse_check *prev;
2070    const pcre_uchar *group;    const pcre_uchar *group;
2071  } recurse_check;  } recurse_check;
2072    
2073  static BOOL  static BOOL
2074  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
# Line 2377  for (code = first_significant_code(code Line 2084  for (code = first_significant_code(code
2084    const pcre_uchar *ccode;    const pcre_uchar *ccode;
2085    
2086    c = *code;    c = *code;
2087    
2088    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
2089    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
2090    
# Line 2405  for (code = first_significant_code(code Line 2112  for (code = first_significant_code(code
2112      NULL. */      NULL. */
2113    
2114      if (cd->start_workspace != NULL)      if (cd->start_workspace != NULL)
2115        {        {
2116        const pcre_uchar *tcode;        const pcre_uchar *tcode;
2117        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2118          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2119        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2120        }        }
2121    
2122      /* If we are scanning a completed pattern, there are no forward references      /* If we are scanning a completed pattern, there are no forward references
2123      and all groups are complete. We need to detect whether this is a recursive      and all groups are complete. We need to detect whether this is a recursive
2124      call, as otherwise there will be an infinite loop. If it is a recursion,      call, as otherwise there will be an infinite loop. If it is a recursion,
2125      just skip over it. Simple recursions are easily detected. For mutual      just skip over it. Simple recursions are easily detected. For mutual
2126      recursions we keep a chain on the stack. */      recursions we keep a chain on the stack. */
2127    
2128      else      else
2129        {        {
2130        recurse_check *r = recurses;        recurse_check *r = recurses;
2131        const pcre_uchar *endgroup = scode;        const pcre_uchar *endgroup = scode;
2132    
2133        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2134        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2135    
2136        for (r = recurses; r != NULL; r = r->prev)        for (r = recurses; r != NULL; r = r->prev)
2137          if (r->group == scode) break;          if (r->group == scode) break;
2138        if (r != NULL) continue;   /* Mutual recursion */        if (r != NULL) continue;   /* Mutual recursion */
# Line 2436  for (code = first_significant_code(code Line 2143  for (code = first_significant_code(code
2143    
2144      empty_branch = FALSE;      empty_branch = FALSE;
2145      this_recurse.prev = recurses;      this_recurse.prev = recurses;
2146      this_recurse.group = scode;      this_recurse.group = scode;
2147    
2148      do      do
2149        {        {
2150        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
# Line 2557  for (code = first_significant_code(code Line 2264  for (code = first_significant_code(code
2264      case OP_ANY:      case OP_ANY:
2265      case OP_ALLANY:      case OP_ALLANY:
2266      case OP_ANYBYTE:      case OP_ANYBYTE:
2267    
2268      case OP_PROP:      case OP_PROP:
2269      case OP_NOTPROP:      case OP_NOTPROP:
2270      case OP_ANYNL:      case OP_ANYNL:
2271    
2272      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
2273      case OP_HSPACE:      case OP_HSPACE:
2274      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
2275      case OP_VSPACE:      case OP_VSPACE:
2276      case OP_EXTUNI:      case OP_EXTUNI:
2277    
2278      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2279      case OP_DIGIT:      case OP_DIGIT:
2280      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2281      case OP_WHITESPACE:      case OP_WHITESPACE:
2282      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2283      case OP_WORDCHAR:      case OP_WORDCHAR:
2284    
2285      case OP_CHAR:      case OP_CHAR:
2286      case OP_CHARI:      case OP_CHARI:
2287      case OP_NOT:      case OP_NOT:
2288      case OP_NOTI:      case OP_NOTI:
2289    
2290      case OP_PLUS:      case OP_PLUS:
2291      case OP_PLUSI:      case OP_PLUSI:
2292      case OP_MINPLUS:      case OP_MINPLUS:
# Line 2589  for (code = first_significant_code(code Line 2296  for (code = first_significant_code(code
2296      case OP_NOTPLUSI:      case OP_NOTPLUSI:
2297      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2298      case OP_NOTMINPLUSI:      case OP_NOTMINPLUSI:
2299    
2300      case OP_POSPLUS:      case OP_POSPLUS:
2301      case OP_POSPLUSI:      case OP_POSPLUSI:
2302      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2303      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
2304    
2305      case OP_EXACT:      case OP_EXACT:
2306      case OP_EXACTI:      case OP_EXACTI:
2307      case OP_NOTEXACT:      case OP_NOTEXACT:
2308      case OP_NOTEXACTI:      case OP_NOTEXACTI:
2309    
2310      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2311      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2312      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2313      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2314    
2315      return FALSE;      return FALSE;
2316    
2317      /* These are going to continue, as they may be empty, but we have to      /* These are going to continue, as they may be empty, but we have to
# Line 2644  for (code = first_significant_code(code Line 2351  for (code = first_significant_code(code
2351  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2352      case OP_STAR:      case OP_STAR:
2353      case OP_STARI:      case OP_STARI:
2354      case OP_NOTSTAR:      case OP_NOTSTAR:
2355      case OP_NOTSTARI:      case OP_NOTSTARI:
2356    
2357      case OP_MINSTAR:      case OP_MINSTAR:
2358      case OP_MINSTARI:      case OP_MINSTARI:
2359      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
2360      case OP_NOTMINSTARI:      case OP_NOTMINSTARI:
2361    
2362      case OP_POSSTAR:      case OP_POSSTAR:
2363      case OP_POSSTARI:      case OP_POSSTARI:
2364      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
2365      case OP_NOTPOSSTARI:      case OP_NOTPOSSTARI:
2366    
2367      case OP_QUERY:      case OP_QUERY:
2368      case OP_QUERYI:      case OP_QUERYI:
2369      case OP_NOTQUERY:      case OP_NOTQUERY:
2370      case OP_NOTQUERYI:      case OP_NOTQUERYI:
2371    
2372      case OP_MINQUERY:      case OP_MINQUERY:
2373      case OP_MINQUERYI:      case OP_MINQUERYI:
2374      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
2375      case OP_NOTMINQUERYI:      case OP_NOTMINQUERYI:
2376    
2377      case OP_POSQUERY:      case OP_POSQUERY:
2378      case OP_POSQUERYI:      case OP_POSQUERYI:
2379      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
2380      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
2381    
2382      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2383      break;      break;
2384    
2385      case OP_UPTO:      case OP_UPTO:
2386      case OP_UPTOI:      case OP_UPTOI:
2387      case OP_NOTUPTO:      case OP_NOTUPTO:
2388      case OP_NOTUPTOI:      case OP_NOTUPTOI:
2389    
2390      case OP_MINUPTO:      case OP_MINUPTO:
2391      case OP_MINUPTOI:      case OP_MINUPTOI:
2392      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
2393      case OP_NOTMINUPTOI:      case OP_NOTMINUPTOI:
2394    
2395      case OP_POSUPTO:      case OP_POSUPTO:
2396      case OP_POSUPTOI:      case OP_POSUPTOI:
2397      case OP_NOTPOSUPTO:      case OP_NOTPOSUPTO:
2398      case OP_NOTPOSUPTOI:      case OP_NOTPOSUPTOI:
2399    
2400      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2401      break;      break;
2402  #endif  #endif
# Line 3754  to find out the amount of memory needed, Line 3461  to find out the amount of memory needed,
3461  phase. The value of lengthptr distinguishes the two phases.  phase. The value of lengthptr distinguishes the two phases.
3462    
3463  Arguments:  Arguments:
3464    optionsptr     pointer to the option bits    optionsptr        pointer to the option bits
3465    codeptr        points to the pointer to the current code point    codeptr           points to the pointer to the current code point
3466    ptrptr         points to the current pattern pointer    ptrptr            points to the current pattern pointer
3467    errorcodeptr   points to error code variable    errorcodeptr      points to error code variable
3468    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
3469    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
3470    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
3471    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
3472    bcptr          points to current branch chain    bcptr             points to current branch chain
3473    cond_depth     conditional nesting depth    cond_depth        conditional nesting depth
3474    cd             contains pointers to tables etc.    cd                contains pointers to tables etc.
3475    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
3476                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
3477    
3478  Returns:         TRUE on success  Returns:            TRUE on success
3479                   FALSE, with *errorcodeptr set non-zero on error                      FALSE, with *errorcodeptr set non-zero on error
3480  */  */
3481    
3482  static BOOL  static BOOL
# Line 5949  for (;; ptr++) Line 5656  for (;; ptr++)
5656            slot += cd->name_entry_size;            slot += cd->name_entry_size;
5657            }            }
5658    
5659          /* Found a previous named subpattern */          /* Found the named subpattern */
5660    
5661          if (i < cd->names_found)          if (i < cd->names_found)
5662            {            {
# Line 5958  for (;; ptr++) Line 5665  for (;; ptr++)
5665            code[1+LINK_SIZE]++;            code[1+LINK_SIZE]++;
5666            }            }
5667    
         /* Search the pattern for a forward reference */  
   
         else if ((i = find_parens(cd, name, namelen,  
                         (options & PCRE_EXTENDED) != 0, utf)) > 0)  
           {  
           PUT2(code, 2+LINK_SIZE, i);  
           code[1+LINK_SIZE]++;  
           }  
   
5668          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
5669          after the opening parenthesis [e.g. (?(abc)...] and in this case there          after the opening parenthesis [e.g. (?(abc)...] and in this case there
5670          are some further alternatives to try. For the cases where terminator !=          are some further alternatives to try. For the cases where terminator !=
# Line 6130  for (;; ptr++) Line 5828  for (;; ptr++)
5828          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5829          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
5830          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
5831            {          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5832            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5833              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;          name = ++ptr;
           name = ++ptr;  
5834    
5835            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5836            namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
5837    
5838            /* In the pre-compile phase, just do a syntax check. */          /* In the pre-compile phase, do a syntax check, remember the longest
5839            name, and then remember the group in a vector, expanding it if
5840            necessary. Duplicates for the same number are skipped; other duplicates
5841            are checked for validity. In the actual compile, there is nothing to
5842            do. */
5843    
5844            if (lengthptr != NULL)          if (lengthptr != NULL)
5845              {
5846              named_group *ng;
5847              pcre_uint32 number = cd->bracount + 1;
5848    
5849              if (*ptr != (pcre_uchar)terminator)
5850              {              {
5851              if (*ptr != (pcre_uchar)terminator)              *errorcodeptr = ERR42;
5852                {              goto FAILED;
5853                *errorcodeptr = ERR42;              }
5854                goto FAILED;  
5855                }            if (cd->names_found >= MAX_NAME_COUNT)
5856              if (cd->names_found >= MAX_NAME_COUNT)              {
5857                *errorcodeptr = ERR49;
5858                goto FAILED;
5859                }
5860    
5861              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5862                {
5863                cd->name_entry_size = namelen + IMM2_SIZE + 1;
5864                if (namelen > MAX_NAME_SIZE)
5865                {                {
5866                *errorcodeptr = ERR49;                *errorcodeptr = ERR48;
5867                goto FAILED;                goto FAILED;
5868                }                }
             if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)  
               {  
               cd->name_entry_size = namelen + IMM2_SIZE + 1;  
               if (namelen > MAX_NAME_SIZE)  
                 {  
                 *errorcodeptr = ERR48;  
                 goto FAILED;  
                 }  
               }  
5869              }              }
5870    
5871            /* In the real compile, create the entry in the table, maintaining            /* Scan the list to check for duplicates. For duplicate names, if the
5872            alphabetical order. Duplicate names for different numbers are            number is the same, break the loop, which causes the name to be
5873            permitted only if PCRE_DUPNAMES is set. Duplicate names for the same            discarded; otherwise, if DUPNAMES is not set, give an error.
5874            number are always OK. (An existing number can be re-used if (?|            If it is set, allow the name with a different number, but continue
5875            appears in the pattern.) In either event, a duplicate name results in            scanning in case this is a duplicate with the same number. For
5876            a duplicate entry in the table, even if the number is the same. This            non-duplicate names, give an error if the number is duplicated. */
5877            is because the number of names, and hence the table size, is computed  
5878            in the pre-compile, and it affects various numbers and pointers which            ng = cd->named_groups;
5879            would all have to be modified, and the compiled code moved down, if            for (i = 0; i < cd->names_found; i++, ng++)
5880            duplicates with the same number were omitted from the table. This              {
5881            doesn't seem worth the hassle. However, *different* names for the              if (namelen == ng->length &&
5882            same number are not permitted. */                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5883                  {
5884            else                if (ng->number == number) break;
5885              {                if ((options & PCRE_DUPNAMES) == 0)
             BOOL dupname = FALSE;  
             slot = cd->name_table;  
   
             for (i = 0; i < cd->names_found; i++)  
               {  
               int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));  
               if (crc == 0)  
                 {  
                 if (slot[IMM2_SIZE+namelen] == 0)  
                   {  
                   if (GET2(slot, 0) != cd->bracount + 1 &&  
                       (options & PCRE_DUPNAMES) == 0)  
                     {  
                     *errorcodeptr = ERR43;  
                     goto FAILED;  
                     }  
                   else dupname = TRUE;  
                   }  
                 else crc = -1;      /* Current name is a substring */  
                 }  
   
               /* Make space in the table and break the loop for an earlier  
               name. For a duplicate or later name, carry on. We do this for  
               duplicates so that in the simple case (when ?(| is not used) they  
               are in order of their numbers. */  
   
               if (crc < 0)  
5886                  {                  {
5887                  memmove(slot + cd->name_entry_size, slot,                  *errorcodeptr = ERR43;
5888                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));                  goto FAILED;
5889                  break;                  }
5890                  }                }
5891                else if (ng->number == number)
5892                /* Continue the loop for a later or duplicate name */                {
5893                  *errorcodeptr = ERR65;
5894                slot += cd->name_entry_size;                goto FAILED;
5895                }                }
5896                }
             /* For non-duplicate names, check for a duplicate number before  
             adding the new name. */  
5897    
5898              if (!dupname)            if (i >= cd->names_found)     /* Not a duplicate with same number */
5899                {
5900                /* Increase the list size if necessary */
5901    
5902                if (cd->names_found >= cd->named_group_list_size)
5903                {                {
5904                pcre_uchar *cslot = cd->name_table;                int newsize = cd->named_group_list_size * 2;
5905                for (i = 0; i < cd->names_found; i++)                named_group *newspace = (PUBL(malloc))
5906                    (newsize * sizeof(named_group));
5907    
5908                  if (newspace == NULL)
5909                  {                  {
5910                  if (cslot != slot)                  *errorcodeptr = ERR21;
5911                    {                  goto FAILED;
5912                    if (GET2(cslot, 0) == cd->bracount + 1)                  }
5913                      {  
5914                      *errorcodeptr = ERR65;                memcpy(newspace, cd->named_groups,
5915                      goto FAILED;                  cd->named_group_list_size * sizeof(named_group));
5916                      }                if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5917                    }                  (PUBL(free))((void *)cd->named_groups);
5918                  else i--;                cd->named_groups = newspace;
5919                  cslot += cd->name_entry_size;                cd->named_group_list_size = newsize;
5920                  }                }
5921                }  
5922                cd->named_groups[cd->names_found].name = name;
5923              PUT2(slot, 0, cd->bracount + 1);              cd->named_groups[cd->names_found].length = namelen;
5924              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));              cd->named_groups[cd->names_found].number = number;
5925              slot[IMM2_SIZE + namelen] = 0;              cd->names_found++;
5926              }              }
5927            }            }
5928    
5929          /* In both pre-compile and compile, count the number of names we've          ptr++;                    /* Move past > or ' in both passes. */
         encountered. */  
   
         cd->names_found++;  
         ptr++;                    /* Move past > or ' */  
5930          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
5931    
5932    
# Line 6277  for (;; ptr++) Line 5956  for (;; ptr++)
5956    
5957          if (lengthptr != NULL)          if (lengthptr != NULL)
5958            {            {
5959            const pcre_uchar *temp;            named_group *ng;
5960    
5961            if (namelen == 0)            if (namelen == 0)
5962              {              {
5963              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
# Line 6295  for (;; ptr++) Line 5974  for (;; ptr++)
5974              goto FAILED;              goto FAILED;
5975              }              }
5976    
5977            /* The name table does not exist in the first pass, so we cannot            /* The name table does not exist in the first pass; instead we must
5978            do a simple search as in the code below. Instead, we have to scan the            scan the list of names encountered so far in order to get the
5979            pattern to find the number. It is important that we scan it only as            number. If the name is not found, set the value to 0 for a forward
5980            far as we have got because the syntax of named subpatterns has not            reference. */
5981            been checked for the rest of the pattern, and find_parens() assumes  
5982            correct syntax. In any case, it's a waste of resources to scan            ng = cd->named_groups;
5983            further. We stop the scan at the current point by temporarily            for (i = 0; i < cd->names_found; i++, ng++)
5984            adjusting the value of cd->endpattern. */              {
5985                if (namelen == ng->length &&
5986            temp = cd->end_pattern;                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5987            cd->end_pattern = ptr;                break;
5988            recno = find_parens(cd, name, namelen,              }
5989              (options & PCRE_EXTENDED) != 0, utf);            recno = (i < cd->names_found)? ng->number : 0;
           cd->end_pattern = temp;  
           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */  
5990            }            }
5991    
5992          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, search the name table. We check the name
5993          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
5994          table. That way, if the name that is longer than any in the table,          table. That way, if the name is longer than any in the table, the
5995          the comparison will fail without reading beyond the table entry. */          comparison will fail without reading beyond the table entry. */
5996    
5997          else          else
5998            {            {
# Line 6328  for (;; ptr++) Line 6005  for (;; ptr++)
6005              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6006              }              }
6007    
6008            if (i < cd->names_found)         /* Back reference */            if (i < cd->names_found)
6009              {              {
6010              recno = GET2(slot, 0);              recno = GET2(slot, 0);
6011              }              }
6012            else if ((recno =                /* Forward back reference */            else
                     find_parens(cd, name, namelen,  
                       (options & PCRE_EXTENDED) != 0, utf)) <= 0)  
6013              {              {
6014              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6015              goto FAILED;              goto FAILED;
# Line 6444  for (;; ptr++) Line 6119  for (;; ptr++)
6119    
6120              if (called == NULL)              if (called == NULL)
6121                {                {
6122                if (find_parens(cd, NULL, recno,                if (recno > cd->final_bracount)
                     (options & PCRE_EXTENDED) != 0, utf) < 0)  
6123                  {                  {
6124                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6125                  goto FAILED;                  goto FAILED;
# Line 7058  for (;; ptr++) Line 6732  for (;; ptr++)
6732          *code++ = OP_PROP;          *code++ = OP_PROP;
6733          *code++ = PT_CLIST;          *code++ = PT_CLIST;
6734          *code++ = c;          *code++ = c;
6735          if (firstcharflags == REQ_UNSET) firstcharflags = zerofirstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET)
6736              firstcharflags = zerofirstcharflags = REQ_NONE;
6737          break;          break;
6738          }          }
6739        }        }
# Line 7147  out the amount of memory needed, as well Line 6822  out the amount of memory needed, as well
6822  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
6823    
6824  Arguments:  Arguments:
6825    options        option bits, including any changes for this subpattern    options           option bits, including any changes for this subpattern
6826    codeptr        -> the address of the current code pointer    codeptr           -> the address of the current code pointer
6827    ptrptr         -> the address of the current pattern pointer    ptrptr            -> the address of the current pattern pointer
6828    errorcodeptr   -> pointer to error code variable    errorcodeptr      -> pointer to error code variable
6829    lookbehind     TRUE if this is a lookbehind assertion    lookbehind        TRUE if this is a lookbehind assertion
6830    reset_bracount TRUE to reset the count for each branch    reset_bracount    TRUE to reset the count for each branch
6831    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes         skip this many bytes at start (for brackets and OP_COND)
6832    cond_depth     depth of nesting for conditional subpatterns    cond_depth        depth of nesting for conditional subpatterns
6833    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
6834    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
6835    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
6836    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
6837    bcptr          pointer to the chain of currently open branches    bcptr             pointer to the chain of currently open branches
6838    cd             points to the data block with tables pointers etc.    cd                points to the data block with tables pointers etc.
6839    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
6840                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
6841    
6842  Returns:         TRUE on success  Returns:            TRUE on success
6843  */  */
6844    
6845  static BOOL  static BOOL
# Line 7701  return TRUE; Line 7376  return TRUE;
7376  discarded, because they can cause conflicts with actual literals that follow.  discarded, because they can cause conflicts with actual literals that follow.
7377  However, if we end up without a first char setting for an unanchored pattern,  However, if we end up without a first char setting for an unanchored pattern,
7378  it is worth scanning the regex to see if there is an initial asserted first  it is worth scanning the regex to see if there is an initial asserted first
7379  char. If all branches start with the same asserted char, or with a bracket all  char. If all branches start with the same asserted char, or with a
7380  of whose alternatives start with the same asserted char (recurse ad lib), then  non-conditional bracket all of whose alternatives start with the same asserted
7381  we return that char, otherwise -1.  char (recurse ad lib), then we return that char, with the flags set to zero or
7382    REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
7383    
7384  Arguments:  Arguments:
7385    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
7386    flags       points to the first char flags, or to REQ_NONE    flags      points to the first char flags, or to REQ_NONE
7387    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
7388    
7389  Returns:     the fixed first char, or 0 with REQ_NONE in flags  Returns:     the fixed first char, or 0 with REQ_NONE in flags
# Line 7744  do { Line 7420  do {
7420       case OP_ASSERT:       case OP_ASSERT:
7421       case OP_ONCE:       case OP_ONCE:
7422       case OP_ONCE_NC:       case OP_ONCE_NC:
      case OP_COND:  
7423       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
7424       if (dflags < 0)       if (dflags < 0)
7425         return 0;         return 0;
# Line 7789  return c; Line 7464  return c;
7464    
7465    
7466  /*************************************************  /*************************************************
7467    *     Add an entry to the name/number table      *
7468    *************************************************/
7469    
7470    /* This function is called between compiling passes to add an entry to the
7471    name/number table, maintaining alphabetical order. Checking for permitted
7472    and forbidden duplicates has already been done.
7473    
7474    Arguments:
7475      cd           the compile data block
7476      name         the name to add
7477      length       the length of the name
7478      groupno      the group number
7479    
7480    Returns:       nothing
7481    */
7482    
7483    static void
7484    add_name(compile_data *cd, const pcre_uchar *name, int length,
7485      unsigned int groupno)
7486    {
7487    int i;
7488    pcre_uchar *slot = cd->name_table;
7489    
7490    for (i = 0; i < cd->names_found; i++)
7491      {
7492      int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
7493      if (crc == 0 && slot[IMM2_SIZE+length] != 0)
7494        crc = -1; /* Current name is a substring */
7495    
7496      /* Make space in the table and break the loop for an earlier name. For a
7497      duplicate or later name, carry on. We do this for duplicates so that in the
7498      simple case (when ?(| is not used) they are in order of their numbers. In all
7499      cases they are in the order in which they appear in the pattern. */
7500    
7501      if (crc < 0)
7502        {
7503        memmove(slot + cd->name_entry_size, slot,
7504          IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
7505        break;
7506        }
7507    
7508      /* Continue the loop for a later or duplicate name */
7509    
7510      slot += cd->name_entry_size;
7511      }
7512    
7513    PUT2(slot, 0, groupno);
7514    memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
7515    slot[IMM2_SIZE + length] = 0;
7516    cd->names_found++;
7517    }
7518    
7519    
7520    
7521    /*************************************************
7522  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
7523  *************************************************/  *************************************************/
7524    
# Line 7875  new memory is obtained from malloc(). */ Line 7605  new memory is obtained from malloc(). */
7605    
7606  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7607    
7608    /* This vector is used for remembering name groups during the pre-compile. In a
7609    similar way to cworkspace, it can be expanded using malloc() if necessary. */
7610    
7611    named_group named_groups[NAMED_GROUP_LIST_SIZE];
7612    
7613  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
7614    
7615  ptr = (const pcre_uchar *)pattern;  ptr = (const pcre_uchar *)pattern;
# Line 8141  cd->start_code = cworkspace; Line 7876  cd->start_code = cworkspace;
7876  cd->hwm = cworkspace;  cd->hwm = cworkspace;
7877  cd->start_workspace = cworkspace;  cd->start_workspace = cworkspace;
7878  cd->workspace_size = COMPILE_WORK_SIZE;  cd->workspace_size = COMPILE_WORK_SIZE;
7879    cd->named_groups = named_groups;
7880    cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
7881  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
7882  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7883  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 8223  cd->final_bracount = cd->bracount;  /* S Line 7960  cd->final_bracount = cd->bracount;  /* S
7960  cd->assert_depth = 0;  cd->assert_depth = 0;
7961  cd->bracount = 0;  cd->bracount = 0;
7962  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
 cd->names_found = 0;  
7963  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7964  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
7965  cd->start_code = codestart;  cd->start_code = codestart;
# Line 8234  cd->had_pruneorskip = FALSE; Line 7970  cd->had_pruneorskip = FALSE;
7970  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
7971  cd->open_caps = NULL;  cd->open_caps = NULL;
7972    
7973    /* If any named groups were found, create the name/number table from the list
7974    created in the first pass. */
7975    
7976    if (cd->names_found > 0)
7977      {
7978      int i = cd->names_found;
7979      named_group *ng = cd->named_groups;
7980      cd->names_found = 0;
7981      for (; i > 0; i--, ng++)
7982        add_name(cd, ng->name, ng->length, ng->number);
7983      if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7984        (PUBL(free))((void *)cd->named_groups);
7985      }
7986    
7987  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
7988  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
7989  of the function here. */  of the function here. */
# Line 8297  if (cd->hwm > cd->start_workspace) Line 8047  if (cd->hwm > cd->start_workspace)
8047      }      }
8048    }    }
8049    
8050  /* If the workspace had to be expanded, free the new memory. Set the pointer to  /* If the workspace had to be expanded, free the new memory. Set the pointer to
8051  NULL to indicate that forward references have been filled in. */  NULL to indicate that forward references have been filled in. */
8052    
8053  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
8054    (PUBL(free))((void *)cd->start_workspace);    (PUBL(free))((void *)cd->start_workspace);
8055  cd->start_workspace = NULL;  cd->start_workspace = NULL;
8056    
8057  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
8058  subpattern. */  subpattern. */
# Line 8506  if (code - codestart > length) Line 8256  if (code - codestart > length)
8256    }    }
8257  #endif   /* PCRE_DEBUG */  #endif   /* PCRE_DEBUG */
8258    
8259  /* Check for a pattern than can match an empty string, so that this information  /* Check for a pattern than can match an empty string, so that this information
8260  can be provided to applications. */  can be provided to applications. */
8261    
8262  do  do
# Line 8515  do Line 8265  do
8265      {      {
8266      re->flags |= PCRE_MATCH_EMPTY;      re->flags |= PCRE_MATCH_EMPTY;
8267      break;      break;
8268      }      }
8269    codestart += GET(codestart, 1);    codestart += GET(codestart, 1);
8270    }    }
8271  while (*codestart == OP_ALT);  while (*codestart == OP_ALT);

Legend:
Removed from v.1348  
changed lines
  Added in v.1360

  ViewVC Help
Powered by ViewVC 1.1.5