/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1348 by ph10, Fri Jul 5 10:38:37 2013 UTC revision 1369 by ph10, Tue Oct 8 15:06:46 2013 UTC
# Line 115  kicks in at the same number of forward r Line 115  kicks in at the same number of forward r
115  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123    #define NAMED_GROUP_LIST_SIZE  20
124    
125  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
126  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
127    
# Line 648  static const pcre_uint8 ebcdic_chartab[] Line 655  static const pcre_uint8 ebcdic_chartab[]
655  #endif  #endif
656    
657    
658    /* This table is used to check whether auto-possessification is possible
659    between adjacent character-type opcodes. The left-hand (repeated) opcode is
660    used to select the row, and the right-hand opcode is use to select the column.
661    A value of 1 means that auto-possessification is OK. For example, the second
662    value in the first row means that \D+\d can be turned into \D++\d.
663    
664    The Unicode property types (\P and \p) have to be present to fill out the table
665    because of what their opcode values are, but the table values should always be
666    zero because property types are handled separately in the code. The last four
667    columns apply to items that cannot be repeated, so there is no need to have
668    rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
669    *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
670    
671    #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
672    #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
673    
674    static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
675    /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
676      { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
677      { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
678      { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
679      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
680      { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
681      { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
682      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
683      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
684      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
685      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
686      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
687      { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
688      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
689      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
690      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
691      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
692      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
693    };
694    
695    
696    /* This table is used to check whether auto-possessification is possible
697    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
698    left-hand (repeated) opcode is used to select the row, and the right-hand
699    opcode is used to select the column. The values are as follows:
700    
701      0   Always return FALSE (never auto-possessify)
702      1   Character groups are distinct (possessify if both are OP_PROP)
703      2   Check character categories in the same group (general or particular)
704      3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
705    
706      4   Check left general category vs right particular category
707      5   Check right general category vs left particular category
708    
709      6   Left alphanum vs right general category
710      7   Left space vs right general category
711      8   Left word vs right general category
712    
713      9   Right alphanum vs left general category
714     10   Right space vs left general category
715     11   Right word vs left general category
716    
717     12   Left alphanum vs right particular category
718     13   Left space vs right particular category
719     14   Left word vs right particular category
720    
721     15   Right alphanum vs left particular category
722     16   Right space vs left particular category
723     17   Right word vs left particular category
724    */
725    
726    static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
727    /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
728      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
729      { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
730      { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
731      { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
732      { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
733      { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
734      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
735      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
736      { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
737      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
738      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
739    };
740    
741    /* This table is used to check whether auto-possessification is possible
742    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
743    specifies a general category and the other specifies a particular category. The
744    row is selected by the general category and the column by the particular
745    category. The value is 1 if the particular category is not part of the general
746    category. */
747    
748    static const pcre_uint8 catposstab[7][30] = {
749    /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
750      { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
751      { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
752      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
753      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
754      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
755      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
756      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
757    };
758    
759    /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
760    a general or particular category. The properties in each row are those
761    that apply to the character set in question. Duplication means that a little
762    unnecessary work is done when checking, but this keeps things much simpler
763    because they can all use the same code. For more details see the comment where
764    this table is used.
765    
766    Note: SPACE and PXSPACE used to be different because Perl excluded VT from
767    "space", but from Perl 5.18 it's included, so both categories are treated the
768    same here. */
769    
770    static const pcre_uint8 posspropstab[3][4] = {
771      { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
772      { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
773      { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
774    };
775    
776    
777    
778  /*************************************************  /*************************************************
779  *            Find an error text                  *  *            Find an error text                  *
# Line 675  return s; Line 801  return s;
801  }  }
802    
803    
804    
805  /*************************************************  /*************************************************
806  *           Expand the workspace                 *  *           Expand the workspace                 *
807  *************************************************/  *************************************************/
# Line 752  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET);
879  *************************************************/  *************************************************/
880    
881  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
882  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
883  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
884  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
885  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
886  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
887    
888  Arguments:  Arguments:
889    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
890    chptr          points to the data character    chptr          points to a returned data character
891    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
892    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
893    options        the options bits    options        the options bits
# Line 965  else Line 1091  else
1091      break;      break;
1092    
1093      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1094      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1095      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1096        recommended to avoid the ambiguities in the old syntax.
1097    
1098      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1099      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1100      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1101      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1102      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1103      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1104      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1105    
1106        Inside a character class, \ followed by a digit is always either a literal
1107        8 or 9 or an octal number. */
1108    
1109      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1110      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1001  else Line 1131  else
1131          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1132          break;          break;
1133          }          }
1134        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1135          {          {
1136          escape = -s;          escape = -s;
1137          break;          break;
# Line 1009  else Line 1139  else
1139        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1140        }        }
1141    
1142      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1143      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1144      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1145        changed so as not to insert the binary zero. */
1146    
1147      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1148        {  
1149        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1150    
1151      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1152      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1192  if ((options & PCRE_UCP) != 0 && escape Line 1320  if ((options & PCRE_UCP) != 0 && escape
1320  return escape;  return escape;
1321  }  }
1322    
1323    
1324    
1325  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1326  /*************************************************  /*************************************************
1327  *               Handle \P and \p                 *  *               Handle \P and \p                 *
# Line 1289  return FALSE; Line 1419  return FALSE;
1419    
1420    
1421    
   
1422  /*************************************************  /*************************************************
1423  *         Read repeat counts                     *  *         Read repeat counts                     *
1424  *************************************************/  *************************************************/
# Line 1358  return p; Line 1487  return p;
1487    
1488    
1489  /*************************************************  /*************************************************
 *  Subroutine for finding forward reference      *  
 *************************************************/  
   
 /* This recursive function is called only from find_parens() below. The  
 top-level call starts at the beginning of the pattern. All other calls must  
 start at a parenthesis. It scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. Recursion is used to keep  
 track of subpatterns that reset the capturing group numbers - the (?| feature.  
   
 This function was originally called only from the second pass, in which we know  
 that if (?< or (?' or (?P< is encountered, the name will be correctly  
 terminated because that is checked in the first pass. There is now one call to  
 this function in the first pass, to check for a recursive back reference by  
 name (so that we can make the whole group atomic). In this case, we need check  
 only up to the current position in the pattern, and that is still OK because  
 and previous occurrences will have been checked. To make this work, the test  
 for "end of pattern" is a check against cd->end_pattern in the main loop,  
 instead of looking for a binary zero. This means that the special first-pass  
 call can adjust cd->end_pattern temporarily. (Checks for binary zero while  
 processing items within the loop are OK, because afterwards the main loop will  
 terminate.)  
   
 Arguments:  
   ptrptr       address of the current character pointer (updated)  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   count        pointer to the current capturing subpattern number (updated)  
   
 Returns:       the number of the named subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,  
   BOOL xmode, BOOL utf, int *count)  
 {  
 pcre_uchar *ptr = *ptrptr;  
 int start_count = *count;  
 int hwm_count = start_count;  
 BOOL dup_parens = FALSE;  
   
 /* If the first character is a parenthesis, check on the type of group we are  
 dealing with. The very first call may not start with a parenthesis. */  
   
 if (ptr[0] == CHAR_LEFT_PARENTHESIS)  
   {  
   /* Handle specials such as (*SKIP) or (*UTF8) etc. */  
   
   if (ptr[1] == CHAR_ASTERISK)  
     {  
     ptr += 2;  
     while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
     }  
   
   /* Handle a normal, unnamed capturing parenthesis. */  
   
   else if (ptr[1] != CHAR_QUESTION_MARK)  
     {  
     *count += 1;  
     if (name == NULL && *count == lorn) return *count;  
     ptr++;  
     }  
   
   /* All cases now have (? at the start. Remember when we are in a group  
   where the parenthesis numbers are duplicated. */  
   
   else if (ptr[2] == CHAR_VERTICAL_LINE)  
     {  
     ptr += 3;  
     dup_parens = TRUE;  
     }  
   
   /* Handle comments; all characters are allowed until a ket is reached. */  
   
   else if (ptr[2] == CHAR_NUMBER_SIGN)  
     {  
     for (ptr += 3; *ptr != CHAR_NULL; ptr++)  
       if (*ptr == CHAR_RIGHT_PARENTHESIS) break;  
     goto FAIL_EXIT;  
     }  
   
   /* Handle a condition. If it is an assertion, just carry on so that it  
   is processed as normal. If not, skip to the closing parenthesis of the  
   condition (there can't be any nested parens). */  
   
   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)  
     {  
     ptr += 2;  
     if (ptr[1] != CHAR_QUESTION_MARK)  
       {  
       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
       if (*ptr != CHAR_NULL) ptr++;  
       }  
     }  
   
   /* Start with (? but not a condition. */  
   
   else  
     {  
     ptr += 2;  
     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */  
   
     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */  
   
     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&  
         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)  
       {  
       pcre_uchar term;  
       const pcre_uchar *thisname;  
       *count += 1;  
       if (name == NULL && *count == lorn) return *count;  
       term = *ptr++;  
       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  
       thisname = ptr;  
       while (*ptr != term) ptr++;  
       if (name != NULL && lorn == (int)(ptr - thisname) &&  
           STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)  
         return *count;  
       term++;  
       }  
     }  
   }  
   
 /* Past any initial parenthesis handling, scan for parentheses or vertical  
 bars. Stop if we get to cd->end_pattern. Note that this is important for the  
 first-pass call when this value is temporarily adjusted to stop at the current  
 position. So DO NOT change this to a test for binary zero. */  
   
 for (; ptr < cd->end_pattern; ptr++)  
   {  
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == CHAR_BACKSLASH)  
     {  
     if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
     if (*ptr == CHAR_Q) for (;;)  
       {  
       while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
       if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
       if (*(++ptr) == CHAR_E) break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes; this logic must be similar to the way they  
   are handled for real. If the first character is '^', skip it. Also, if the  
   first few characters (either before or after ^) are \Q\E or \E we skip them  
   too. This makes for compatibility with Perl. Note the use of STR macros to  
   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */  
   
   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)  
     {  
     BOOL negate_class = FALSE;  
     for (;;)  
       {  
       if (ptr[1] == CHAR_BACKSLASH)  
         {  
         if (ptr[2] == CHAR_E)  
           ptr+= 2;  
         else if (STRNCMP_UC_C8(ptr + 2,  
                  STR_Q STR_BACKSLASH STR_E, 3) == 0)  
           ptr += 4;  
         else  
           break;  
         }  
       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)  
         {  
         negate_class = TRUE;  
         ptr++;  
         }  
       else break;  
       }  
   
     /* If the next character is ']', it is a data character that must be  
     skipped, except in JavaScript compatibility mode. */  
   
     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&  
         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)  
       ptr++;  
   
     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       if (*ptr == CHAR_NULL) return -1;  
       if (*ptr == CHAR_BACKSLASH)  
         {  
         if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
         if (*ptr == CHAR_Q) for (;;)  
           {  
           while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
           if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
           if (*(++ptr) == CHAR_E) break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == CHAR_NUMBER_SIGN)  
     {  
     ptr++;  
     while (*ptr != CHAR_NULL)  
       {  
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }  
       ptr++;  
 #ifdef SUPPORT_UTF  
       if (utf) FORWARDCHAR(ptr);  
 #endif  
       }  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     continue;  
     }  
   
   /* Check for the special metacharacters */  
   
   if (*ptr == CHAR_LEFT_PARENTHESIS)  
     {  
     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);  
     if (rc > 0) return rc;  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_RIGHT_PARENTHESIS)  
     {  
     if (dup_parens && *count < hwm_count) *count = hwm_count;  
     goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)  
     {  
     if (*count > hwm_count) hwm_count = *count;  
     *count = start_count;  
     }  
   }  
   
 FAIL_EXIT:  
 *ptrptr = ptr;  
 return -1;  
 }  
   
   
   
   
 /*************************************************  
 *       Find forward referenced subpattern       *  
 *************************************************/  
   
 /* This function scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. This is used for forward  
 references to subpatterns. We used to be able to start this scan from the  
 current compiling point, using the current count value from cd->bracount, and  
 do it all in a single loop, but the addition of the possibility of duplicate  
 subpattern numbers means that we have to scan from the very start, in order to  
 take account of such duplicates, and to use a recursive function to keep track  
 of the different types of group.  
   
 Arguments:  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   
 Returns:       the number of the found subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,  
   BOOL utf)  
 {  
 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;  
 int count = 0;  
 int rc;  
   
 /* If the pattern does not start with an opening parenthesis, the first call  
 to find_parens_sub() will scan right to the end (if necessary). However, if it  
 does start with a parenthesis, find_parens_sub() will return when it hits the  
 matching closing parens. That is why we have to have a loop. */  
   
 for (;;)  
   {  
   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);  
   if (rc > 0 || *ptr++ == CHAR_NULL) break;  
   }  
   
 return rc;  
 }  
   
   
   
   
 /*************************************************  
1490  *      Find first significant op code            *  *      Find first significant op code            *
1491  *************************************************/  *************************************************/
1492    
# Line 1696  for (;;) Line 1525  for (;;)
1525    
1526      case OP_CALLOUT:      case OP_CALLOUT:
1527      case OP_CREF:      case OP_CREF:
1528      case OP_NCREF:      case OP_DNCREF:
1529      case OP_RREF:      case OP_RREF:
1530      case OP_NRREF:      case OP_DNRREF:
1531      case OP_DEF:      case OP_DEF:
1532      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1533      break;      break;
# Line 1712  for (;;) Line 1541  for (;;)
1541    
1542    
1543    
   
1544  /*************************************************  /*************************************************
1545  *        Find the fixed length of a branch       *  *        Find the fixed length of a branch       *
1546  *************************************************/  *************************************************/
# Line 1836  for (;;) Line 1664  for (;;)
1664      case OP_COMMIT:      case OP_COMMIT:
1665      case OP_CREF:      case OP_CREF:
1666      case OP_DEF:      case OP_DEF:
1667        case OP_DNCREF:
1668        case OP_DNRREF:
1669      case OP_DOLL:      case OP_DOLL:
1670      case OP_DOLLM:      case OP_DOLLM:
1671      case OP_EOD:      case OP_EOD:
1672      case OP_EODN:      case OP_EODN:
1673      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1674      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1675      case OP_PRUNE:      case OP_PRUNE:
1676      case OP_REVERSE:      case OP_REVERSE:
# Line 2015  for (;;) Line 1843  for (;;)
1843      case OP_QUERYI:      case OP_QUERYI:
1844      case OP_REF:      case OP_REF:
1845      case OP_REFI:      case OP_REFI:
1846        case OP_DNREF:
1847        case OP_DNREFI:
1848      case OP_SBRA:      case OP_SBRA:
1849      case OP_SBRAPOS:      case OP_SBRAPOS:
1850      case OP_SCBRA:      case OP_SCBRA:
# Line 2051  for (;;) Line 1881  for (;;)
1881    
1882    
1883    
   
1884  /*************************************************  /*************************************************
1885  *    Scan compiled regex for specific bracket    *  *    Scan compiled regex for specific bracket    *
1886  *************************************************/  *************************************************/
# Line 2361  Returns:      TRUE if what is matched co Line 2190  Returns:      TRUE if what is matched co
2190  typedef struct recurse_check {  typedef struct recurse_check {
2191    struct recurse_check *prev;    struct recurse_check *prev;
2192    const pcre_uchar *group;    const pcre_uchar *group;
2193  } recurse_check;  } recurse_check;
2194    
2195  static BOOL  static BOOL
2196  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
# Line 2377  for (code = first_significant_code(code Line 2206  for (code = first_significant_code(code
2206    const pcre_uchar *ccode;    const pcre_uchar *ccode;
2207    
2208    c = *code;    c = *code;
2209    
2210    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
2211    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
2212    
# Line 2405  for (code = first_significant_code(code Line 2234  for (code = first_significant_code(code
2234      NULL. */      NULL. */
2235    
2236      if (cd->start_workspace != NULL)      if (cd->start_workspace != NULL)
2237        {        {
2238        const pcre_uchar *tcode;        const pcre_uchar *tcode;
2239        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2240          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2241        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2242        }        }
2243    
2244      /* If we are scanning a completed pattern, there are no forward references      /* If we are scanning a completed pattern, there are no forward references
2245      and all groups are complete. We need to detect whether this is a recursive      and all groups are complete. We need to detect whether this is a recursive
2246      call, as otherwise there will be an infinite loop. If it is a recursion,      call, as otherwise there will be an infinite loop. If it is a recursion,
2247      just skip over it. Simple recursions are easily detected. For mutual      just skip over it. Simple recursions are easily detected. For mutual
2248      recursions we keep a chain on the stack. */      recursions we keep a chain on the stack. */
2249    
2250      else      else
2251        {        {
2252        recurse_check *r = recurses;        recurse_check *r = recurses;
2253        const pcre_uchar *endgroup = scode;        const pcre_uchar *endgroup = scode;
2254    
2255        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2256        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2257    
2258        for (r = recurses; r != NULL; r = r->prev)        for (r = recurses; r != NULL; r = r->prev)
2259          if (r->group == scode) break;          if (r->group == scode) break;
2260        if (r != NULL) continue;   /* Mutual recursion */        if (r != NULL) continue;   /* Mutual recursion */
# Line 2436  for (code = first_significant_code(code Line 2265  for (code = first_significant_code(code
2265    
2266      empty_branch = FALSE;      empty_branch = FALSE;
2267      this_recurse.prev = recurses;      this_recurse.prev = recurses;
2268      this_recurse.group = scode;      this_recurse.group = scode;
2269    
2270      do      do
2271        {        {
2272        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
# Line 2557  for (code = first_significant_code(code Line 2386  for (code = first_significant_code(code
2386      case OP_ANY:      case OP_ANY:
2387      case OP_ALLANY:      case OP_ALLANY:
2388      case OP_ANYBYTE:      case OP_ANYBYTE:
2389    
2390      case OP_PROP:      case OP_PROP:
2391      case OP_NOTPROP:      case OP_NOTPROP:
2392      case OP_ANYNL:      case OP_ANYNL:
2393    
2394      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
2395      case OP_HSPACE:      case OP_HSPACE:
2396      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
2397      case OP_VSPACE:      case OP_VSPACE:
2398      case OP_EXTUNI:      case OP_EXTUNI:
2399    
2400      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2401      case OP_DIGIT:      case OP_DIGIT:
2402      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2403      case OP_WHITESPACE:      case OP_WHITESPACE:
2404      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2405      case OP_WORDCHAR:      case OP_WORDCHAR:
2406    
2407      case OP_CHAR:      case OP_CHAR:
2408      case OP_CHARI:      case OP_CHARI:
2409      case OP_NOT:      case OP_NOT:
2410      case OP_NOTI:      case OP_NOTI:
2411    
2412      case OP_PLUS:      case OP_PLUS:
2413      case OP_PLUSI:      case OP_PLUSI:
2414      case OP_MINPLUS:      case OP_MINPLUS:
# Line 2589  for (code = first_significant_code(code Line 2418  for (code = first_significant_code(code
2418      case OP_NOTPLUSI:      case OP_NOTPLUSI:
2419      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2420      case OP_NOTMINPLUSI:      case OP_NOTMINPLUSI:
2421    
2422      case OP_POSPLUS:      case OP_POSPLUS:
2423      case OP_POSPLUSI:      case OP_POSPLUSI:
2424      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2425      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
2426    
2427      case OP_EXACT:      case OP_EXACT:
2428      case OP_EXACTI:      case OP_EXACTI:
2429      case OP_NOTEXACT:      case OP_NOTEXACT:
2430      case OP_NOTEXACTI:      case OP_NOTEXACTI:
2431    
2432      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2433      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2434      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2435      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2436    
2437      return FALSE;      return FALSE;
2438    
2439      /* These are going to continue, as they may be empty, but we have to      /* These are going to continue, as they may be empty, but we have to
# Line 2644  for (code = first_significant_code(code Line 2473  for (code = first_significant_code(code
2473  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2474      case OP_STAR:      case OP_STAR:
2475      case OP_STARI:      case OP_STARI:
2476      case OP_NOTSTAR:      case OP_NOTSTAR:
2477      case OP_NOTSTARI:      case OP_NOTSTARI:
2478    
2479      case OP_MINSTAR:      case OP_MINSTAR:
2480      case OP_MINSTARI:      case OP_MINSTARI:
2481      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
2482      case OP_NOTMINSTARI:      case OP_NOTMINSTARI:
2483    
2484      case OP_POSSTAR:      case OP_POSSTAR:
2485      case OP_POSSTARI:      case OP_POSSTARI:
2486      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
2487      case OP_NOTPOSSTARI:      case OP_NOTPOSSTARI:
2488    
2489      case OP_QUERY:      case OP_QUERY:
2490      case OP_QUERYI:      case OP_QUERYI:
2491      case OP_NOTQUERY:      case OP_NOTQUERY:
2492      case OP_NOTQUERYI:      case OP_NOTQUERYI:
2493    
2494      case OP_MINQUERY:      case OP_MINQUERY:
2495      case OP_MINQUERYI:      case OP_MINQUERYI:
2496      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
2497      case OP_NOTMINQUERYI:      case OP_NOTMINQUERYI:
2498    
2499      case OP_POSQUERY:      case OP_POSQUERY:
2500      case OP_POSQUERYI:      case OP_POSQUERYI:
2501      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
2502      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
2503    
2504      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2505      break;      break;
2506    
2507      case OP_UPTO:      case OP_UPTO:
2508      case OP_UPTOI:      case OP_UPTOI:
2509      case OP_NOTUPTO:      case OP_NOTUPTO:
2510      case OP_NOTUPTOI:      case OP_NOTUPTOI:
2511    
2512      case OP_MINUPTO:      case OP_MINUPTO:
2513      case OP_MINUPTOI:      case OP_MINUPTOI:
2514      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
2515      case OP_NOTMINUPTOI:      case OP_NOTMINUPTOI:
2516    
2517      case OP_POSUPTO:      case OP_POSUPTO:
2518      case OP_POSUPTOI:      case OP_POSUPTOI:
2519      case OP_NOTPOSUPTO:      case OP_NOTPOSUPTO:
2520      case OP_NOTPOSUPTOI:      case OP_NOTPOSUPTOI:
2521    
2522      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2523      break;      break;
2524  #endif  #endif
# Line 2753  return TRUE; Line 2582  return TRUE;
2582    
2583    
2584  /*************************************************  /*************************************************
2585  *           Check for POSIX class syntax         *  *        Base opcode of repeated opcodes         *
2586  *************************************************/  *************************************************/
2587    
2588  /* This function is called when the sequence "[:" or "[." or "[=" is  /* Returns the base opcode for repeated single character type opcodes. If the
2589  encountered in a character class. It checks whether this is followed by a  opcode is not a repeated character type, it returns with the original value.
 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we  
 reach an unescaped ']' without the special preceding character, return FALSE.  
2590    
2591  Originally, this function only recognized a sequence of letters between the  Arguments:  c opcode
2592  terminators, but it seems that Perl recognizes any sequence of characters,  Returns:    base opcode for the type
2593  though of course unknown POSIX names are subsequently rejected. Perl gives an  */
 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE  
 didn't consider this to be a POSIX class. Likewise for [:1234:].  
2594    
2595  The problem in trying to be exactly like Perl is in the handling of escapes. We  static pcre_uchar
2596  have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX  get_repeat_base(pcre_uchar c)
2597  class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code  {
2598  below handles the special case of \], but does not try to do any other escape  return (c > OP_TYPEPOSUPTO)? c :
2599  processing. This makes it different from Perl for cases such as [:l\ower:]         (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2600  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize         (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2601  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,         (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2602  I think.         (c >= OP_STARI)?      OP_STARI :
2603                                 OP_STAR;
2604    }
2605    
 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  
 It seems that the appearance of a nested POSIX class supersedes an apparent  
 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or  
 a digit.  
2606    
 In Perl, unescaped square brackets may also appear as part of class names. For  
 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for  
 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not  
 seem right at all. PCRE does not allow closing square brackets in POSIX class  
 names.  
2607    
2608  Arguments:  #ifdef SUPPORT_UCP
2609    ptr      pointer to the initial [  /*************************************************
2610    endptr   where to return the end pointer  *        Check a character and a property        *
2611    *************************************************/
2612    
2613  Returns:   TRUE or FALSE  /* This function is called by check_auto_possessive() when a property item
2614  */  is adjacent to a fixed character.
2615    
2616    Arguments:
2617      c            the character
2618      ptype        the property type
2619      pdata        the data for the type
2620      negated      TRUE if it's a negated property (\P or \p{^)
2621    
2622    Returns:       TRUE if auto-possessifying is OK
2623    */
2624    
2625  static BOOL  static BOOL
2626  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)  check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2627      BOOL negated)
2628  {  {
2629  pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */  const pcre_uint32 *p;
2630  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  const ucd_record *prop = GET_UCD(c);
2631  for (++ptr; *ptr != CHAR_NULL; ptr++)  
2632    switch(ptype)
2633    {    {
2634    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)    case PT_LAMP:
2635      ptr++;    return (prop->chartype == ucp_Lu ||
2636    else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;            prop->chartype == ucp_Ll ||
2637    else            prop->chartype == ucp_Lt) == negated;
2638    
2639      case PT_GC:
2640      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2641    
2642      case PT_PC:
2643      return (pdata == prop->chartype) == negated;
2644    
2645      case PT_SC:
2646      return (pdata == prop->script) == negated;
2647    
2648      /* These are specials */
2649    
2650      case PT_ALNUM:
2651      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2652              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2653    
2654      /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2655      means that Perl space and POSIX space are now identical. PCRE was changed
2656      at release 8.34. */
2657    
2658      case PT_SPACE:    /* Perl space */
2659      case PT_PXSPACE:  /* POSIX space */
2660      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2661              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2662              c == CHAR_FF || c == CHAR_CR)
2663              == negated;
2664    
2665      case PT_WORD:
2666      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2667              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2668              c == CHAR_UNDERSCORE) == negated;
2669    
2670      case PT_CLIST:
2671      p = PRIV(ucd_caseless_sets) + prop->caseset;
2672      for (;;)
2673      {      {
2674      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (c < *p) return !negated;
2675        {      if (c == *p++) return negated;
       *endptr = ptr;  
       return TRUE;  
       }  
     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&  
          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||  
           ptr[1] == CHAR_EQUALS_SIGN) &&  
         check_posix_syntax(ptr, endptr))  
       return FALSE;  
2676      }      }
2677      break;  /* Control never reaches here */
2678    }    }
2679    
2680  return FALSE;  return FALSE;
2681  }  }
2682    #endif  /* SUPPORT_UCP */
2683    
2684    
2685    
2686  /*************************************************  /*************************************************
2687  *          Check POSIX class name                *  *        Fill the character property list        *
2688  *************************************************/  *************************************************/
2689    
2690  /* This function is called to check the name given in a POSIX-style class entry  /* Checks whether the code points to an opcode that can take part in auto-
2691  such as [:alnum:].  possessification, and if so, fills a list with its properties.
2692    
2693  Arguments:  Arguments:
2694    ptr        points to the first letter    code        points to start of expression
2695    len        the length of the name    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2696      fcc         points to case-flipping table
2697      list        points to output list
2698                  list[0] will be filled with the opcode
2699                  list[1] will be non-zero if this opcode
2700                    can match an empty character string
2701                  list[2..7] depends on the opcode
2702    
2703  Returns:     a value representing the name, or -1 if unknown  Returns:      points to the start of the next opcode if *code is accepted
2704                  NULL if *code is not accepted
2705  */  */
2706    
2707  static int  static const pcre_uchar *
2708  check_posix_name(const pcre_uchar *ptr, int len)  get_chr_property_list(const pcre_uchar *code, BOOL utf,
2709      const pcre_uint8 *fcc, pcre_uint32 *list)
2710  {  {
2711  const char *pn = posix_names;  pcre_uchar c = *code;
2712  register int yield = 0;  const pcre_uchar *end;
2713  while (posix_name_lengths[yield] != 0)  const pcre_uint32 *clist_src;
2714    pcre_uint32 *clist_dest;
2715    pcre_uint32 chr;
2716    pcre_uchar base;
2717    
2718    list[0] = c;
2719    list[1] = FALSE;
2720    code++;
2721    
2722    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2723    {    {
2724    if (len == posix_name_lengths[yield] &&    base = get_repeat_base(c);
2725      STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;    c -= (base - OP_STAR);
   pn += posix_name_lengths[yield] + 1;  
   yield++;  
   }  
 return -1;  
 }  
2726    
2727      if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2728        code += IMM2_SIZE;
2729    
2730  /*************************************************    list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
 *    Adjust OP_RECURSE items in repeated group   *  
 *************************************************/  
2731    
2732  /* OP_RECURSE items contain an offset from the start of the regex to the group    switch(base)
2733  that is referenced. This means that groups can be replicated for fixed      {
2734  repetition simply by copying (because the recursion is allowed to refer to      case OP_STAR:
2735  earlier groups that are outside the current group). However, when a group is      list[0] = OP_CHAR;
2736  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is      break;
 inserted before it, after it has been compiled. This means that any OP_RECURSE  
 items within it that refer to the group itself or any contained groups have to  
 have their offsets adjusted. That one of the jobs of this function. Before it  
 is called, the partially compiled regex must be temporarily terminated with  
 OP_END.  
2737    
2738  This function has been extended with the possibility of forward references for      case OP_STARI:
2739  recursions and subroutine calls. It must also check the list of such references      list[0] = OP_CHARI;
2740  for the group we are dealing with. If it finds that one of the recursions in      break;
 the current group is on this list, it adjusts the offset in the list, not the  
 value in the reference (which is a group number).  
2741    
2742  Arguments:      case OP_NOTSTAR:
2743    group      points to the start of the group      list[0] = OP_NOT;
2744    adjust     the amount by which the group is to be moved      break;
   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode  
   cd         contains pointers to tables etc.  
   save_hwm   the hwm forward reference pointer at the start of the group  
2745    
2746  Returns:     nothing      case OP_NOTSTARI:
2747  */      list[0] = OP_NOTI;
2748        break;
2749    
2750  static void      case OP_TYPESTAR:
2751  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,      list[0] = *code;
2752    pcre_uchar *save_hwm)      code++;
2753  {      break;
2754  pcre_uchar *ptr = group;      }
2755      c = list[0];
2756      }
2757    
2758  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)  switch(c)
2759    {    {
2760    int offset;    case OP_NOT_DIGIT:
2761    pcre_uchar *hc;    case OP_DIGIT:
2762      case OP_NOT_WHITESPACE:
2763      case OP_WHITESPACE:
2764      case OP_NOT_WORDCHAR:
2765      case OP_WORDCHAR:
2766      case OP_ANY:
2767      case OP_ALLANY:
2768      case OP_ANYNL:
2769      case OP_NOT_HSPACE:
2770      case OP_HSPACE:
2771      case OP_NOT_VSPACE:
2772      case OP_VSPACE:
2773      case OP_EXTUNI:
2774      case OP_EODN:
2775      case OP_EOD:
2776      case OP_DOLL:
2777      case OP_DOLLM:
2778      return code;
2779    
2780    /* See if this recursion is on the forward reference list. If so, adjust the    case OP_CHAR:
2781    reference. */    case OP_NOT:
2782      GETCHARINCTEST(chr, code);
2783      list[2] = chr;
2784      list[3] = NOTACHAR;
2785      return code;
2786    
2787    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    case OP_CHARI:
2788      {    case OP_NOTI:
2789      offset = (int)GET(hc, 0);    list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2790      if (cd->start_code + offset == ptr + 1)    GETCHARINCTEST(chr, code);
2791        {    list[2] = chr;
       PUT(hc, 0, offset + adjust);  
       break;  
       }  
     }  
2792    
2793    /* Otherwise, adjust the recursion offset if it's after the start of this  #ifdef SUPPORT_UCP
2794    group. */    if (chr < 128 || (chr < 256 && !utf))
2795        list[3] = fcc[chr];
2796      else
2797        list[3] = UCD_OTHERCASE(chr);
2798    #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2799      list[3] = (chr < 256) ? fcc[chr] : chr;
2800    #else
2801      list[3] = fcc[chr];
2802    #endif
2803    
2804    if (hc >= cd->hwm)    /* The othercase might be the same value. */
2805    
2806      if (chr == list[3])
2807        list[3] = NOTACHAR;
2808      else
2809        list[4] = NOTACHAR;
2810      return code;
2811    
2812    #ifdef SUPPORT_UCP
2813      case OP_PROP:
2814      case OP_NOTPROP:
2815      if (code[0] != PT_CLIST)
2816      {      {
2817      offset = (int)GET(ptr, 1);      list[2] = code[0];
2818      if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);      list[3] = code[1];
2819        return code + 2;
2820      }      }
2821    
2822    ptr += 1 + LINK_SIZE;    /* Convert only if we have anough space. */
   }  
 }  
2823    
2824      clist_src = PRIV(ucd_caseless_sets) + code[1];
2825      clist_dest = list + 2;
2826      code += 2;
2827    
2828      do {
2829         /* Early return if there is not enough space. */
2830         if (clist_dest >= list + 8)
2831           {
2832           list[2] = code[0];
2833           list[3] = code[1];
2834           return code;
2835           }
2836         *clist_dest++ = *clist_src;
2837         }
2838       while(*clist_src++ != NOTACHAR);
2839    
2840  /*************************************************    /* Enough space to store all characters. */
 *        Insert an automatic callout point       *  
 *************************************************/  
2841    
2842  /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2843  callout points before each pattern item.    return code;
2844    #endif
2845    
2846  Arguments:    case OP_NCLASS:
2847    code           current code pointer    case OP_CLASS:
2848    ptr            current pattern pointer  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2849    cd             pointers to tables etc    case OP_XCLASS:
2850    
2851  Returns:         new code pointer    if (c == OP_XCLASS)
2852  */      end = code + GET(code, 0);
2853      else
2854    #endif
2855        end = code + 32 / sizeof(pcre_uchar);
2856    
2857  static pcre_uchar *    switch(*end)
2858  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)      {
2859  {      case OP_CRSTAR:
2860  *code++ = OP_CALLOUT;      case OP_CRMINSTAR:
2861  *code++ = 255;      case OP_CRQUERY:
2862  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */      case OP_CRMINQUERY:
2863  PUT(code, LINK_SIZE, 0);                       /* Default length */      list[1] = TRUE;
2864  return code + 2 * LINK_SIZE;      end++;
2865        break;
2866    
2867        case OP_CRRANGE:
2868        case OP_CRMINRANGE:
2869        list[1] = (GET2(end, 1) == 0);
2870        end += 1 + 2 * IMM2_SIZE;
2871        break;
2872        }
2873      list[2] = end - code;
2874      return end;
2875      }
2876    return NULL;    /* Opcode not accepted */
2877  }  }
2878    
2879    
2880    
2881  /*************************************************  /*************************************************
2882  *         Complete a callout item                *  *    Scan further character sets for match       *
2883  *************************************************/  *************************************************/
2884    
2885  /* A callout item contains the length of the next item in the pattern, which  /* Checks whether the base and the current opcode have a common character, in
2886  we can't fill in till after we have reached the relevant point. This is used  which case the base cannot be possessified.
 for both automatic and manual callouts.  
2887    
2888  Arguments:  Arguments:
2889    previous_callout   points to previous callout item    code        points to the byte code
2890    ptr                current pattern pointer    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2891    cd                 pointers to tables etc    cd          static compile data
2892      base_list   the data list of the base opcode
2893    
2894  Returns:             nothing  Returns:      TRUE if the auto-possessification is possible
2895  */  */
2896    
2897  static void  static BOOL
2898  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2899      const pcre_uint32* base_list)
2900  {  {
2901  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  pcre_uchar c;
2902  PUT(previous_callout, 2 + LINK_SIZE, length);  pcre_uint32 list[8];
2903  }  const pcre_uint32* chr_ptr;
2904    const pcre_uint32* ochr_ptr;
2905    const pcre_uint32* list_ptr;
2906    pcre_uint32 chr;
2907    
2908    for(;;)
2909      {
2910      c = *code;
2911    
2912      /* Skip over callouts */
2913    
2914  #ifdef SUPPORT_UCP    if (c == OP_CALLOUT)
2915  /*************************************************      {
2916  *           Get othercase range                  *      code += PRIV(OP_lengths)[c];
2917  *************************************************/      continue;
2918        }
2919    
2920  /* This function is passed the start and end of a class range, in UTF-8 mode    if (c == OP_ALT)
2921  with UCP support. It searches up the characters, looking for ranges of      {
2922  characters in the "other" case. Each call returns the next one, updating the      do code += GET(code, 1); while (*code == OP_ALT);
2923  start address. A character with multiple other cases is returned on its own      c = *code;
2924  with a special return value.      }
2925    
2926  Arguments:    switch(c)
2927    cptr        points to starting character value; updated      {
2928    d           end value      case OP_END:
2929    ocptr       where to put start of othercase range      /* TRUE only in greedy case. The non-greedy case could be replaced by an
2930    odptr       where to put end of othercase range      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses
2931        more memory, which we cannot get at this stage.) */
2932    
2933  Yield:        -1 when no more      return base_list[1] != 0;
                0 when a range is returned  
               >0 the CASESET offset for char with multiple other cases  
                 in this case, ocptr contains the original  
 */  
2934    
2935  static int      case OP_KET:
2936  get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,      /* If the bracket is capturing, and referenced by an OP_RECURSE, the
2937    pcre_uint32 *odptr)      non-greedy case cannot be converted to a possessive form. We do not test
2938  {      the bracket type at the moment, but we might do it in the future to improve
2939  pcre_uint32 c, othercase, next;      this condition. (But note that recursive calls are always atomic.) */
 unsigned int co;  
2940    
2941  /* Find the first character that has an other case. If it has multiple other      if (base_list[1] == 0) return FALSE;
2942  cases, return its case offset value. */      code += PRIV(OP_lengths)[c];
2943        continue;
2944        }
2945    
2946  for (c = *cptr; c <= d; c++)    /* Check for a supported opcode, and load its properties. */
2947    {  
2948    if ((co = UCD_CASESET(c)) != 0)    code = get_chr_property_list(code, utf, cd->fcc, list);
2949      if (code == NULL) return FALSE;    /* Unsupported */
2950    
2951      /* If either opcode is a small character list, set pointers for comparing
2952      characters from that list with another list, or with a property. */
2953    
2954      if (base_list[0] == OP_CHAR)
2955      {      {
2956      *ocptr = c++;   /* Character that has the set */      chr_ptr = base_list + 2;
2957      *cptr = c;      /* Rest of input range */      list_ptr = list;
2958      return (int)co;      }
2959      else if (list[0] == OP_CHAR)
2960        {
2961        chr_ptr = list + 2;
2962        list_ptr = base_list;
2963      }      }
   if ((othercase = UCD_OTHERCASE(c)) != c) break;  
   }  
2964    
2965  if (c > d) return -1;  /* Reached end of range */    /* Some property combinations also acceptable. Unicode property opcodes are
2966      processed specially; the rest can be handled with a lookup table. */
2967    
2968  *ocptr = othercase;    else
2969  next = othercase + 1;      {
2970        pcre_uint32 leftop, rightop;
2971    
2972  for (++c; c <= d; c++)      if (list[1] != 0) return FALSE;   /* Must match at least one character */
2973    {      leftop = base_list[0];
2974    if (UCD_OTHERCASE(c) != next) break;      rightop = list[0];
   next++;  
   }  
2975    
2976  *odptr = next - 1;     /* End of othercase range */  #ifdef SUPPORT_UCP
2977  *cptr = c;             /* Rest of input range */      if (leftop == OP_PROP || leftop == OP_NOTPROP)
2978  return 0;        {
2979  }        if (rightop == OP_EOD) return TRUE;
2980          if (rightop == OP_PROP || rightop == OP_NOTPROP)
2981            {
2982            int n;
2983            const pcre_uint8 *p;
2984            BOOL same = leftop == rightop;
2985            BOOL lisprop = leftop == OP_PROP;
2986            BOOL risprop = rightop == OP_PROP;
2987            BOOL bothprop = lisprop && risprop;
2988    
2989            /* There's a table that specifies how each combination is to be
2990            processed:
2991              0   Always return FALSE (never auto-possessify)
2992              1   Character groups are distinct (possessify if both are OP_PROP)
2993              2   Check character categories in the same group (general or particular)
2994              3   Return TRUE if the two opcodes are not the same
2995              ... see comments below
2996            */
2997    
2998            n = propposstab[base_list[2]][list[2]];
2999            switch(n)
3000              {
3001              case 0: return FALSE;
3002              case 1: return bothprop;
3003              case 2: return (base_list[3] == list[3]) != same;
3004              case 3: return !same;
3005    
3006              case 4:  /* Left general category, right particular category */
3007              return risprop && catposstab[base_list[3]][list[3]] == same;
3008    
3009              case 5:  /* Right general category, left particular category */
3010              return lisprop && catposstab[list[3]][base_list[3]] == same;
3011    
3012              /* This code is logically tricky. Think hard before fiddling with it.
3013              The posspropstab table has four entries per row. Each row relates to
3014              one of PCRE's special properties such as ALNUM or SPACE or WORD.
3015              Only WORD actually needs all four entries, but using repeats for the
3016              others means they can all use the same code below.
3017    
3018              The first two entries in each row are Unicode general categories, and
3019              apply always, because all the characters they include are part of the
3020              PCRE character set. The third and fourth entries are a general and a
3021              particular category, respectively, that include one or more relevant
3022              characters. One or the other is used, depending on whether the check
3023              is for a general or a particular category. However, in both cases the
3024              category contains more characters than the specials that are defined
3025              for the property being tested against. Therefore, it cannot be used
3026              in a NOTPROP case.
3027    
3028              Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3029              Underscore is covered by ucp_P or ucp_Po. */
3030    
3031              case 6:  /* Left alphanum vs right general category */
3032              case 7:  /* Left space vs right general category */
3033              case 8:  /* Left word vs right general category */
3034              p = posspropstab[n-6];
3035              return risprop && lisprop ==
3036                (list[3] != p[0] &&
3037                 list[3] != p[1] &&
3038                (list[3] != p[2] || !lisprop));
3039    
3040              case 9:   /* Right alphanum vs left general category */
3041              case 10:  /* Right space vs left general category */
3042              case 11:  /* Right word vs left general category */
3043              p = posspropstab[n-9];
3044              return lisprop && risprop ==
3045                (base_list[3] != p[0] &&
3046                 base_list[3] != p[1] &&
3047                (base_list[3] != p[2] || !risprop));
3048    
3049              case 12:  /* Left alphanum vs right particular category */
3050              case 13:  /* Left space vs right particular category */
3051              case 14:  /* Left word vs right particular category */
3052              p = posspropstab[n-12];
3053              return risprop && lisprop ==
3054                (catposstab[p[0]][list[3]] &&
3055                 catposstab[p[1]][list[3]] &&
3056                (list[3] != p[3] || !lisprop));
3057    
3058              case 15:  /* Right alphanum vs left particular category */
3059              case 16:  /* Right space vs left particular category */
3060              case 17:  /* Right word vs left particular category */
3061              p = posspropstab[n-15];
3062              return lisprop && risprop ==
3063                (catposstab[p[0]][base_list[3]] &&
3064                 catposstab[p[1]][base_list[3]] &&
3065                (base_list[3] != p[3] || !risprop));
3066              }
3067            }
3068          return FALSE;
3069          }
3070    
3071        else
3072    #endif  /* SUPPORT_UCP */
3073    
3074        return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3075               rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3076               autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3077        }
3078    
3079  /*************************************************    /* Control reaches here only if one of the items is a small character list.
3080  *        Check a character and a property        *    All characters are checked against the other side. */
 *************************************************/  
3081    
3082  /* This function is called by check_auto_possessive() when a property item    do
3083  is adjacent to a fixed character.      {
3084        chr = *chr_ptr;
3085    
3086  Arguments:      switch(list_ptr[0])
3087    c            the character        {
3088    ptype        the property type        case OP_CHAR:
3089    pdata        the data for the type        ochr_ptr = list_ptr + 2;
3090    negated      TRUE if it's a negated property (\P or \p{^)        do
3091            {
3092            if (chr == *ochr_ptr) return FALSE;
3093            ochr_ptr++;
3094            }
3095          while(*ochr_ptr != NOTACHAR);
3096          break;
3097    
3098  Returns:       TRUE if auto-possessifying is OK        case OP_NOT:
3099  */        ochr_ptr = list_ptr + 2;
3100          do
3101            {
3102            if (chr == *ochr_ptr)
3103              break;
3104            ochr_ptr++;
3105            }
3106          while(*ochr_ptr != NOTACHAR);
3107          if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3108          break;
3109    
3110  static BOOL        /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3111  check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, BOOL negated)        set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 {  
 #ifdef SUPPORT_UCP  
 const pcre_uint32 *p;  
 #endif  
3112    
3113  const ucd_record *prop = GET_UCD(c);        case OP_DIGIT:
3114          if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3115          break;
3116    
3117  switch(ptype)        case OP_NOT_DIGIT:
3118    {        if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3119    case PT_LAMP:        break;
   return (prop->chartype == ucp_Lu ||  
           prop->chartype == ucp_Ll ||  
           prop->chartype == ucp_Lt) == negated;  
3120    
3121    case PT_GC:        case OP_WHITESPACE:
3122    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;        if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3123          break;
3124    
3125    case PT_PC:        case OP_NOT_WHITESPACE:
3126    return (pdata == prop->chartype) == negated;        if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3127          break;
3128    
3129    case PT_SC:        case OP_WORDCHAR:
3130    return (pdata == prop->script) == negated;        if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3131          break;
3132    
3133    /* These are specials */        case OP_NOT_WORDCHAR:
3134          if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3135          break;
3136    
3137    case PT_ALNUM:        case OP_HSPACE:
3138    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||        switch(chr)
3139            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;          {
3140            HSPACE_CASES: return FALSE;
3141            default: break;
3142            }
3143          break;
3144    
3145    case PT_SPACE:    /* Perl space */        case OP_NOT_HSPACE:
3146    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||        switch(chr)
3147            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)          {
3148            == negated;          HSPACE_CASES: break;
3149            default: return FALSE;
3150            }
3151          break;
3152    
3153    case PT_PXSPACE:  /* POSIX space */        case OP_ANYNL:
3154    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||        case OP_VSPACE:
3155            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||        switch(chr)
3156            c == CHAR_FF || c == CHAR_CR)          {
3157            == negated;          VSPACE_CASES: return FALSE;
3158            default: break;
3159            }
3160          break;
3161    
3162    case PT_WORD:        case OP_NOT_VSPACE:
3163    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||        switch(chr)
3164            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||          {
3165            c == CHAR_UNDERSCORE) == negated;          VSPACE_CASES: break;
3166            default: return FALSE;
3167            }
3168          break;
3169    
3170  #ifdef SUPPORT_UCP        case OP_DOLL:
3171    case PT_CLIST:        case OP_EODN:
3172    p = PRIV(ucd_caseless_sets) + prop->caseset;        switch (chr)
3173    for (;;)          {
3174      {          case CHAR_CR:
3175      if (c < *p) return !negated;          case CHAR_LF:
3176      if (c == *p++) return negated;          case CHAR_VT:
3177      }          case CHAR_FF:
3178    break;  /* Control never reaches here */          case CHAR_NEL:
3179    #ifndef EBCDIC
3180            case 0x2028:
3181            case 0x2029:
3182    #endif  /* Not EBCDIC */
3183            return FALSE;
3184            }
3185          break;
3186    
3187          case OP_EOD:    /* Can always possessify before \z */
3188          break;
3189    
3190          case OP_PROP:
3191          case OP_NOTPROP:
3192          if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3193                list_ptr[0] == OP_NOTPROP))
3194            return FALSE;
3195          break;
3196    
3197          /* The class comparisons work only when the class is the second item
3198          of the pair, because there are at present no possessive forms of the
3199          class opcodes. Note also that the "code" variable that is used below
3200          points after the second item, and that the pointer for the first item
3201          is not available, so even if there were possessive forms of the class
3202          opcodes, the correct comparison could not be done. */
3203    
3204          case OP_NCLASS:
3205          if (chr > 255) return FALSE;
3206          /* Fall through */
3207    
3208          case OP_CLASS:
3209          if (list_ptr != list) return FALSE;   /* Class is first opcode */
3210          if (chr > 255) break;
3211          if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)
3212            return FALSE;
3213          break;
3214    
3215    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3216          case OP_XCLASS:
3217          if (list_ptr != list) return FALSE;   /* Class is first opcode */
3218          if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))
3219            return FALSE;
3220          break;
3221  #endif  #endif
3222    
3223          default:
3224          return FALSE;
3225          }
3226    
3227        chr_ptr++;
3228        }
3229      while(*chr_ptr != NOTACHAR);
3230    
3231      /* At least one character must be matched from this opcode. */
3232    
3233      if (list[1] == 0) return TRUE;
3234    }    }
3235    
3236  return FALSE;  return FALSE;
3237  }  }
 #endif  /* SUPPORT_UCP */  
3238    
3239    
3240    
3241  /*************************************************  /*************************************************
3242  *     Check if auto-possessifying is possible    *  *    Scan compiled regex for auto-possession     *
3243  *************************************************/  *************************************************/
3244    
3245  /* This function is called for unlimited repeats of certain items, to see  /* Replaces single character iterations with their possessive alternatives
3246  whether the next thing could possibly match the repeated item. If not, it makes  if appropriate. This function modifies the compiled opcode!
 sense to automatically possessify the repeated item.  
3247    
3248  Arguments:  Arguments:
3249    previous      pointer to the repeated opcode    code        points to start of the byte code
3250    utf           TRUE in UTF-8 / UTF-16 / UTF-32 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3251    ptr           next character in pattern    cd          static compile data
   options       options bits  
   cd            contains pointers to tables etc.  
3252    
3253  Returns:        TRUE if possessifying is wanted  Returns:      nothing
3254  */  */
3255    
3256  static BOOL  static void
3257  check_auto_possessive(const pcre_uchar *previous, BOOL utf,  auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
   const pcre_uchar *ptr, int options, compile_data *cd)  
3258  {  {
3259  pcre_uint32 c = NOTACHAR;  register pcre_uchar c;
3260  pcre_uint32 next;  const pcre_uchar *end;
3261  int escape;  pcre_uint32 list[8];
 pcre_uchar op_code = *previous++;  
   
 /* Skip whitespace and comments in extended mode */  
3262    
3263  if ((options & PCRE_EXTENDED) != 0)  for (;;)
3264    {    {
3265    for (;;)    c = *code;
3266    
3267      if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3268      {      {
3269      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      c -= get_repeat_base(c) - OP_STAR;
3270      if (*ptr == CHAR_NUMBER_SIGN)      end = (c <= OP_MINUPTO) ?
3271          get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3272        list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3273    
3274        if (end != NULL && compare_opcodes(end, utf, cd, list))
3275        {        {
3276        ptr++;        switch(c)
       while (*ptr != CHAR_NULL)  
3277          {          {
3278          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          case OP_STAR:
3279          ptr++;          *code += OP_POSSTAR - OP_STAR;
3280  #ifdef SUPPORT_UTF          break;
3281          if (utf) FORWARDCHAR(ptr);  
3282  #endif          case OP_MINSTAR:
3283            *code += OP_POSSTAR - OP_MINSTAR;
3284            break;
3285    
3286            case OP_PLUS:
3287            *code += OP_POSPLUS - OP_PLUS;
3288            break;
3289    
3290            case OP_MINPLUS:
3291            *code += OP_POSPLUS - OP_MINPLUS;
3292            break;
3293    
3294            case OP_QUERY:
3295            *code += OP_POSQUERY - OP_QUERY;
3296            break;
3297    
3298            case OP_MINQUERY:
3299            *code += OP_POSQUERY - OP_MINQUERY;
3300            break;
3301    
3302            case OP_UPTO:
3303            *code += OP_POSUPTO - OP_UPTO;
3304            break;
3305    
3306            case OP_MINUPTO:
3307            *code += OP_MINUPTO - OP_UPTO;
3308            break;
3309          }          }
3310        }        }
3311      else break;      c = *code;
3312      }      }
   }  
3313    
3314  /* If the next item is one that we can handle, get its value. A non-negative    switch(c)
3315  value is a character, a negative value is an escape value. */      {
3316        case OP_END:
3317        return;
3318    
3319  if (*ptr == CHAR_BACKSLASH)      case OP_TYPESTAR:
3320    {      case OP_TYPEMINSTAR:
3321    int temperrorcode = 0;      case OP_TYPEPLUS:
3322    escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options,      case OP_TYPEMINPLUS:
3323      FALSE);      case OP_TYPEQUERY:
3324    if (temperrorcode != 0) return FALSE;      case OP_TYPEMINQUERY:
3325    ptr++;    /* Point after the escape sequence */      case OP_TYPEPOSSTAR:
3326    }      case OP_TYPEPOSPLUS:
3327  else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)      case OP_TYPEPOSQUERY:
3328    {      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3329    escape = 0;      break;
 #ifdef SUPPORT_UTF  
   if (utf) { GETCHARINC(next, ptr); } else  
 #endif  
   next = *ptr++;  
   }  
 else return FALSE;  
3330    
3331  /* Skip whitespace and comments in extended mode */      case OP_TYPEUPTO:
3332        case OP_TYPEMINUPTO:
3333        case OP_TYPEEXACT:
3334        case OP_TYPEPOSUPTO:
3335        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3336          code += 2;
3337        break;
3338    
3339  if ((options & PCRE_EXTENDED) != 0)      case OP_XCLASS:
3340    {      code += GET(code, 1);
3341    for (;;)      break;
3342      {  
3343      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      case OP_MARK:
3344      if (*ptr == CHAR_NUMBER_SIGN)      case OP_PRUNE_ARG:
3345        {      case OP_SKIP_ARG:
3346        ptr++;      case OP_THEN_ARG:
3347        while (*ptr != CHAR_NULL)      code += code[1];
3348          {      break;
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }  
         ptr++;  
 #ifdef SUPPORT_UTF  
         if (utf) FORWARDCHAR(ptr);  
 #endif  
         }  
       }  
     else break;  
3349      }      }
   }  
3350    
3351  /* If the next thing is itself optional, we have to give up. */    /* Add in the fixed length from the table */
3352    
3353  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||    code += PRIV(OP_lengths)[c];
   STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)  
     return FALSE;  
3354    
3355  /* If the previous item is a character, get its value. */    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3356      a multi-byte character. The length in the table is a minimum, so we have to
3357      arrange to skip the extra bytes. */
3358    
3359  if (op_code == OP_CHAR || op_code == OP_CHARI ||  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3360      op_code == OP_NOT || op_code == OP_NOTI)    if (utf) switch(c)
3361    {      {
3362  #ifdef SUPPORT_UTF      case OP_CHAR:
3363    GETCHARTEST(c, previous);      case OP_CHARI:
3364        case OP_NOT:
3365        case OP_NOTI:
3366        case OP_STAR:
3367        case OP_MINSTAR:
3368        case OP_PLUS:
3369        case OP_MINPLUS:
3370        case OP_QUERY:
3371        case OP_MINQUERY:
3372        case OP_UPTO:
3373        case OP_MINUPTO:
3374        case OP_EXACT:
3375        case OP_POSSTAR:
3376        case OP_POSPLUS:
3377        case OP_POSQUERY:
3378        case OP_POSUPTO:
3379        case OP_STARI:
3380        case OP_MINSTARI:
3381        case OP_PLUSI:
3382        case OP_MINPLUSI:
3383        case OP_QUERYI:
3384        case OP_MINQUERYI:
3385        case OP_UPTOI:
3386        case OP_MINUPTOI:
3387        case OP_EXACTI:
3388        case OP_POSSTARI:
3389        case OP_POSPLUSI:
3390        case OP_POSQUERYI:
3391        case OP_POSUPTOI:
3392        case OP_NOTSTAR:
3393        case OP_NOTMINSTAR:
3394        case OP_NOTPLUS:
3395        case OP_NOTMINPLUS:
3396        case OP_NOTQUERY:
3397        case OP_NOTMINQUERY:
3398        case OP_NOTUPTO:
3399        case OP_NOTMINUPTO:
3400        case OP_NOTEXACT:
3401        case OP_NOTPOSSTAR:
3402        case OP_NOTPOSPLUS:
3403        case OP_NOTPOSQUERY:
3404        case OP_NOTPOSUPTO:
3405        case OP_NOTSTARI:
3406        case OP_NOTMINSTARI:
3407        case OP_NOTPLUSI:
3408        case OP_NOTMINPLUSI:
3409        case OP_NOTQUERYI:
3410        case OP_NOTMINQUERYI:
3411        case OP_NOTUPTOI:
3412        case OP_NOTMINUPTOI:
3413        case OP_NOTEXACTI:
3414        case OP_NOTPOSSTARI:
3415        case OP_NOTPOSPLUSI:
3416        case OP_NOTPOSQUERYI:
3417        case OP_NOTPOSUPTOI:
3418        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3419        break;
3420        }
3421  #else  #else
3422    c = *previous;    (void)(utf);  /* Keep compiler happy by referencing function argument */
3423  #endif  #endif
3424    }    }
3425    }
3426    
 /* Now compare the next item with the previous opcode. First, handle cases when  
 the next item is a character. */  
3427    
 if (escape == 0)  
   {  
   /* For a caseless UTF match, the next character may have more than one other  
   case, which maps to the special PT_CLIST property. Check this first. */  
3428    
3429  #ifdef SUPPORT_UCP  /*************************************************
3430    if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)  *           Check for POSIX class syntax         *
3431      {  *************************************************/
     unsigned int ocs = UCD_CASESET(next);  
     if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);  
     }  
 #endif  
3432    
3433    switch(op_code)  /* This function is called when the sequence "[:" or "[." or "[=" is
3434      {  encountered in a character class. It checks whether this is followed by a
3435      case OP_CHAR:  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3436      return c != next;  reach an unescaped ']' without the special preceding character, return FALSE.
3437    
3438      /* For CHARI (caseless character) we must check the other case. If we have  Originally, this function only recognized a sequence of letters between the
3439      Unicode property support, we can use it to test the other case of  terminators, but it seems that Perl recognizes any sequence of characters,
3440      high-valued characters. We know that next can have only one other case,  though of course unknown POSIX names are subsequently rejected. Perl gives an
3441      because multi-other-case characters are dealt with above. */  "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3442    didn't consider this to be a POSIX class. Likewise for [:1234:].
3443    
3444      case OP_CHARI:  The problem in trying to be exactly like Perl is in the handling of escapes. We
3445      if (c == next) return FALSE;  have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3446  #ifdef SUPPORT_UTF  class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3447      if (utf)  below handles the special case of \], but does not try to do any other escape
3448        {  processing. This makes it different from Perl for cases such as [:l\ower:]
3449        pcre_uint32 othercase;  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3450        if (next < 128) othercase = cd->fcc[next]; else  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3451  #ifdef SUPPORT_UCP  I think.
       othercase = UCD_OTHERCASE(next);  
 #else  
       othercase = NOTACHAR;  
 #endif  
       return c != othercase;  
       }  
     else  
 #endif  /* SUPPORT_UTF */  
     return (c != TABLE_GET(next, cd->fcc, next));  /* Not UTF */  
3452    
3453      case OP_NOT:  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3454      return c == next;  It seems that the appearance of a nested POSIX class supersedes an apparent
3455    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3456    a digit.
3457    
3458      case OP_NOTI:  In Perl, unescaped square brackets may also appear as part of class names. For
3459      if (c == next) return TRUE;  example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3460  #ifdef SUPPORT_UTF  [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3461      if (utf)  seem right at all. PCRE does not allow closing square brackets in POSIX class
3462    names.
3463    
3464    Arguments:
3465      ptr      pointer to the initial [
3466      endptr   where to return the end pointer
3467    
3468    Returns:   TRUE or FALSE
3469    */
3470    
3471    static BOOL
3472    check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3473    {
3474    pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3475    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3476    for (++ptr; *ptr != CHAR_NULL; ptr++)
3477      {
3478      if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3479        ptr++;
3480      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3481      else
3482        {
3483        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3484        {        {
3485        pcre_uint32 othercase;        *endptr = ptr;
3486        if (next < 128) othercase = cd->fcc[next]; else        return TRUE;
 #ifdef SUPPORT_UCP  
       othercase = UCD_OTHERCASE(next);  
 #else  
       othercase = NOTACHAR;  
 #endif  
       return c == othercase;  
3487        }        }
3488      else      if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3489  #endif  /* SUPPORT_UTF */           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3490      return (c == TABLE_GET(next, cd->fcc, next));  /* Not UTF */            ptr[1] == CHAR_EQUALS_SIGN) &&
3491            check_posix_syntax(ptr, endptr))
3492          return FALSE;
3493        }
3494      }
3495    return FALSE;
3496    }
3497    
     /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.  
     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */  
3498    
     case OP_DIGIT:  
     return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;  
3499    
     case OP_NOT_DIGIT:  
     return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;  
3500    
3501      case OP_WHITESPACE:  /*************************************************
3502      return next > 255 || (cd->ctypes[next] & ctype_space) == 0;  *          Check POSIX class name                *
3503    *************************************************/
3504    
3505      case OP_NOT_WHITESPACE:  /* This function is called to check the name given in a POSIX-style class entry
3506      return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;  such as [:alnum:].
3507    
3508      case OP_WORDCHAR:  Arguments:
3509      return next > 255 || (cd->ctypes[next] & ctype_word) == 0;    ptr        points to the first letter
3510      len        the length of the name
3511    
3512      case OP_NOT_WORDCHAR:  Returns:     a value representing the name, or -1 if unknown
3513      return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;  */
3514    
3515      case OP_HSPACE:  static int
3516      case OP_NOT_HSPACE:  check_posix_name(const pcre_uchar *ptr, int len)
3517      switch(next)  {
3518        {  const char *pn = posix_names;
3519        HSPACE_CASES:  register int yield = 0;
3520        return op_code == OP_NOT_HSPACE;  while (posix_name_lengths[yield] != 0)
3521      {
3522      if (len == posix_name_lengths[yield] &&
3523        STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3524      pn += posix_name_lengths[yield] + 1;
3525      yield++;
3526      }
3527    return -1;
3528    }
3529    
       default:  
       return op_code != OP_NOT_HSPACE;  
       }  
3530    
3531      case OP_ANYNL:  /*************************************************
3532      case OP_VSPACE:  *    Adjust OP_RECURSE items in repeated group   *
3533      case OP_NOT_VSPACE:  *************************************************/
     switch(next)  
       {  
       VSPACE_CASES:  
       return op_code == OP_NOT_VSPACE;  
3534    
3535        default:  /* OP_RECURSE items contain an offset from the start of the regex to the group
3536        return op_code != OP_NOT_VSPACE;  that is referenced. This means that groups can be replicated for fixed
3537        }  repetition simply by copying (because the recursion is allowed to refer to
3538    earlier groups that are outside the current group). However, when a group is
3539    optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3540    inserted before it, after it has been compiled. This means that any OP_RECURSE
3541    items within it that refer to the group itself or any contained groups have to
3542    have their offsets adjusted. That one of the jobs of this function. Before it
3543    is called, the partially compiled regex must be temporarily terminated with
3544    OP_END.
3545    
3546  #ifdef SUPPORT_UCP  This function has been extended with the possibility of forward references for
3547      case OP_PROP:  recursions and subroutine calls. It must also check the list of such references
3548      return check_char_prop(next, previous[0], previous[1], FALSE);  for the group we are dealing with. If it finds that one of the recursions in
3549    the current group is on this list, it adjusts the offset in the list, not the
3550    value in the reference (which is a group number).
3551    
3552      case OP_NOTPROP:  Arguments:
3553      return check_char_prop(next, previous[0], previous[1], TRUE);    group      points to the start of the group
3554  #endif    adjust     the amount by which the group is to be moved
3555      utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
3556      cd         contains pointers to tables etc.
3557      save_hwm   the hwm forward reference pointer at the start of the group
3558    
3559      default:  Returns:     nothing
3560      return FALSE;  */
     }  
   }  
3561    
3562  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP  static void
3563  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3564  generated only when PCRE_UCP is *not* set, that is, when only ASCII    pcre_uchar *save_hwm)
3565  characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are  {
3566  replaced by OP_PROP codes when PCRE_UCP is set. */  pcre_uchar *ptr = group;
3567    
3568  switch(op_code)  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3569    {    {
3570    case OP_CHAR:    int offset;
3571    case OP_CHARI:    pcre_uchar *hc;
3572    switch(escape)  
3573      {    /* See if this recursion is on the forward reference list. If so, adjust the
3574      case ESC_d:    reference. */
     return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;  
3575    
3576      case ESC_D:    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3577      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;      {
3578        offset = (int)GET(hc, 0);
3579        if (cd->start_code + offset == ptr + 1)
3580          {
3581          PUT(hc, 0, offset + adjust);
3582          break;
3583          }
3584        }
3585    
3586      case ESC_s:    /* Otherwise, adjust the recursion offset if it's after the start of this
3587      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;    group. */
3588    
3589      case ESC_S:    if (hc >= cd->hwm)
3590      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;      {
3591        offset = (int)GET(ptr, 1);
3592        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3593        }
3594    
3595      case ESC_w:    ptr += 1 + LINK_SIZE;
3596      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;    }
3597    }
3598    
     case ESC_W:  
     return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;  
3599    
     case ESC_h:  
     case ESC_H:  
     switch(c)  
       {  
       HSPACE_CASES:  
       return escape != ESC_h;  
3600    
3601        default:  /*************************************************
3602        return escape == ESC_h;  *        Insert an automatic callout point       *
3603        }  *************************************************/
3604    
3605      case ESC_v:  /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3606      case ESC_V:  callout points before each pattern item.
     switch(c)  
       {  
       VSPACE_CASES:  
       return escape != ESC_v;  
3607    
3608        default:  Arguments:
3609        return escape == ESC_v;    code           current code pointer
3610        }    ptr            current pattern pointer
3611      cd             pointers to tables etc
3612    
3613      /* When PCRE_UCP is set, these values get generated for \d etc. Find  Returns:         new code pointer
3614      their substitutions and process them. The result will always be either  */
     ESC_p or ESC_P. Then fall through to process those values. */  
3615    
3616  #ifdef SUPPORT_UCP  static pcre_uchar *
3617      case ESC_du:  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3618      case ESC_DU:  {
3619      case ESC_wu:  *code++ = OP_CALLOUT;
3620      case ESC_WU:  *code++ = 255;
3621      case ESC_su:  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
3622      case ESC_SU:  PUT(code, LINK_SIZE, 0);                       /* Default length */
3623        {  return code + 2 * LINK_SIZE;
3624        int temperrorcode = 0;  }
       ptr = substitutes[escape - ESC_DU];  
       escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);  
       if (temperrorcode != 0) return FALSE;  
       ptr++;    /* For compatibility */  
       }  
     /* Fall through */  
3625    
     case ESC_p:  
     case ESC_P:  
       {  
       unsigned int ptype = 0, pdata = 0;  
       int errorcodeptr;  
       BOOL negated;  
3626    
       ptr--;      /* Make ptr point at the p or P */  
       if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcodeptr))  
         return FALSE;  
       ptr++;      /* Point past the final curly ket */  
3627    
3628        /* If the property item is optional, we have to give up. (When generated  /*************************************************
3629        from \d etc by PCRE_UCP, this test will have been applied much earlier,  *         Complete a callout item                *
3630        to the original \d etc. At this point, ptr will point to a zero byte. */  *************************************************/
3631    
3632        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  /* A callout item contains the length of the next item in the pattern, which
3633          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)  we can't fill in till after we have reached the relevant point. This is used
3634            return FALSE;  for both automatic and manual callouts.
3635    
3636        /* Do the property check. */  Arguments:
3637      previous_callout   points to previous callout item
3638      ptr                current pattern pointer
3639      cd                 pointers to tables etc
3640    
3641        return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);  Returns:             nothing
3642        }  */
 #endif  
3643    
3644      default:  static void
3645      return FALSE;  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3646      }  {
3647    int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3648    PUT(previous_callout, 2 + LINK_SIZE, length);
3649    }
3650    
   /* In principle, support for Unicode properties should be integrated here as  
   well. It means re-organizing the above code so as to get hold of the property  
   values before switching on the op-code. However, I wonder how many patterns  
   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,  
   these op-codes are never generated.) */  
3651    
   case OP_DIGIT:  
   return escape == ESC_D || escape == ESC_s || escape == ESC_W ||  
          escape == ESC_h || escape == ESC_v || escape == ESC_R;  
3652    
3653    case OP_NOT_DIGIT:  #ifdef SUPPORT_UCP
3654    return escape == ESC_d;  /*************************************************
3655    *           Get othercase range                  *
3656    *************************************************/
3657    
3658    case OP_WHITESPACE:  /* This function is passed the start and end of a class range, in UTF-8 mode
3659    return escape == ESC_S || escape == ESC_d || escape == ESC_w;  with UCP support. It searches up the characters, looking for ranges of
3660    characters in the "other" case. Each call returns the next one, updating the
3661    start address. A character with multiple other cases is returned on its own
3662    with a special return value.
3663    
3664    case OP_NOT_WHITESPACE:  Arguments:
3665    return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;    cptr        points to starting character value; updated
3666      d           end value
3667      ocptr       where to put start of othercase range
3668      odptr       where to put end of othercase range
3669    
3670    case OP_HSPACE:  Yield:        -1 when no more
3671    return escape == ESC_S || escape == ESC_H || escape == ESC_d ||                 0 when a range is returned
3672           escape == ESC_w || escape == ESC_v || escape == ESC_R;                >0 the CASESET offset for char with multiple other cases
3673                    in this case, ocptr contains the original
3674    */
3675    
3676    case OP_NOT_HSPACE:  static int
3677    return escape == ESC_h;  get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3678      pcre_uint32 *odptr)
3679    {
3680    pcre_uint32 c, othercase, next;
3681    unsigned int co;
3682    
3683    /* Can't have \S in here because VT matches \S (Perl anomaly) */  /* Find the first character that has an other case. If it has multiple other
3684    case OP_ANYNL:  cases, return its case offset value. */
   case OP_VSPACE:  
   return escape == ESC_V || escape == ESC_d || escape == ESC_w;  
3685    
3686    case OP_NOT_VSPACE:  for (c = *cptr; c <= d; c++)
3687    return escape == ESC_v || escape == ESC_R;    {
3688      if ((co = UCD_CASESET(c)) != 0)
3689        {
3690        *ocptr = c++;   /* Character that has the set */
3691        *cptr = c;      /* Rest of input range */
3692        return (int)co;
3693        }
3694      if ((othercase = UCD_OTHERCASE(c)) != c) break;
3695      }
3696    
3697    case OP_WORDCHAR:  if (c > d) return -1;  /* Reached end of range */
   return escape == ESC_W || escape == ESC_s || escape == ESC_h ||  
          escape == ESC_v || escape == ESC_R;  
3698    
3699    case OP_NOT_WORDCHAR:  *ocptr = othercase;
3700    return escape == ESC_w || escape == ESC_d;  next = othercase + 1;
3701    
3702    default:  for (++c; c <= d; c++)
3703    return FALSE;    {
3704      if (UCD_OTHERCASE(c) != next) break;
3705      next++;
3706    }    }
3707    
3708  /* Control does not reach here */  *odptr = next - 1;     /* End of othercase range */
3709    *cptr = c;             /* Rest of input range */
3710    return 0;
3711  }  }
3712    #endif  /* SUPPORT_UCP */
3713    
3714    
3715    
# Line 3754  to find out the amount of memory needed, Line 3962  to find out the amount of memory needed,
3962  phase. The value of lengthptr distinguishes the two phases.  phase. The value of lengthptr distinguishes the two phases.
3963    
3964  Arguments:  Arguments:
3965    optionsptr     pointer to the option bits    optionsptr        pointer to the option bits
3966    codeptr        points to the pointer to the current code point    codeptr           points to the pointer to the current code point
3967    ptrptr         points to the current pattern pointer    ptrptr            points to the current pattern pointer
3968    errorcodeptr   points to error code variable    errorcodeptr      points to error code variable
3969    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
3970    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
3971    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
3972    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
3973    bcptr          points to current branch chain    bcptr             points to current branch chain
3974    cond_depth     conditional nesting depth    cond_depth        conditional nesting depth
3975    cd             contains pointers to tables etc.    cd                contains pointers to tables etc.
3976    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
3977                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
3978    
3979  Returns:         TRUE on success  Returns:            TRUE on success
3980                   FALSE, with *errorcodeptr set non-zero on error                      FALSE, with *errorcodeptr set non-zero on error
3981  */  */
3982    
3983  static BOOL  static BOOL
# Line 4420  for (;; ptr++) Line 4628  for (;; ptr++)
4628              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4629              continue;              continue;
4630    
4631              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4632              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4633              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
4634              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4635                we could just adjust the appropriate bit. From PCRE 8.34 we no
4636                longer treat \s and \S specially. */
4637    
4638              case ESC_s:              case ESC_s:
4639              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
4640              continue;              continue;
4641    
4642              case ESC_S:              case ESC_S:
4643              should_flip_negation = TRUE;              should_flip_negation = TRUE;
4644              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
4645              continue;              continue;
4646    
4647              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
# Line 4933  for (;; ptr++) Line 5140  for (;; ptr++)
5140            }            }
5141          }          }
5142    
       /* If the repetition is unlimited, it pays to see if the next thing on  
       the line is something that cannot possibly match this character. If so,  
       automatically possessifying this item gains some performance in the case  
       where the match fails. */  
   
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
   
5143        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5144        }        }
5145    
# Line 4963  for (;; ptr++) Line 5157  for (;; ptr++)
5157        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5158        c = *previous;        c = *previous;
5159    
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
   
5160        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
5161        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
5162          {          {
# Line 5119  for (;; ptr++) Line 5305  for (;; ptr++)
5305      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
5306      stuff after it, but just skip the item if the repeat was {0,0}. */      stuff after it, but just skip the item if the repeat was {0,0}. */
5307    
5308      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
              *previous == OP_NCLASS ||  
5309  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5310               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5311  #endif  #endif
5312               *previous == OP_REF ||               *previous == OP_REF   || *previous == OP_REFI ||
5313               *previous == OP_REFI)               *previous == OP_DNREF || *previous == OP_DNREFI)
5314        {        {
5315        if (repeat_max == 0)        if (repeat_max == 0)
5316          {          {
# Line 5846  for (;; ptr++) Line 6031  for (;; ptr++)
6031                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6032            break;            break;
6033    
6034          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6035          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6036    
6037          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6038          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 5863  for (;; ptr++) Line 6048  for (;; ptr++)
6048            }            }
6049    
6050          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6051          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6052            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6053            consist entirely of digits, there is scope for ambiguity. */
6054    
6055          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6056            {            {
# Line 5880  for (;; ptr++) Line 6067  for (;; ptr++)
6067            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6068            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6069            }            }
6070    
6071            /* When a name is one of a number of duplicates, a different opcode is
6072            used and it needs more memory. Unfortunately we cannot tell whether a
6073            name is a duplicate in the first pass, so we have to allow for more
6074            memory except when we know it is a relative numerical reference. */
6075    
6076            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6077    
6078          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name (possibly all digits); any thing else
6079            is an error. In the case of all digits, also get it as a number. */
6080    
6081          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6082            {            {
# Line 5890  for (;; ptr++) Line 6085  for (;; ptr++)
6085            goto FAILED;            goto FAILED;
6086            }            }
6087    
         /* Read the name, but also get it as a number if it's all digits */  
   
6088          recno = 0;          recno = 0;
6089          name = ++ptr;          name = ++ptr;
6090          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
# Line 5902  for (;; ptr++) Line 6095  for (;; ptr++)
6095            }            }
6096          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6097    
6098            /* Check the terminator */
6099    
6100          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6101              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6102            {            {
# Line 5937  for (;; ptr++) Line 6132  for (;; ptr++)
6132            }            }
6133    
6134          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6135          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
6136          OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
   
6137          slot = cd->name_table;          slot = cd->name_table;
6138          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
6139            {            {
# Line 5949  for (;; ptr++) Line 6141  for (;; ptr++)
6141            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6142            }            }
6143    
6144          /* Found a previous named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6145            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6146            appropriate data values. Otherwise, just insert the unique subpattern
6147            number. */
6148    
6149          if (i < cd->names_found)          if (i < cd->names_found)
6150            {            {
6151            recno = GET2(slot, 0);            int offset = i++;
6152            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6153            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6154            }            for (; i < cd->names_found; i++)
6155                {
6156          /* Search the pattern for a forward reference */              slot += cd->name_entry_size;
6157                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6158          else if ((i = find_parens(cd, name, namelen,              count++;
6159                          (options & PCRE_EXTENDED) != 0, utf)) > 0)              }
6160            {            if (count > 1)
6161            PUT2(code, 2+LINK_SIZE, i);              {
6162            code[1+LINK_SIZE]++;              PUT2(code, 2+LINK_SIZE, offset);
6163                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6164                skipbytes += IMM2_SIZE;
6165                code[1+LINK_SIZE]++;
6166                }
6167              else  /* Not a duplicated name */
6168                {
6169                PUT2(code, 2+LINK_SIZE, recno);
6170                }
6171            }            }
6172    
6173          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 6130  for (;; ptr++) Line 6333  for (;; ptr++)
6333          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6334          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
6335          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
6336            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6337              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6338            name = ++ptr;
6339    
6340            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6341            namelen = (int)(ptr - name);
6342    
6343            /* In the pre-compile phase, do a syntax check, remember the longest
6344            name, and then remember the group in a vector, expanding it if
6345            necessary. Duplicates for the same number are skipped; other duplicates
6346            are checked for validity. In the actual compile, there is nothing to
6347            do. */
6348    
6349            if (lengthptr != NULL)
6350            {            {
6351            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            named_group *ng;
6352              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            pcre_uint32 number = cd->bracount + 1;
           name = ++ptr;  
6353    
6354            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            if (*ptr != (pcre_uchar)terminator)
6355            namelen = (int)(ptr - name);              {
6356                *errorcodeptr = ERR42;
6357                goto FAILED;
6358                }
6359    
6360            /* In the pre-compile phase, just do a syntax check. */            if (cd->names_found >= MAX_NAME_COUNT)
6361                {
6362                *errorcodeptr = ERR49;
6363                goto FAILED;
6364                }
6365    
6366            if (lengthptr != NULL)            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6367              {              {
6368              if (*ptr != (pcre_uchar)terminator)              cd->name_entry_size = namelen + IMM2_SIZE + 1;
6369                {              if (namelen > MAX_NAME_SIZE)
               *errorcodeptr = ERR42;  
               goto FAILED;  
               }  
             if (cd->names_found >= MAX_NAME_COUNT)  
6370                {                {
6371                *errorcodeptr = ERR49;                *errorcodeptr = ERR48;
6372                goto FAILED;                goto FAILED;
6373                }                }
6374              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)              }
6375    
6376              /* Scan the list to check for duplicates. For duplicate names, if the
6377              number is the same, break the loop, which causes the name to be
6378              discarded; otherwise, if DUPNAMES is not set, give an error.
6379              If it is set, allow the name with a different number, but continue
6380              scanning in case this is a duplicate with the same number. For
6381              non-duplicate names, give an error if the number is duplicated. */
6382    
6383              ng = cd->named_groups;
6384              for (i = 0; i < cd->names_found; i++, ng++)
6385                {
6386                if (namelen == ng->length &&
6387                    STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6388                {                {
6389                cd->name_entry_size = namelen + IMM2_SIZE + 1;                if (ng->number == number) break;
6390                if (namelen > MAX_NAME_SIZE)                if ((options & PCRE_DUPNAMES) == 0)
6391                  {                  {
6392                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR43;
6393                  goto FAILED;                  goto FAILED;
6394                  }                  }
6395                  cd->dupnames = TRUE;  /* Duplicate names exist */
6396                  }
6397                else if (ng->number == number)
6398                  {
6399                  *errorcodeptr = ERR65;
6400                  goto FAILED;
6401                }                }
6402              }              }
6403    
6404            /* In the real compile, create the entry in the table, maintaining            if (i >= cd->names_found)     /* Not a duplicate with same number */
           alphabetical order. Duplicate names for different numbers are  
           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same  
           number are always OK. (An existing number can be re-used if (?|  
           appears in the pattern.) In either event, a duplicate name results in  
           a duplicate entry in the table, even if the number is the same. This  
           is because the number of names, and hence the table size, is computed  
           in the pre-compile, and it affects various numbers and pointers which  
           would all have to be modified, and the compiled code moved down, if  
           duplicates with the same number were omitted from the table. This  
           doesn't seem worth the hassle. However, *different* names for the  
           same number are not permitted. */  
   
           else  
6405              {              {
6406              BOOL dupname = FALSE;              /* Increase the list size if necessary */
             slot = cd->name_table;  
6407    
6408              for (i = 0; i < cd->names_found; i++)              if (cd->names_found >= cd->named_group_list_size)
6409                {                {
6410                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));                int newsize = cd->named_group_list_size * 2;
6411                if (crc == 0)                named_group *newspace = (PUBL(malloc))
6412                  {                  (newsize * sizeof(named_group));
                 if (slot[IMM2_SIZE+namelen] == 0)  
                   {  
                   if (GET2(slot, 0) != cd->bracount + 1 &&  
                       (options & PCRE_DUPNAMES) == 0)  
                     {  
                     *errorcodeptr = ERR43;  
                     goto FAILED;  
                     }  
                   else dupname = TRUE;  
                   }  
                 else crc = -1;      /* Current name is a substring */  
                 }  
   
               /* Make space in the table and break the loop for an earlier  
               name. For a duplicate or later name, carry on. We do this for  
               duplicates so that in the simple case (when ?(| is not used) they  
               are in order of their numbers. */  
6413    
6414                if (crc < 0)                if (newspace == NULL)
6415                  {                  {
6416                  memmove(slot + cd->name_entry_size, slot,                  *errorcodeptr = ERR21;
6417                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));                  goto FAILED;
                 break;  
6418                  }                  }
6419    
6420                /* Continue the loop for a later or duplicate name */                memcpy(newspace, cd->named_groups,
6421                    cd->named_group_list_size * sizeof(named_group));
6422                slot += cd->name_entry_size;                if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6423                }                  (PUBL(free))((void *)cd->named_groups);
6424                  cd->named_groups = newspace;
6425              /* For non-duplicate names, check for a duplicate number before                cd->named_group_list_size = newsize;
             adding the new name. */  
   
             if (!dupname)  
               {  
               pcre_uchar *cslot = cd->name_table;  
               for (i = 0; i < cd->names_found; i++)  
                 {  
                 if (cslot != slot)  
                   {  
                   if (GET2(cslot, 0) == cd->bracount + 1)  
                     {  
                     *errorcodeptr = ERR65;  
                     goto FAILED;  
                     }  
                   }  
                 else i--;  
                 cslot += cd->name_entry_size;  
                 }  
6426                }                }
6427    
6428              PUT2(slot, 0, cd->bracount + 1);              cd->named_groups[cd->names_found].name = name;
6429              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));              cd->named_groups[cd->names_found].length = namelen;
6430              slot[IMM2_SIZE + namelen] = 0;              cd->named_groups[cd->names_found].number = number;
6431                cd->names_found++;
6432              }              }
6433            }            }
6434    
6435          /* In both pre-compile and compile, count the number of names we've          ptr++;                    /* Move past > or ' in both passes. */
         encountered. */  
   
         cd->names_found++;  
         ptr++;                    /* Move past > or ' */  
6436          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
6437    
6438    
# Line 6277  for (;; ptr++) Line 6462  for (;; ptr++)
6462    
6463          if (lengthptr != NULL)          if (lengthptr != NULL)
6464            {            {
6465            const pcre_uchar *temp;            named_group *ng;
6466    
6467            if (namelen == 0)            if (namelen == 0)
6468              {              {
# Line 6295  for (;; ptr++) Line 6480  for (;; ptr++)
6480              goto FAILED;              goto FAILED;
6481              }              }
6482    
6483            /* The name table does not exist in the first pass, so we cannot            /* The name table does not exist in the first pass; instead we must
6484            do a simple search as in the code below. Instead, we have to scan the            scan the list of names encountered so far in order to get the
6485            pattern to find the number. It is important that we scan it only as            number. If the name is not found, set the value to 0 for a forward
6486            far as we have got because the syntax of named subpatterns has not            reference. */
6487            been checked for the rest of the pattern, and find_parens() assumes  
6488            correct syntax. In any case, it's a waste of resources to scan            ng = cd->named_groups;
6489            further. We stop the scan at the current point by temporarily            for (i = 0; i < cd->names_found; i++, ng++)
6490            adjusting the value of cd->endpattern. */              {
6491                if (namelen == ng->length &&
6492            temp = cd->end_pattern;                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6493            cd->end_pattern = ptr;                break;
6494            recno = find_parens(cd, name, namelen,              }
6495              (options & PCRE_EXTENDED) != 0, utf);            recno = (i < cd->names_found)? ng->number : 0;
6496            cd->end_pattern = temp;  
6497            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            /* Count named back references. */
6498    
6499              if (!is_recurse) cd->namedrefcount++;
6500            }            }
6501    
6502          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, search the name table. We check the name
6503          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
6504          table. That way, if the name that is longer than any in the table,          table. That way, if the name is longer than any in the table, the
6505          the comparison will fail without reading beyond the table entry. */          comparison will fail without reading beyond the table entry. */
6506    
6507          else          else
6508            {            {
# Line 6328  for (;; ptr++) Line 6515  for (;; ptr++)
6515              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6516              }              }
6517    
6518            if (i < cd->names_found)         /* Back reference */            if (i < cd->names_found)
6519              {              {
6520              recno = GET2(slot, 0);              recno = GET2(slot, 0);
6521              }              }
6522            else if ((recno =                /* Forward back reference */            else
                     find_parens(cd, name, namelen,  
                       (options & PCRE_EXTENDED) != 0, utf)) <= 0)  
6523              {              {
6524              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6525              goto FAILED;              goto FAILED;
6526              }              }
6527            }            }
6528    
6529          /* In both phases, we can now go to the code than handles numerical          /* In both phases, for recursions, we can now go to the code than
6530          recursion or backreferences. */          handles numerical recursion. */
6531    
6532          if (is_recurse) goto HANDLE_RECURSION;          if (is_recurse) goto HANDLE_RECURSION;
6533            else goto HANDLE_REFERENCE;  
6534            /* In the second pass we must see if the name is duplicated. If so, we
6535            generate a different opcode. */
6536    
6537            if (lengthptr == NULL && cd->dupnames)
6538              {
6539              int count = 1;
6540              unsigned int index = i;
6541              pcre_uchar *cslot = slot + cd->name_entry_size;
6542    
6543              for (i++; i < cd->names_found; i++)
6544                {
6545                if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
6546                count++;
6547                cslot += cd->name_entry_size;
6548                }
6549    
6550              if (count > 1)
6551                {
6552                if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6553                previous = code;
6554                *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6555                PUT2INC(code, 0, index);
6556                PUT2INC(code, 0, count);
6557    
6558                /* Process each potentially referenced group. */
6559    
6560                for (; slot < cslot; slot += cd->name_entry_size)
6561                  {
6562                  open_capitem *oc;
6563                  recno = GET2(slot, 0);
6564                  cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6565                  if (recno > cd->top_backref) cd->top_backref = recno;
6566    
6567                  /* Check to see if this back reference is recursive, that it, it
6568                  is inside the group that it references. A flag is set so that the
6569                  group can be made atomic. */
6570    
6571                  for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6572                    {
6573                    if (oc->number == recno)
6574                      {
6575                      oc->flag = TRUE;
6576                      break;
6577                      }
6578                    }
6579                  }
6580    
6581                continue;  /* End of back ref handling */
6582                }
6583              }
6584    
6585            /* First pass, or a non-duplicated name. */
6586    
6587            goto HANDLE_REFERENCE;
6588    
6589    
6590          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
# Line 6444  for (;; ptr++) Line 6683  for (;; ptr++)
6683    
6684              if (called == NULL)              if (called == NULL)
6685                {                {
6686                if (find_parens(cd, NULL, recno,                if (recno > cd->final_bracount)
                     (options & PCRE_EXTENDED) != 0, utf) < 0)  
6687                  {                  {
6688                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
6689                  goto FAILED;                  goto FAILED;
# Line 6929  for (;; ptr++) Line 7167  for (;; ptr++)
7167          open_capitem *oc;          open_capitem *oc;
7168          recno = -escape;          recno = -escape;
7169    
7170          HANDLE_REFERENCE:    /* Come here from named backref handling */          /* Come here from named backref handling when the reference is to a
7171            single group (i.e. not to a duplicated name. */
7172    
7173            HANDLE_REFERENCE:
7174          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7175          previous = code;          previous = code;
7176          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
# Line 7058  for (;; ptr++) Line 7299  for (;; ptr++)
7299          *code++ = OP_PROP;          *code++ = OP_PROP;
7300          *code++ = PT_CLIST;          *code++ = PT_CLIST;
7301          *code++ = c;          *code++ = c;
7302          if (firstcharflags == REQ_UNSET) firstcharflags = zerofirstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET)
7303              firstcharflags = zerofirstcharflags = REQ_NONE;
7304          break;          break;
7305          }          }
7306        }        }
# Line 7147  out the amount of memory needed, as well Line 7389  out the amount of memory needed, as well
7389  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
7390    
7391  Arguments:  Arguments:
7392    options        option bits, including any changes for this subpattern    options           option bits, including any changes for this subpattern
7393    codeptr        -> the address of the current code pointer    codeptr           -> the address of the current code pointer
7394    ptrptr         -> the address of the current pattern pointer    ptrptr            -> the address of the current pattern pointer
7395    errorcodeptr   -> pointer to error code variable    errorcodeptr      -> pointer to error code variable
7396    lookbehind     TRUE if this is a lookbehind assertion    lookbehind        TRUE if this is a lookbehind assertion
7397    reset_bracount TRUE to reset the count for each branch    reset_bracount    TRUE to reset the count for each branch
7398    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes         skip this many bytes at start (for brackets and OP_COND)
7399    cond_depth     depth of nesting for conditional subpatterns    cond_depth        depth of nesting for conditional subpatterns
7400    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
7401    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
7402    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
7403    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
7404    bcptr          pointer to the chain of currently open branches    bcptr             pointer to the chain of currently open branches
7405    cd             points to the data block with tables pointers etc.    cd                points to the data block with tables pointers etc.
7406    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
7407                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
7408    
7409  Returns:         TRUE on success  Returns:            TRUE on success
7410  */  */
7411    
7412  static BOOL  static BOOL
# Line 7615  do { Line 7857  do {
7857       switch (*scode)       switch (*scode)
7858         {         {
7859         case OP_CREF:         case OP_CREF:
7860         case OP_NCREF:         case OP_DNCREF:
7861         case OP_RREF:         case OP_RREF:
7862         case OP_NRREF:         case OP_DNRREF:
7863         case OP_DEF:         case OP_DEF:
7864         return FALSE;         return FALSE;
7865    
# Line 7701  return TRUE; Line 7943  return TRUE;
7943  discarded, because they can cause conflicts with actual literals that follow.  discarded, because they can cause conflicts with actual literals that follow.
7944  However, if we end up without a first char setting for an unanchored pattern,  However, if we end up without a first char setting for an unanchored pattern,
7945  it is worth scanning the regex to see if there is an initial asserted first  it is worth scanning the regex to see if there is an initial asserted first
7946  char. If all branches start with the same asserted char, or with a bracket all  char. If all branches start with the same asserted char, or with a
7947  of whose alternatives start with the same asserted char (recurse ad lib), then  non-conditional bracket all of whose alternatives start with the same asserted
7948  we return that char, otherwise -1.  char (recurse ad lib), then we return that char, with the flags set to zero or
7949    REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
7950    
7951  Arguments:  Arguments:
7952    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
7953    flags       points to the first char flags, or to REQ_NONE    flags      points to the first char flags, or to REQ_NONE
7954    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
7955    
7956  Returns:     the fixed first char, or 0 with REQ_NONE in flags  Returns:     the fixed first char, or 0 with REQ_NONE in flags
# Line 7744  do { Line 7987  do {
7987       case OP_ASSERT:       case OP_ASSERT:
7988       case OP_ONCE:       case OP_ONCE:
7989       case OP_ONCE_NC:       case OP_ONCE_NC:
      case OP_COND:  
7990       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
7991       if (dflags < 0)       if (dflags < 0)
7992         return 0;         return 0;
# Line 7789  return c; Line 8031  return c;
8031    
8032    
8033  /*************************************************  /*************************************************
8034    *     Add an entry to the name/number table      *
8035    *************************************************/
8036    
8037    /* This function is called between compiling passes to add an entry to the
8038    name/number table, maintaining alphabetical order. Checking for permitted
8039    and forbidden duplicates has already been done.
8040    
8041    Arguments:
8042      cd           the compile data block
8043      name         the name to add
8044      length       the length of the name
8045      groupno      the group number
8046    
8047    Returns:       nothing
8048    */
8049    
8050    static void
8051    add_name(compile_data *cd, const pcre_uchar *name, int length,
8052      unsigned int groupno)
8053    {
8054    int i;
8055    pcre_uchar *slot = cd->name_table;
8056    
8057    for (i = 0; i < cd->names_found; i++)
8058      {
8059      int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8060      if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8061        crc = -1; /* Current name is a substring */
8062    
8063      /* Make space in the table and break the loop for an earlier name. For a
8064      duplicate or later name, carry on. We do this for duplicates so that in the
8065      simple case (when ?(| is not used) they are in order of their numbers. In all
8066      cases they are in the order in which they appear in the pattern. */
8067    
8068      if (crc < 0)
8069        {
8070        memmove(slot + cd->name_entry_size, slot,
8071          IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8072        break;
8073        }
8074    
8075      /* Continue the loop for a later or duplicate name */
8076    
8077      slot += cd->name_entry_size;
8078      }
8079    
8080    PUT2(slot, 0, groupno);
8081    memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8082    slot[IMM2_SIZE + length] = 0;
8083    cd->names_found++;
8084    }
8085    
8086    
8087    
8088    /*************************************************
8089  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
8090  *************************************************/  *************************************************/
8091    
# Line 7875  new memory is obtained from malloc(). */ Line 8172  new memory is obtained from malloc(). */
8172    
8173  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
8174    
8175    /* This vector is used for remembering name groups during the pre-compile. In a
8176    similar way to cworkspace, it can be expanded using malloc() if necessary. */
8177    
8178    named_group named_groups[NAMED_GROUP_LIST_SIZE];
8179    
8180  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
8181    
8182  ptr = (const pcre_uchar *)pattern;  ptr = (const pcre_uchar *)pattern;
# Line 8137  cd->bracount = cd->final_bracount = 0; Line 8439  cd->bracount = cd->final_bracount = 0;
8439  cd->names_found = 0;  cd->names_found = 0;
8440  cd->name_entry_size = 0;  cd->name_entry_size = 0;
8441  cd->name_table = NULL;  cd->name_table = NULL;
8442    cd->dupnames = FALSE;
8443    cd->namedrefcount = 0;
8444  cd->start_code = cworkspace;  cd->start_code = cworkspace;
8445  cd->hwm = cworkspace;  cd->hwm = cworkspace;
8446  cd->start_workspace = cworkspace;  cd->start_workspace = cworkspace;
8447  cd->workspace_size = COMPILE_WORK_SIZE;  cd->workspace_size = COMPILE_WORK_SIZE;
8448    cd->named_groups = named_groups;
8449    cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
8450  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
8451  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
8452  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 8172  if (length > MAX_PATTERN_SIZE) Line 8478  if (length > MAX_PATTERN_SIZE)
8478    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
8479    }    }
8480    
8481  /* Compute the size of data block needed and get it, either from malloc or  /* If there are groups with duplicate names and there are also references by
8482  externally provided function. Integer overflow should no longer be possible  name, we must allow for the possibility of named references to duplicated
8483  because nowadays we limit the maximum value of cd->names_found and  groups. These require an extra data item each. */
 cd->name_entry_size. */  
8484    
8485  size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);  if (cd->dupnames && cd->namedrefcount > 0)
8486  re = (REAL_PCRE *)(PUBL(malloc))(size);    length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
8487    
8488    /* Compute the size of the data block for storing the compiled pattern. Integer
8489    overflow should no longer be possible because nowadays we limit the maximum
8490    value of cd->names_found and cd->name_entry_size. */
8491    
8492    size = sizeof(REAL_PCRE) +
8493      (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
8494    
8495    /* Get the memory. */
8496    
8497    re = (REAL_PCRE *)(PUBL(malloc))(size);
8498  if (re == NULL)  if (re == NULL)
8499    {    {
8500    errorcode = ERR21;    errorcode = ERR21;
# Line 8223  cd->final_bracount = cd->bracount;  /* S Line 8538  cd->final_bracount = cd->bracount;  /* S
8538  cd->assert_depth = 0;  cd->assert_depth = 0;
8539  cd->bracount = 0;  cd->bracount = 0;
8540  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
 cd->names_found = 0;  
8541  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
8542  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
8543  cd->start_code = codestart;  cd->start_code = codestart;
# Line 8234  cd->had_pruneorskip = FALSE; Line 8548  cd->had_pruneorskip = FALSE;
8548  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
8549  cd->open_caps = NULL;  cd->open_caps = NULL;
8550    
8551    /* If any named groups were found, create the name/number table from the list
8552    created in the first pass. */
8553    
8554    if (cd->names_found > 0)
8555      {
8556      int i = cd->names_found;
8557      named_group *ng = cd->named_groups;
8558      cd->names_found = 0;
8559      for (; i > 0; i--, ng++)
8560        add_name(cd, ng->name, ng->length, ng->number);
8561      if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
8562        (PUBL(free))((void *)cd->named_groups);
8563      }
8564    
8565  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
8566  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
8567  of the function here. */  of the function here. */
# Line 8297  if (cd->hwm > cd->start_workspace) Line 8625  if (cd->hwm > cd->start_workspace)
8625      }      }
8626    }    }
8627    
8628  /* If the workspace had to be expanded, free the new memory. Set the pointer to  /* If the workspace had to be expanded, free the new memory. Set the pointer to
8629  NULL to indicate that forward references have been filled in. */  NULL to indicate that forward references have been filled in. */
8630    
8631  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
8632    (PUBL(free))((void *)cd->start_workspace);    (PUBL(free))((void *)cd->start_workspace);
8633  cd->start_workspace = NULL;  cd->start_workspace = NULL;
8634    
8635  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
8636  subpattern. */  subpattern. */
8637    
8638  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8639    
8640    /* Unless disabled, check whether single character iterators can be
8641    auto-possessified. The function overwrites the appropriate opcode values. */
8642    
8643    if ((options & PCRE_NO_AUTO_POSSESSIFY) == 0)
8644      auto_possessify((pcre_uchar *)codestart, utf, cd);
8645    
8646  /* If there were any lookbehind assertions that contained OP_RECURSE  /* If there were any lookbehind assertions that contained OP_RECURSE
8647  (recursions or subroutine calls), a flag is set for them to be checked here,  (recursions or subroutine calls), a flag is set for them to be checked here,
8648  because they may contain forward references. Actual recursions cannot be fixed  because they may contain forward references. Actual recursions cannot be fixed
# Line 8506  if (code - codestart > length) Line 8840  if (code - codestart > length)
8840    }    }
8841  #endif   /* PCRE_DEBUG */  #endif   /* PCRE_DEBUG */
8842    
8843  /* Check for a pattern than can match an empty string, so that this information  /* Check for a pattern than can match an empty string, so that this information
8844  can be provided to applications. */  can be provided to applications. */
8845    
8846  do  do
# Line 8515  do Line 8849  do
8849      {      {
8850      re->flags |= PCRE_MATCH_EMPTY;      re->flags |= PCRE_MATCH_EMPTY;
8851      break;      break;
8852      }      }
8853    codestart += GET(codestart, 1);    codestart += GET(codestart, 1);
8854    }    }
8855  while (*codestart == OP_ALT);  while (*codestart == OP_ALT);

Legend:
Removed from v.1348  
changed lines
  Added in v.1369

  ViewVC Help
Powered by ViewVC 1.1.5