/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1072 by chpe, Tue Oct 16 15:54:40 2012 UTC revision 1369 by ph10, Tue Oct 8 15:06:46 2013 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2012 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 80  to check them every time. */ Line 80  to check them every time. */
80  /* Definitions to allow mutual recursion */  /* Definitions to allow mutual recursion */
81    
82  static int  static int
83    add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,    add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84      const pcre_uint32 *, unsigned int);      const pcre_uint32 *, unsigned int);
85    
86  static BOOL  static BOOL
87    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL,    compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88      int, int, int *, int *, branch_chain *, compile_data *, int *);      pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89        compile_data *, int *);
90    
91    
92    
# Line 114  kicks in at the same number of forward r Line 115  kicks in at the same number of forward r
115  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123    #define NAMED_GROUP_LIST_SIZE  20
124    
125  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
126  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
127    
# Line 121  overrun before it actually does run off Line 129  overrun before it actually does run off
129    
130  /* Private flags added to firstchar and reqchar. */  /* Private flags added to firstchar and reqchar. */
131    
132  #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */  #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133  #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */  #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134  #define REQ_MASK       (REQ_CASELESS | REQ_VARY)  /* Negative values for the firstchar and reqchar flags */
135    #define REQ_UNSET       (-2)
136    #define REQ_NONE        (-1)
137    
138  /* Repeated character flags. */  /* Repeated character flags. */
139    
# Line 484  static const char error_texts[] = Line 494  static const char error_texts[] =
494    "a numbered reference must not be zero\0"    "a numbered reference must not be zero\0"
495    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496    /* 60 */    /* 60 */
497    "(*VERB) not recognized\0"    "(*VERB) not recognized or malformed\0"
498    "number is too big\0"    "number is too big\0"
499    "subpattern name expected\0"    "subpattern name expected\0"
500    "digit expected after (?+\0"    "digit expected after (?+\0"
# Line 505  static const char error_texts[] = Line 515  static const char error_texts[] =
515    "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"    "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
517    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
518      "setting UTF is disabled by the application\0"
519    ;    ;
520    
521  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 644  static const pcre_uint8 ebcdic_chartab[] Line 655  static const pcre_uint8 ebcdic_chartab[]
655  #endif  #endif
656    
657    
658    /* This table is used to check whether auto-possessification is possible
659    between adjacent character-type opcodes. The left-hand (repeated) opcode is
660    used to select the row, and the right-hand opcode is use to select the column.
661    A value of 1 means that auto-possessification is OK. For example, the second
662    value in the first row means that \D+\d can be turned into \D++\d.
663    
664    The Unicode property types (\P and \p) have to be present to fill out the table
665    because of what their opcode values are, but the table values should always be
666    zero because property types are handled separately in the code. The last four
667    columns apply to items that cannot be repeated, so there is no need to have
668    rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
669    *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
670    
671    #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
672    #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
673    
674    static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
675    /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
676      { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
677      { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
678      { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
679      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
680      { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
681      { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
682      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
683      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
684      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
685      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
686      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
687      { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
688      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
689      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
690      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
691      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
692      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
693    };
694    
695    
696    /* This table is used to check whether auto-possessification is possible
697    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
698    left-hand (repeated) opcode is used to select the row, and the right-hand
699    opcode is used to select the column. The values are as follows:
700    
701      0   Always return FALSE (never auto-possessify)
702      1   Character groups are distinct (possessify if both are OP_PROP)
703      2   Check character categories in the same group (general or particular)
704      3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
705    
706      4   Check left general category vs right particular category
707      5   Check right general category vs left particular category
708    
709      6   Left alphanum vs right general category
710      7   Left space vs right general category
711      8   Left word vs right general category
712    
713      9   Right alphanum vs left general category
714     10   Right space vs left general category
715     11   Right word vs left general category
716    
717     12   Left alphanum vs right particular category
718     13   Left space vs right particular category
719     14   Left word vs right particular category
720    
721     15   Right alphanum vs left particular category
722     16   Right space vs left particular category
723     17   Right word vs left particular category
724    */
725    
726    static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
727    /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
728      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
729      { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
730      { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
731      { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
732      { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
733      { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
734      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
735      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
736      { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
737      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
738      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
739    };
740    
741    /* This table is used to check whether auto-possessification is possible
742    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
743    specifies a general category and the other specifies a particular category. The
744    row is selected by the general category and the column by the particular
745    category. The value is 1 if the particular category is not part of the general
746    category. */
747    
748    static const pcre_uint8 catposstab[7][30] = {
749    /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
750      { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
751      { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
752      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
753      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
754      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
755      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
756      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
757    };
758    
759    /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
760    a general or particular category. The properties in each row are those
761    that apply to the character set in question. Duplication means that a little
762    unnecessary work is done when checking, but this keeps things much simpler
763    because they can all use the same code. For more details see the comment where
764    this table is used.
765    
766    Note: SPACE and PXSPACE used to be different because Perl excluded VT from
767    "space", but from Perl 5.18 it's included, so both categories are treated the
768    same here. */
769    
770    static const pcre_uint8 posspropstab[3][4] = {
771      { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
772      { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
773      { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
774    };
775    
776    
777    
778  /*************************************************  /*************************************************
# Line 665  find_error_text(int n) Line 794  find_error_text(int n)
794  const char *s = error_texts;  const char *s = error_texts;
795  for (; n > 0; n--)  for (; n > 0; n--)
796    {    {
797    while (*s++ != 0) {};    while (*s++ != CHAR_NULL) {};
798    if (*s == 0) return "Error text not found (please report)";    if (*s == CHAR_NULL) return "Error text not found (please report)";
799    }    }
800  return s;  return s;
801  }  }
802    
803    
804    
805  /*************************************************  /*************************************************
806  *           Expand the workspace                 *  *           Expand the workspace                 *
807  *************************************************/  *************************************************/
# Line 749  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET);
879  *************************************************/  *************************************************/
880    
881  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
882  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
883  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
884  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
885  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
886  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
887    
888  Arguments:  Arguments:
889    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
890    chptr          points to the data character    chptr          points to a returned data character
891    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
892    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
893    options        the options bits    options        the options bits
# Line 771  Returns:         zero => a data characte Line 900  Returns:         zero => a data characte
900  */  */
901    
902  static int  static int
903  check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,  check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
904    int bracount, int options, BOOL isclass)    int bracount, int options, BOOL isclass)
905  {  {
906  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
# Line 786  ptr--;                            /* Set Line 915  ptr--;                            /* Set
915    
916  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
917    
918  if (c == 0) *errorcodeptr = ERR1;  if (c == CHAR_NULL) *errorcodeptr = ERR1;
919    
920  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
921  in a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
# Line 795  Otherwise further processing may be requ Line 924  Otherwise further processing may be requ
924  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
925  /* Not alphanumeric */  /* Not alphanumeric */
926  else if (c < CHAR_0 || c > CHAR_z) {}  else if (c < CHAR_0 || c > CHAR_z) {}
927  else if ((i = escapes[c - CHAR_0]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }  else if ((i = escapes[c - CHAR_0]) != 0)
928      { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
929    
930  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
931  /* Not alphanumeric */  /* Not alphanumeric */
# Line 845  else Line 975  else
975            }            }
976    
977  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
978          if (c > (utf ? 0x10ffff : 0xff))          if (c > (utf ? 0x10ffffU : 0xffU))
979  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
980          if (c > (utf ? 0x10ffff : 0xffff))          if (c > (utf ? 0x10ffffU : 0xffffU))
981  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
982          if (utf && c > 0x10ffff)          if (utf && c > 0x10ffffU)
983  #endif  #endif
984            {            {
985            *errorcodeptr = ERR76;            *errorcodeptr = ERR76;
# Line 896  else Line 1026  else
1026      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1027        {        {
1028        const pcre_uchar *p;        const pcre_uchar *p;
1029        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)        for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1030          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;          if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1031        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)        if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1032          {          {
1033          escape = ESC_k;          escape = ESC_k;
1034          break;          break;
# Line 961  else Line 1091  else
1091      break;      break;
1092    
1093      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1094      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1095      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1096        recommended to avoid the ambiguities in the old syntax.
1097    
1098      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1099      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1100      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1101      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1102      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1103      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1104      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1105    
1106        Inside a character class, \ followed by a digit is always either a literal
1107        8 or 9 or an octal number. */
1108    
1109      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1110      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 997  else Line 1131  else
1131          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1132          break;          break;
1133          }          }
1134        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1135          {          {
1136          escape = -s;          escape = -s;
1137          break;          break;
# Line 1005  else Line 1139  else
1139        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1140        }        }
1141    
1142      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1143      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1144      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1145        changed so as not to insert the binary zero. */
1146    
1147      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1148        {  
1149        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1150    
1151      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1152      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1083  else Line 1215  else
1215  #endif  #endif
1216    
1217  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1218          if (c > (utf ? 0x10ffff : 0xff)) { overflow = TRUE; break; }          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1219  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1220          if (c > (utf ? 0x10ffff : 0xffff)) { overflow = TRUE; break; }          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1221  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1222          if (utf && c > 0x10ffff) { overflow = TRUE; break; }          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1223  #endif  #endif
1224          }          }
1225    
# Line 1132  else Line 1264  else
1264    
1265      case CHAR_c:      case CHAR_c:
1266      c = *(++ptr);      c = *(++ptr);
1267      if (c == 0)      if (c == CHAR_NULL)
1268        {        {
1269        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1270        break;        break;
# Line 1188  if ((options & PCRE_UCP) != 0 && escape Line 1320  if ((options & PCRE_UCP) != 0 && escape
1320  return escape;  return escape;
1321  }  }
1322    
1323    
1324    
1325  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1326  /*************************************************  /*************************************************
1327  *               Handle \P and \p                 *  *               Handle \P and \p                 *
# Line 1201  escape sequence. Line 1335  escape sequence.
1335  Argument:  Argument:
1336    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
1337    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
1338    dptr           points to an int that is set to the detailed property value    ptypeptr       points to an unsigned int that is set to the type value
1339      pdataptr       points to an unsigned int that is set to the detailed property value
1340    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
1341    
1342  Returns:         type value from ucp_type_table, or -1 for an invalid type  Returns:         TRUE if the type value was found, or FALSE for an invalid type
1343  */  */
1344    
1345  static int  static BOOL
1346  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1347      unsigned int *pdataptr, int *errorcodeptr)
1348  {  {
1349  pcre_uchar c;  pcre_uchar c;
1350  int i, bot, top;  int i, bot, top;
# Line 1216  const pcre_uchar *ptr = *ptrptr; Line 1352  const pcre_uchar *ptr = *ptrptr;
1352  pcre_uchar name[32];  pcre_uchar name[32];
1353    
1354  c = *(++ptr);  c = *(++ptr);
1355  if (c == 0) goto ERROR_RETURN;  if (c == CHAR_NULL) goto ERROR_RETURN;
1356    
1357  *negptr = FALSE;  *negptr = FALSE;
1358    
# Line 1233  if (c == CHAR_LEFT_CURLY_BRACKET) Line 1369  if (c == CHAR_LEFT_CURLY_BRACKET)
1369    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1370      {      {
1371      c = *(++ptr);      c = *(++ptr);
1372      if (c == 0) goto ERROR_RETURN;      if (c == CHAR_NULL) goto ERROR_RETURN;
1373      if (c == CHAR_RIGHT_CURLY_BRACKET) break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1374      name[i] = c;      name[i] = c;
1375      }      }
# Line 1263  while (bot < top) Line 1399  while (bot < top)
1399    r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);    r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1400    if (r == 0)    if (r == 0)
1401      {      {
1402      *dptr = PRIV(utt)[i].value;      *ptypeptr = PRIV(utt)[i].type;
1403      return PRIV(utt)[i].type;      *pdataptr = PRIV(utt)[i].value;
1404        return TRUE;
1405      }      }
1406    if (r > 0) bot = i + 1; else top = i;    if (r > 0) bot = i + 1; else top = i;
1407    }    }
1408    
1409  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
1410  *ptrptr = ptr;  *ptrptr = ptr;
1411  return -1;  return FALSE;
1412    
1413  ERROR_RETURN:  ERROR_RETURN:
1414  *errorcodeptr = ERR46;  *errorcodeptr = ERR46;
1415  *ptrptr = ptr;  *ptrptr = ptr;
1416  return -1;  return FALSE;
1417  }  }
1418  #endif  #endif
1419    
1420    
1421    
   
1422  /*************************************************  /*************************************************
1423  *         Read repeat counts                     *  *         Read repeat counts                     *
1424  *************************************************/  *************************************************/
# Line 1351  return p; Line 1487  return p;
1487    
1488    
1489  /*************************************************  /*************************************************
 *  Subroutine for finding forward reference      *  
 *************************************************/  
   
 /* This recursive function is called only from find_parens() below. The  
 top-level call starts at the beginning of the pattern. All other calls must  
 start at a parenthesis. It scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. Recursion is used to keep  
 track of subpatterns that reset the capturing group numbers - the (?| feature.  
   
 This function was originally called only from the second pass, in which we know  
 that if (?< or (?' or (?P< is encountered, the name will be correctly  
 terminated because that is checked in the first pass. There is now one call to  
 this function in the first pass, to check for a recursive back reference by  
 name (so that we can make the whole group atomic). In this case, we need check  
 only up to the current position in the pattern, and that is still OK because  
 and previous occurrences will have been checked. To make this work, the test  
 for "end of pattern" is a check against cd->end_pattern in the main loop,  
 instead of looking for a binary zero. This means that the special first-pass  
 call can adjust cd->end_pattern temporarily. (Checks for binary zero while  
 processing items within the loop are OK, because afterwards the main loop will  
 terminate.)  
   
 Arguments:  
   ptrptr       address of the current character pointer (updated)  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   count        pointer to the current capturing subpattern number (updated)  
   
 Returns:       the number of the named subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,  
   BOOL xmode, BOOL utf, int *count)  
 {  
 pcre_uchar *ptr = *ptrptr;  
 int start_count = *count;  
 int hwm_count = start_count;  
 BOOL dup_parens = FALSE;  
   
 /* If the first character is a parenthesis, check on the type of group we are  
 dealing with. The very first call may not start with a parenthesis. */  
   
 if (ptr[0] == CHAR_LEFT_PARENTHESIS)  
   {  
   /* Handle specials such as (*SKIP) or (*UTF8) etc. */  
   
   if (ptr[1] == CHAR_ASTERISK) ptr += 2;  
   
   /* Handle a normal, unnamed capturing parenthesis. */  
   
   else if (ptr[1] != CHAR_QUESTION_MARK)  
     {  
     *count += 1;  
     if (name == NULL && *count == lorn) return *count;  
     ptr++;  
     }  
   
   /* All cases now have (? at the start. Remember when we are in a group  
   where the parenthesis numbers are duplicated. */  
   
   else if (ptr[2] == CHAR_VERTICAL_LINE)  
     {  
     ptr += 3;  
     dup_parens = TRUE;  
     }  
   
   /* Handle comments; all characters are allowed until a ket is reached. */  
   
   else if (ptr[2] == CHAR_NUMBER_SIGN)  
     {  
     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;  
     goto FAIL_EXIT;  
     }  
   
   /* Handle a condition. If it is an assertion, just carry on so that it  
   is processed as normal. If not, skip to the closing parenthesis of the  
   condition (there can't be any nested parens). */  
   
   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)  
     {  
     ptr += 2;  
     if (ptr[1] != CHAR_QUESTION_MARK)  
       {  
       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
       if (*ptr != 0) ptr++;  
       }  
     }  
   
   /* Start with (? but not a condition. */  
   
   else  
     {  
     ptr += 2;  
     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */  
   
     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */  
   
     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&  
         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)  
       {  
       pcre_uchar term;  
       const pcre_uchar *thisname;  
       *count += 1;  
       if (name == NULL && *count == lorn) return *count;  
       term = *ptr++;  
       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  
       thisname = ptr;  
       while (*ptr != term) ptr++;  
       if (name != NULL && lorn == (int)(ptr - thisname) &&  
           STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)  
         return *count;  
       term++;  
       }  
     }  
   }  
   
 /* Past any initial parenthesis handling, scan for parentheses or vertical  
 bars. Stop if we get to cd->end_pattern. Note that this is important for the  
 first-pass call when this value is temporarily adjusted to stop at the current  
 position. So DO NOT change this to a test for binary zero. */  
   
 for (; ptr < cd->end_pattern; ptr++)  
   {  
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == CHAR_BACKSLASH)  
     {  
     if (*(++ptr) == 0) goto FAIL_EXIT;  
     if (*ptr == CHAR_Q) for (;;)  
       {  
       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};  
       if (*ptr == 0) goto FAIL_EXIT;  
       if (*(++ptr) == CHAR_E) break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes; this logic must be similar to the way they  
   are handled for real. If the first character is '^', skip it. Also, if the  
   first few characters (either before or after ^) are \Q\E or \E we skip them  
   too. This makes for compatibility with Perl. Note the use of STR macros to  
   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */  
   
   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)  
     {  
     BOOL negate_class = FALSE;  
     for (;;)  
       {  
       if (ptr[1] == CHAR_BACKSLASH)  
         {  
         if (ptr[2] == CHAR_E)  
           ptr+= 2;  
         else if (STRNCMP_UC_C8(ptr + 2,  
                  STR_Q STR_BACKSLASH STR_E, 3) == 0)  
           ptr += 4;  
         else  
           break;  
         }  
       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)  
         {  
         negate_class = TRUE;  
         ptr++;  
         }  
       else break;  
       }  
   
     /* If the next character is ']', it is a data character that must be  
     skipped, except in JavaScript compatibility mode. */  
   
     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&  
         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)  
       ptr++;  
   
     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       if (*ptr == 0) return -1;  
       if (*ptr == CHAR_BACKSLASH)  
         {  
         if (*(++ptr) == 0) goto FAIL_EXIT;  
         if (*ptr == CHAR_Q) for (;;)  
           {  
           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};  
           if (*ptr == 0) goto FAIL_EXIT;  
           if (*(++ptr) == CHAR_E) break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == CHAR_NUMBER_SIGN)  
     {  
     ptr++;  
     while (*ptr != 0)  
       {  
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }  
       ptr++;  
 #ifdef SUPPORT_UTF  
       if (utf) FORWARDCHAR(ptr);  
 #endif  
       }  
     if (*ptr == 0) goto FAIL_EXIT;  
     continue;  
     }  
   
   /* Check for the special metacharacters */  
   
   if (*ptr == CHAR_LEFT_PARENTHESIS)  
     {  
     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);  
     if (rc > 0) return rc;  
     if (*ptr == 0) goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_RIGHT_PARENTHESIS)  
     {  
     if (dup_parens && *count < hwm_count) *count = hwm_count;  
     goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)  
     {  
     if (*count > hwm_count) hwm_count = *count;  
     *count = start_count;  
     }  
   }  
   
 FAIL_EXIT:  
 *ptrptr = ptr;  
 return -1;  
 }  
   
   
   
   
 /*************************************************  
 *       Find forward referenced subpattern       *  
 *************************************************/  
   
 /* This function scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. This is used for forward  
 references to subpatterns. We used to be able to start this scan from the  
 current compiling point, using the current count value from cd->bracount, and  
 do it all in a single loop, but the addition of the possibility of duplicate  
 subpattern numbers means that we have to scan from the very start, in order to  
 take account of such duplicates, and to use a recursive function to keep track  
 of the different types of group.  
   
 Arguments:  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   
 Returns:       the number of the found subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,  
   BOOL utf)  
 {  
 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;  
 int count = 0;  
 int rc;  
   
 /* If the pattern does not start with an opening parenthesis, the first call  
 to find_parens_sub() will scan right to the end (if necessary). However, if it  
 does start with a parenthesis, find_parens_sub() will return when it hits the  
 matching closing parens. That is why we have to have a loop. */  
   
 for (;;)  
   {  
   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);  
   if (rc > 0 || *ptr++ == 0) break;  
   }  
   
 return rc;  
 }  
   
   
   
   
 /*************************************************  
1490  *      Find first significant op code            *  *      Find first significant op code            *
1491  *************************************************/  *************************************************/
1492    
# Line 1684  for (;;) Line 1525  for (;;)
1525    
1526      case OP_CALLOUT:      case OP_CALLOUT:
1527      case OP_CREF:      case OP_CREF:
1528      case OP_NCREF:      case OP_DNCREF:
1529      case OP_RREF:      case OP_RREF:
1530      case OP_NRREF:      case OP_DNRREF:
1531      case OP_DEF:      case OP_DEF:
1532      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1533      break;      break;
# Line 1700  for (;;) Line 1541  for (;;)
1541    
1542    
1543    
   
1544  /*************************************************  /*************************************************
1545  *        Find the fixed length of a branch       *  *        Find the fixed length of a branch       *
1546  *************************************************/  *************************************************/
# Line 1824  for (;;) Line 1664  for (;;)
1664      case OP_COMMIT:      case OP_COMMIT:
1665      case OP_CREF:      case OP_CREF:
1666      case OP_DEF:      case OP_DEF:
1667        case OP_DNCREF:
1668        case OP_DNRREF:
1669      case OP_DOLL:      case OP_DOLL:
1670      case OP_DOLLM:      case OP_DOLLM:
1671      case OP_EOD:      case OP_EOD:
1672      case OP_EODN:      case OP_EODN:
1673      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1674      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1675      case OP_PRUNE:      case OP_PRUNE:
1676      case OP_REVERSE:      case OP_REVERSE:
# Line 1852  for (;;) Line 1692  for (;;)
1692      case OP_NOTI:      case OP_NOTI:
1693      branchlength++;      branchlength++;
1694      cc += 2;      cc += 2;
1695  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
1696      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1697  #endif  #endif
1698      break;      break;
# Line 1866  for (;;) Line 1706  for (;;)
1706      case OP_NOTEXACTI:      case OP_NOTEXACTI:
1707      branchlength += (int)GET2(cc,1);      branchlength += (int)GET2(cc,1);
1708      cc += 2 + IMM2_SIZE;      cc += 2 + IMM2_SIZE;
1709  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
1710      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1711  #endif  #endif
1712      break;      break;
1713    
1714      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1715      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1716      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1717        cc += 2;        cc += 2;
1718      cc += 1 + IMM2_SIZE + 1;      cc += 1 + IMM2_SIZE + 1;
1719      break;      break;
# Line 1909  for (;;) Line 1749  for (;;)
1749    
1750      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1751    
 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32  
     case OP_XCLASS:  
     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];  
     /* Fall through */  
 #endif  
   
1752      case OP_CLASS:      case OP_CLASS:
1753      case OP_NCLASS:      case OP_NCLASS:
1754    #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1755        case OP_XCLASS:
1756        /* The original code caused an unsigned overflow in 64 bit systems,
1757        so now we use a conditional statement. */
1758        if (op == OP_XCLASS)
1759          cc += GET(cc, 1);
1760        else
1761          cc += PRIV(OP_lengths)[OP_CLASS];
1762    #else
1763      cc += PRIV(OP_lengths)[OP_CLASS];      cc += PRIV(OP_lengths)[OP_CLASS];
1764    #endif
1765    
1766      switch (*cc)      switch (*cc)
1767        {        {
# Line 1999  for (;;) Line 1843  for (;;)
1843      case OP_QUERYI:      case OP_QUERYI:
1844      case OP_REF:      case OP_REF:
1845      case OP_REFI:      case OP_REFI:
1846        case OP_DNREF:
1847        case OP_DNREFI:
1848      case OP_SBRA:      case OP_SBRA:
1849      case OP_SBRAPOS:      case OP_SBRAPOS:
1850      case OP_SCBRA:      case OP_SCBRA:
# Line 2035  for (;;) Line 1881  for (;;)
1881    
1882    
1883    
   
1884  /*************************************************  /*************************************************
1885  *    Scan compiled regex for specific bracket    *  *    Scan compiled regex for specific bracket    *
1886  *************************************************/  *************************************************/
# Line 2112  for (;;) Line 1957  for (;;)
1957        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1958        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1959        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1960        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1961          code += 2;          code += 2;
1962        break;        break;
1963    
1964        case OP_MARK:        case OP_MARK:
1965        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
1966        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       code += code[1];  
       break;  
   
1967        case OP_THEN_ARG:        case OP_THEN_ARG:
1968        code += code[1];        code += code[1];
1969        break;        break;
# Line 2232  for (;;) Line 2074  for (;;)
2074        case OP_TYPEUPTO:        case OP_TYPEUPTO:
2075        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
2076        case OP_TYPEEXACT:        case OP_TYPEEXACT:
2077        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2078          code += 2;          code += 2;
2079        break;        break;
2080    
2081        case OP_MARK:        case OP_MARK:
2082        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
2083        case OP_SKIP_ARG:        case OP_SKIP_ARG:
       code += code[1];  
       break;  
   
2084        case OP_THEN_ARG:        case OP_THEN_ARG:
2085        code += code[1];        code += code[1];
2086        break;        break;
# Line 2343  Arguments: Line 2182  Arguments:
2182    endcode     points to where to stop    endcode     points to where to stop
2183    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2184    cd          contains pointers to tables etc.    cd          contains pointers to tables etc.
2185      recurses    chain of recurse_check to catch mutual recursion
2186    
2187  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2188  */  */
2189    
2190    typedef struct recurse_check {
2191      struct recurse_check *prev;
2192      const pcre_uchar *group;
2193    } recurse_check;
2194    
2195  static BOOL  static BOOL
2196  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2197    BOOL utf, compile_data *cd)    BOOL utf, compile_data *cd, recurse_check *recurses)
2198  {  {
2199  register pcre_uchar c;  register pcre_uchar c;
2200    recurse_check this_recurse;
2201    
2202  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);  for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2203       code < endcode;       code < endcode;
2204       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
# Line 2379  for (code = first_significant_code(code Line 2226  for (code = first_significant_code(code
2226    
2227    if (c == OP_RECURSE)    if (c == OP_RECURSE)
2228      {      {
2229      const pcre_uchar *scode;      const pcre_uchar *scode = cd->start_code + GET(code, 1);
2230      BOOL empty_branch;      BOOL empty_branch;
2231    
2232      /* Test for forward reference */      /* Test for forward reference or uncompleted reference. This is disabled
2233        when called to scan a completed pattern by setting cd->start_workspace to
2234        NULL. */
2235    
2236        if (cd->start_workspace != NULL)
2237          {
2238          const pcre_uchar *tcode;
2239          for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2240            if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2241          if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2242          }
2243    
2244        /* If we are scanning a completed pattern, there are no forward references
2245        and all groups are complete. We need to detect whether this is a recursive
2246        call, as otherwise there will be an infinite loop. If it is a recursion,
2247        just skip over it. Simple recursions are easily detected. For mutual
2248        recursions we keep a chain on the stack. */
2249    
2250      for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)      else
2251        if ((int)GET(scode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;        {
2252          recurse_check *r = recurses;
2253          const pcre_uchar *endgroup = scode;
2254    
2255      /* Not a forward reference, test for completed backward reference */        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2256          if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2257    
2258      empty_branch = FALSE;        for (r = recurses; r != NULL; r = r->prev)
2259      scode = cd->start_code + GET(code, 1);          if (r->group == scode) break;
2260      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */        if (r != NULL) continue;   /* Mutual recursion */
2261          }
2262    
2263        /* Completed reference; scan the referenced group, remembering it on the
2264        stack chain to detect mutual recursions. */
2265    
2266      /* Completed backwards reference */      empty_branch = FALSE;
2267        this_recurse.prev = recurses;
2268        this_recurse.group = scode;
2269    
2270      do      do
2271        {        {
2272        if (could_be_empty_branch(scode, endcode, utf, cd))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2273          {          {
2274          empty_branch = TRUE;          empty_branch = TRUE;
2275          break;          break;
# Line 2453  for (code = first_significant_code(code Line 2325  for (code = first_significant_code(code
2325        empty_branch = FALSE;        empty_branch = FALSE;
2326        do        do
2327          {          {
2328          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))          if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2329            empty_branch = TRUE;            empty_branch = TRUE;
2330          code += GET(code, 1);          code += GET(code, 1);
2331          }          }
# Line 2511  for (code = first_significant_code(code Line 2383  for (code = first_significant_code(code
2383    
2384      /* Opcodes that must match a character */      /* Opcodes that must match a character */
2385    
2386        case OP_ANY:
2387        case OP_ALLANY:
2388        case OP_ANYBYTE:
2389    
2390      case OP_PROP:      case OP_PROP:
2391      case OP_NOTPROP:      case OP_NOTPROP:
2392        case OP_ANYNL:
2393    
2394        case OP_NOT_HSPACE:
2395        case OP_HSPACE:
2396        case OP_NOT_VSPACE:
2397        case OP_VSPACE:
2398      case OP_EXTUNI:      case OP_EXTUNI:
2399    
2400      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2401      case OP_DIGIT:      case OP_DIGIT:
2402      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2403      case OP_WHITESPACE:      case OP_WHITESPACE:
2404      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2405      case OP_WORDCHAR:      case OP_WORDCHAR:
2406      case OP_ANY:  
     case OP_ALLANY:  
     case OP_ANYBYTE:  
2407      case OP_CHAR:      case OP_CHAR:
2408      case OP_CHARI:      case OP_CHARI:
2409      case OP_NOT:      case OP_NOT:
2410      case OP_NOTI:      case OP_NOTI:
2411    
2412      case OP_PLUS:      case OP_PLUS:
2413        case OP_PLUSI:
2414      case OP_MINPLUS:      case OP_MINPLUS:
2415      case OP_POSPLUS:      case OP_MINPLUSI:
2416      case OP_EXACT:  
2417      case OP_NOTPLUS:      case OP_NOTPLUS:
2418        case OP_NOTPLUSI:
2419      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2420        case OP_NOTMINPLUSI:
2421    
2422        case OP_POSPLUS:
2423        case OP_POSPLUSI:
2424      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2425        case OP_NOTPOSPLUSI:
2426    
2427        case OP_EXACT:
2428        case OP_EXACTI:
2429      case OP_NOTEXACT:      case OP_NOTEXACT:
2430        case OP_NOTEXACTI:
2431    
2432      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2433      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2434      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2435      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2436    
2437      return FALSE;      return FALSE;
2438    
2439      /* These are going to continue, as they may be empty, but we have to      /* These are going to continue, as they may be empty, but we have to
# Line 2558  for (code = first_significant_code(code Line 2453  for (code = first_significant_code(code
2453      case OP_TYPEUPTO:      case OP_TYPEUPTO:
2454      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
2455      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
2456      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2457        code += 2;        code += 2;
2458      break;      break;
2459    
# Line 2572  for (code = first_significant_code(code Line 2467  for (code = first_significant_code(code
2467      return TRUE;      return TRUE;
2468    
2469      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2470      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO and their caseless and negative versions may be
2471        followed by a multibyte character. */
2472    
2473  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2474      case OP_STAR:      case OP_STAR:
2475      case OP_STARI:      case OP_STARI:
2476        case OP_NOTSTAR:
2477        case OP_NOTSTARI:
2478    
2479      case OP_MINSTAR:      case OP_MINSTAR:
2480      case OP_MINSTARI:      case OP_MINSTARI:
2481        case OP_NOTMINSTAR:
2482        case OP_NOTMINSTARI:
2483    
2484      case OP_POSSTAR:      case OP_POSSTAR:
2485      case OP_POSSTARI:      case OP_POSSTARI:
2486        case OP_NOTPOSSTAR:
2487        case OP_NOTPOSSTARI:
2488    
2489      case OP_QUERY:      case OP_QUERY:
2490      case OP_QUERYI:      case OP_QUERYI:
2491        case OP_NOTQUERY:
2492        case OP_NOTQUERYI:
2493    
2494      case OP_MINQUERY:      case OP_MINQUERY:
2495      case OP_MINQUERYI:      case OP_MINQUERYI:
2496        case OP_NOTMINQUERY:
2497        case OP_NOTMINQUERYI:
2498    
2499      case OP_POSQUERY:      case OP_POSQUERY:
2500      case OP_POSQUERYI:      case OP_POSQUERYI:
2501        case OP_NOTPOSQUERY:
2502        case OP_NOTPOSQUERYI:
2503    
2504      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2505      break;      break;
2506    
2507      case OP_UPTO:      case OP_UPTO:
2508      case OP_UPTOI:      case OP_UPTOI:
2509        case OP_NOTUPTO:
2510        case OP_NOTUPTOI:
2511    
2512      case OP_MINUPTO:      case OP_MINUPTO:
2513      case OP_MINUPTOI:      case OP_MINUPTOI:
2514        case OP_NOTMINUPTO:
2515        case OP_NOTMINUPTOI:
2516    
2517      case OP_POSUPTO:      case OP_POSUPTO:
2518      case OP_POSUPTOI:      case OP_POSUPTOI:
2519        case OP_NOTPOSUPTO:
2520        case OP_NOTPOSUPTOI:
2521    
2522      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2523      break;      break;
2524  #endif  #endif
# Line 2606  for (code = first_significant_code(code Line 2529  for (code = first_significant_code(code
2529      case OP_MARK:      case OP_MARK:
2530      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
2531      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     code += code[1];  
     break;  
   
2532      case OP_THEN_ARG:      case OP_THEN_ARG:
2533      code += code[1];      code += code[1];
2534      break;      break;
# Line 2652  could_be_empty(const pcre_uchar *code, c Line 2572  could_be_empty(const pcre_uchar *code, c
2572  {  {
2573  while (bcptr != NULL && bcptr->current_branch >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2574    {    {
2575    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2576      return FALSE;      return FALSE;
2577    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2578    }    }
# Line 2662  return TRUE; Line 2582  return TRUE;
2582    
2583    
2584  /*************************************************  /*************************************************
2585  *           Check for POSIX class syntax         *  *        Base opcode of repeated opcodes         *
2586  *************************************************/  *************************************************/
2587    
2588  /* This function is called when the sequence "[:" or "[." or "[=" is  /* Returns the base opcode for repeated single character type opcodes. If the
2589  encountered in a character class. It checks whether this is followed by a  opcode is not a repeated character type, it returns with the original value.
 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we  
 reach an unescaped ']' without the special preceding character, return FALSE.  
2590    
2591  Originally, this function only recognized a sequence of letters between the  Arguments:  c opcode
2592  terminators, but it seems that Perl recognizes any sequence of characters,  Returns:    base opcode for the type
2593  though of course unknown POSIX names are subsequently rejected. Perl gives an  */
 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE  
 didn't consider this to be a POSIX class. Likewise for [:1234:].  
2594    
2595  The problem in trying to be exactly like Perl is in the handling of escapes. We  static pcre_uchar
2596  have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX  get_repeat_base(pcre_uchar c)
2597  class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code  {
2598  below handles the special case of \], but does not try to do any other escape  return (c > OP_TYPEPOSUPTO)? c :
2599  processing. This makes it different from Perl for cases such as [:l\ower:]         (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2600  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize         (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2601  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,         (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2602  I think.         (c >= OP_STARI)?      OP_STARI :
2603                                 OP_STAR;
2604    }
2605    
 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  
 It seems that the appearance of a nested POSIX class supersedes an apparent  
 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or  
 a digit.  
2606    
2607  In Perl, unescaped square brackets may also appear as part of class names. For  
2608  example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for  #ifdef SUPPORT_UCP
2609  [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not  /*************************************************
2610  seem right at all. PCRE does not allow closing square brackets in POSIX class  *        Check a character and a property        *
2611  names.  *************************************************/
2612    
2613    /* This function is called by check_auto_possessive() when a property item
2614    is adjacent to a fixed character.
2615    
2616  Arguments:  Arguments:
2617    ptr      pointer to the initial [    c            the character
2618    endptr   where to return the end pointer    ptype        the property type
2619      pdata        the data for the type
2620      negated      TRUE if it's a negated property (\P or \p{^)
2621    
2622  Returns:   TRUE or FALSE  Returns:       TRUE if auto-possessifying is OK
2623  */  */
2624    
2625  static BOOL  static BOOL
2626  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)  check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2627      BOOL negated)
2628  {  {
2629  pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */  const pcre_uint32 *p;
2630  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  const ucd_record *prop = GET_UCD(c);
2631  for (++ptr; *ptr != 0; ptr++)  
2632    switch(ptype)
2633    {    {
2634    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)    case PT_LAMP:
2635      ptr++;    return (prop->chartype == ucp_Lu ||
2636    else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;            prop->chartype == ucp_Ll ||
2637    else            prop->chartype == ucp_Lt) == negated;
2638    
2639      case PT_GC:
2640      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2641    
2642      case PT_PC:
2643      return (pdata == prop->chartype) == negated;
2644    
2645      case PT_SC:
2646      return (pdata == prop->script) == negated;
2647    
2648      /* These are specials */
2649    
2650      case PT_ALNUM:
2651      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2652              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2653    
2654      /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2655      means that Perl space and POSIX space are now identical. PCRE was changed
2656      at release 8.34. */
2657    
2658      case PT_SPACE:    /* Perl space */
2659      case PT_PXSPACE:  /* POSIX space */
2660      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2661              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2662              c == CHAR_FF || c == CHAR_CR)
2663              == negated;
2664    
2665      case PT_WORD:
2666      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2667              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2668              c == CHAR_UNDERSCORE) == negated;
2669    
2670      case PT_CLIST:
2671      p = PRIV(ucd_caseless_sets) + prop->caseset;
2672      for (;;)
2673      {      {
2674      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)      if (c < *p) return !negated;
2675        {      if (c == *p++) return negated;
       *endptr = ptr;  
       return TRUE;  
       }  
     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&  
          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||  
           ptr[1] == CHAR_EQUALS_SIGN) &&  
         check_posix_syntax(ptr, endptr))  
       return FALSE;  
2676      }      }
2677      break;  /* Control never reaches here */
2678    }    }
2679    
2680  return FALSE;  return FALSE;
2681  }  }
2682    #endif  /* SUPPORT_UCP */
2683    
2684    
2685    
2686  /*************************************************  /*************************************************
2687  *          Check POSIX class name                *  *        Fill the character property list        *
2688  *************************************************/  *************************************************/
2689    
2690  /* This function is called to check the name given in a POSIX-style class entry  /* Checks whether the code points to an opcode that can take part in auto-
2691  such as [:alnum:].  possessification, and if so, fills a list with its properties.
2692    
2693  Arguments:  Arguments:
2694    ptr        points to the first letter    code        points to start of expression
2695    len        the length of the name    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2696      fcc         points to case-flipping table
2697      list        points to output list
2698                  list[0] will be filled with the opcode
2699                  list[1] will be non-zero if this opcode
2700                    can match an empty character string
2701                  list[2..7] depends on the opcode
2702    
2703  Returns:     a value representing the name, or -1 if unknown  Returns:      points to the start of the next opcode if *code is accepted
2704                  NULL if *code is not accepted
2705  */  */
2706    
2707  static int  static const pcre_uchar *
2708  check_posix_name(const pcre_uchar *ptr, int len)  get_chr_property_list(const pcre_uchar *code, BOOL utf,
2709      const pcre_uint8 *fcc, pcre_uint32 *list)
2710  {  {
2711  const char *pn = posix_names;  pcre_uchar c = *code;
2712  register int yield = 0;  const pcre_uchar *end;
2713  while (posix_name_lengths[yield] != 0)  const pcre_uint32 *clist_src;
2714    pcre_uint32 *clist_dest;
2715    pcre_uint32 chr;
2716    pcre_uchar base;
2717    
2718    list[0] = c;
2719    list[1] = FALSE;
2720    code++;
2721    
2722    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2723    {    {
2724    if (len == posix_name_lengths[yield] &&    base = get_repeat_base(c);
2725      STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;    c -= (base - OP_STAR);
   pn += posix_name_lengths[yield] + 1;  
   yield++;  
   }  
 return -1;  
 }  
2726    
2727      if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2728        code += IMM2_SIZE;
2729    
2730  /*************************************************    list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
 *    Adjust OP_RECURSE items in repeated group   *  
 *************************************************/  
2731    
2732  /* OP_RECURSE items contain an offset from the start of the regex to the group    switch(base)
2733  that is referenced. This means that groups can be replicated for fixed      {
2734  repetition simply by copying (because the recursion is allowed to refer to      case OP_STAR:
2735  earlier groups that are outside the current group). However, when a group is      list[0] = OP_CHAR;
2736  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is      break;
 inserted before it, after it has been compiled. This means that any OP_RECURSE  
 items within it that refer to the group itself or any contained groups have to  
 have their offsets adjusted. That one of the jobs of this function. Before it  
 is called, the partially compiled regex must be temporarily terminated with  
 OP_END.  
2737    
2738  This function has been extended with the possibility of forward references for      case OP_STARI:
2739  recursions and subroutine calls. It must also check the list of such references      list[0] = OP_CHARI;
2740  for the group we are dealing with. If it finds that one of the recursions in      break;
 the current group is on this list, it adjusts the offset in the list, not the  
 value in the reference (which is a group number).  
2741    
2742  Arguments:      case OP_NOTSTAR:
2743    group      points to the start of the group      list[0] = OP_NOT;
2744    adjust     the amount by which the group is to be moved      break;
   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode  
   cd         contains pointers to tables etc.  
   save_hwm   the hwm forward reference pointer at the start of the group  
2745    
2746  Returns:     nothing      case OP_NOTSTARI:
2747  */      list[0] = OP_NOTI;
2748        break;
2749    
2750  static void      case OP_TYPESTAR:
2751  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,      list[0] = *code;
2752    pcre_uchar *save_hwm)      code++;
2753  {      break;
2754  pcre_uchar *ptr = group;      }
2755      c = list[0];
2756      }
2757    
2758  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)  switch(c)
2759    {    {
2760    int offset;    case OP_NOT_DIGIT:
2761    pcre_uchar *hc;    case OP_DIGIT:
2762      case OP_NOT_WHITESPACE:
2763      case OP_WHITESPACE:
2764      case OP_NOT_WORDCHAR:
2765      case OP_WORDCHAR:
2766      case OP_ANY:
2767      case OP_ALLANY:
2768      case OP_ANYNL:
2769      case OP_NOT_HSPACE:
2770      case OP_HSPACE:
2771      case OP_NOT_VSPACE:
2772      case OP_VSPACE:
2773      case OP_EXTUNI:
2774      case OP_EODN:
2775      case OP_EOD:
2776      case OP_DOLL:
2777      case OP_DOLLM:
2778      return code;
2779    
2780    /* See if this recursion is on the forward reference list. If so, adjust the    case OP_CHAR:
2781    reference. */    case OP_NOT:
2782      GETCHARINCTEST(chr, code);
2783      list[2] = chr;
2784      list[3] = NOTACHAR;
2785      return code;
2786    
2787    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    case OP_CHARI:
2788      {    case OP_NOTI:
2789      offset = GET(hc, 0);    list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2790      if (cd->start_code + offset == ptr + 1)    GETCHARINCTEST(chr, code);
2791        {    list[2] = chr;
       PUT(hc, 0, offset + adjust);  
       break;  
       }  
     }  
2792    
2793    /* Otherwise, adjust the recursion offset if it's after the start of this  #ifdef SUPPORT_UCP
2794    group. */    if (chr < 128 || (chr < 256 && !utf))
2795        list[3] = fcc[chr];
2796      else
2797        list[3] = UCD_OTHERCASE(chr);
2798    #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2799      list[3] = (chr < 256) ? fcc[chr] : chr;
2800    #else
2801      list[3] = fcc[chr];
2802    #endif
2803    
2804    if (hc >= cd->hwm)    /* The othercase might be the same value. */
2805    
2806      if (chr == list[3])
2807        list[3] = NOTACHAR;
2808      else
2809        list[4] = NOTACHAR;
2810      return code;
2811    
2812    #ifdef SUPPORT_UCP
2813      case OP_PROP:
2814      case OP_NOTPROP:
2815      if (code[0] != PT_CLIST)
2816      {      {
2817      offset = GET(ptr, 1);      list[2] = code[0];
2818      if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);      list[3] = code[1];
2819        return code + 2;
2820      }      }
2821    
2822    ptr += 1 + LINK_SIZE;    /* Convert only if we have anough space. */
   }  
 }  
2823    
2824      clist_src = PRIV(ucd_caseless_sets) + code[1];
2825      clist_dest = list + 2;
2826      code += 2;
2827    
2828      do {
2829         /* Early return if there is not enough space. */
2830         if (clist_dest >= list + 8)
2831           {
2832           list[2] = code[0];
2833           list[3] = code[1];
2834           return code;
2835           }
2836         *clist_dest++ = *clist_src;
2837         }
2838       while(*clist_src++ != NOTACHAR);
2839    
2840  /*************************************************    /* Enough space to store all characters. */
 *        Insert an automatic callout point       *  
 *************************************************/  
2841    
2842  /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2843  callout points before each pattern item.    return code;
2844    #endif
2845    
2846  Arguments:    case OP_NCLASS:
2847    code           current code pointer    case OP_CLASS:
2848    ptr            current pattern pointer  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2849    cd             pointers to tables etc    case OP_XCLASS:
2850    
2851  Returns:         new code pointer    if (c == OP_XCLASS)
2852  */      end = code + GET(code, 0);
2853      else
2854    #endif
2855        end = code + 32 / sizeof(pcre_uchar);
2856    
2857  static pcre_uchar *    switch(*end)
2858  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)      {
2859  {      case OP_CRSTAR:
2860  *code++ = OP_CALLOUT;      case OP_CRMINSTAR:
2861  *code++ = 255;      case OP_CRQUERY:
2862  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */      case OP_CRMINQUERY:
2863  PUT(code, LINK_SIZE, 0);                       /* Default length */      list[1] = TRUE;
2864  return code + 2 * LINK_SIZE;      end++;
2865        break;
2866    
2867        case OP_CRRANGE:
2868        case OP_CRMINRANGE:
2869        list[1] = (GET2(end, 1) == 0);
2870        end += 1 + 2 * IMM2_SIZE;
2871        break;
2872        }
2873      list[2] = end - code;
2874      return end;
2875      }
2876    return NULL;    /* Opcode not accepted */
2877  }  }
2878    
2879    
2880    
2881  /*************************************************  /*************************************************
2882  *         Complete a callout item                *  *    Scan further character sets for match       *
2883  *************************************************/  *************************************************/
2884    
2885  /* A callout item contains the length of the next item in the pattern, which  /* Checks whether the base and the current opcode have a common character, in
2886  we can't fill in till after we have reached the relevant point. This is used  which case the base cannot be possessified.
 for both automatic and manual callouts.  
2887    
2888  Arguments:  Arguments:
2889    previous_callout   points to previous callout item    code        points to the byte code
2890    ptr                current pattern pointer    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2891    cd                 pointers to tables etc    cd          static compile data
2892      base_list   the data list of the base opcode
2893    
2894  Returns:             nothing  Returns:      TRUE if the auto-possessification is possible
2895  */  */
2896    
2897  static void  static BOOL
2898  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2899      const pcre_uint32* base_list)
2900  {  {
2901  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  pcre_uchar c;
2902  PUT(previous_callout, 2 + LINK_SIZE, length);  pcre_uint32 list[8];
2903  }  const pcre_uint32* chr_ptr;
2904    const pcre_uint32* ochr_ptr;
2905    const pcre_uint32* list_ptr;
2906    pcre_uint32 chr;
2907    
2908    for(;;)
2909      {
2910      c = *code;
2911    
2912      /* Skip over callouts */
2913    
2914  #ifdef SUPPORT_UCP    if (c == OP_CALLOUT)
2915  /*************************************************      {
2916  *           Get othercase range                  *      code += PRIV(OP_lengths)[c];
2917  *************************************************/      continue;
2918        }
2919    
2920  /* This function is passed the start and end of a class range, in UTF-8 mode    if (c == OP_ALT)
2921  with UCP support. It searches up the characters, looking for ranges of      {
2922  characters in the "other" case. Each call returns the next one, updating the      do code += GET(code, 1); while (*code == OP_ALT);
2923  start address. A character with multiple other cases is returned on its own      c = *code;
2924  with a special return value.      }
2925    
2926  Arguments:    switch(c)
2927    cptr        points to starting character value; updated      {
2928    d           end value      case OP_END:
2929    ocptr       where to put start of othercase range      /* TRUE only in greedy case. The non-greedy case could be replaced by an
2930    odptr       where to put end of othercase range      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses
2931        more memory, which we cannot get at this stage.) */
2932    
2933  Yield:        -1 when no more      return base_list[1] != 0;
                0 when a range is returned  
               >0 the CASESET offset for char with multiple other cases  
                 in this case, ocptr contains the original  
 */  
2934    
2935  static int      case OP_KET:
2936  get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,      /* If the bracket is capturing, and referenced by an OP_RECURSE, the
2937    pcre_uint32 *odptr)      non-greedy case cannot be converted to a possessive form. We do not test
2938  {      the bracket type at the moment, but we might do it in the future to improve
2939  pcre_uint32 c, othercase, next;      this condition. (But note that recursive calls are always atomic.) */
 int co;  
2940    
2941  /* Find the first character that has an other case. If it has multiple other      if (base_list[1] == 0) return FALSE;
2942  cases, return its case offset value. */      code += PRIV(OP_lengths)[c];
2943        continue;
2944        }
2945    
2946  for (c = *cptr; c <= d; c++)    /* Check for a supported opcode, and load its properties. */
2947    {  
2948    if ((co = UCD_CASESET(c)) != 0)    code = get_chr_property_list(code, utf, cd->fcc, list);
2949      if (code == NULL) return FALSE;    /* Unsupported */
2950    
2951      /* If either opcode is a small character list, set pointers for comparing
2952      characters from that list with another list, or with a property. */
2953    
2954      if (base_list[0] == OP_CHAR)
2955      {      {
2956      *ocptr = c++;   /* Character that has the set */      chr_ptr = base_list + 2;
2957      *cptr = c;      /* Rest of input range */      list_ptr = list;
2958      return co;      }
2959      }    else if (list[0] == OP_CHAR)
2960    if ((othercase = UCD_OTHERCASE(c)) != c) break;      {
2961    }      chr_ptr = list + 2;
2962        list_ptr = base_list;
2963        }
2964    
2965  if (c > d) return -1;  /* Reached end of range */    /* Some property combinations also acceptable. Unicode property opcodes are
2966      processed specially; the rest can be handled with a lookup table. */
2967    
2968  *ocptr = othercase;    else
2969  next = othercase + 1;      {
2970        pcre_uint32 leftop, rightop;
2971    
2972  for (++c; c <= d; c++)      if (list[1] != 0) return FALSE;   /* Must match at least one character */
2973    {      leftop = base_list[0];
2974    if (UCD_OTHERCASE(c) != next) break;      rightop = list[0];
   next++;  
   }  
2975    
2976  *odptr = next - 1;     /* End of othercase range */  #ifdef SUPPORT_UCP
2977  *cptr = c;             /* Rest of input range */      if (leftop == OP_PROP || leftop == OP_NOTPROP)
2978  return 0;        {
2979  }        if (rightop == OP_EOD) return TRUE;
2980          if (rightop == OP_PROP || rightop == OP_NOTPROP)
2981            {
2982            int n;
2983            const pcre_uint8 *p;
2984            BOOL same = leftop == rightop;
2985            BOOL lisprop = leftop == OP_PROP;
2986            BOOL risprop = rightop == OP_PROP;
2987            BOOL bothprop = lisprop && risprop;
2988    
2989            /* There's a table that specifies how each combination is to be
2990            processed:
2991              0   Always return FALSE (never auto-possessify)
2992              1   Character groups are distinct (possessify if both are OP_PROP)
2993              2   Check character categories in the same group (general or particular)
2994              3   Return TRUE if the two opcodes are not the same
2995              ... see comments below
2996            */
2997    
2998            n = propposstab[base_list[2]][list[2]];
2999            switch(n)
3000              {
3001              case 0: return FALSE;
3002              case 1: return bothprop;
3003              case 2: return (base_list[3] == list[3]) != same;
3004              case 3: return !same;
3005    
3006              case 4:  /* Left general category, right particular category */
3007              return risprop && catposstab[base_list[3]][list[3]] == same;
3008    
3009              case 5:  /* Right general category, left particular category */
3010              return lisprop && catposstab[list[3]][base_list[3]] == same;
3011    
3012              /* This code is logically tricky. Think hard before fiddling with it.
3013              The posspropstab table has four entries per row. Each row relates to
3014              one of PCRE's special properties such as ALNUM or SPACE or WORD.
3015              Only WORD actually needs all four entries, but using repeats for the
3016              others means they can all use the same code below.
3017    
3018              The first two entries in each row are Unicode general categories, and
3019              apply always, because all the characters they include are part of the
3020              PCRE character set. The third and fourth entries are a general and a
3021              particular category, respectively, that include one or more relevant
3022              characters. One or the other is used, depending on whether the check
3023              is for a general or a particular category. However, in both cases the
3024              category contains more characters than the specials that are defined
3025              for the property being tested against. Therefore, it cannot be used
3026              in a NOTPROP case.
3027    
3028              Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3029              Underscore is covered by ucp_P or ucp_Po. */
3030    
3031              case 6:  /* Left alphanum vs right general category */
3032              case 7:  /* Left space vs right general category */
3033              case 8:  /* Left word vs right general category */
3034              p = posspropstab[n-6];
3035              return risprop && lisprop ==
3036                (list[3] != p[0] &&
3037                 list[3] != p[1] &&
3038                (list[3] != p[2] || !lisprop));
3039    
3040              case 9:   /* Right alphanum vs left general category */
3041              case 10:  /* Right space vs left general category */
3042              case 11:  /* Right word vs left general category */
3043              p = posspropstab[n-9];
3044              return lisprop && risprop ==
3045                (base_list[3] != p[0] &&
3046                 base_list[3] != p[1] &&
3047                (base_list[3] != p[2] || !risprop));
3048    
3049              case 12:  /* Left alphanum vs right particular category */
3050              case 13:  /* Left space vs right particular category */
3051              case 14:  /* Left word vs right particular category */
3052              p = posspropstab[n-12];
3053              return risprop && lisprop ==
3054                (catposstab[p[0]][list[3]] &&
3055                 catposstab[p[1]][list[3]] &&
3056                (list[3] != p[3] || !lisprop));
3057    
3058              case 15:  /* Right alphanum vs left particular category */
3059              case 16:  /* Right space vs left particular category */
3060              case 17:  /* Right word vs left particular category */
3061              p = posspropstab[n-15];
3062              return lisprop && risprop ==
3063                (catposstab[p[0]][base_list[3]] &&
3064                 catposstab[p[1]][base_list[3]] &&
3065                (base_list[3] != p[3] || !risprop));
3066              }
3067            }
3068          return FALSE;
3069          }
3070    
3071        else
3072    #endif  /* SUPPORT_UCP */
3073    
3074  /*************************************************      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3075  *        Check a character and a property        *             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3076  *************************************************/             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3077        }
3078    
3079  /* This function is called by check_auto_possessive() when a property item    /* Control reaches here only if one of the items is a small character list.
3080  is adjacent to a fixed character.    All characters are checked against the other side. */
3081    
3082  Arguments:    do
3083    c            the character      {
3084    ptype        the property type      chr = *chr_ptr;
   pdata        the data for the type  
   negated      TRUE if it's a negated property (\P or \p{^)  
3085    
3086  Returns:       TRUE if auto-possessifying is OK      switch(list_ptr[0])
3087  */        {
3088          case OP_CHAR:
3089          ochr_ptr = list_ptr + 2;
3090          do
3091            {
3092            if (chr == *ochr_ptr) return FALSE;
3093            ochr_ptr++;
3094            }
3095          while(*ochr_ptr != NOTACHAR);
3096          break;
3097    
3098  static BOOL        case OP_NOT:
3099  check_char_prop(pcre_uint32 c, int ptype, int pdata, BOOL negated)        ochr_ptr = list_ptr + 2;
3100  {        do
3101  #ifdef SUPPORT_UCP          {
3102  const pcre_uint32 *p;          if (chr == *ochr_ptr)
3103  #endif            break;
3104            ochr_ptr++;
3105            }
3106          while(*ochr_ptr != NOTACHAR);
3107          if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3108          break;
3109    
3110  const ucd_record *prop = GET_UCD(c);        /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3111          set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3112    
3113  switch(ptype)        case OP_DIGIT:
3114    {        if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3115    case PT_LAMP:        break;
   return (prop->chartype == ucp_Lu ||  
           prop->chartype == ucp_Ll ||  
           prop->chartype == ucp_Lt) == negated;  
3116    
3117    case PT_GC:        case OP_NOT_DIGIT:
3118    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;        if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3119          break;
3120    
3121    case PT_PC:        case OP_WHITESPACE:
3122    return (pdata == prop->chartype) == negated;        if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3123          break;
3124    
3125    case PT_SC:        case OP_NOT_WHITESPACE:
3126    return (pdata == prop->script) == negated;        if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3127          break;
3128    
3129    /* These are specials */        case OP_WORDCHAR:
3130          if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3131          break;
3132    
3133    case PT_ALNUM:        case OP_NOT_WORDCHAR:
3134    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||        if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3135            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;        break;
3136    
3137    case PT_SPACE:    /* Perl space */        case OP_HSPACE:
3138    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||        switch(chr)
3139            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)          {
3140            == negated;          HSPACE_CASES: return FALSE;
3141            default: break;
3142            }
3143          break;
3144    
3145    case PT_PXSPACE:  /* POSIX space */        case OP_NOT_HSPACE:
3146    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||        switch(chr)
3147            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||          {
3148            c == CHAR_FF || c == CHAR_CR)          HSPACE_CASES: break;
3149            == negated;          default: return FALSE;
3150            }
3151          break;
3152    
3153    case PT_WORD:        case OP_ANYNL:
3154    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||        case OP_VSPACE:
3155            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||        switch(chr)
3156            c == CHAR_UNDERSCORE) == negated;          {
3157            VSPACE_CASES: return FALSE;
3158  #ifdef SUPPORT_UCP          default: break;
3159    case PT_CLIST:          }
3160    p = PRIV(ucd_caseless_sets) + prop->caseset;        break;
3161    for (;;)  
3162      {        case OP_NOT_VSPACE:
3163      if ((unsigned int)c < *p) return !negated;        switch(chr)
3164      if ((unsigned int)c == *p++) return negated;          {
3165      }          VSPACE_CASES: break;
3166    break;  /* Control never reaches here */          default: return FALSE;
3167            }
3168          break;
3169    
3170          case OP_DOLL:
3171          case OP_EODN:
3172          switch (chr)
3173            {
3174            case CHAR_CR:
3175            case CHAR_LF:
3176            case CHAR_VT:
3177            case CHAR_FF:
3178            case CHAR_NEL:
3179    #ifndef EBCDIC
3180            case 0x2028:
3181            case 0x2029:
3182    #endif  /* Not EBCDIC */
3183            return FALSE;
3184            }
3185          break;
3186    
3187          case OP_EOD:    /* Can always possessify before \z */
3188          break;
3189    
3190          case OP_PROP:
3191          case OP_NOTPROP:
3192          if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3193                list_ptr[0] == OP_NOTPROP))
3194            return FALSE;
3195          break;
3196    
3197          /* The class comparisons work only when the class is the second item
3198          of the pair, because there are at present no possessive forms of the
3199          class opcodes. Note also that the "code" variable that is used below
3200          points after the second item, and that the pointer for the first item
3201          is not available, so even if there were possessive forms of the class
3202          opcodes, the correct comparison could not be done. */
3203    
3204          case OP_NCLASS:
3205          if (chr > 255) return FALSE;
3206          /* Fall through */
3207    
3208          case OP_CLASS:
3209          if (list_ptr != list) return FALSE;   /* Class is first opcode */
3210          if (chr > 255) break;
3211          if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)
3212            return FALSE;
3213          break;
3214    
3215    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3216          case OP_XCLASS:
3217          if (list_ptr != list) return FALSE;   /* Class is first opcode */
3218          if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))
3219            return FALSE;
3220          break;
3221  #endif  #endif
3222    
3223          default:
3224          return FALSE;
3225          }
3226    
3227        chr_ptr++;
3228        }
3229      while(*chr_ptr != NOTACHAR);
3230    
3231      /* At least one character must be matched from this opcode. */
3232    
3233      if (list[1] == 0) return TRUE;
3234    }    }
3235    
3236  return FALSE;  return FALSE;
3237  }  }
 #endif  /* SUPPORT_UCP */  
3238    
3239    
3240    
3241  /*************************************************  /*************************************************
3242  *     Check if auto-possessifying is possible    *  *    Scan compiled regex for auto-possession     *
3243  *************************************************/  *************************************************/
3244    
3245  /* This function is called for unlimited repeats of certain items, to see  /* Replaces single character iterations with their possessive alternatives
3246  whether the next thing could possibly match the repeated item. If not, it makes  if appropriate. This function modifies the compiled opcode!
 sense to automatically possessify the repeated item.  
3247    
3248  Arguments:  Arguments:
3249    previous      pointer to the repeated opcode    code        points to start of the byte code
3250    utf           TRUE in UTF-8 / UTF-16 / UTF-32 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3251    ptr           next character in pattern    cd          static compile data
   options       options bits  
   cd            contains pointers to tables etc.  
3252    
3253  Returns:        TRUE if possessifying is wanted  Returns:      nothing
3254  */  */
3255    
3256  static BOOL  static void
3257  check_auto_possessive(const pcre_uchar *previous, BOOL utf,  auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
   const pcre_uchar *ptr, int options, compile_data *cd)  
3258  {  {
3259  pcre_uint32 c = NOTACHAR;  register pcre_uchar c;
3260  pcre_uint32 next;  const pcre_uchar *end;
3261  int escape;  pcre_uint32 list[8];
 int op_code = *previous++;  
   
 /* Skip whitespace and comments in extended mode */  
3262    
3263  if ((options & PCRE_EXTENDED) != 0)  for (;;)
3264    {    {
3265    for (;;)    c = *code;
3266    
3267      if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3268      {      {
3269      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      c -= get_repeat_base(c) - OP_STAR;
3270      if (*ptr == CHAR_NUMBER_SIGN)      end = (c <= OP_MINUPTO) ?
3271          get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3272        list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3273    
3274        if (end != NULL && compare_opcodes(end, utf, cd, list))
3275        {        {
3276        ptr++;        switch(c)
       while (*ptr != 0)  
3277          {          {
3278          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          case OP_STAR:
3279          ptr++;          *code += OP_POSSTAR - OP_STAR;
3280  #ifdef SUPPORT_UTF          break;
3281          if (utf) FORWARDCHAR(ptr);  
3282  #endif          case OP_MINSTAR:
3283            *code += OP_POSSTAR - OP_MINSTAR;
3284            break;
3285    
3286            case OP_PLUS:
3287            *code += OP_POSPLUS - OP_PLUS;
3288            break;
3289    
3290            case OP_MINPLUS:
3291            *code += OP_POSPLUS - OP_MINPLUS;
3292            break;
3293    
3294            case OP_QUERY:
3295            *code += OP_POSQUERY - OP_QUERY;
3296            break;
3297    
3298            case OP_MINQUERY:
3299            *code += OP_POSQUERY - OP_MINQUERY;
3300            break;
3301    
3302            case OP_UPTO:
3303            *code += OP_POSUPTO - OP_UPTO;
3304            break;
3305    
3306            case OP_MINUPTO:
3307            *code += OP_MINUPTO - OP_UPTO;
3308            break;
3309          }          }
3310        }        }
3311      else break;      c = *code;
3312      }      }
   }  
3313    
3314  /* If the next item is one that we can handle, get its value. A non-negative    switch(c)
3315  value is a character, a negative value is an escape value. */      {
3316        case OP_END:
3317        return;
3318    
3319  if (*ptr == CHAR_BACKSLASH)      case OP_TYPESTAR:
3320    {      case OP_TYPEMINSTAR:
3321    int temperrorcode = 0;      case OP_TYPEPLUS:
3322    escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options, FALSE);      case OP_TYPEMINPLUS:
3323    if (temperrorcode != 0) return FALSE;      case OP_TYPEQUERY:
3324    ptr++;    /* Point after the escape sequence */      case OP_TYPEMINQUERY:
3325    }      case OP_TYPEPOSSTAR:
3326  else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)      case OP_TYPEPOSPLUS:
3327    {      case OP_TYPEPOSQUERY:
3328    escape = 0;      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3329  #ifdef SUPPORT_UTF      break;
   if (utf) { GETCHARINC(next, ptr); } else  
 #endif  
   next = *ptr++;  
   }  
 else return FALSE;  
3330    
3331  /* Skip whitespace and comments in extended mode */      case OP_TYPEUPTO:
3332        case OP_TYPEMINUPTO:
3333        case OP_TYPEEXACT:
3334        case OP_TYPEPOSUPTO:
3335        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3336          code += 2;
3337        break;
3338    
3339  if ((options & PCRE_EXTENDED) != 0)      case OP_XCLASS:
3340    {      code += GET(code, 1);
3341    for (;;)      break;
3342      {  
3343      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      case OP_MARK:
3344      if (*ptr == CHAR_NUMBER_SIGN)      case OP_PRUNE_ARG:
3345        {      case OP_SKIP_ARG:
3346        ptr++;      case OP_THEN_ARG:
3347        while (*ptr != 0)      code += code[1];
3348          {      break;
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }  
         ptr++;  
 #ifdef SUPPORT_UTF  
         if (utf) FORWARDCHAR(ptr);  
 #endif  
         }  
       }  
     else break;  
3349      }      }
   }  
3350    
3351  /* If the next thing is itself optional, we have to give up. */    /* Add in the fixed length from the table */
3352    
3353  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||    code += PRIV(OP_lengths)[c];
   STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)  
     return FALSE;  
3354    
3355  /* If the previous item is a character, get its value. */    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3356      a multi-byte character. The length in the table is a minimum, so we have to
3357      arrange to skip the extra bytes. */
3358    
3359  if (op_code == OP_CHAR || op_code == OP_CHARI ||  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3360      op_code == OP_NOT || op_code == OP_NOTI)    if (utf) switch(c)
3361    //if (escape == 0) switch(op_code)      {
3362    {      case OP_CHAR:
3363  #ifdef SUPPORT_UTF      case OP_CHARI:
3364    GETCHARTEST(c, previous);      case OP_NOT:
3365        case OP_NOTI:
3366        case OP_STAR:
3367        case OP_MINSTAR:
3368        case OP_PLUS:
3369        case OP_MINPLUS:
3370        case OP_QUERY:
3371        case OP_MINQUERY:
3372        case OP_UPTO:
3373        case OP_MINUPTO:
3374        case OP_EXACT:
3375        case OP_POSSTAR:
3376        case OP_POSPLUS:
3377        case OP_POSQUERY:
3378        case OP_POSUPTO:
3379        case OP_STARI:
3380        case OP_MINSTARI:
3381        case OP_PLUSI:
3382        case OP_MINPLUSI:
3383        case OP_QUERYI:
3384        case OP_MINQUERYI:
3385        case OP_UPTOI:
3386        case OP_MINUPTOI:
3387        case OP_EXACTI:
3388        case OP_POSSTARI:
3389        case OP_POSPLUSI:
3390        case OP_POSQUERYI:
3391        case OP_POSUPTOI:
3392        case OP_NOTSTAR:
3393        case OP_NOTMINSTAR:
3394        case OP_NOTPLUS:
3395        case OP_NOTMINPLUS:
3396        case OP_NOTQUERY:
3397        case OP_NOTMINQUERY:
3398        case OP_NOTUPTO:
3399        case OP_NOTMINUPTO:
3400        case OP_NOTEXACT:
3401        case OP_NOTPOSSTAR:
3402        case OP_NOTPOSPLUS:
3403        case OP_NOTPOSQUERY:
3404        case OP_NOTPOSUPTO:
3405        case OP_NOTSTARI:
3406        case OP_NOTMINSTARI:
3407        case OP_NOTPLUSI:
3408        case OP_NOTMINPLUSI:
3409        case OP_NOTQUERYI:
3410        case OP_NOTMINQUERYI:
3411        case OP_NOTUPTOI:
3412        case OP_NOTMINUPTOI:
3413        case OP_NOTEXACTI:
3414        case OP_NOTPOSSTARI:
3415        case OP_NOTPOSPLUSI:
3416        case OP_NOTPOSQUERYI:
3417        case OP_NOTPOSUPTOI:
3418        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3419        break;
3420        }
3421  #else  #else
3422    c = *previous;    (void)(utf);  /* Keep compiler happy by referencing function argument */
3423  #endif  #endif
3424    }    }
3425    }
3426    
3427    
3428    
3429    /*************************************************
3430    *           Check for POSIX class syntax         *
3431    *************************************************/
3432    
3433    /* This function is called when the sequence "[:" or "[." or "[=" is
3434    encountered in a character class. It checks whether this is followed by a
3435    sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3436    reach an unescaped ']' without the special preceding character, return FALSE.
3437    
3438    Originally, this function only recognized a sequence of letters between the
3439    terminators, but it seems that Perl recognizes any sequence of characters,
3440    though of course unknown POSIX names are subsequently rejected. Perl gives an
3441    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3442    didn't consider this to be a POSIX class. Likewise for [:1234:].
3443    
3444    The problem in trying to be exactly like Perl is in the handling of escapes. We
3445    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3446    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3447    below handles the special case of \], but does not try to do any other escape
3448    processing. This makes it different from Perl for cases such as [:l\ower:]
3449    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3450    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3451    I think.
3452    
3453    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3454    It seems that the appearance of a nested POSIX class supersedes an apparent
3455    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3456    a digit.
3457    
3458    In Perl, unescaped square brackets may also appear as part of class names. For
3459    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3460    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3461    seem right at all. PCRE does not allow closing square brackets in POSIX class
3462    names.
3463    
3464    Arguments:
3465      ptr      pointer to the initial [
3466      endptr   where to return the end pointer
3467    
3468  /* Now compare the next item with the previous opcode. First, handle cases when  Returns:   TRUE or FALSE
3469  the next item is a character. */  */
3470    
3471  if (escape == 0)  static BOOL
3472    check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3473    {
3474    pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3475    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3476    for (++ptr; *ptr != CHAR_NULL; ptr++)
3477    {    {
3478    /* For a caseless UTF match, the next character may have more than one other    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3479    case, which maps to the special PT_CLIST property. Check this first. */      ptr++;
3480      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3481  #ifdef SUPPORT_UCP    else
   if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)  
3482      {      {
3483      int ocs = UCD_CASESET(next);      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3484      if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);        {
3485          *endptr = ptr;
3486          return TRUE;
3487          }
3488        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3489             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3490              ptr[1] == CHAR_EQUALS_SIGN) &&
3491            check_posix_syntax(ptr, endptr))
3492          return FALSE;
3493      }      }
3494  #endif    }
3495    return FALSE;
3496    }
3497    
   switch(op_code)  
     {  
     case OP_CHAR:  
     return c != next;  
3498    
     /* For CHARI (caseless character) we must check the other case. If we have  
     Unicode property support, we can use it to test the other case of  
     high-valued characters. We know that next can have only one other case,  
     because multi-other-case characters are dealt with above. */  
3499    
     case OP_CHARI:  
     if (c == next) return FALSE;  
 #ifdef SUPPORT_UTF  
     if (utf)  
       {  
       pcre_uint32 othercase;  
       if (next < 128) othercase = cd->fcc[next]; else  
 #ifdef SUPPORT_UCP  
       othercase = UCD_OTHERCASE(next);  
 #else  
       othercase = NOTACHAR;  
 #endif  
       return c != othercase;  
       }  
     else  
 #endif  /* SUPPORT_UTF */  
     return (c != TABLE_GET(next, cd->fcc, next));  /* Not UTF */  
   
     case OP_NOT:  
     return c == next;  
   
     case OP_NOTI:  
     if (c == next) return TRUE;  
 #ifdef SUPPORT_UTF  
     if (utf)  
       {  
       pcre_uint32 othercase;  
       if (next < 128) othercase = cd->fcc[next]; else  
 #ifdef SUPPORT_UCP  
       othercase = UCD_OTHERCASE(next);  
 #else  
       othercase = NOTACHAR;  
 #endif  
       return c == othercase;  
       }  
     else  
 #endif  /* SUPPORT_UTF */  
     return (c == TABLE_GET(next, cd->fcc, next));  /* Not UTF */  
3500    
3501      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.  /*************************************************
3502      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */  *          Check POSIX class name                *
3503    *************************************************/
3504    
3505    /* This function is called to check the name given in a POSIX-style class entry
3506    such as [:alnum:].
3507    
3508      case OP_DIGIT:  Arguments:
3509      return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;    ptr        points to the first letter
3510      len        the length of the name
3511    
3512      case OP_NOT_DIGIT:  Returns:     a value representing the name, or -1 if unknown
3513      return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;  */
3514    
3515      case OP_WHITESPACE:  static int
3516      return next > 255 || (cd->ctypes[next] & ctype_space) == 0;  check_posix_name(const pcre_uchar *ptr, int len)
3517    {
3518    const char *pn = posix_names;
3519    register int yield = 0;
3520    while (posix_name_lengths[yield] != 0)
3521      {
3522      if (len == posix_name_lengths[yield] &&
3523        STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3524      pn += posix_name_lengths[yield] + 1;
3525      yield++;
3526      }
3527    return -1;
3528    }
3529    
     case OP_NOT_WHITESPACE:  
     return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;  
3530    
3531      case OP_WORDCHAR:  /*************************************************
3532      return next > 255 || (cd->ctypes[next] & ctype_word) == 0;  *    Adjust OP_RECURSE items in repeated group   *
3533    *************************************************/
3534    
3535      case OP_NOT_WORDCHAR:  /* OP_RECURSE items contain an offset from the start of the regex to the group
3536      return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;  that is referenced. This means that groups can be replicated for fixed
3537    repetition simply by copying (because the recursion is allowed to refer to
3538    earlier groups that are outside the current group). However, when a group is
3539    optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3540    inserted before it, after it has been compiled. This means that any OP_RECURSE
3541    items within it that refer to the group itself or any contained groups have to
3542    have their offsets adjusted. That one of the jobs of this function. Before it
3543    is called, the partially compiled regex must be temporarily terminated with
3544    OP_END.
3545    
3546      case OP_HSPACE:  This function has been extended with the possibility of forward references for
3547      case OP_NOT_HSPACE:  recursions and subroutine calls. It must also check the list of such references
3548      switch(next)  for the group we are dealing with. If it finds that one of the recursions in
3549        {  the current group is on this list, it adjusts the offset in the list, not the
3550        HSPACE_CASES:  value in the reference (which is a group number).
       return op_code == OP_NOT_HSPACE;  
3551    
3552        default:  Arguments:
3553        return op_code != OP_NOT_HSPACE;    group      points to the start of the group
3554        }    adjust     the amount by which the group is to be moved
3555      utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
3556      cd         contains pointers to tables etc.
3557      save_hwm   the hwm forward reference pointer at the start of the group
3558    
3559      case OP_ANYNL:  Returns:     nothing
3560      case OP_VSPACE:  */
     case OP_NOT_VSPACE:  
     switch(next)  
       {  
       VSPACE_CASES:  
       return op_code == OP_NOT_VSPACE;  
3561    
3562        default:  static void
3563        return op_code != OP_NOT_VSPACE;  adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3564        }    pcre_uchar *save_hwm)
3565    {
3566    pcre_uchar *ptr = group;
3567    
3568  #ifdef SUPPORT_UCP  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3569      case OP_PROP:    {
3570      return check_char_prop(next, previous[0], previous[1], FALSE);    int offset;
3571      pcre_uchar *hc;
3572    
3573      case OP_NOTPROP:    /* See if this recursion is on the forward reference list. If so, adjust the
3574      return check_char_prop(next, previous[0], previous[1], TRUE);    reference. */
 #endif  
3575    
3576      default:    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3577      return FALSE;      {
3578        offset = (int)GET(hc, 0);
3579        if (cd->start_code + offset == ptr + 1)
3580          {
3581          PUT(hc, 0, offset + adjust);
3582          break;
3583          }
3584      }      }
   }  
3585    
3586  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP    /* Otherwise, adjust the recursion offset if it's after the start of this
3587  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are    group. */
 generated only when PCRE_UCP is *not* set, that is, when only ASCII  
 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are  
 replaced by OP_PROP codes when PCRE_UCP is set. */  
3588    
3589  switch(op_code)    if (hc >= cd->hwm)
   {  
   case OP_CHAR:  
   case OP_CHARI:  
   switch(escape)  
3590      {      {
3591      case ESC_d:      offset = (int)GET(ptr, 1);
3592      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;      if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3593        }
     case ESC_D:  
     return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;  
   
     case ESC_s:  
     return c > 255 || (cd->ctypes[c] & ctype_space) == 0;  
3594    
3595      case ESC_S:    ptr += 1 + LINK_SIZE;
3596      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;    }
3597    }
3598    
     case ESC_w:  
     return c > 255 || (cd->ctypes[c] & ctype_word) == 0;  
3599    
     case ESC_W:  
     return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;  
3600    
3601      case ESC_h:  /*************************************************
3602      case ESC_H:  *        Insert an automatic callout point       *
3603      switch(c)  *************************************************/
       {  
       HSPACE_CASES:  
       return escape != ESC_h;  
   
       default:  
       return escape == ESC_h;  
       }  
3604    
3605      case ESC_v:  /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3606      case ESC_V:  callout points before each pattern item.
     switch(c)  
       {  
       VSPACE_CASES:  
       return escape != ESC_v;  
3607    
3608        default:  Arguments:
3609        return escape == ESC_v;    code           current code pointer
3610        }    ptr            current pattern pointer
3611      cd             pointers to tables etc
3612    
3613      /* When PCRE_UCP is set, these values get generated for \d etc. Find  Returns:         new code pointer
3614      their substitutions and process them. The result will always be either  */
     ESC_p or ESC_P. Then fall through to process those values. */  
3615    
3616  #ifdef SUPPORT_UCP  static pcre_uchar *
3617      case ESC_du:  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3618      case ESC_DU:  {
3619      case ESC_wu:  *code++ = OP_CALLOUT;
3620      case ESC_WU:  *code++ = 255;
3621      case ESC_su:  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
3622      case ESC_SU:  PUT(code, LINK_SIZE, 0);                       /* Default length */
3623        {  return code + 2 * LINK_SIZE;
3624        int temperrorcode = 0;  }
       ptr = substitutes[escape - ESC_DU];  
       escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);  
       if (temperrorcode != 0) return FALSE;  
       ptr++;    /* For compatibility */  
       }  
     /* Fall through */  
3625    
     case ESC_p:  
     case ESC_P:  
       {  
       int ptype, pdata, errorcodeptr;  
       BOOL negated;  
3626    
       ptr--;      /* Make ptr point at the p or P */  
       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);  
       if (ptype < 0) return FALSE;  
       ptr++;      /* Point past the final curly ket */  
3627    
3628        /* If the property item is optional, we have to give up. (When generated  /*************************************************
3629        from \d etc by PCRE_UCP, this test will have been applied much earlier,  *         Complete a callout item                *
3630        to the original \d etc. At this point, ptr will point to a zero byte. */  *************************************************/
3631    
3632        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  /* A callout item contains the length of the next item in the pattern, which
3633          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)  we can't fill in till after we have reached the relevant point. This is used
3634            return FALSE;  for both automatic and manual callouts.
3635    
3636        /* Do the property check. */  Arguments:
3637      previous_callout   points to previous callout item
3638      ptr                current pattern pointer
3639      cd                 pointers to tables etc
3640    
3641        return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);  Returns:             nothing
3642        }  */
 #endif  
3643    
3644      default:  static void
3645      return FALSE;  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3646      }  {
3647    int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3648    PUT(previous_callout, 2 + LINK_SIZE, length);
3649    }
3650    
   /* In principle, support for Unicode properties should be integrated here as  
   well. It means re-organizing the above code so as to get hold of the property  
   values before switching on the op-code. However, I wonder how many patterns  
   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,  
   these op-codes are never generated.) */  
3651    
   case OP_DIGIT:  
   return escape == ESC_D || escape == ESC_s || escape == ESC_W ||  
          escape == ESC_h || escape == ESC_v || escape == ESC_R;  
3652    
3653    case OP_NOT_DIGIT:  #ifdef SUPPORT_UCP
3654    return escape == ESC_d;  /*************************************************
3655    *           Get othercase range                  *
3656    *************************************************/
3657    
3658    case OP_WHITESPACE:  /* This function is passed the start and end of a class range, in UTF-8 mode
3659    return escape == ESC_S || escape == ESC_d || escape == ESC_w;  with UCP support. It searches up the characters, looking for ranges of
3660    characters in the "other" case. Each call returns the next one, updating the
3661    start address. A character with multiple other cases is returned on its own
3662    with a special return value.
3663    
3664    case OP_NOT_WHITESPACE:  Arguments:
3665    return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;    cptr        points to starting character value; updated
3666      d           end value
3667      ocptr       where to put start of othercase range
3668      odptr       where to put end of othercase range
3669    
3670    case OP_HSPACE:  Yield:        -1 when no more
3671    return escape == ESC_S || escape == ESC_H || escape == ESC_d ||                 0 when a range is returned
3672           escape == ESC_w || escape == ESC_v || escape == ESC_R;                >0 the CASESET offset for char with multiple other cases
3673                    in this case, ocptr contains the original
3674    */
3675    
3676    case OP_NOT_HSPACE:  static int
3677    return escape == ESC_h;  get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3678      pcre_uint32 *odptr)
3679    {
3680    pcre_uint32 c, othercase, next;
3681    unsigned int co;
3682    
3683    /* Can't have \S in here because VT matches \S (Perl anomaly) */  /* Find the first character that has an other case. If it has multiple other
3684    case OP_ANYNL:  cases, return its case offset value. */
   case OP_VSPACE:  
   return escape == ESC_V || escape == ESC_d || escape == ESC_w;  
3685    
3686    case OP_NOT_VSPACE:  for (c = *cptr; c <= d; c++)
3687    return escape == ESC_v || escape == ESC_R;    {
3688      if ((co = UCD_CASESET(c)) != 0)
3689        {
3690        *ocptr = c++;   /* Character that has the set */
3691        *cptr = c;      /* Rest of input range */
3692        return (int)co;
3693        }
3694      if ((othercase = UCD_OTHERCASE(c)) != c) break;
3695      }
3696    
3697    case OP_WORDCHAR:  if (c > d) return -1;  /* Reached end of range */
   return escape == ESC_W || escape == ESC_s || escape == ESC_h ||  
          escape == ESC_v || escape == ESC_R;  
3698    
3699    case OP_NOT_WORDCHAR:  *ocptr = othercase;
3700    return escape == ESC_w || escape == ESC_d;  next = othercase + 1;
3701    
3702    default:  for (++c; c <= d; c++)
3703    return FALSE;    {
3704      if (UCD_OTHERCASE(c) != next) break;
3705      next++;
3706    }    }
3707    
3708  /* Control does not reach here */  *odptr = next - 1;     /* End of othercase range */
3709    *cptr = c;             /* Rest of input range */
3710    return 0;
3711  }  }
3712    #endif  /* SUPPORT_UCP */
3713    
3714    
3715    
# Line 3418  switch(op_code) Line 3718  switch(op_code)
3718  *************************************************/  *************************************************/
3719    
3720  /* This function packages up the logic of adding a character or range of  /* This function packages up the logic of adding a character or range of
3721  characters to a class. The character values in the arguments will be within the  characters to a class. The character values in the arguments will be within the
3722  valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is  valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3723  mutually recursive with the function immediately below.  mutually recursive with the function immediately below.
3724    
3725  Arguments:  Arguments:
3726    classbits     the bit map for characters < 256    classbits     the bit map for characters < 256
3727    uchardptr     points to the pointer for extra data    uchardptr     points to the pointer for extra data
3728    options       the options word    options       the options word
3729    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
3730    start         start of range character    start         start of range character
3731    end           end of range character    end           end of range character
3732    
3733  Returns:        the number of < 256 characters added  Returns:        the number of < 256 characters added
3734                  the pointer to extra data is updated                  the pointer to extra data is updated
3735  */  */
# Line 3441  add_to_class(pcre_uint8 *classbits, pcre Line 3741  add_to_class(pcre_uint8 *classbits, pcre
3741  pcre_uint32 c;  pcre_uint32 c;
3742  int n8 = 0;  int n8 = 0;
3743    
3744  /* If caseless matching is required, scan the range and process alternate  /* If caseless matching is required, scan the range and process alternate
3745  cases. In Unicode, there are 8-bit characters that have alternate cases that  cases. In Unicode, there are 8-bit characters that have alternate cases that
3746  are greater than 255 and vice-versa. Sometimes we can just extend the original  are greater than 255 and vice-versa. Sometimes we can just extend the original
3747  range. */  range. */
3748    
3749  if ((options & PCRE_CASELESS) != 0)  if ((options & PCRE_CASELESS) != 0)
3750    {    {
3751  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3752    if ((options & PCRE_UTF8) != 0)    if ((options & PCRE_UTF8) != 0)
3753      {      {
3754      int rc;      int rc;
3755      pcre_uint32 oc, od;      pcre_uint32 oc, od;
3756    
3757      options &= ~PCRE_CASELESS;   /* Remove for recursive calls */      options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
3758      c = start;      c = start;
3759    
3760      while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)      while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3761        {        {
3762        /* Handle a single character that has more than one other case. */        /* Handle a single character that has more than one other case. */
3763    
3764        if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,        if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3765          PRIV(ucd_caseless_sets) + rc, oc);          PRIV(ucd_caseless_sets) + rc, oc);
3766    
3767        /* Do nothing if the other case range is within the original range. */        /* Do nothing if the other case range is within the original range. */
3768    
3769        else if (oc >= start && od <= end) continue;        else if (oc >= start && od <= end) continue;
3770    
3771        /* Extend the original range if there is overlap, noting that if oc < c, we        /* Extend the original range if there is overlap, noting that if oc < c, we
3772        can't have od > end because a subrange is always shorter than the basic        can't have od > end because a subrange is always shorter than the basic
3773        range. Otherwise, use a recursive call to add the additional range. */        range. Otherwise, use a recursive call to add the additional range. */
3774    
3775        else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */        else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3776        else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */        else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
3777        else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);        else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
# Line 3481  if ((options & PCRE_CASELESS) != 0) Line 3781  if ((options & PCRE_CASELESS) != 0)
3781  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3782    
3783    /* Not UTF-mode, or no UCP */    /* Not UTF-mode, or no UCP */
3784    
3785    for (c = start; c <= end && c < 256; c++)    for (c = start; c <= end && c < 256; c++)
3786      {      {
3787      SETBIT(classbits, cd->fcc[c]);      SETBIT(classbits, cd->fcc[c]);
3788      n8++;      n8++;
3789      }      }
3790    }    }
3791    
3792  /* Now handle the original range. Adjust the final value according to the bit  /* Now handle the original range. Adjust the final value according to the bit
3793  length - this means that the same lists of (e.g.) horizontal spaces can be used  length - this means that the same lists of (e.g.) horizontal spaces can be used
3794  in all cases. */  in all cases. */
# Line 3514  if (end < 0x100) Line 3814  if (end < 0x100)
3814    {    {
3815    for (c = start; c <= end; c++)    for (c = start; c <= end; c++)
3816      {      {
3817      n8++;      n8++;
3818      SETBIT(classbits, c);      SETBIT(classbits, c);
3819      }      }
3820    }    }
3821    
3822  else  else
3823    {    {
3824    pcre_uchar *uchardata = *uchardptr;    pcre_uchar *uchardata = *uchardptr;
3825    
3826  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
3827    if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */    if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
3828      {      {
3829      if (start < end)      if (start < end)
3830        {        {
3831        *uchardata++ = XCL_RANGE;        *uchardata++ = XCL_RANGE;
3832        uchardata += PRIV(ord2utf)(start, uchardata);        uchardata += PRIV(ord2utf)(start, uchardata);
3833        uchardata += PRIV(ord2utf)(end, uchardata);        uchardata += PRIV(ord2utf)(end, uchardata);
3834        }        }
3835      else if (start == end)      else if (start == end)
3836        {        {
3837        *uchardata++ = XCL_SINGLE;        *uchardata++ = XCL_SINGLE;
3838        uchardata += PRIV(ord2utf)(start, uchardata);        uchardata += PRIV(ord2utf)(start, uchardata);
3839        }        }
3840      }      }
3841    else    else
3842  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
3843    
3844    /* Without UTF support, character values are constrained by the bit length,    /* Without UTF support, character values are constrained by the bit length,
3845    and can only be > 256 for 16-bit and 32-bit libraries. */    and can only be > 256 for 16-bit and 32-bit libraries. */
3846    
3847  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
3848      {}      {}
3849  #else  #else
3850    if (start < end)    if (start < end)
3851      {      {
3852      *uchardata++ = XCL_RANGE;      *uchardata++ = XCL_RANGE;
# Line 3557  else Line 3857  else
3857      {      {
3858      *uchardata++ = XCL_SINGLE;      *uchardata++ = XCL_SINGLE;
3859      *uchardata++ = start;      *uchardata++ = start;
3860      }      }
3861  #endif  #endif
3862    
3863    *uchardptr = uchardata;   /* Updata extra data pointer */    *uchardptr = uchardata;   /* Updata extra data pointer */
3864    }    }
3865    
3866  return n8;    /* Number of 8-bit characters */  return n8;    /* Number of 8-bit characters */
3867  }  }
3868    
3869    
3870    
3871    
3872  /*************************************************  /*************************************************
3873  *        Add a list of characters to a class     *  *        Add a list of characters to a class     *
3874  *************************************************/  *************************************************/
3875    
3876  /* This function is used for adding a list of case-equivalent characters to a  /* This function is used for adding a list of case-equivalent characters to a
3877  class, and also for adding a list of horizontal or vertical whitespace. If the  class, and also for adding a list of horizontal or vertical whitespace. If the
3878  list is in order (which it should be), ranges of characters are detected and  list is in order (which it should be), ranges of characters are detected and
3879  handled appropriately. This function is mutually recursive with the function  handled appropriately. This function is mutually recursive with the function
# Line 3583  Arguments: Line 3883  Arguments:
3883    classbits     the bit map for characters < 256    classbits     the bit map for characters < 256
3884    uchardptr     points to the pointer for extra data    uchardptr     points to the pointer for extra data
3885    options       the options word    options       the options word
3886    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
3887    p             points to row of 32-bit values, terminated by NOTACHAR    p             points to row of 32-bit values, terminated by NOTACHAR
3888    except        character to omit; this is used when adding lists of    except        character to omit; this is used when adding lists of
3889                    case-equivalent characters to avoid including the one we                    case-equivalent characters to avoid including the one we
3890                    already know about                    already know about
3891    
3892  Returns:        the number of < 256 characters added  Returns:        the number of < 256 characters added
3893                  the pointer to extra data is updated                  the pointer to extra data is updated
3894  */  */
# Line 3602  while (p[0] < NOTACHAR) Line 3902  while (p[0] < NOTACHAR)
3902    {    {
3903    int n = 0;    int n = 0;
3904    if (p[0] != except)    if (p[0] != except)
3905      {      {
3906      while(p[n+1] == p[0] + n + 1) n++;      while(p[n+1] == p[0] + n + 1) n++;
3907      n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);      n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3908      }      }
3909    p += n + 1;    p += n + 1;
3910    }    }
3911  return n8;  return n8;
3912  }  }
3913    
3914    
3915    
# Line 3624  Arguments: Line 3924  Arguments:
3924    classbits     the bit map for characters < 256    classbits     the bit map for characters < 256
3925    uchardptr     points to the pointer for extra data    uchardptr     points to the pointer for extra data
3926    options       the options word    options       the options word
3927    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
3928    p             points to row of 32-bit values, terminated by NOTACHAR    p             points to row of 32-bit values, terminated by NOTACHAR
3929    
3930  Returns:        the number of < 256 characters added  Returns:        the number of < 256 characters added
3931                  the pointer to extra data is updated                  the pointer to extra data is updated
3932  */  */
3933    
3934  static int  static int
3935  add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,  add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3936    int options, compile_data *cd, const pcre_uint32 *p)    int options, compile_data *cd, const pcre_uint32 *p)
3937  {  {
3938  BOOL utf = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
# Line 3644  while (p[0] < NOTACHAR) Line 3944  while (p[0] < NOTACHAR)
3944    while (p[1] == p[0] + 1) p++;    while (p[1] == p[0] + 1) p++;
3945    n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,    n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3946      (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);      (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3947    p++;    p++;
3948    }    }
3949  return n8;  return n8;
3950  }  }
3951    
3952    
3953    
# Line 3662  to find out the amount of memory needed, Line 3962  to find out the amount of memory needed,
3962  phase. The value of lengthptr distinguishes the two phases.  phase. The value of lengthptr distinguishes the two phases.
3963    
3964  Arguments:  Arguments:
3965    optionsptr     pointer to the option bits    optionsptr        pointer to the option bits
3966    codeptr        points to the pointer to the current code point    codeptr           points to the pointer to the current code point
3967    ptrptr         points to the current pattern pointer    ptrptr            points to the current pattern pointer
3968    errorcodeptr   points to error code variable    errorcodeptr      points to error code variable
3969    firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstcharptr      place to put the first required character
3970    reqcharptr     set to the last literal character required, else < 0    firstcharflagsptr place to put the first character flags, or a negative number
3971    bcptr          points to current branch chain    reqcharptr        place to put the last required character
3972    cond_depth     conditional nesting depth    reqcharflagsptr   place to put the last required character flags, or a negative number
3973    cd             contains pointers to tables etc.    bcptr             points to current branch chain
3974    lengthptr      NULL during the real compile phase    cond_depth        conditional nesting depth
3975                   points to length accumulator during pre-compile phase    cd                contains pointers to tables etc.
3976      lengthptr         NULL during the real compile phase
3977                        points to length accumulator during pre-compile phase
3978    
3979  Returns:         TRUE on success  Returns:            TRUE on success
3980                   FALSE, with *errorcodeptr set non-zero on error                      FALSE, with *errorcodeptr set non-zero on error
3981  */  */
3982    
3983  static BOOL  static BOOL
3984  compile_branch(int *optionsptr, pcre_uchar **codeptr,  compile_branch(int *optionsptr, pcre_uchar **codeptr,
3985    const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,    const pcre_uchar **ptrptr, int *errorcodeptr,
3986    pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,    pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
3987      pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
3988      branch_chain *bcptr, int cond_depth,
3989    compile_data *cd, int *lengthptr)    compile_data *cd, int *lengthptr)
3990  {  {
3991  int repeat_type, op_type;  int repeat_type, op_type;
3992  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3993  int bravalue = 0;  int bravalue = 0;
3994  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
3995  pcre_int32 firstchar, reqchar;  pcre_uint32 firstchar, reqchar;
3996  pcre_int32 zeroreqchar, zerofirstchar;  pcre_int32 firstcharflags, reqcharflags;
3997    pcre_uint32 zeroreqchar, zerofirstchar;
3998    pcre_int32 zeroreqcharflags, zerofirstcharflags;
3999  pcre_int32 req_caseopt, reqvary, tempreqvary;  pcre_int32 req_caseopt, reqvary, tempreqvary;
4000  int options = *optionsptr;               /* May change dynamically */  int options = *optionsptr;               /* May change dynamically */
4001  int after_manual_callout = 0;  int after_manual_callout = 0;
# Line 3717  dynamically as we process the pattern. * Line 4023  dynamically as we process the pattern. *
4023  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4024  /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */  /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4025  BOOL utf = (options & PCRE_UTF8) != 0;  BOOL utf = (options & PCRE_UTF8) != 0;
4026    #ifndef COMPILE_PCRE32
4027  pcre_uchar utf_chars[6];  pcre_uchar utf_chars[6];
4028    #endif
4029  #else  #else
4030  BOOL utf = FALSE;  BOOL utf = FALSE;
4031  #endif  #endif
4032    
4033  /* Helper variables for OP_XCLASS opcode (for characters > 255). We define  /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4034  class_uchardata always so that it can be passed to add_to_class() always,  class_uchardata always so that it can be passed to add_to_class() always,
4035  though it will not be used in non-UTF 8-bit cases. This avoids having to supply  though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4036  alternative calls for the different cases. */  alternative calls for the different cases. */
4037    
4038  pcre_uchar *class_uchardata;  pcre_uchar *class_uchardata;
# Line 3752  to take the zero repeat into account. Th Line 4060  to take the zero repeat into account. Th
4060  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual  zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4061  item types that can be repeated set these backoff variables appropriately. */  item types that can be repeated set these backoff variables appropriately. */
4062    
4063  firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;  firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4064    firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4065    
4066  /* The variable req_caseopt contains either the REQ_CASELESS value  /* The variable req_caseopt contains either the REQ_CASELESS value
4067  or zero, according to the current setting of the caseless flag. The  or zero, according to the current setting of the caseless flag. The
# Line 3778  for (;; ptr++) Line 4087  for (;; ptr++)
4087    int recno;    int recno;
4088    int refsign;    int refsign;
4089    int skipbytes;    int skipbytes;
4090    int subreqchar;    pcre_uint32 subreqchar, subfirstchar;
4091    int subfirstchar;    pcre_int32 subreqcharflags, subfirstcharflags;
4092    int terminator;    int terminator;
4093    int mclength;    unsigned int mclength;
4094    int tempbracount;    unsigned int tempbracount;
4095    int ec; // FIXMEchpe pcre_uint32    pcre_uint32 ec;
4096    pcre_uchar mcbuffer[8];    pcre_uchar mcbuffer[8];
4097    
4098    /* Get next character in the pattern */    /* Get next character in the pattern */
# Line 3793  for (;; ptr++) Line 4102  for (;; ptr++)
4102    /* If we are at the end of a nested substitution, revert to the outer level    /* If we are at the end of a nested substitution, revert to the outer level
4103    string. Nesting only happens one level deep. */    string. Nesting only happens one level deep. */
4104    
4105    if (c == 0 && nestptr != NULL)    if (c == CHAR_NULL && nestptr != NULL)
4106      {      {
4107      ptr = nestptr;      ptr = nestptr;
4108      nestptr = NULL;      nestptr = NULL;
# Line 3868  for (;; ptr++) Line 4177  for (;; ptr++)
4177    
4178    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
4179    
4180    if (inescq && c != 0)    if (inescq && c != CHAR_NULL)
4181      {      {
4182      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4183        {        {
# Line 3916  for (;; ptr++) Line 4225  for (;; ptr++)
4225      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
4226        {        {
4227        ptr++;        ptr++;
4228        while (*ptr != 0)        while (*ptr != CHAR_NULL)
4229          {          {
4230          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4231          ptr++;          ptr++;
# Line 3924  for (;; ptr++) Line 4233  for (;; ptr++)
4233          if (utf) FORWARDCHAR(ptr);          if (utf) FORWARDCHAR(ptr);
4234  #endif  #endif
4235          }          }
4236        if (*ptr != 0) continue;        if (*ptr != CHAR_NULL) continue;
4237    
4238        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
4239        c = 0;        c = 0;
# Line 3946  for (;; ptr++) Line 4255  for (;; ptr++)
4255      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
4256      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
4257      *firstcharptr = firstchar;      *firstcharptr = firstchar;
4258        *firstcharflagsptr = firstcharflags;
4259      *reqcharptr = reqchar;      *reqcharptr = reqchar;
4260        *reqcharflagsptr = reqcharflags;
4261      *codeptr = code;      *codeptr = code;
4262      *ptrptr = ptr;      *ptrptr = ptr;
4263      if (lengthptr != NULL)      if (lengthptr != NULL)
# Line 3970  for (;; ptr++) Line 4281  for (;; ptr++)
4281      previous = NULL;      previous = NULL;
4282      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
4283        {        {
4284        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4285        *code++ = OP_CIRCM;        *code++ = OP_CIRCM;
4286        }        }
4287      else *code++ = OP_CIRC;      else *code++ = OP_CIRC;
# Line 3985  for (;; ptr++) Line 4296  for (;; ptr++)
4296      repeats. The value of reqchar doesn't change either. */      repeats. The value of reqchar doesn't change either. */
4297    
4298      case CHAR_DOT:      case CHAR_DOT:
4299      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4300      zerofirstchar = firstchar;      zerofirstchar = firstchar;
4301        zerofirstcharflags = firstcharflags;
4302      zeroreqchar = reqchar;      zeroreqchar = reqchar;
4303        zeroreqcharflags = reqcharflags;
4304      previous = code;      previous = code;
4305      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4306      break;      break;
# Line 4061  for (;; ptr++) Line 4374  for (;; ptr++)
4374          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4375        {        {
4376        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
4377        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4378        zerofirstchar = firstchar;        zerofirstchar = firstchar;
4379          zerofirstcharflags = firstcharflags;
4380        break;        break;
4381        }        }
4382    
# Line 4097  for (;; ptr++) Line 4411  for (;; ptr++)
4411      means that an initial ] is taken as a data character. At the start of the      means that an initial ] is taken as a data character. At the start of the
4412      loop, c contains the first byte of the character. */      loop, c contains the first byte of the character. */
4413    
4414      if (c != 0) do      if (c != CHAR_NULL) do
4415        {        {
4416        const pcre_uchar *oldptr;        const pcre_uchar *oldptr;
4417    
4418  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
4419        if (utf && HAS_EXTRALEN(c))        if (utf && HAS_EXTRALEN(c))
4420          {                           /* Braces are required because the */          {                           /* Braces are required because the */
4421          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
# Line 4112  for (;; ptr++) Line 4426  for (;; ptr++)
4426        /* In the pre-compile phase, accumulate the length of any extra        /* In the pre-compile phase, accumulate the length of any extra
4427        data and reset the pointer. This is so that very large classes that        data and reset the pointer. This is so that very large classes that
4428        contain a zillion > 255 characters no longer overwrite the work space        contain a zillion > 255 characters no longer overwrite the work space
4429        (which is on the stack). We have to remember that there was XCLASS data,        (which is on the stack). We have to remember that there was XCLASS data,
4430        however. */        however. */
4431    
4432        if (lengthptr != NULL && class_uchardata > class_uchardata_base)        if (lengthptr != NULL && class_uchardata > class_uchardata_base)
# Line 4176  for (;; ptr++) Line 4490  for (;; ptr++)
4490          alpha. This relies on the fact that the class table starts with          alpha. This relies on the fact that the class table starts with
4491          alpha, lower, upper as the first 3 entries. */          alpha, lower, upper as the first 3 entries. */
4492    
4493          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4494            posix_class = 0;            posix_class = 0;
4495    
4496          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
# Line 4253  for (;; ptr++) Line 4567  for (;; ptr++)
4567    
4568        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
4569          {          {
4570          escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, TRUE);          escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4571              TRUE);
4572          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
4573            if (escape == 0) c = ec;
         if (escape == 0)  
           c = ec;  
4574          else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */          else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4575          else if (escape == ESC_N)            /* \N is not supported in a class */          else if (escape == ESC_N)          /* \N is not supported in a class */
4576            {            {
4577            *errorcodeptr = ERR71;            *errorcodeptr = ERR71;
4578            goto FAILED;            goto FAILED;
# Line 4316  for (;; ptr++) Line 4628  for (;; ptr++)
4628              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4629              continue;              continue;
4630    
4631              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4632              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4633              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
4634              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4635                we could just adjust the appropriate bit. From PCRE 8.34 we no
4636                longer treat \s and \S specially. */
4637    
4638              case ESC_s:              case ESC_s:
4639              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
4640              continue;              continue;
4641    
4642              case ESC_S:              case ESC_S:
4643              should_flip_negation = TRUE;              should_flip_negation = TRUE;
4644              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
4645              continue;              continue;
4646    
4647              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
4648    
4649              case ESC_h:              case ESC_h:
4650              (void)add_list_to_class(classbits, &class_uchardata, options, cd,              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4651                PRIV(hspace_list), NOTACHAR);                PRIV(hspace_list), NOTACHAR);
4652              continue;              continue;
4653    
4654              case ESC_H:              case ESC_H:
4655              (void)add_not_list_to_class(classbits, &class_uchardata, options,              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4656                cd, PRIV(hspace_list));                cd, PRIV(hspace_list));
4657              continue;              continue;
4658    
4659              case ESC_v:              case ESC_v:
4660              (void)add_list_to_class(classbits, &class_uchardata, options, cd,              (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4661                PRIV(vspace_list), NOTACHAR);                PRIV(vspace_list), NOTACHAR);
4662              continue;              continue;
4663    
4664              case ESC_V:              case ESC_V:
4665              (void)add_not_list_to_class(classbits, &class_uchardata, options,              (void)add_not_list_to_class(classbits, &class_uchardata, options,
4666                cd, PRIV(vspace_list));                cd, PRIV(vspace_list));
4667              continue;              continue;
4668    
4669  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 4360  for (;; ptr++) Line 4671  for (;; ptr++)
4671              case ESC_P:              case ESC_P:
4672                {                {
4673                BOOL negated;                BOOL negated;
4674                int pdata;                unsigned int ptype = 0, pdata = 0;
4675                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4676                if (ptype < 0) goto FAILED;                  goto FAILED;
4677                *class_uchardata++ = ((escape == ESC_p) != negated)?                *class_uchardata++ = ((escape == ESC_p) != negated)?
4678                  XCL_PROP : XCL_NOTPROP;                  XCL_PROP : XCL_NOTPROP;
4679                *class_uchardata++ = ptype;                *class_uchardata++ = ptype;
# Line 4390  for (;; ptr++) Line 4701  for (;; ptr++)
4701    
4702          /* Fall through if the escape just defined a single character (c >= 0).          /* Fall through if the escape just defined a single character (c >= 0).
4703          This may be greater than 256. */          This may be greater than 256. */
4704    
4705          escape = 0;          escape = 0;
4706    
4707          }   /* End of backslash handling */          }   /* End of backslash handling */
# Line 4416  for (;; ptr++) Line 4727  for (;; ptr++)
4727    
4728        if (!inescq && ptr[1] == CHAR_MINUS)        if (!inescq && ptr[1] == CHAR_MINUS)
4729          {          {
4730          int d;          pcre_uint32 d;
4731          ptr += 2;          ptr += 2;
4732          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4733    
# Line 4431  for (;; ptr++) Line 4742  for (;; ptr++)
4742            inescq = TRUE;            inescq = TRUE;
4743            break;            break;
4744            }            }
4745    
4746          /* Minus (hyphen) at the end of a class is treated as a literal, so put          /* Minus (hyphen) at the end of a class is treated as a literal, so put
4747          back the pointer and jump to handle the character that preceded it. */          back the pointer and jump to handle the character that preceded it. */
4748    
4749          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))          if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4750            {            {
4751            ptr = oldptr;            ptr = oldptr;
4752            goto CLASS_SINGLE_CHARACTER;            goto CLASS_SINGLE_CHARACTER;
4753            }            }
4754    
4755          /* Otherwise, we have a potential range; pick up the next character */          /* Otherwise, we have a potential range; pick up the next character */
4756    
4757  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
# Line 4487  for (;; ptr++) Line 4798  for (;; ptr++)
4798          /* We have found a character range, so single character optimizations          /* We have found a character range, so single character optimizations
4799          cannot be done anymore. Any value greater than 1 indicates that there          cannot be done anymore. Any value greater than 1 indicates that there
4800          is more than one character. */          is more than one character. */
4801    
4802          class_one_char = 2;          class_one_char = 2;
4803    
4804          /* Remember an explicit \r or \n, and add the range to the class. */          /* Remember an explicit \r or \n, and add the range to the class. */
4805    
4806          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4807    
4808          class_has_8bitchar +=          class_has_8bitchar +=
4809            add_to_class(classbits, &class_uchardata, options, cd, c, d);            add_to_class(classbits, &class_uchardata, options, cd, c, d);
4810    
4811          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
4812          }          }
4813    
4814        /* Handle a single character - we can get here for a normal non-escape        /* Handle a single character - we can get here for a normal non-escape
4815        char, or after \ that introduces a single character or for an apparent        char, or after \ that introduces a single character or for an apparent
4816        range that isn't. Only the value 1 matters for class_one_char, so don't        range that isn't. Only the value 1 matters for class_one_char, so don't
4817        increase it if it is already 2 or more ... just in case there's a class        increase it if it is already 2 or more ... just in case there's a class
4818        with a zillion characters in it. */        with a zillion characters in it. */
4819    
4820        CLASS_SINGLE_CHARACTER:        CLASS_SINGLE_CHARACTER:
# Line 4522  for (;; ptr++) Line 4833  for (;; ptr++)
4833          {          {
4834          ptr++;          ptr++;
4835          zeroreqchar = reqchar;          zeroreqchar = reqchar;
4836            zeroreqcharflags = reqcharflags;
4837    
4838          if (negate_class)          if (negate_class)
4839            {            {
4840  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
             // FIXMEchpe pcreuint32?  
4841            int d;            int d;
4842  #endif  #endif
4843            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;            if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4844            zerofirstchar = firstchar;            zerofirstchar = firstchar;
4845              zerofirstcharflags = firstcharflags;
4846    
4847            /* For caseless UTF-8 mode when UCP support is available, check            /* For caseless UTF-8 mode when UCP support is available, check
4848            whether this character has more than one other case. If so, generate            whether this character has more than one other case. If so, generate
4849            a special OP_NOTPROP item instead of OP_NOTI. */            a special OP_NOTPROP item instead of OP_NOTI. */
4850    
4851  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4852            if (utf && (options & PCRE_CASELESS) != 0 &&            if (utf && (options & PCRE_CASELESS) != 0 &&
4853                (d = UCD_CASESET(c)) != 0)                (d = UCD_CASESET(c)) != 0)
4854              {              {
4855              *code++ = OP_NOTPROP;              *code++ = OP_NOTPROP;
4856              *code++ = PT_CLIST;              *code++ = PT_CLIST;
4857              *code++ = d;              *code++ = d;
4858              }              }
4859            else            else
4860  #endif  #endif
4861            /* Char has only one other case, or UCP not available */            /* Char has only one other case, or UCP not available */
4862    
# Line 4557  for (;; ptr++) Line 4869  for (;; ptr++)
4869  #endif  #endif
4870                *code++ = c;                *code++ = c;
4871              }              }
4872    
4873            /* We are finished with this character class */            /* We are finished with this character class */
4874    
4875            goto END_CLASS;            goto END_CLASS;
4876            }            }
4877    
# Line 4577  for (;; ptr++) Line 4889  for (;; ptr++)
4889            }            }
4890          goto ONE_CHAR;          goto ONE_CHAR;
4891          }       /* End of 1-char optimization */          }       /* End of 1-char optimization */
4892    
4893        /* There is more than one character in the class, or an XCLASS item        /* There is more than one character in the class, or an XCLASS item
4894        has been generated. Add this character to the class. */        has been generated. Add this character to the class. */
4895    
4896        class_has_8bitchar +=        class_has_8bitchar +=
4897          add_to_class(classbits, &class_uchardata, options, cd, c, c);          add_to_class(classbits, &class_uchardata, options, cd, c, c);
4898        }        }
4899    
# Line 4589  for (;; ptr++) Line 4901  for (;; ptr++)
4901      If we are at the end of an internal nested string, revert to the outer      If we are at the end of an internal nested string, revert to the outer
4902      string. */      string. */
4903    
4904      while (((c = *(++ptr)) != 0 ||      while (((c = *(++ptr)) != CHAR_NULL ||
4905             (nestptr != NULL &&             (nestptr != NULL &&
4906               (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&               (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
4907             (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));             (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4908    
4909      /* Check for missing terminating ']' */      /* Check for missing terminating ']' */
4910    
4911      if (c == 0)      if (c == CHAR_NULL)
4912        {        {
4913        *errorcodeptr = ERR6;        *errorcodeptr = ERR6;
4914        goto FAILED;        goto FAILED;
4915        }        }
4916    
4917      /* We will need an XCLASS if data has been placed in class_uchardata. In      /* We will need an XCLASS if data has been placed in class_uchardata. In
4918      the second phase this is a sufficient test. However, in the pre-compile      the second phase this is a sufficient test. However, in the pre-compile
4919      phase, class_uchardata gets emptied to prevent workspace overflow, so it      phase, class_uchardata gets emptied to prevent workspace overflow, so it
4920      only if the very last character in the class needs XCLASS will it contain      only if the very last character in the class needs XCLASS will it contain
4921      anything at this point. For this reason, xclass gets set TRUE above when      anything at this point. For this reason, xclass gets set TRUE above when
4922      uchar_classdata is emptied, and that's why this code is the way it is here      uchar_classdata is emptied, and that's why this code is the way it is here
4923      instead of just doing a test on class_uchardata below. */      instead of just doing a test on class_uchardata below. */
4924    
4925  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4926      if (class_uchardata > class_uchardata_base) xclass = TRUE;      if (class_uchardata > class_uchardata_base) xclass = TRUE;
4927  #endif  #endif
# Line 4618  for (;; ptr++) Line 4930  for (;; ptr++)
4930      setting, whatever the repeat count. Any reqchar setting must remain      setting, whatever the repeat count. Any reqchar setting must remain
4931      unchanged after any kind of repeat. */      unchanged after any kind of repeat. */
4932    
4933      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4934      zerofirstchar = firstchar;      zerofirstchar = firstchar;
4935        zerofirstcharflags = firstcharflags;
4936      zeroreqchar = reqchar;      zeroreqchar = reqchar;
4937        zeroreqcharflags = reqcharflags;
4938    
4939      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4940      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
# Line 4676  for (;; ptr++) Line 4990  for (;; ptr++)
4990        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4991        }        }
4992      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4993    
4994      END_CLASS:      END_CLASS:
4995      break;      break;
4996    
# Line 4715  for (;; ptr++) Line 5029  for (;; ptr++)
5029      if (repeat_min == 0)      if (repeat_min == 0)
5030        {        {
5031        firstchar = zerofirstchar;    /* Adjust for zero repeat */        firstchar = zerofirstchar;    /* Adjust for zero repeat */
5032          firstcharflags = zerofirstcharflags;
5033        reqchar = zeroreqchar;        /* Ditto */        reqchar = zeroreqchar;        /* Ditto */
5034          reqcharflags = zeroreqcharflags;
5035        }        }
5036    
5037      /* Remember whether this is a variable length repeat */      /* Remember whether this is a variable length repeat */
# Line 4818  for (;; ptr++) Line 5134  for (;; ptr++)
5134          {          {
5135          c = code[-1];          c = code[-1];
5136          if (*previous <= OP_CHARI && repeat_min > 1)          if (*previous <= OP_CHARI && repeat_min > 1)
5137            reqchar = c | req_caseopt | cd->req_varyopt;            {
5138          }            reqchar = c;
5139              reqcharflags = req_caseopt | cd->req_varyopt;
5140        /* If the repetition is unlimited, it pays to see if the next thing on            }
       the line is something that cannot possibly match this character. If so,  
       automatically possessifying this item gains some performance in the case  
       where the match fails. */  
   
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
5141          }          }
5142    
5143        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
# Line 4851  for (;; ptr++) Line 5157  for (;; ptr++)
5157        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5158        c = *previous;        c = *previous;
5159    
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
   
5160        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
5161        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
5162          {          {
# Line 4875  for (;; ptr++) Line 5173  for (;; ptr++)
5173    
5174        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
5175    
       /*--------------------------------------------------------------------*/  
       /* This code is obsolete from release 8.00; the restriction was finally  
       removed: */  
   
       /* All real repeats make it impossible to handle partial matching (maybe  
       one day we will be able to remove this restriction). */  
   
       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */  
       /*--------------------------------------------------------------------*/  
   
5176        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
5177    
5178        repeat_type += op_type;        repeat_type += op_type;
# Line 5017  for (;; ptr++) Line 5305  for (;; ptr++)
5305      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
5306      stuff after it, but just skip the item if the repeat was {0,0}. */      stuff after it, but just skip the item if the repeat was {0,0}. */
5307    
5308      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
              *previous == OP_NCLASS ||  
5309  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5310               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5311  #endif  #endif
5312               *previous == OP_REF ||               *previous == OP_REF   || *previous == OP_REFI ||
5313               *previous == OP_REFI)               *previous == OP_DNREF || *previous == OP_DNREFI)
5314        {        {
5315        if (repeat_max == 0)        if (repeat_max == 0)
5316          {          {
# Line 5031  for (;; ptr++) Line 5318  for (;; ptr++)
5318          goto END_REPEAT;          goto END_REPEAT;
5319          }          }
5320    
       /*--------------------------------------------------------------------*/  
       /* This code is obsolete from release 8.00; the restriction was finally  
       removed: */  
   
       /* All real repeats make it impossible to handle partial matching (maybe  
       one day we will be able to remove this restriction). */  
   
       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */  
       /*--------------------------------------------------------------------*/  
   
5321        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
5322          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
5323        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 5200  for (;; ptr++) Line 5477  for (;; ptr++)
5477    
5478            else            else
5479              {              {
5480              if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;              if (groupsetfirstchar && reqcharflags < 0)
5481                  {
5482                  reqchar = firstchar;
5483                  reqcharflags = firstcharflags;
5484                  }
5485    
5486              for (i = 1; i < repeat_min; i++)              for (i = 1; i < repeat_min; i++)
5487                {                {
# Line 5379  for (;; ptr++) Line 5660  for (;; ptr++)
5660              pcre_uchar *scode = bracode;              pcre_uchar *scode = bracode;
5661              do              do
5662                {                {
5663                if (could_be_empty_branch(scode, ketcode, utf, cd))                if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5664                  {                  {
5665                  *bracode += OP_SBRA - OP_BRA;                  *bracode += OP_SBRA - OP_BRA;
5666                  break;                  break;
# Line 5479  for (;; ptr++) Line 5760  for (;; ptr++)
5760        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5761          {          {
5762          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
5763  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
5764          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
5765            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
5766  #endif  #endif
# Line 5575  for (;; ptr++) Line 5856  for (;; ptr++)
5856        if (*ptr == CHAR_COLON)        if (*ptr == CHAR_COLON)
5857          {          {
5858          arg = ++ptr;          arg = ++ptr;
5859          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;          while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5860          arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
5861          if ((unsigned int)arglen > MAX_MARK)          if ((unsigned int)arglen > MAX_MARK)
5862            {            {
# Line 5620  for (;; ptr++) Line 5901  for (;; ptr++)
5901                (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;                (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5902    
5903              /* Do not set firstchar after *ACCEPT */              /* Do not set firstchar after *ACCEPT */
5904              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;              if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5905              }              }
5906    
5907            /* Handle other cases with/without an argument */            /* Handle other cases with/without an argument */
# Line 5689  for (;; ptr++) Line 5970  for (;; ptr++)
5970          {          {
5971          case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */          case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
5972          ptr++;          ptr++;
5973          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;          while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5974          if (*ptr == 0)          if (*ptr == CHAR_NULL)
5975            {            {
5976            *errorcodeptr = ERR18;            *errorcodeptr = ERR18;
5977            goto FAILED;            goto FAILED;
# Line 5713  for (;; ptr++) Line 5994  for (;; ptr++)
5994          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5995          case CHAR_LEFT_PARENTHESIS:          case CHAR_LEFT_PARENTHESIS:
5996          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
5997            tempptr = ptr;
5998    
5999          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
6000          group), a name (referring to a named group), or 'R', referring to          group), a name (referring to a named group), or 'R', referring to
# Line 5725  for (;; ptr++) Line 6007  for (;; ptr++)
6007          be the recursive thing or the name 'R' (and similarly for 'R' followed          be the recursive thing or the name 'R' (and similarly for 'R' followed
6008          by digits), and (b) a number could be a name that consists of digits.          by digits), and (b) a number could be a name that consists of digits.
6009          In both cases, we look for a name first; if not found, we try the other          In both cases, we look for a name first; if not found, we try the other
6010          cases. */          cases.
6011    
6012            For compatibility with auto-callouts, we allow a callout to be
6013            specified before a condition that is an assertion. First, check for the
6014            syntax of a callout; if found, adjust the temporary pointer that is
6015            used to check for an assertion condition. That's all that is needed! */
6016    
6017            if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6018              {
6019              for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6020              if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6021                tempptr += i + 1;
6022              }
6023    
6024          /* For conditions that are assertions, check the syntax, and then exit          /* For conditions that are assertions, check the syntax, and then exit
6025          the switch. This will take control down to where bracketed groups,          the switch. This will take control down to where bracketed groups,
6026          including assertions, are processed. */          including assertions, are processed. */
6027    
6028          if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||          if (tempptr[1] == CHAR_QUESTION_MARK &&
6029              ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))                (tempptr[2] == CHAR_EQUALS_SIGN ||
6030                   tempptr[2] == CHAR_EXCLAMATION_MARK ||
6031                   tempptr[2] == CHAR_LESS_THAN_SIGN))
6032            break;            break;
6033    
6034          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6035          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6036    
6037          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6038          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 5752  for (;; ptr++) Line 6048  for (;; ptr++)
6048            }            }
6049    
6050          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6051          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6052            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6053            consist entirely of digits, there is scope for ambiguity. */
6054    
6055          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6056            {            {
# Line 5766  for (;; ptr++) Line 6064  for (;; ptr++)
6064            }            }
6065          else          else
6066            {            {
6067            terminator = 0;            terminator = CHAR_NULL;
6068            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6069            }            }
6070    
6071            /* When a name is one of a number of duplicates, a different opcode is
6072            used and it needs more memory. Unfortunately we cannot tell whether a
6073            name is a duplicate in the first pass, so we have to allow for more
6074            memory except when we know it is a relative numerical reference. */
6075    
6076            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6077    
6078          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name (possibly all digits); any thing else
6079            is an error. In the case of all digits, also get it as a number. */
6080    
6081          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6082            {            {
# Line 5779  for (;; ptr++) Line 6085  for (;; ptr++)
6085            goto FAILED;            goto FAILED;
6086            }            }
6087    
         /* Read the name, but also get it as a number if it's all digits */  
   
6088          recno = 0;          recno = 0;
6089          name = ++ptr;          name = ++ptr;
6090          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6091            {            {
6092            if (recno >= 0)            if (recno >= 0)
6093              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;              recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6094            ptr++;            ptr++;
6095            }            }
6096          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6097    
6098          if ((terminator > 0 && *ptr++ != terminator) ||          /* Check the terminator */
6099    
6100            if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6101              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6102            {            {
6103            ptr--;      /* Error offset */            ptr--;      /* Error offset */
# Line 5826  for (;; ptr++) Line 6132  for (;; ptr++)
6132            }            }
6133    
6134          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6135          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
6136          OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
   
6137          slot = cd->name_table;          slot = cd->name_table;
6138          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
6139            {            {
# Line 5838  for (;; ptr++) Line 6141  for (;; ptr++)
6141            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6142            }            }
6143    
6144          /* Found a previous named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6145            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6146            appropriate data values. Otherwise, just insert the unique subpattern
6147            number. */
6148    
6149          if (i < cd->names_found)          if (i < cd->names_found)
6150            {            {
6151            recno = GET2(slot, 0);            int offset = i++;
6152            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6153            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6154            }            for (; i < cd->names_found; i++)
6155                {
6156          /* Search the pattern for a forward reference */              slot += cd->name_entry_size;
6157                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6158          else if ((i = find_parens(cd, name, namelen,              count++;
6159                          (options & PCRE_EXTENDED) != 0, utf)) > 0)              }
6160            {            if (count > 1)
6161            PUT2(code, 2+LINK_SIZE, i);              {
6162            code[1+LINK_SIZE]++;              PUT2(code, 2+LINK_SIZE, offset);
6163            }              PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6164                skipbytes += IMM2_SIZE;
6165          /* If terminator == 0 it means that the name followed directly after              code[1+LINK_SIZE]++;
6166          the opening parenthesis [e.g. (?(abc)...] and in this case there are              }
6167          some further alternatives to try. For the cases where terminator != 0            else  /* Not a duplicated name */
6168          [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have              {
6169                PUT2(code, 2+LINK_SIZE, recno);
6170                }
6171              }
6172    
6173            /* If terminator == CHAR_NULL it means that the name followed directly
6174            after the opening parenthesis [e.g. (?(abc)...] and in this case there
6175            are some further alternatives to try. For the cases where terminator !=
6176            0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6177          now checked all the possibilities, so give an error. */          now checked all the possibilities, so give an error. */
6178    
6179          else if (terminator != 0)          else if (terminator != CHAR_NULL)
6180            {            {
6181            *errorcodeptr = ERR15;            *errorcodeptr = ERR15;
6182            goto FAILED;            goto FAILED;
# Line 6019  for (;; ptr++) Line 6333  for (;; ptr++)
6333          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6334          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
6335          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
6336            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6337              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6338            name = ++ptr;
6339    
6340            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6341            namelen = (int)(ptr - name);
6342    
6343            /* In the pre-compile phase, do a syntax check, remember the longest
6344            name, and then remember the group in a vector, expanding it if
6345            necessary. Duplicates for the same number are skipped; other duplicates
6346            are checked for validity. In the actual compile, there is nothing to
6347            do. */
6348    
6349            if (lengthptr != NULL)
6350            {            {
6351            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            named_group *ng;
6352              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            pcre_uint32 number = cd->bracount + 1;
           name = ++ptr;  
6353    
6354            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            if (*ptr != (pcre_uchar)terminator)
6355            namelen = (int)(ptr - name);              {
6356                *errorcodeptr = ERR42;
6357                goto FAILED;
6358                }
6359    
6360            /* In the pre-compile phase, just do a syntax check. */            if (cd->names_found >= MAX_NAME_COUNT)
6361                {
6362                *errorcodeptr = ERR49;
6363                goto FAILED;
6364                }
6365    
6366            if (lengthptr != NULL)            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6367              {              {
6368              if (*ptr != terminator)              cd->name_entry_size = namelen + IMM2_SIZE + 1;
6369                {              if (namelen > MAX_NAME_SIZE)
               *errorcodeptr = ERR42;  
               goto FAILED;  
               }  
             if (cd->names_found >= MAX_NAME_COUNT)  
6370                {                {
6371                *errorcodeptr = ERR49;                *errorcodeptr = ERR48;
6372                goto FAILED;                goto FAILED;
6373                }                }
6374              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)              }
6375    
6376              /* Scan the list to check for duplicates. For duplicate names, if the
6377              number is the same, break the loop, which causes the name to be
6378              discarded; otherwise, if DUPNAMES is not set, give an error.
6379              If it is set, allow the name with a different number, but continue
6380              scanning in case this is a duplicate with the same number. For
6381              non-duplicate names, give an error if the number is duplicated. */
6382    
6383              ng = cd->named_groups;
6384              for (i = 0; i < cd->names_found; i++, ng++)
6385                {
6386                if (namelen == ng->length &&
6387                    STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6388                {                {
6389                cd->name_entry_size = namelen + IMM2_SIZE + 1;                if (ng->number == number) break;
6390                if (namelen > MAX_NAME_SIZE)                if ((options & PCRE_DUPNAMES) == 0)
6391                  {                  {
6392                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR43;
6393                  goto FAILED;                  goto FAILED;
6394                  }                  }
6395                  cd->dupnames = TRUE;  /* Duplicate names exist */
6396                  }
6397                else if (ng->number == number)
6398                  {
6399                  *errorcodeptr = ERR65;
6400                  goto FAILED;