/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1348 by ph10, Fri Jul 5 10:38:37 2013 UTC revision 1384 by zherczeg, Fri Oct 25 17:37:50 2013 UTC
# Line 115  kicks in at the same number of forward r Line 115  kicks in at the same number of forward r
115  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)  #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123    #define NAMED_GROUP_LIST_SIZE  20
124    
125  /* The overrun tests check for a slightly smaller size so that they detect the  /* The overrun tests check for a slightly smaller size so that they detect the
126  overrun before it actually does run off the end of the data block. */  overrun before it actually does run off the end of the data block. */
127    
# Line 455  static const char error_texts[] = Line 462  static const char error_texts[] =
462    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
463    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
464    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
465    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
466    /* 35 */    /* 35 */
467    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
468    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 509  static const char error_texts[] = Line 516  static const char error_texts[] =
516    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
517    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
518    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
519      "non-hex character in \\x{} (closing brace missing?)\0"
520      /* 80 */
521      "non-octal character in \\o{} (closing brace missing?)\0"
522      "missing opening brace after \\o\0"
523    ;    ;
524    
525  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 648  static const pcre_uint8 ebcdic_chartab[] Line 659  static const pcre_uint8 ebcdic_chartab[]
659  #endif  #endif
660    
661    
662    /* This table is used to check whether auto-possessification is possible
663    between adjacent character-type opcodes. The left-hand (repeated) opcode is
664    used to select the row, and the right-hand opcode is use to select the column.
665    A value of 1 means that auto-possessification is OK. For example, the second
666    value in the first row means that \D+\d can be turned into \D++\d.
667    
668    The Unicode property types (\P and \p) have to be present to fill out the table
669    because of what their opcode values are, but the table values should always be
670    zero because property types are handled separately in the code. The last four
671    columns apply to items that cannot be repeated, so there is no need to have
672    rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
673    *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
674    
675    #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
676    #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
677    
678    static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
679    /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
680      { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
681      { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
682      { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
683      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
684      { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
685      { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
686      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
687      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
688      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
689      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
690      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
691      { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
692      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
693      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
694      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
695      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
696      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
697    };
698    
699    
700    /* This table is used to check whether auto-possessification is possible
701    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
702    left-hand (repeated) opcode is used to select the row, and the right-hand
703    opcode is used to select the column. The values are as follows:
704    
705      0   Always return FALSE (never auto-possessify)
706      1   Character groups are distinct (possessify if both are OP_PROP)
707      2   Check character categories in the same group (general or particular)
708      3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
709    
710      4   Check left general category vs right particular category
711      5   Check right general category vs left particular category
712    
713      6   Left alphanum vs right general category
714      7   Left space vs right general category
715      8   Left word vs right general category
716    
717      9   Right alphanum vs left general category
718     10   Right space vs left general category
719     11   Right word vs left general category
720    
721     12   Left alphanum vs right particular category
722     13   Left space vs right particular category
723     14   Left word vs right particular category
724    
725     15   Right alphanum vs left particular category
726     16   Right space vs left particular category
727     17   Right word vs left particular category
728    */
729    
730    static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
731    /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
732      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
733      { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
734      { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
735      { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
736      { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
737      { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
738      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
739      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
740      { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
741      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
742      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
743    };
744    
745    /* This table is used to check whether auto-possessification is possible
746    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
747    specifies a general category and the other specifies a particular category. The
748    row is selected by the general category and the column by the particular
749    category. The value is 1 if the particular category is not part of the general
750    category. */
751    
752    static const pcre_uint8 catposstab[7][30] = {
753    /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
754      { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
755      { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
756      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
757      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
758      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
759      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
760      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
761    };
762    
763    /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
764    a general or particular category. The properties in each row are those
765    that apply to the character set in question. Duplication means that a little
766    unnecessary work is done when checking, but this keeps things much simpler
767    because they can all use the same code. For more details see the comment where
768    this table is used.
769    
770    Note: SPACE and PXSPACE used to be different because Perl excluded VT from
771    "space", but from Perl 5.18 it's included, so both categories are treated the
772    same here. */
773    
774    static const pcre_uint8 posspropstab[3][4] = {
775      { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
776      { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
777      { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
778    };
779    
780    /* This table is used when converting repeating opcodes into possessified
781    versions as a result of an explicit possessive quantifier such as ++. A zero
782    value means there is no possessified version - in those cases the item in
783    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
784    because all relevant opcodes are less than that. */
785    
786    static const pcre_uint8 opcode_possessify[] = {
787      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
788      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
789    
790      0,                       /* NOTI */
791      OP_POSSTAR, 0,           /* STAR, MINSTAR */
792      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
793      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
794      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
795      0,                       /* EXACT */
796      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
797    
798      OP_POSSTARI, 0,          /* STARI, MINSTARI */
799      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
800      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
801      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
802      0,                       /* EXACTI */
803      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
804    
805      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
806      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
807      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
808      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
809      0,                       /* NOTEXACT */
810      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
811    
812      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
813      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
814      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
815      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
816      0,                       /* NOTEXACTI */
817      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
818    
819      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
820      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
821      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
822      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
823      0,                       /* TYPEEXACT */
824      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
825    
826      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
827      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
828      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
829      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
830      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
831    
832      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
833      0, 0,                    /* REF, REFI */
834      0, 0,                    /* DNREF, DNREFI */
835      0, 0                     /* RECURSE, CALLOUT */
836    };
837    
838    
839    
840  /*************************************************  /*************************************************
841  *            Find an error text                  *  *            Find an error text                  *
# Line 675  return s; Line 863  return s;
863  }  }
864    
865    
866    
867  /*************************************************  /*************************************************
868  *           Expand the workspace                 *  *           Expand the workspace                 *
869  *************************************************/  *************************************************/
# Line 752  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 941  return (*p == CHAR_RIGHT_CURLY_BRACKET);
941  *************************************************/  *************************************************/
942    
943  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
944  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
945  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
946  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
947  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
948  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
949    
950  Arguments:  Arguments:
951    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
952    chptr          points to the data character    chptr          points to a returned data character
953    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
954    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
955    options        the options bits    options        the options bits
# Line 965  else Line 1153  else
1153      break;      break;
1154    
1155      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1156      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1157      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1158        recommended to avoid the ambiguities in the old syntax.
1159    
1160      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1161      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1162      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1163      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1164      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1165      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1166      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1167    
1168        Inside a character class, \ followed by a digit is always either a literal
1169        8 or 9 or an octal number. */
1170    
1171      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1172      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1001  else Line 1193  else
1193          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1194          break;          break;
1195          }          }
1196        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1197          {          {
1198          escape = -s;          escape = -s;
1199          break;          break;
# Line 1009  else Line 1201  else
1201        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1202        }        }
1203    
1204      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1205      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1206      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1207        changed so as not to insert the binary zero. */
1208    
1209      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1210        {  
1211        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1212    
1213      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1214      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1035  else Line 1225  else
1225  #endif  #endif
1226      break;      break;
1227    
1228      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1229      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1230      If not, { is treated as a data character. */  
1231        case CHAR_o:
1232        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1233          {
1234          ptr += 2;
1235          c = 0;
1236          overflow = FALSE;
1237          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1238            {
1239            register pcre_uint32 cc = *ptr++;
1240            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1241    #ifdef COMPILE_PCRE32
1242            if (c >= 0x20000000l) { overflow = TRUE; break; }
1243    #endif
1244            c = (c << 3) + cc - CHAR_0 ;
1245    #if defined COMPILE_PCRE8
1246            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1247    #elif defined COMPILE_PCRE16
1248            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1249    #elif defined COMPILE_PCRE32
1250            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1251    #endif
1252            }
1253          if (overflow)
1254            {
1255            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1256            *errorcodeptr = ERR34;
1257            }
1258          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1259            {
1260            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1261            }
1262          else *errorcodeptr = ERR80;
1263          }
1264        break;
1265    
1266        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1267        numbers. Otherwise it is a lowercase x letter. */
1268    
1269      case CHAR_x:      case CHAR_x:
1270      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1271        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1272        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1273          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1274          {          {
# Line 1060  else Line 1285  else
1285  #endif  #endif
1286            }            }
1287          }          }
1288        break;        }    /* End JavaScript handling */
       }  
1289    
1290      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1291        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1292        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1293        seems to read hex digits up to the first non-such, and ignore the rest, so
1294        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1295        now gives an error. */
1296    
1297        c = 0;      else
1298        overflow = FALSE;        {
1299        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1300          {          {
1301          register pcre_uint32 cc = *pt++;          ptr += 2;
1302          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1303            overflow = FALSE;
1304            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1305              {
1306              register pcre_uint32 cc = *ptr++;
1307              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1308    
1309  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1310          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1311  #endif  #endif
1312    
1313  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1314          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1315          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1316  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1317          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1318          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1319  #endif  #endif
1320    
1321  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1322          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1323  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1324          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1325  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1326          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1327  #endif  #endif
1328          }            }
1329    
1330        if (overflow)          if (overflow)
1331          {            {
1332          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1333          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1334          }            }
1335    
1336        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1337          {            {
1338          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1339          ptr = pt;            }
         break;  
         }  
1340    
1341        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1342        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1343        }          \x handling, but nowadays Perl gives an error, which seems much more
1344            sensible, so we do too. */
1345    
1346      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1347            }   /* End of \x{} processing */
1348    
1349      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1350      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1351        {        else
1352        pcre_uint32 cc;                          /* Some compilers don't like */          {
1353        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1354            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1355              {
1356              pcre_uint32 cc;                          /* Some compilers don't like */
1357              cc = *(++ptr);                           /* ++ in initializers */
1358  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1359        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1360        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1361  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1362        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1363        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1364  #endif  #endif
1365        }            }
1366            }     /* End of \xdd handling */
1367          }       /* End of Perl-style \x handling */
1368      break;      break;
1369    
1370      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1192  if ((options & PCRE_UCP) != 0 && escape Line 1430  if ((options & PCRE_UCP) != 0 && escape
1430  return escape;  return escape;
1431  }  }
1432    
1433    
1434    
1435  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1436  /*************************************************  /*************************************************
1437  *               Handle \P and \p                 *  *               Handle \P and \p                 *
# Line 1289  return FALSE; Line 1529  return FALSE;
1529    
1530    
1531    
   
1532  /*************************************************  /*************************************************
1533  *         Read repeat counts                     *  *         Read repeat counts                     *
1534  *************************************************/  *************************************************/
# Line 1358  return p; Line 1597  return p;
1597    
1598    
1599  /*************************************************  /*************************************************
 *  Subroutine for finding forward reference      *  
 *************************************************/  
   
 /* This recursive function is called only from find_parens() below. The  
 top-level call starts at the beginning of the pattern. All other calls must  
 start at a parenthesis. It scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. Recursion is used to keep  
 track of subpatterns that reset the capturing group numbers - the (?| feature.  
   
 This function was originally called only from the second pass, in which we know  
 that if (?< or (?' or (?P< is encountered, the name will be correctly  
 terminated because that is checked in the first pass. There is now one call to  
 this function in the first pass, to check for a recursive back reference by  
 name (so that we can make the whole group atomic). In this case, we need check  
 only up to the current position in the pattern, and that is still OK because  
 and previous occurrences will have been checked. To make this work, the test  
 for "end of pattern" is a check against cd->end_pattern in the main loop,  
 instead of looking for a binary zero. This means that the special first-pass  
 call can adjust cd->end_pattern temporarily. (Checks for binary zero while  
 processing items within the loop are OK, because afterwards the main loop will  
 terminate.)  
   
 Arguments:  
   ptrptr       address of the current character pointer (updated)  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   count        pointer to the current capturing subpattern number (updated)  
   
 Returns:       the number of the named subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,  
   BOOL xmode, BOOL utf, int *count)  
 {  
 pcre_uchar *ptr = *ptrptr;  
 int start_count = *count;  
 int hwm_count = start_count;  
 BOOL dup_parens = FALSE;  
   
 /* If the first character is a parenthesis, check on the type of group we are  
 dealing with. The very first call may not start with a parenthesis. */  
   
 if (ptr[0] == CHAR_LEFT_PARENTHESIS)  
   {  
   /* Handle specials such as (*SKIP) or (*UTF8) etc. */  
   
   if (ptr[1] == CHAR_ASTERISK)  
     {  
     ptr += 2;  
     while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
     }  
   
   /* Handle a normal, unnamed capturing parenthesis. */  
   
   else if (ptr[1] != CHAR_QUESTION_MARK)  
     {  
     *count += 1;  
     if (name == NULL && *count == lorn) return *count;  
     ptr++;  
     }  
   
   /* All cases now have (? at the start. Remember when we are in a group  
   where the parenthesis numbers are duplicated. */  
   
   else if (ptr[2] == CHAR_VERTICAL_LINE)  
     {  
     ptr += 3;  
     dup_parens = TRUE;  
     }  
   
   /* Handle comments; all characters are allowed until a ket is reached. */  
   
   else if (ptr[2] == CHAR_NUMBER_SIGN)  
     {  
     for (ptr += 3; *ptr != CHAR_NULL; ptr++)  
       if (*ptr == CHAR_RIGHT_PARENTHESIS) break;  
     goto FAIL_EXIT;  
     }  
   
   /* Handle a condition. If it is an assertion, just carry on so that it  
   is processed as normal. If not, skip to the closing parenthesis of the  
   condition (there can't be any nested parens). */  
   
   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)  
     {  
     ptr += 2;  
     if (ptr[1] != CHAR_QUESTION_MARK)  
       {  
       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;  
       if (*ptr != CHAR_NULL) ptr++;  
       }  
     }  
   
   /* Start with (? but not a condition. */  
   
   else  
     {  
     ptr += 2;  
     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */  
   
     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */  
   
     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&  
         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)  
       {  
       pcre_uchar term;  
       const pcre_uchar *thisname;  
       *count += 1;  
       if (name == NULL && *count == lorn) return *count;  
       term = *ptr++;  
       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;  
       thisname = ptr;  
       while (*ptr != term) ptr++;  
       if (name != NULL && lorn == (int)(ptr - thisname) &&  
           STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)  
         return *count;  
       term++;  
       }  
     }  
   }  
   
 /* Past any initial parenthesis handling, scan for parentheses or vertical  
 bars. Stop if we get to cd->end_pattern. Note that this is important for the  
 first-pass call when this value is temporarily adjusted to stop at the current  
 position. So DO NOT change this to a test for binary zero. */  
   
 for (; ptr < cd->end_pattern; ptr++)  
   {  
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == CHAR_BACKSLASH)  
     {  
     if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
     if (*ptr == CHAR_Q) for (;;)  
       {  
       while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
       if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
       if (*(++ptr) == CHAR_E) break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes; this logic must be similar to the way they  
   are handled for real. If the first character is '^', skip it. Also, if the  
   first few characters (either before or after ^) are \Q\E or \E we skip them  
   too. This makes for compatibility with Perl. Note the use of STR macros to  
   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */  
   
   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)  
     {  
     BOOL negate_class = FALSE;  
     for (;;)  
       {  
       if (ptr[1] == CHAR_BACKSLASH)  
         {  
         if (ptr[2] == CHAR_E)  
           ptr+= 2;  
         else if (STRNCMP_UC_C8(ptr + 2,  
                  STR_Q STR_BACKSLASH STR_E, 3) == 0)  
           ptr += 4;  
         else  
           break;  
         }  
       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)  
         {  
         negate_class = TRUE;  
         ptr++;  
         }  
       else break;  
       }  
   
     /* If the next character is ']', it is a data character that must be  
     skipped, except in JavaScript compatibility mode. */  
   
     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&  
         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)  
       ptr++;  
   
     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       if (*ptr == CHAR_NULL) return -1;  
       if (*ptr == CHAR_BACKSLASH)  
         {  
         if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;  
         if (*ptr == CHAR_Q) for (;;)  
           {  
           while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};  
           if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
           if (*(++ptr) == CHAR_E) break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == CHAR_NUMBER_SIGN)  
     {  
     ptr++;  
     while (*ptr != CHAR_NULL)  
       {  
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }  
       ptr++;  
 #ifdef SUPPORT_UTF  
       if (utf) FORWARDCHAR(ptr);  
 #endif  
       }  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     continue;  
     }  
   
   /* Check for the special metacharacters */  
   
   if (*ptr == CHAR_LEFT_PARENTHESIS)  
     {  
     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);  
     if (rc > 0) return rc;  
     if (*ptr == CHAR_NULL) goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_RIGHT_PARENTHESIS)  
     {  
     if (dup_parens && *count < hwm_count) *count = hwm_count;  
     goto FAIL_EXIT;  
     }  
   
   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)  
     {  
     if (*count > hwm_count) hwm_count = *count;  
     *count = start_count;  
     }  
   }  
   
 FAIL_EXIT:  
 *ptrptr = ptr;  
 return -1;  
 }  
   
   
   
   
 /*************************************************  
 *       Find forward referenced subpattern       *  
 *************************************************/  
   
 /* This function scans along a pattern's text looking for capturing  
 subpatterns, and counting them. If it finds a named pattern that matches the  
 name it is given, it returns its number. Alternatively, if the name is NULL, it  
 returns when it reaches a given numbered subpattern. This is used for forward  
 references to subpatterns. We used to be able to start this scan from the  
 current compiling point, using the current count value from cd->bracount, and  
 do it all in a single loop, but the addition of the possibility of duplicate  
 subpattern numbers means that we have to scan from the very start, in order to  
 take account of such duplicates, and to use a recursive function to keep track  
 of the different types of group.  
   
 Arguments:  
   cd           compile background data  
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
   utf          TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode  
   
 Returns:       the number of the found subpattern, or -1 if not found  
 */  
   
 static int  
 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,  
   BOOL utf)  
 {  
 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;  
 int count = 0;  
 int rc;  
   
 /* If the pattern does not start with an opening parenthesis, the first call  
 to find_parens_sub() will scan right to the end (if necessary). However, if it  
 does start with a parenthesis, find_parens_sub() will return when it hits the  
 matching closing parens. That is why we have to have a loop. */  
   
 for (;;)  
   {  
   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);  
   if (rc > 0 || *ptr++ == CHAR_NULL) break;  
   }  
   
 return rc;  
 }  
   
   
   
   
 /*************************************************  
1600  *      Find first significant op code            *  *      Find first significant op code            *
1601  *************************************************/  *************************************************/
1602    
# Line 1696  for (;;) Line 1635  for (;;)
1635    
1636      case OP_CALLOUT:      case OP_CALLOUT:
1637      case OP_CREF:      case OP_CREF:
1638      case OP_NCREF:      case OP_DNCREF:
1639      case OP_RREF:      case OP_RREF:
1640      case OP_NRREF:      case OP_DNRREF:
1641      case OP_DEF:      case OP_DEF:
1642      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1643      break;      break;
# Line 1712  for (;;) Line 1651  for (;;)
1651    
1652    
1653    
   
1654  /*************************************************  /*************************************************
1655  *        Find the fixed length of a branch       *  *        Find the fixed length of a branch       *
1656  *************************************************/  *************************************************/
# Line 1836  for (;;) Line 1774  for (;;)
1774      case OP_COMMIT:      case OP_COMMIT:
1775      case OP_CREF:      case OP_CREF:
1776      case OP_DEF:      case OP_DEF:
1777        case OP_DNCREF:
1778        case OP_DNRREF:
1779      case OP_DOLL:      case OP_DOLL:
1780      case OP_DOLLM:      case OP_DOLLM:
1781      case OP_EOD:      case OP_EOD:
1782      case OP_EODN:      case OP_EODN:
1783      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1784      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1785      case OP_PRUNE:      case OP_PRUNE:
1786      case OP_REVERSE:      case OP_REVERSE:
# Line 1937  for (;;) Line 1875  for (;;)
1875    
1876      switch (*cc)      switch (*cc)
1877        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1878        case OP_CRSTAR:        case OP_CRSTAR:
1879        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1880          case OP_CRPLUS:
1881          case OP_CRMINPLUS:
1882        case OP_CRQUERY:        case OP_CRQUERY:
1883        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1884          case OP_CRPOSSTAR:
1885          case OP_CRPOSPLUS:
1886          case OP_CRPOSQUERY:
1887        return -1;        return -1;
1888    
1889        case OP_CRRANGE:        case OP_CRRANGE:
1890        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1891          case OP_CRPOSRANGE:
1892        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1893        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1894        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2015  for (;;) Line 1957  for (;;)
1957      case OP_QUERYI:      case OP_QUERYI:
1958      case OP_REF:      case OP_REF:
1959      case OP_REFI:      case OP_REFI:
1960        case OP_DNREF:
1961        case OP_DNREFI:
1962      case OP_SBRA:      case OP_SBRA:
1963      case OP_SBRAPOS:      case OP_SBRAPOS:
1964      case OP_SCBRA:      case OP_SCBRA:
# Line 2051  for (;;) Line 1995  for (;;)
1995    
1996    
1997    
   
1998  /*************************************************  /*************************************************
1999  *    Scan compiled regex for specific bracket    *  *    Scan compiled regex for specific bracket    *
2000  *************************************************/  *************************************************/
# Line 2361  Returns:      TRUE if what is matched co Line 2304  Returns:      TRUE if what is matched co
2304  typedef struct recurse_check {  typedef struct recurse_check {
2305    struct recurse_check *prev;    struct recurse_check *prev;
2306    const pcre_uchar *group;    const pcre_uchar *group;
2307  } recurse_check;  } recurse_check;
2308    
2309  static BOOL  static BOOL
2310  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
# Line 2377  for (code = first_significant_code(code Line 2320  for (code = first_significant_code(code
2320    const pcre_uchar *ccode;    const pcre_uchar *ccode;
2321    
2322    c = *code;    c = *code;
2323    
2324    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
2325    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
2326    
# Line 2405  for (code = first_significant_code(code Line 2348  for (code = first_significant_code(code
2348      NULL. */      NULL. */
2349    
2350      if (cd->start_workspace != NULL)      if (cd->start_workspace != NULL)
2351        {        {
2352        const pcre_uchar *tcode;        const pcre_uchar *tcode;
2353        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)        for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2354          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;          if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2355        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2356        }        }
2357    
2358      /* If we are scanning a completed pattern, there are no forward references      /* If we are scanning a completed pattern, there are no forward references
2359      and all groups are complete. We need to detect whether this is a recursive      and all groups are complete. We need to detect whether this is a recursive
2360      call, as otherwise there will be an infinite loop. If it is a recursion,      call, as otherwise there will be an infinite loop. If it is a recursion,
2361      just skip over it. Simple recursions are easily detected. For mutual      just skip over it. Simple recursions are easily detected. For mutual
2362      recursions we keep a chain on the stack. */      recursions we keep a chain on the stack. */
2363    
2364      else      else
2365        {        {
2366        recurse_check *r = recurses;        recurse_check *r = recurses;
2367        const pcre_uchar *endgroup = scode;        const pcre_uchar *endgroup = scode;
2368    
2369        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);        do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2370        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */        if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2371    
2372        for (r = recurses; r != NULL; r = r->prev)        for (r = recurses; r != NULL; r = r->prev)
2373          if (r->group == scode) break;          if (r->group == scode) break;
2374        if (r != NULL) continue;   /* Mutual recursion */        if (r != NULL) continue;   /* Mutual recursion */
# Line 2436  for (code = first_significant_code(code Line 2379  for (code = first_significant_code(code
2379    
2380      empty_branch = FALSE;      empty_branch = FALSE;
2381      this_recurse.prev = recurses;      this_recurse.prev = recurses;
2382      this_recurse.group = scode;      this_recurse.group = scode;
2383    
2384      do      do
2385        {        {
2386        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
# Line 2538  for (code = first_significant_code(code Line 2481  for (code = first_significant_code(code
2481        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2482        case OP_CRQUERY:        case OP_CRQUERY:
2483        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2484          case OP_CRPOSSTAR:
2485          case OP_CRPOSQUERY:
2486        break;        break;
2487    
2488        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2489        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2490        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2491          case OP_CRPOSPLUS:
2492        return FALSE;        return FALSE;
2493    
2494        case OP_CRRANGE:        case OP_CRRANGE:
2495        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2496          case OP_CRPOSRANGE:
2497        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2498        break;        break;
2499        }        }
# Line 2557  for (code = first_significant_code(code Line 2504  for (code = first_significant_code(code
2504      case OP_ANY:      case OP_ANY:
2505      case OP_ALLANY:      case OP_ALLANY:
2506      case OP_ANYBYTE:      case OP_ANYBYTE:
2507    
2508      case OP_PROP:      case OP_PROP:
2509      case OP_NOTPROP:      case OP_NOTPROP:
2510      case OP_ANYNL:      case OP_ANYNL:
2511    
2512      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
2513      case OP_HSPACE:      case OP_HSPACE:
2514      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
2515      case OP_VSPACE:      case OP_VSPACE:
2516      case OP_EXTUNI:      case OP_EXTUNI:
2517    
2518      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2519      case OP_DIGIT:      case OP_DIGIT:
2520      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2521      case OP_WHITESPACE:      case OP_WHITESPACE:
2522      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2523      case OP_WORDCHAR:      case OP_WORDCHAR:
2524    
2525      case OP_CHAR:      case OP_CHAR:
2526      case OP_CHARI:      case OP_CHARI:
2527      case OP_NOT:      case OP_NOT:
2528      case OP_NOTI:      case OP_NOTI:
2529    
2530      case OP_PLUS:      case OP_PLUS:
2531      case OP_PLUSI:      case OP_PLUSI:
2532      case OP_MINPLUS:      case OP_MINPLUS:
# Line 2589  for (code = first_significant_code(code Line 2536  for (code = first_significant_code(code
2536      case OP_NOTPLUSI:      case OP_NOTPLUSI:
2537      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2538      case OP_NOTMINPLUSI:      case OP_NOTMINPLUSI:
2539    
2540      case OP_POSPLUS:      case OP_POSPLUS:
2541      case OP_POSPLUSI:      case OP_POSPLUSI:
2542      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2543      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
2544    
2545      case OP_EXACT:      case OP_EXACT:
2546      case OP_EXACTI:      case OP_EXACTI:
2547      case OP_NOTEXACT:      case OP_NOTEXACT:
2548      case OP_NOTEXACTI:      case OP_NOTEXACTI:
2549    
2550      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2551      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2552      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2553      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2554    
2555      return FALSE;      return FALSE;
2556    
2557      /* These are going to continue, as they may be empty, but we have to      /* These are going to continue, as they may be empty, but we have to
# Line 2644  for (code = first_significant_code(code Line 2591  for (code = first_significant_code(code
2591  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2592      case OP_STAR:      case OP_STAR:
2593      case OP_STARI:      case OP_STARI:
2594      case OP_NOTSTAR:      case OP_NOTSTAR:
2595      case OP_NOTSTARI:      case OP_NOTSTARI:
2596    
2597      case OP_MINSTAR:      case OP_MINSTAR:
2598      case OP_MINSTARI:      case OP_MINSTARI:
2599      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
2600      case OP_NOTMINSTARI:      case OP_NOTMINSTARI:
2601    
2602      case OP_POSSTAR:      case OP_POSSTAR:
2603      case OP_POSSTARI:      case OP_POSSTARI:
2604      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
2605      case OP_NOTPOSSTARI:      case OP_NOTPOSSTARI:
2606    
2607      case OP_QUERY:      case OP_QUERY:
2608      case OP_QUERYI:      case OP_QUERYI:
2609      case OP_NOTQUERY:      case OP_NOTQUERY:
2610      case OP_NOTQUERYI:      case OP_NOTQUERYI:
2611    
2612      case OP_MINQUERY:      case OP_MINQUERY:
2613      case OP_MINQUERYI:      case OP_MINQUERYI:
2614      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
2615      case OP_NOTMINQUERYI:      case OP_NOTMINQUERYI:
2616    
2617      case OP_POSQUERY:      case OP_POSQUERY:
2618      case OP_POSQUERYI:      case OP_POSQUERYI:
2619      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
2620      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
2621    
2622      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);      if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2623      break;      break;
2624    
2625      case OP_UPTO:      case OP_UPTO:
2626      case OP_UPTOI:      case OP_UPTOI:
2627      case OP_NOTUPTO:      case OP_NOTUPTO:
2628      case OP_NOTUPTOI:      case OP_NOTUPTOI:
2629    
2630      case OP_MINUPTO:      case OP_MINUPTO:
2631      case OP_MINUPTOI:      case OP_MINUPTOI:
2632      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
2633      case OP_NOTMINUPTOI:      case OP_NOTMINUPTOI:
2634    
2635      case OP_POSUPTO:      case OP_POSUPTO:
2636      case OP_POSUPTOI:      case OP_POSUPTOI:
2637      case OP_NOTPOSUPTO:      case OP_NOTPOSUPTO:
2638      case OP_NOTPOSUPTOI:      case OP_NOTPOSUPTOI:
2639    
2640      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);      if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2641      break;      break;
2642  #endif  #endif
# Line 2753  return TRUE; Line 2700  return TRUE;
2700    
2701    
2702  /*************************************************  /*************************************************
2703  *           Check for POSIX class syntax         *  *        Base opcode of repeated opcodes         *
2704  *************************************************/  *************************************************/
2705    
2706  /* This function is called when the sequence "[:" or "[." or "[=" is  /* Returns the base opcode for repeated single character type opcodes. If the
2707  encountered in a character class. It checks whether this is followed by a  opcode is not a repeated character type, it returns with the original value.
 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we  
 reach an unescaped ']' without the special preceding character, return FALSE.  
   
 Originally, this function only recognized a sequence of letters between the  
 terminators, but it seems that Perl recognizes any sequence of characters,  
 though of course unknown POSIX names are subsequently rejected. Perl gives an  
 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE  
 didn't consider this to be a POSIX class. Likewise for [:1234:].  
   
 The problem in trying to be exactly like Perl is in the handling of escapes. We  
 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX  
 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code  
 below handles the special case of \], but does not try to do any other escape  
 processing. This makes it different from Perl for cases such as [:l\ower:]  
 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize  
 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  
 I think.  
   
 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  
 It seems that the appearance of a nested POSIX class supersedes an apparent  
 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or  
 a digit.  
   
 In Perl, unescaped square brackets may also appear as part of class names. For  
 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for  
 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not  
 seem right at all. PCRE does not allow closing square brackets in POSIX class  
 names.  
   
 Arguments:  
   ptr      pointer to the initial [  
   endptr   where to return the end pointer  
2708    
2709  Returns:   TRUE or FALSE  Arguments:  c opcode
2710    Returns:    base opcode for the type
2711  */  */
2712    
2713  static BOOL  static pcre_uchar
2714  check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)  get_repeat_base(pcre_uchar c)
2715  {  {
2716  pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */  return (c > OP_TYPEPOSUPTO)? c :
2717  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */         (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2718  for (++ptr; *ptr != CHAR_NULL; ptr++)         (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2719    {         (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2720    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)         (c >= OP_STARI)?      OP_STARI :
2721      ptr++;                               OP_STAR;
   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;  
   else  
     {  
     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)  
       {  
       *endptr = ptr;  
       return TRUE;  
       }  
     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&  
          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||  
           ptr[1] == CHAR_EQUALS_SIGN) &&  
         check_posix_syntax(ptr, endptr))  
       return FALSE;  
     }  
   }  
 return FALSE;  
2722  }  }
2723    
2724    
2725    
2726    #ifdef SUPPORT_UCP
2727  /*************************************************  /*************************************************
2728  *          Check POSIX class name                *  *        Check a character and a property        *
2729  *************************************************/  *************************************************/
2730    
2731  /* This function is called to check the name given in a POSIX-style class entry  /* This function is called by check_auto_possessive() when a property item
2732  such as [:alnum:].  is adjacent to a fixed character.
2733    
2734  Arguments:  Arguments:
2735    ptr        points to the first letter    c            the character
2736    len        the length of the name    ptype        the property type
2737      pdata        the data for the type
2738      negated      TRUE if it's a negated property (\P or \p{^)
2739    
2740  Returns:     a value representing the name, or -1 if unknown  Returns:       TRUE if auto-possessifying is OK
2741  */  */
2742    
2743  static int  static BOOL
2744  check_posix_name(const pcre_uchar *ptr, int len)  check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2745      BOOL negated)
2746  {  {
2747  const char *pn = posix_names;  const pcre_uint32 *p;
2748  register int yield = 0;  const ucd_record *prop = GET_UCD(c);
 while (posix_name_lengths[yield] != 0)  
   {  
   if (len == posix_name_lengths[yield] &&  
     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;  
   pn += posix_name_lengths[yield] + 1;  
   yield++;  
   }  
 return -1;  
 }  
   
   
 /*************************************************  
 *    Adjust OP_RECURSE items in repeated group   *  
 *************************************************/  
2749    
2750  /* OP_RECURSE items contain an offset from the start of the regex to the group  switch(ptype)
2751  that is referenced. This means that groups can be replicated for fixed    {
2752  repetition simply by copying (because the recursion is allowed to refer to    case PT_LAMP:
2753  earlier groups that are outside the current group). However, when a group is    return (prop->chartype == ucp_Lu ||
2754  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is            prop->chartype == ucp_Ll ||
2755  inserted before it, after it has been compiled. This means that any OP_RECURSE            prop->chartype == ucp_Lt) == negated;
 items within it that refer to the group itself or any contained groups have to  
 have their offsets adjusted. That one of the jobs of this function. Before it  
 is called, the partially compiled regex must be temporarily terminated with  
 OP_END.  
2756    
2757  This function has been extended with the possibility of forward references for    case PT_GC:
2758  recursions and subroutine calls. It must also check the list of such references    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
 for the group we are dealing with. If it finds that one of the recursions in  
 the current group is on this list, it adjusts the offset in the list, not the  
 value in the reference (which is a group number).  
2759    
2760  Arguments:    case PT_PC:
2761    group      points to the start of the group    return (pdata == prop->chartype) == negated;
   adjust     the amount by which the group is to be moved  
   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode  
   cd         contains pointers to tables etc.  
   save_hwm   the hwm forward reference pointer at the start of the group  
2762    
2763  Returns:     nothing    case PT_SC:
2764  */    return (pdata == prop->script) == negated;
2765    
2766  static void    /* These are specials */
 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,  
   pcre_uchar *save_hwm)  
 {  
 pcre_uchar *ptr = group;  
2767    
2768  while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)    case PT_ALNUM:
2769    {    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2770    int offset;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
   pcre_uchar *hc;  
2771    
2772    /* See if this recursion is on the forward reference list. If so, adjust the    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2773    reference. */    means that Perl space and POSIX space are now identical. PCRE was changed
2774      at release 8.34. */
2775    
2776    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    case PT_SPACE:    /* Perl space */
2777      case PT_PXSPACE:  /* POSIX space */
2778      switch(c)
2779      {      {
2780      offset = (int)GET(hc, 0);      HSPACE_CASES:
2781      if (cd->start_code + offset == ptr + 1)      VSPACE_CASES:
2782        {      return negated;
2783        PUT(hc, 0, offset + adjust);  
2784        break;      default:
2785        }      return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2786      }      }
2787      break;  /* Control never reaches here */
2788    
2789    /* Otherwise, adjust the recursion offset if it's after the start of this    case PT_WORD:
2790    group. */    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2791              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2792              c == CHAR_UNDERSCORE) == negated;
2793    
2794    if (hc >= cd->hwm)    case PT_CLIST:
2795      p = PRIV(ucd_caseless_sets) + prop->caseset;
2796      for (;;)
2797      {      {
2798      offset = (int)GET(ptr, 1);      if (c < *p) return !negated;
2799      if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);      if (c == *p++) return negated;
2800      }      }
2801      break;  /* Control never reaches here */
   ptr += 1 + LINK_SIZE;  
2802    }    }
2803    
2804    return FALSE;
2805  }  }
2806    #endif  /* SUPPORT_UCP */
2807    
2808    
2809    
2810  /*************************************************  /*************************************************
2811  *        Insert an automatic callout point       *  *        Fill the character property list        *
2812  *************************************************/  *************************************************/
2813    
2814  /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert  /* Checks whether the code points to an opcode that can take part in auto-
2815  callout points before each pattern item.  possessification, and if so, fills a list with its properties.
2816    
2817  Arguments:  Arguments:
2818    code           current code pointer    code        points to start of expression
2819    ptr            current pattern pointer    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2820    cd             pointers to tables etc    fcc         points to case-flipping table
2821      list        points to output list
2822                  list[0] will be filled with the opcode
2823                  list[1] will be non-zero if this opcode
2824                    can match an empty character string
2825                  list[2..7] depends on the opcode
2826    
2827  Returns:         new code pointer  Returns:      points to the start of the next opcode if *code is accepted
2828                  NULL if *code is not accepted
2829  */  */
2830    
2831  static pcre_uchar *  static const pcre_uchar *
2832  auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)  get_chr_property_list(const pcre_uchar *code, BOOL utf,
2833      const pcre_uint8 *fcc, pcre_uint32 *list)
2834  {  {
2835  *code++ = OP_CALLOUT;  pcre_uchar c = *code;
2836  *code++ = 255;  const pcre_uchar *end;
2837  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */  const pcre_uint32 *clist_src;
2838  PUT(code, LINK_SIZE, 0);                       /* Default length */  pcre_uint32 *clist_dest;
2839  return code + 2 * LINK_SIZE;  pcre_uint32 chr;
2840  }  pcre_uchar base;
   
2841    
2842    list[0] = c;
2843    list[1] = FALSE;
2844    code++;
2845    
2846  /*************************************************  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2847  *         Complete a callout item                *    {
2848  *************************************************/    base = get_repeat_base(c);
2849      c -= (base - OP_STAR);
 /* A callout item contains the length of the next item in the pattern, which  
 we can't fill in till after we have reached the relevant point. This is used  
 for both automatic and manual callouts.  
2850    
2851  Arguments:    if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2852    previous_callout   points to previous callout item      code += IMM2_SIZE;
   ptr                current pattern pointer  
   cd                 pointers to tables etc  
2853    
2854  Returns:             nothing    list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
 */  
2855    
2856  static void    switch(base)
2857  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)      {
2858  {      case OP_STAR:
2859  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));      list[0] = OP_CHAR;
2860  PUT(previous_callout, 2 + LINK_SIZE, length);      break;
 }  
2861    
2862        case OP_STARI:
2863        list[0] = OP_CHARI;
2864        break;
2865    
2866        case OP_NOTSTAR:
2867        list[0] = OP_NOT;
2868        break;
2869    
2870  #ifdef SUPPORT_UCP      case OP_NOTSTARI:
2871  /*************************************************      list[0] = OP_NOTI;
2872  *           Get othercase range                  *      break;
 *************************************************/  
2873    
2874  /* This function is passed the start and end of a class range, in UTF-8 mode      case OP_TYPESTAR:
2875  with UCP support. It searches up the characters, looking for ranges of      list[0] = *code;
2876  characters in the "other" case. Each call returns the next one, updating the      code++;
2877  start address. A character with multiple other cases is returned on its own      break;
2878  with a special return value.      }
2879      c = list[0];
2880      }
2881    
2882  Arguments:  switch(c)
   cptr        points to starting character value; updated  
   d           end value  
   ocptr       where to put start of othercase range  
   odptr       where to put end of othercase range  
   
 Yield:        -1 when no more  
                0 when a range is returned  
               >0 the CASESET offset for char with multiple other cases  
                 in this case, ocptr contains the original  
 */  
   
 static int  
 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,  
   pcre_uint32 *odptr)  
 {  
 pcre_uint32 c, othercase, next;  
 unsigned int co;  
   
 /* Find the first character that has an other case. If it has multiple other  
 cases, return its case offset value. */  
   
 for (c = *cptr; c <= d; c++)  
   {  
   if ((co = UCD_CASESET(c)) != 0)  
     {  
     *ocptr = c++;   /* Character that has the set */  
     *cptr = c;      /* Rest of input range */  
     return (int)co;  
     }  
   if ((othercase = UCD_OTHERCASE(c)) != c) break;  
   }  
   
 if (c > d) return -1;  /* Reached end of range */  
   
 *ocptr = othercase;  
 next = othercase + 1;  
   
 for (++c; c <= d; c++)  
2883    {    {
2884    if (UCD_OTHERCASE(c) != next) break;    case OP_NOT_DIGIT:
2885    next++;    case OP_DIGIT:
2886    }    case OP_NOT_WHITESPACE:
2887      case OP_WHITESPACE:
2888  *odptr = next - 1;     /* End of othercase range */    case OP_NOT_WORDCHAR:
2889  *cptr = c;             /* Rest of input range */    case OP_WORDCHAR:
2890  return 0;    case OP_ANY:
2891  }    case OP_ALLANY:
2892      case OP_ANYNL:
2893      case OP_NOT_HSPACE:
2894      case OP_HSPACE:
2895  /*************************************************    case OP_NOT_VSPACE:
2896  *        Check a character and a property        *    case OP_VSPACE:
2897  *************************************************/    case OP_EXTUNI:
2898      case OP_EODN:
2899  /* This function is called by check_auto_possessive() when a property item    case OP_EOD:
2900  is adjacent to a fixed character.    case OP_DOLL:
2901      case OP_DOLLM:
2902      return code;
2903    
2904  Arguments:    case OP_CHAR:
2905    c            the character    case OP_NOT:
2906    ptype        the property type    GETCHARINCTEST(chr, code);
2907    pdata        the data for the type    list[2] = chr;
2908    negated      TRUE if it's a negated property (\P or \p{^)    list[3] = NOTACHAR;
2909      return code;
2910    
2911  Returns:       TRUE if auto-possessifying is OK    case OP_CHARI:
2912  */    case OP_NOTI:
2913      list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2914      GETCHARINCTEST(chr, code);
2915      list[2] = chr;
2916    
 static BOOL  
 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, BOOL negated)  
 {  
2917  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2918  const pcre_uint32 *p;    if (chr < 128 || (chr < 256 && !utf))
2919        list[3] = fcc[chr];
2920      else
2921        list[3] = UCD_OTHERCASE(chr);
2922    #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2923      list[3] = (chr < 256) ? fcc[chr] : chr;
2924    #else
2925      list[3] = fcc[chr];
2926  #endif  #endif
2927    
2928  const ucd_record *prop = GET_UCD(c);    /* The othercase might be the same value. */
2929    
2930  switch(ptype)    if (chr == list[3])
2931    {      list[3] = NOTACHAR;
2932    case PT_LAMP:    else
2933    return (prop->chartype == ucp_Lu ||      list[4] = NOTACHAR;
2934            prop->chartype == ucp_Ll ||    return code;
           prop->chartype == ucp_Lt) == negated;  
2935    
2936    case PT_GC:  #ifdef SUPPORT_UCP
2937    return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;    case OP_PROP:
2938      case OP_NOTPROP:
2939      if (code[0] != PT_CLIST)
2940        {
2941        list[2] = code[0];
2942        list[3] = code[1];
2943        return code + 2;
2944        }
2945    
2946    case PT_PC:    /* Convert only if we have enough space. */
   return (pdata == prop->chartype) == negated;  
2947    
2948    case PT_SC:    clist_src = PRIV(ucd_caseless_sets) + code[1];
2949    return (pdata == prop->script) == negated;    clist_dest = list + 2;
2950      code += 2;
2951    
2952    /* These are specials */    do {
2953         if (clist_dest >= list + 8)
2954           {
2955           /* Early return if there is not enough space. This should never
2956           happen, since all clists are shorter than 5 character now. */
2957           list[2] = code[0];
2958           list[3] = code[1];
2959           return code;
2960           }
2961         *clist_dest++ = *clist_src;
2962         }
2963      while(*clist_src++ != NOTACHAR);
2964    
2965    case PT_ALNUM:    /* All characters are stored. The terminating NOTACHAR
2966    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    is copied form the clist itself. */
           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;  
2967    
2968    case PT_SPACE:    /* Perl space */    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2969    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    return code;
2970            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)  #endif
           == negated;  
2971    
2972    case PT_PXSPACE:  /* POSIX space */    case OP_NCLASS:
2973    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    case OP_CLASS:
2974            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2975            c == CHAR_FF || c == CHAR_CR)    case OP_XCLASS:
           == negated;  
2976    
2977    case PT_WORD:    if (c == OP_XCLASS)
2978    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||      end = code + GET(code, 0) - 1;
2979            PRIV(ucp_gentype)[prop->chartype] == ucp_N ||    else
2980            c == CHAR_UNDERSCORE) == negated;  #endif
2981        end = code + 32 / sizeof(pcre_uchar);
2982    
2983  #ifdef SUPPORT_UCP    switch(*end)
   case PT_CLIST:  
   p = PRIV(ucd_caseless_sets) + prop->caseset;  
   for (;;)  
2984      {      {
2985      if (c < *p) return !negated;      case OP_CRSTAR:
2986      if (c == *p++) return negated;      case OP_CRMINSTAR:
2987        case OP_CRQUERY:
2988        case OP_CRMINQUERY:
2989        case OP_CRPOSSTAR:
2990        case OP_CRPOSQUERY:
2991        list[1] = TRUE;
2992        end++;
2993        break;
2994    
2995        case OP_CRPLUS:
2996        case OP_CRMINPLUS:
2997        case OP_CRPOSPLUS:
2998        end++;
2999        break;
3000    
3001        case OP_CRRANGE:
3002        case OP_CRMINRANGE:
3003        case OP_CRPOSRANGE:
3004        list[1] = (GET2(end, 1) == 0);
3005        end += 1 + 2 * IMM2_SIZE;
3006        break;
3007      }      }
3008    break;  /* Control never reaches here */    list[2] = end - code;
3009  #endif    return end;
3010    }    }
3011    return NULL;    /* Opcode not accepted */
 return FALSE;  
3012  }  }
 #endif  /* SUPPORT_UCP */  
3013    
3014    
3015    
3016  /*************************************************  /*************************************************
3017  *     Check if auto-possessifying is possible    *  *    Scan further character sets for match       *
3018  *************************************************/  *************************************************/
3019    
3020  /* This function is called for unlimited repeats of certain items, to see  /* Checks whether the base and the current opcode have a common character, in
3021  whether the next thing could possibly match the repeated item. If not, it makes  which case the base cannot be possessified.
 sense to automatically possessify the repeated item.  
3022    
3023  Arguments:  Arguments:
3024    previous      pointer to the repeated opcode    code        points to the byte code
3025    utf           TRUE in UTF-8 / UTF-16 / UTF-32 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3026    ptr           next character in pattern    cd          static compile data
3027    options       options bits    base_list   the data list of the base opcode
   cd            contains pointers to tables etc.  
3028    
3029  Returns:        TRUE if possessifying is wanted  Returns:      TRUE if the auto-possessification is possible
3030  */  */
3031    
3032  static BOOL  static BOOL
3033  check_auto_possessive(const pcre_uchar *previous, BOOL utf,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3034    const pcre_uchar *ptr, int options, compile_data *cd)    const pcre_uint32 *base_list, const pcre_uchar *base_end)
3035  {  {
3036  pcre_uint32 c = NOTACHAR;  pcre_uchar c;
3037  pcre_uint32 next;  pcre_uint32 list[8];
3038  int escape;  const pcre_uint32 *chr_ptr;
3039  pcre_uchar op_code = *previous++;  const pcre_uint32 *ochr_ptr;
3040    const pcre_uint32 *list_ptr;
3041  /* Skip whitespace and comments in extended mode */  const pcre_uchar *next_code;
3042    const pcre_uint8 *class_bitset;
3043    const pcre_uint32 *set1, *set2, *set_end;
3044    pcre_uint32 chr;
3045    BOOL accepted, invert_bits;
3046    
3047    /* Note: the base_list[1] contains whether the current opcode has greedy
3048    (represented by a non-zero value) quantifier. This is a different from
3049    other character type lists, which stores here that the character iterator
3050    matches to an empty string (also represented by a non-zero value). */
3051    
3052  if ((options & PCRE_EXTENDED) != 0)  for(;;)
3053    {    {
3054    for (;;)    /* All operations move the code pointer forward.
3055      Therefore infinite recursions are not possible. */
3056    
3057      c = *code;
3058    
3059      /* Skip over callouts */
3060    
3061      if (c == OP_CALLOUT)
3062      {      {
3063      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      code += PRIV(OP_lengths)[c];
3064      if (*ptr == CHAR_NUMBER_SIGN)      continue;
       {  
       ptr++;  
       while (*ptr != CHAR_NULL)  
         {  
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }  
         ptr++;  
 #ifdef SUPPORT_UTF  
         if (utf) FORWARDCHAR(ptr);  
 #endif  
         }  
       }  
     else break;  
3065      }      }
   }  
3066    
3067  /* If the next item is one that we can handle, get its value. A non-negative    if (c == OP_ALT)
3068  value is a character, a negative value is an escape value. */      {
3069        do code += GET(code, 1); while (*code == OP_ALT);
3070        c = *code;
3071        }
3072    
3073  if (*ptr == CHAR_BACKSLASH)    switch(c)
3074    {      {
3075    int temperrorcode = 0;      case OP_END:
3076    escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options,      case OP_KETRPOS:
3077      FALSE);      /* TRUE only in greedy case. The non-greedy case could be replaced by
3078    if (temperrorcode != 0) return FALSE;      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3079    ptr++;    /* Point after the escape sequence */      uses more memory, which we cannot get at this stage.) */
   }  
 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)  
   {  
   escape = 0;  
 #ifdef SUPPORT_UTF  
   if (utf) { GETCHARINC(next, ptr); } else  
 #endif  
   next = *ptr++;  
   }  
 else return FALSE;  
3080    
3081  /* Skip whitespace and comments in extended mode */      return base_list[1] != 0;
3082    
3083  if ((options & PCRE_EXTENDED) != 0)      case OP_KET:
3084    {      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3085    for (;;)      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3086      {      cannot be converted to a possessive form. */
3087      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;  
3088      if (*ptr == CHAR_NUMBER_SIGN)      if (base_list[1] == 0) return FALSE;
3089    
3090        switch(*(code - GET(code, 1)))
3091          {
3092          case OP_ASSERT:
3093          case OP_ASSERT_NOT:
3094          case OP_ASSERTBACK:
3095          case OP_ASSERTBACK_NOT:
3096          case OP_ONCE:
3097          case OP_ONCE_NC:
3098          /* Atomic sub-patterns and assertions can always auto-possessify their
3099          last iterator. */
3100          return TRUE;
3101          }
3102    
3103        code += PRIV(OP_lengths)[c];
3104        continue;
3105    
3106        case OP_ONCE:
3107        case OP_ONCE_NC:
3108        case OP_BRA:
3109        case OP_CBRA:
3110        next_code = code + GET(code, 1);
3111        code += PRIV(OP_lengths)[c];
3112    
3113        while (*next_code == OP_ALT)
3114        {        {
3115        ptr++;        if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3116        while (*ptr != CHAR_NULL)        code = next_code + 1 + LINK_SIZE;
3117          {        next_code += GET(next_code, 1);
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }  
         ptr++;  
 #ifdef SUPPORT_UTF  
         if (utf) FORWARDCHAR(ptr);  
 #endif  
         }  
3118        }        }
3119      else break;      continue;
     }  
   }  
3120    
3121  /* If the next thing is itself optional, we have to give up. */      case OP_BRAZERO:
3122        case OP_BRAMINZERO:
3123    
3124  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||      next_code = code + 1;
3125    STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)      if (*next_code != OP_BRA && *next_code != OP_CBRA
3126      return FALSE;          && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3127    
3128        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3129    
3130        /* The bracket content will be checked by the
3131        OP_BRA/OP_CBRA case above. */
3132        next_code += 1 + LINK_SIZE;
3133        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3134          return FALSE;
3135    
3136  /* If the previous item is a character, get its value. */      code += PRIV(OP_lengths)[c];
3137        continue;
3138        }
3139    
3140  if (op_code == OP_CHAR || op_code == OP_CHARI ||    /* Check for a supported opcode, and load its properties. */
     op_code == OP_NOT || op_code == OP_NOTI)  
   {  
 #ifdef SUPPORT_UTF  
   GETCHARTEST(c, previous);  
 #else  
   c = *previous;  
 #endif  
   }  
3141    
3142  /* Now compare the next item with the previous opcode. First, handle cases when    code = get_chr_property_list(code, utf, cd->fcc, list);
3143  the next item is a character. */    if (code == NULL) return FALSE;    /* Unsupported */
3144    
3145  if (escape == 0)    /* If either opcode is a small character list, set pointers for comparing
3146    {    characters from that list with another list, or with a property. */
   /* For a caseless UTF match, the next character may have more than one other  
   case, which maps to the special PT_CLIST property. Check this first. */  
3147    
3148  #ifdef SUPPORT_UCP    if (base_list[0] == OP_CHAR)
   if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)  
3149      {      {
3150      unsigned int ocs = UCD_CASESET(next);      chr_ptr = base_list + 2;
3151      if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);      list_ptr = list;
3152      }      }
3153  #endif    else if (list[0] == OP_CHAR)
   
   switch(op_code)  
3154      {      {
3155      case OP_CHAR:      chr_ptr = list + 2;
3156      return c != next;      list_ptr = base_list;
3157        }
3158    
3159      /* For CHARI (caseless character) we must check the other case. If we have    /* Character bitsets can also be compared to certain opcodes. */
     Unicode property support, we can use it to test the other case of  
     high-valued characters. We know that next can have only one other case,  
     because multi-other-case characters are dealt with above. */  
3160    
3161      case OP_CHARI:    else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3162      if (c == next) return FALSE;  #ifdef COMPILE_PCRE8
3163  #ifdef SUPPORT_UTF        /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3164      if (utf)        || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3165        {  #endif
3166        pcre_uint32 othercase;        )
3167        if (next < 128) othercase = cd->fcc[next]; else      {
3168  #ifdef SUPPORT_UCP  #ifdef COMPILE_PCRE8
3169        othercase = UCD_OTHERCASE(next);      if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3170  #else  #else
3171        othercase = NOTACHAR;      if (base_list[0] == OP_CLASS)
3172  #endif  #endif
3173        return c != othercase;        {
3174          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3175          list_ptr = list;
3176        }        }
3177      else      else
 #endif  /* SUPPORT_UTF */  
     return (c != TABLE_GET(next, cd->fcc, next));  /* Not UTF */  
   
     case OP_NOT:  
     return c == next;  
   
     case OP_NOTI:  
     if (c == next) return TRUE;  
 #ifdef SUPPORT_UTF  
     if (utf)  
3178        {        {
3179        pcre_uint32 othercase;        set1 = (pcre_uint32 *)(code - list[2]);
3180        if (next < 128) othercase = cd->fcc[next]; else        list_ptr = base_list;
 #ifdef SUPPORT_UCP  
       othercase = UCD_OTHERCASE(next);  
 #else  
       othercase = NOTACHAR;  
 #endif  
       return c == othercase;  
3181        }        }
     else  
 #endif  /* SUPPORT_UTF */  
     return (c == TABLE_GET(next, cd->fcc, next));  /* Not UTF */  
   
     /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.  
     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */  
   
     case OP_DIGIT:  
     return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;  
   
     case OP_NOT_DIGIT:  
     return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;  
3182    
3183      case OP_WHITESPACE:      invert_bits = FALSE;
3184      return next > 255 || (cd->ctypes[next] & ctype_space) == 0;      switch(list_ptr[0])
3185          {
3186          case OP_CLASS:
3187          case OP_NCLASS:
3188          set2 = (pcre_uint32 *)
3189            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3190          break;
3191    
3192      case OP_NOT_WHITESPACE:        /* OP_XCLASS cannot be supported here, because its bitset
3193      return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;        is not necessarily complete. E.g: [a-\0x{200}] is stored
3194          as a character range, and the appropriate bits are not set. */
3195    
3196      case OP_WORDCHAR:        case OP_NOT_DIGIT:
3197      return next > 255 || (cd->ctypes[next] & ctype_word) == 0;          invert_bits = TRUE;
3198            /* Fall through */
3199          case OP_DIGIT:
3200            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3201            break;
3202    
3203      case OP_NOT_WORDCHAR:        case OP_NOT_WHITESPACE:
3204      return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;          invert_bits = TRUE;
3205            /* Fall through */
3206          case OP_WHITESPACE:
3207            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3208            break;
3209    
3210      case OP_HSPACE:        case OP_NOT_WORDCHAR:
3211      case OP_NOT_HSPACE:          invert_bits = TRUE;
3212      switch(next)          /* Fall through */
3213        {        case OP_WORDCHAR:
3214        HSPACE_CASES:          set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3215        return op_code == OP_NOT_HSPACE;          break;
3216    
3217        default:        default:
3218        return op_code != OP_NOT_HSPACE;        return FALSE;
3219        }        }
3220    
3221      case OP_ANYNL:      /* Compare 4 bytes to improve speed. */
3222      case OP_VSPACE:      set_end = set1 + (32 / 4);
3223      case OP_NOT_VSPACE:      if (invert_bits)
     switch(next)  
3224        {        {
3225        VSPACE_CASES:        do
3226        return op_code == OP_NOT_VSPACE;          {
3227            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3228            }
3229          while (set1 < set_end);
3230          }
3231        else
3232          {
3233          do
3234            {
3235            if ((*set1++ & *set2++) != 0) return FALSE;
3236            }
3237          while (set1 < set_end);
3238          }
3239    
3240        default:      if (list[1] == 0) return TRUE;
3241        return op_code != OP_NOT_VSPACE;      /* Might be an empty repeat. */
3242        continue;
3243        }
3244    
3245      /* Some property combinations also acceptable. Unicode property opcodes are
3246      processed specially; the rest can be handled with a lookup table. */
3247    
3248      else
3249        {
3250        pcre_uint32 leftop, rightop;
3251    
3252        leftop = base_list[0];
3253        rightop = list[0];
3254    
3255    #ifdef SUPPORT_UCP
3256        accepted = FALSE; /* Always set in non-unicode case. */
3257        if (leftop == OP_PROP || leftop == OP_NOTPROP)
3258          {
3259          if (rightop == OP_EOD)
3260            accepted = TRUE;
3261          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3262            {
3263            int n;
3264            const pcre_uint8 *p;
3265            BOOL same = leftop == rightop;
3266            BOOL lisprop = leftop == OP_PROP;
3267            BOOL risprop = rightop == OP_PROP;
3268            BOOL bothprop = lisprop && risprop;
3269    
3270            /* There's a table that specifies how each combination is to be
3271            processed:
3272              0   Always return FALSE (never auto-possessify)
3273              1   Character groups are distinct (possessify if both are OP_PROP)
3274              2   Check character categories in the same group (general or particular)
3275              3   Return TRUE if the two opcodes are not the same
3276              ... see comments below
3277            */
3278    
3279            n = propposstab[base_list[2]][list[2]];
3280            switch(n)
3281              {
3282              case 0: break;
3283              case 1: accepted = bothprop; break;
3284              case 2: accepted = (base_list[3] == list[3]) != same; break;
3285              case 3: accepted = !same; break;
3286    
3287              case 4:  /* Left general category, right particular category */
3288              accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3289              break;
3290    
3291              case 5:  /* Right general category, left particular category */
3292              accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3293              break;
3294    
3295              /* This code is logically tricky. Think hard before fiddling with it.
3296              The posspropstab table has four entries per row. Each row relates to
3297              one of PCRE's special properties such as ALNUM or SPACE or WORD.
3298              Only WORD actually needs all four entries, but using repeats for the
3299              others means they can all use the same code below.
3300    
3301              The first two entries in each row are Unicode general categories, and
3302              apply always, because all the characters they include are part of the
3303              PCRE character set. The third and fourth entries are a general and a
3304              particular category, respectively, that include one or more relevant
3305              characters. One or the other is used, depending on whether the check
3306              is for a general or a particular category. However, in both cases the
3307              category contains more characters than the specials that are defined
3308              for the property being tested against. Therefore, it cannot be used
3309              in a NOTPROP case.
3310    
3311              Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3312              Underscore is covered by ucp_P or ucp_Po. */
3313    
3314              case 6:  /* Left alphanum vs right general category */
3315              case 7:  /* Left space vs right general category */
3316              case 8:  /* Left word vs right general category */
3317              p = posspropstab[n-6];
3318              accepted = risprop && lisprop ==
3319                (list[3] != p[0] &&
3320                 list[3] != p[1] &&
3321                (list[3] != p[2] || !lisprop));
3322              break;
3323    
3324              case 9:   /* Right alphanum vs left general category */
3325              case 10:  /* Right space vs left general category */
3326              case 11:  /* Right word vs left general category */
3327              p = posspropstab[n-9];
3328              accepted = lisprop && risprop ==
3329                (base_list[3] != p[0] &&
3330                 base_list[3] != p[1] &&
3331                (base_list[3] != p[2] || !risprop));
3332              break;
3333    
3334              case 12:  /* Left alphanum vs right particular category */
3335              case 13:  /* Left space vs right particular category */
3336              case 14:  /* Left word vs right particular category */
3337              p = posspropstab[n-12];
3338              accepted = risprop && lisprop ==
3339                (catposstab[p[0]][list[3]] &&
3340                 catposstab[p[1]][list[3]] &&
3341                (list[3] != p[3] || !lisprop));
3342              break;
3343    
3344              case 15:  /* Right alphanum vs left particular category */
3345              case 16:  /* Right space vs left particular category */
3346              case 17:  /* Right word vs left particular category */
3347              p = posspropstab[n-15];
3348              accepted = lisprop && risprop ==
3349                (catposstab[p[0]][base_list[3]] &&
3350                 catposstab[p[1]][base_list[3]] &&
3351                (base_list[3] != p[3] || !risprop));
3352              break;
3353              }
3354            }
3355        }        }
3356    
3357        else
3358    #endif  /* SUPPORT_UCP */
3359    
3360        accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3361               rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3362               autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3363    
3364        if (!accepted)
3365          return FALSE;
3366    
3367        if (list[1] == 0) return TRUE;
3368        /* Might be an empty repeat. */
3369        continue;
3370        }
3371    
3372      /* Control reaches here only if one of the items is a small character list.
3373      All characters are checked against the other side. */
3374    
3375      do
3376        {
3377        chr = *chr_ptr;
3378    
3379        switch(list_ptr[0])
3380          {
3381          case OP_CHAR:
3382          ochr_ptr = list_ptr + 2;
3383          do
3384            {
3385            if (chr == *ochr_ptr) return FALSE;
3386            ochr_ptr++;
3387            }
3388          while(*ochr_ptr != NOTACHAR);
3389          break;
3390    
3391          case OP_NOT:
3392          ochr_ptr = list_ptr + 2;
3393          do
3394            {
3395            if (chr == *ochr_ptr)
3396              break;
3397            ochr_ptr++;
3398            }
3399          while(*ochr_ptr != NOTACHAR);
3400          if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3401          break;
3402    
3403          /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3404          set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3405    
3406          case OP_DIGIT:
3407          if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3408          break;
3409    
3410          case OP_NOT_DIGIT:
3411          if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3412          break;
3413    
3414          case OP_WHITESPACE:
3415          if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3416          break;
3417    
3418          case OP_NOT_WHITESPACE:
3419          if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3420          break;
3421    
3422          case OP_WORDCHAR:
3423          if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3424          break;
3425    
3426          case OP_NOT_WORDCHAR:
3427          if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3428          break;
3429    
3430          case OP_HSPACE:
3431          switch(chr)
3432            {
3433            HSPACE_CASES: return FALSE;
3434            default: break;
3435            }
3436          break;
3437    
3438          case OP_NOT_HSPACE:
3439          switch(chr)
3440            {
3441            HSPACE_CASES: break;
3442            default: return FALSE;
3443            }
3444          break;
3445    
3446          case OP_ANYNL:
3447          case OP_VSPACE:
3448          switch(chr)
3449            {
3450            VSPACE_CASES: return FALSE;
3451            default: break;
3452            }
3453          break;
3454    
3455          case OP_NOT_VSPACE:
3456          switch(chr)
3457            {
3458            VSPACE_CASES: break;
3459            default: return FALSE;
3460            }
3461          break;
3462    
3463          case OP_DOLL:
3464          case OP_EODN:
3465          switch (chr)
3466            {
3467            case CHAR_CR:
3468            case CHAR_LF:
3469            case CHAR_VT:
3470            case CHAR_FF:
3471            case CHAR_NEL:
3472    #ifndef EBCDIC
3473            case 0x2028:
3474            case 0x2029:
3475    #endif  /* Not EBCDIC */
3476            return FALSE;
3477            }
3478          break;
3479    
3480          case OP_EOD:    /* Can always possessify before \z */
3481          break;
3482    
3483  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3484      case OP_PROP:        case OP_PROP:
3485      return check_char_prop(next, previous[0], previous[1], FALSE);        case OP_NOTPROP:
3486          if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3487                list_ptr[0] == OP_NOTPROP))
3488            return FALSE;
3489          break;
3490    #endif
3491    
3492      case OP_NOTPROP:        case OP_NCLASS:
3493      return check_char_prop(next, previous[0], previous[1], TRUE);        if (chr > 255) return FALSE;
3494          /* Fall through */
3495    
3496          case OP_CLASS:
3497          if (chr > 255) break;
3498          class_bitset = (pcre_uint8 *)
3499            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3500          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3501          break;
3502    
3503    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3504          case OP_XCLASS:
3505          if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3506              list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3507          break;
3508  #endif  #endif
3509    
3510      default:        default:
3511      return FALSE;        return FALSE;
3512          }
3513    
3514        chr_ptr++;
3515      }      }
3516      while(*chr_ptr != NOTACHAR);
3517    
3518      /* At least one character must be matched from this opcode. */
3519    
3520      if (list[1] == 0) return TRUE;
3521    }    }
3522    
3523  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP  return FALSE;
3524  is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are  }
3525  generated only when PCRE_UCP is *not* set, that is, when only ASCII  
 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are  
 replaced by OP_PROP codes when PCRE_UCP is set. */  
3526    
3527  switch(op_code)  
3528    /*************************************************
3529    *    Scan compiled regex for auto-possession     *
3530    *************************************************/
3531    
3532    /* Replaces single character iterations with their possessive alternatives
3533    if appropriate. This function modifies the compiled opcode!
3534    
3535    Arguments:
3536      code        points to start of the byte code
3537      utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3538      cd          static compile data
3539    
3540    Returns:      nothing
3541    */
3542    
3543    static void
3544    auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3545    {
3546    register pcre_uchar c;
3547    const pcre_uchar *end;
3548    pcre_uchar *repeat_opcode;
3549    pcre_uint32 list[8];
3550    
3551    for (;;)
3552    {    {
3553    case OP_CHAR:    c = *code;
3554    case OP_CHARI:  
3555    switch(escape)    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3556      {      {
3557      case ESC_d:      c -= get_repeat_base(c) - OP_STAR;
3558      return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;      end = (c <= OP_MINUPTO) ?
3559          get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3560        list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3561    
3562      case ESC_D:      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3563      return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;        {
3564          switch(c)
3565            {
3566            case OP_STAR:
3567            *code += OP_POSSTAR - OP_STAR;
3568            break;
3569    
3570      case ESC_s:          case OP_MINSTAR:
3571      return c > 255 || (cd->ctypes[c] & ctype_space) == 0;          *code += OP_POSSTAR - OP_MINSTAR;
3572            break;
3573    
3574      case ESC_S:          case OP_PLUS:
3575      return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;          *code += OP_POSPLUS - OP_PLUS;
3576            break;
3577    
3578      case ESC_w:          case OP_MINPLUS:
3579      return c > 255 || (cd->ctypes[c] & ctype_word) == 0;          *code += OP_POSPLUS - OP_MINPLUS;
3580            break;
3581    
3582      case ESC_W:          case OP_QUERY:
3583      return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;          *code += OP_POSQUERY - OP_QUERY;
3584            break;
3585    
3586      case ESC_h:          case OP_MINQUERY:
3587      case ESC_H:          *code += OP_POSQUERY - OP_MINQUERY;
3588      switch(c)          break;
       {  
       HSPACE_CASES:  
       return escape != ESC_h;  
3589    
3590        default:          case OP_UPTO:
3591        return escape == ESC_h;          *code += OP_POSUPTO - OP_UPTO;
3592            break;
3593    
3594            case OP_MINUPTO:
3595            *code += OP_MINUPTO - OP_UPTO;
3596            break;
3597            }
3598        }        }
3599        c = *code;
3600        }
3601      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3602        {
3603    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3604        if (c == OP_XCLASS)
3605          repeat_opcode = code + GET(code, 1);
3606        else
3607    #endif
3608          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3609    
3610      case ESC_v:      c = *repeat_opcode;
3611      case ESC_V:      if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
     switch(c)  
3612        {        {
3613        VSPACE_CASES:        /* end must not be NULL. */
3614        return escape != ESC_v;        end = get_chr_property_list(code, utf, cd->fcc, list);
3615    
3616        default:        list[1] = (c & 1) == 0;
3617        return escape == ESC_v;  
3618          if (compare_opcodes(end, utf, cd, list, end))
3619            {
3620            switch (c)
3621              {
3622              case OP_CRSTAR:
3623              case OP_CRMINSTAR:
3624              *repeat_opcode = OP_CRPOSSTAR;
3625              break;
3626    
3627              case OP_CRPLUS:
3628              case OP_CRMINPLUS:
3629              *repeat_opcode = OP_CRPOSPLUS;
3630              break;
3631    
3632              case OP_CRQUERY:
3633              case OP_CRMINQUERY:
3634              *repeat_opcode = OP_CRPOSQUERY;
3635              break;
3636    
3637              case OP_CRRANGE:
3638              case OP_CRMINRANGE:
3639              *repeat_opcode = OP_CRPOSRANGE;
3640              break;
3641              }
3642            }
3643        }        }
3644        c = *code;
3645        }
3646    
3647      /* When PCRE_UCP is set, these values get generated for \d etc. Find    switch(c)
3648      their substitutions and process them. The result will always be either      {
3649      ESC_p or ESC_P. Then fall through to process those values. */      case OP_END:
3650        return;
3651    
3652  #ifdef SUPPORT_UCP      case OP_TYPESTAR:
3653      case ESC_du:      case OP_TYPEMINSTAR:
3654      case ESC_DU:      case OP_TYPEPLUS:
3655      case ESC_wu:      case OP_TYPEMINPLUS:
3656      case ESC_WU:      case OP_TYPEQUERY:
3657      case ESC_su:      case OP_TYPEMINQUERY:
3658      case ESC_SU:      case OP_TYPEPOSSTAR:
3659        {      case OP_TYPEPOSPLUS:
3660        int temperrorcode = 0;      case OP_TYPEPOSQUERY:
3661        ptr = substitutes[escape - ESC_DU];      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3662        escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);      break;
3663        if (temperrorcode != 0) return FALSE;  
3664        ptr++;    /* For compatibility */      case OP_TYPEUPTO:
3665        case OP_TYPEMINUPTO:
3666        case OP_TYPEEXACT:
3667        case OP_TYPEPOSUPTO:
3668        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3669          code += 2;
3670        break;
3671    
3672    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3673        case OP_XCLASS:
3674        code += GET(code, 1);
3675        break;
3676    #endif
3677    
3678        case OP_MARK:
3679        case OP_PRUNE_ARG:
3680        case OP_SKIP_ARG:
3681        case OP_THEN_ARG:
3682        code += code[1];
3683        break;
3684        }
3685    
3686      /* Add in the fixed length from the table */
3687    
3688      code += PRIV(OP_lengths)[c];
3689    
3690      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3691      a multi-byte character. The length in the table is a minimum, so we have to
3692      arrange to skip the extra bytes. */
3693    
3694    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3695      if (utf) switch(c)
3696        {
3697        case OP_CHAR:
3698        case OP_CHARI:
3699        case OP_NOT:
3700        case OP_NOTI:
3701        case OP_STAR:
3702        case OP_MINSTAR:
3703        case OP_PLUS:
3704        case OP_MINPLUS:
3705        case OP_QUERY:
3706        case OP_MINQUERY:
3707        case OP_UPTO:
3708        case OP_MINUPTO:
3709        case OP_EXACT:
3710        case OP_POSSTAR:
3711        case OP_POSPLUS:
3712        case OP_POSQUERY:
3713        case OP_POSUPTO:
3714        case OP_STARI:
3715        case OP_MINSTARI:
3716        case OP_PLUSI:
3717        case OP_MINPLUSI:
3718        case OP_QUERYI:
3719        case OP_MINQUERYI:
3720        case OP_UPTOI:
3721        case OP_MINUPTOI:
3722        case OP_EXACTI:
3723        case OP_POSSTARI:
3724        case OP_POSPLUSI:
3725        case OP_POSQUERYI:
3726        case OP_POSUPTOI:
3727        case OP_NOTSTAR:
3728        case OP_NOTMINSTAR:
3729        case OP_NOTPLUS:
3730        case OP_NOTMINPLUS:
3731        case OP_NOTQUERY:
3732        case OP_NOTMINQUERY:
3733        case OP_NOTUPTO:
3734        case OP_NOTMINUPTO:
3735        case OP_NOTEXACT:
3736        case OP_NOTPOSSTAR:
3737        case OP_NOTPOSPLUS:
3738        case OP_NOTPOSQUERY:
3739        case OP_NOTPOSUPTO:
3740        case OP_NOTSTARI:
3741        case OP_NOTMINSTARI:
3742        case OP_NOTPLUSI:
3743        case OP_NOTMINPLUSI:
3744        case OP_NOTQUERYI:
3745        case OP_NOTMINQUERYI:
3746        case OP_NOTUPTOI:
3747        case OP_NOTMINUPTOI:
3748        case OP_NOTEXACTI:
3749        case OP_NOTPOSSTARI:
3750        case OP_NOTPOSPLUSI:
3751        case OP_NOTPOSQUERYI:
3752        case OP_NOTPOSUPTOI:
3753        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3754        break;
3755        }
3756    #else
3757      (void)(utf);  /* Keep compiler happy by referencing function argument */
3758    #endif
3759      }
3760    }
3761    
3762    
3763    
3764    /*************************************************
3765    *           Check for POSIX class syntax         *
3766    *************************************************/
3767    
3768    /* This function is called when the sequence "[:" or "[." or "[=" is
3769    encountered in a character class. It checks whether this is followed by a
3770    sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3771    reach an unescaped ']' without the special preceding character, return FALSE.
3772    
3773    Originally, this function only recognized a sequence of letters between the
3774    terminators, but it seems that Perl recognizes any sequence of characters,
3775    though of course unknown POSIX names are subsequently rejected. Perl gives an
3776    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3777    didn't consider this to be a POSIX class. Likewise for [:1234:].
3778    
3779    The problem in trying to be exactly like Perl is in the handling of escapes. We
3780    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3781    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3782    below handles the special case of \], but does not try to do any other escape
3783    processing. This makes it different from Perl for cases such as [:l\ower:]
3784    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3785    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3786    I think.
3787    
3788    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3789    It seems that the appearance of a nested POSIX class supersedes an apparent
3790    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3791    a digit.
3792    
3793    In Perl, unescaped square brackets may also appear as part of class names. For
3794    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3795    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3796    seem right at all. PCRE does not allow closing square brackets in POSIX class
3797    names.
3798    
3799    Arguments:
3800      ptr      pointer to the initial [
3801      endptr   where to return the end pointer
3802    
3803    Returns:   TRUE or FALSE
3804    */
3805    
3806    static BOOL
3807    check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3808    {
3809    pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3810    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3811    for (++ptr; *ptr != CHAR_NULL; ptr++)
3812      {
3813      if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3814        ptr++;
3815      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3816      else
3817        {
3818        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3819          {
3820          *endptr = ptr;
3821          return TRUE;
3822        }        }
3823      /* Fall through */      if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3824             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3825              ptr[1] == CHAR_EQUALS_SIGN) &&
3826            check_posix_syntax(ptr, endptr))
3827          return FALSE;
3828        }
3829      }
3830    return FALSE;
3831    }
3832    
3833    
3834    
3835    
3836    /*************************************************
3837    *          Check POSIX class name                *
3838    *************************************************/
3839    
3840    /* This function is called to check the name given in a POSIX-style class entry
3841    such as [:alnum:].
3842    
3843    Arguments:
3844      ptr        points to the first letter
3845      len        the length of the name
3846    
3847    Returns:     a value representing the name, or -1 if unknown
3848    */
3849    
3850    static int
3851    check_posix_name(const pcre_uchar *ptr, int len)
3852    {
3853    const char *pn = posix_names;
3854    register int yield = 0;
3855    while (posix_name_lengths[yield] != 0)
3856      {
3857      if (len == posix_name_lengths[yield] &&
3858        STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3859      pn += posix_name_lengths[yield] + 1;
3860      yield++;
3861      }
3862    return -1;
3863    }
3864    
3865    
3866    /*************************************************
3867    *    Adjust OP_RECURSE items in repeated group   *
3868    *************************************************/
3869    
3870    /* OP_RECURSE items contain an offset from the start of the regex to the group
3871    that is referenced. This means that groups can be replicated for fixed
3872    repetition simply by copying (because the recursion is allowed to refer to
3873    earlier groups that are outside the current group). However, when a group is
3874    optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3875    inserted before it, after it has been compiled. This means that any OP_RECURSE
3876    items within it that refer to the group itself or any contained groups have to
3877    have their offsets adjusted. That one of the jobs of this function. Before it
3878    is called, the partially compiled regex must be temporarily terminated with
3879    OP_END.
3880    
3881    This function has been extended with the possibility of forward references for
3882    recursions and subroutine calls. It must also check the list of such references
3883    for the group we are dealing with. If it finds that one of the recursions in
3884    the current group is on this list, it adjusts the offset in the list, not the
3885    value in the reference (which is a group number).
3886    
3887    Arguments:
3888      group      points to the start of the group
3889      adjust     the amount by which the group is to be moved
3890      utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
3891      cd         contains pointers to tables etc.
3892      save_hwm   the hwm forward reference pointer at the start of the group
3893    
3894    Returns:     nothing
3895    */
3896    
3897    static void
3898    adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3899      pcre_uchar *save_hwm)
3900    {
3901    pcre_uchar *ptr = group;
3902    
3903    while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3904      {
3905      int offset;
3906      pcre_uchar *hc;
3907    
3908      /* See if this recursion is on the forward reference list. If so, adjust the
3909      reference. */
3910    
3911      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3912        {
3913        offset = (int)GET(hc, 0);
3914        if (cd->start_code + offset == ptr + 1)
3915          {
3916          PUT(hc, 0, offset + adjust);
3917          break;
3918          }
3919        }
3920    
3921      /* Otherwise, adjust the recursion offset if it's after the start of this
3922      group. */
3923    
3924      if (hc >= cd->hwm)
3925        {
3926        offset = (int)GET(ptr, 1);
3927        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3928        }
3929    
3930      ptr += 1 + LINK_SIZE;
3931      }
3932    }
3933    
3934    
3935    
3936    /*************************************************
3937    *        Insert an automatic callout point       *
3938    *************************************************/
3939    
3940    /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3941    callout points before each pattern item.
3942    
3943    Arguments:
3944      code           current code pointer
3945      ptr            current pattern pointer
3946      cd             pointers to tables etc
3947    
3948    Returns:         new code pointer
3949    */
3950    
3951    static pcre_uchar *
3952    auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3953    {
3954    *code++ = OP_CALLOUT;
3955    *code++ = 255;
3956    PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
3957    PUT(code, LINK_SIZE, 0);                       /* Default length */
3958    return code + 2 * LINK_SIZE;
3959    }
3960    
     case ESC_p:  
     case ESC_P:  
       {  
       unsigned int ptype = 0, pdata = 0;  
       int errorcodeptr;  
       BOOL negated;  
3961    
       ptr--;      /* Make ptr point at the p or P */  
       if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcodeptr))  
         return FALSE;  
       ptr++;      /* Point past the final curly ket */  
3962    
3963        /* If the property item is optional, we have to give up. (When generated  /*************************************************
3964        from \d etc by PCRE_UCP, this test will have been applied much earlier,  *         Complete a callout item                *
3965        to the original \d etc. At this point, ptr will point to a zero byte. */  *************************************************/
3966    
3967        if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  /* A callout item contains the length of the next item in the pattern, which
3968          STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)  we can't fill in till after we have reached the relevant point. This is used
3969            return FALSE;  for both automatic and manual callouts.
3970    
3971        /* Do the property check. */  Arguments:
3972      previous_callout   points to previous callout item
3973      ptr                current pattern pointer
3974      cd                 pointers to tables etc
3975    
3976        return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);  Returns:             nothing
3977        }  */
 #endif  
3978    
3979      default:  static void
3980      return FALSE;  complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3981      }  {
3982    int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3983    PUT(previous_callout, 2 + LINK_SIZE, length);
3984    }
3985    
   /* In principle, support for Unicode properties should be integrated here as  
   well. It means re-organizing the above code so as to get hold of the property  
   values before switching on the op-code. However, I wonder how many patterns  
   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,  
   these op-codes are never generated.) */  
3986    
   case OP_DIGIT:  
   return escape == ESC_D || escape == ESC_s || escape == ESC_W ||  
          escape == ESC_h || escape == ESC_v || escape == ESC_R;  
3987    
3988    case OP_NOT_DIGIT:  #ifdef SUPPORT_UCP
3989    return escape == ESC_d;  /*************************************************
3990    *           Get othercase range                  *
3991    *************************************************/
3992    
3993    case OP_WHITESPACE:  /* This function is passed the start and end of a class range, in UTF-8 mode
3994    return escape == ESC_S || escape == ESC_d || escape == ESC_w;  with UCP support. It searches up the characters, looking for ranges of
3995    characters in the "other" case. Each call returns the next one, updating the
3996    start address. A character with multiple other cases is returned on its own
3997    with a special return value.
3998    
3999    case OP_NOT_WHITESPACE:  Arguments:
4000    return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;    cptr        points to starting character value; updated
4001      d           end value
4002      ocptr       where to put start of othercase range
4003      odptr       where to put end of othercase range
4004    
4005    case OP_HSPACE:  Yield:        -1 when no more
4006    return escape == ESC_S || escape == ESC_H || escape == ESC_d ||                 0 when a range is returned
4007           escape == ESC_w || escape == ESC_v || escape == ESC_R;                >0 the CASESET offset for char with multiple other cases
4008                    in this case, ocptr contains the original
4009    */
4010    
4011    case OP_NOT_HSPACE:  static int
4012    return escape == ESC_h;  get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4013      pcre_uint32 *odptr)
4014    {
4015    pcre_uint32 c, othercase, next;
4016    unsigned int co;
4017    
4018    /* Can't have \S in here because VT matches \S (Perl anomaly) */  /* Find the first character that has an other case. If it has multiple other
4019    case OP_ANYNL:  cases, return its case offset value. */
   case OP_VSPACE:  
   return escape == ESC_V || escape == ESC_d || escape == ESC_w;  
4020    
4021    case OP_NOT_VSPACE:  for (c = *cptr; c <= d; c++)
4022    return escape == ESC_v || escape == ESC_R;    {
4023      if ((co = UCD_CASESET(c)) != 0)
4024        {
4025        *ocptr = c++;   /* Character that has the set */
4026        *cptr = c;      /* Rest of input range */
4027        return (int)co;
4028        }
4029      if ((othercase = UCD_OTHERCASE(c)) != c) break;
4030      }
4031    
4032    case OP_WORDCHAR:  if (c > d) return -1;  /* Reached end of range */
   return escape == ESC_W || escape == ESC_s || escape == ESC_h ||  
          escape == ESC_v || escape == ESC_R;  
4033    
4034    case OP_NOT_WORDCHAR:  *ocptr = othercase;
4035    return escape == ESC_w || escape == ESC_d;  next = othercase + 1;
4036    
4037    default:  for (++c; c <= d; c++)
4038    return FALSE;    {
4039      if (UCD_OTHERCASE(c) != next) break;
4040      next++;
4041    }    }
4042    
4043  /* Control does not reach here */  *odptr = next - 1;     /* End of othercase range */
4044    *cptr = c;             /* Rest of input range */
4045    return 0;
4046  }  }
4047    #endif  /* SUPPORT_UCP */
4048    
4049    
4050    
# Line 3754  to find out the amount of memory needed, Line 4297  to find out the amount of memory needed,
4297  phase. The value of lengthptr distinguishes the two phases.  phase. The value of lengthptr distinguishes the two phases.
4298    
4299  Arguments:  Arguments:
4300    optionsptr     pointer to the option bits    optionsptr        pointer to the option bits
4301    codeptr        points to the pointer to the current code point    codeptr           points to the pointer to the current code point
4302    ptrptr         points to the current pattern pointer    ptrptr            points to the current pattern pointer
4303    errorcodeptr   points to error code variable    errorcodeptr      points to error code variable
4304    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
4305    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
4306    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
4307    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
4308    bcptr          points to current branch chain    bcptr             points to current branch chain
4309    cond_depth     conditional nesting depth    cond_depth        conditional nesting depth
4310    cd             contains pointers to tables etc.    cd                contains pointers to tables etc.
4311    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
4312                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
4313    
4314  Returns:         TRUE on success  Returns:            TRUE on success
4315                   FALSE, with *errorcodeptr set non-zero on error                      FALSE, with *errorcodeptr set non-zero on error
4316  */  */
4317    
4318  static BOOL  static BOOL
# Line 3994  for (;; ptr++) Line 4537  for (;; ptr++)
4537        }        }
4538      }      }
4539    
   /* Fill in length of a previous callout, except when the next thing is  
   a quantifier. */  
   
4540    is_quantifier =    is_quantifier =
4541      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4542      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4543    
4544    if (!is_quantifier && previous_callout != NULL &&    /* Fill in length of a previous callout, except when the next thing is a
4545      quantifier or when processing a property substitution string in UCP mode. */
4546    
4547      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4548         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
4549      {      {
4550      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
# Line 4032  for (;; ptr++) Line 4575  for (;; ptr++)
4575        }        }
4576      }      }
4577    
4578    /* No auto callout for quantifiers. */    /* No auto callout for quantifiers, or while processing property strings that
4579      are substituted for \w etc in UCP mode. */
4580    
4581    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4582      {      {
4583      previous_callout = code;      previous_callout = code;
4584      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
# Line 4420  for (;; ptr++) Line 4964  for (;; ptr++)
4964              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4965              continue;              continue;
4966    
4967              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4968              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4969              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
4970              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4971                we could just adjust the appropriate bit. From PCRE 8.34 we no
4972                longer treat \s and \S specially. */
4973    
4974              case ESC_s:              case ESC_s:
4975              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
4976              continue;              continue;
4977    
4978              case ESC_S:              case ESC_S:
4979              should_flip_negation = TRUE;              should_flip_negation = TRUE;
4980              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
4981              continue;              continue;
4982    
4983              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
# Line 4933  for (;; ptr++) Line 5476  for (;; ptr++)
5476            }            }
5477          }          }
5478    
       /* If the repetition is unlimited, it pays to see if the next thing on  
       the line is something that cannot possibly match this character. If so,  
       automatically possessifying this item gains some performance in the case  
       where the match fails. */  
   
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
   
5479        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5480        }        }
5481    
# Line 4963  for (;; ptr++) Line 5493  for (;; ptr++)
5493        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5494        c = *previous;        c = *previous;
5495    
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
   
5496        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
5497        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
5498          {          {
# Line 5119  for (;; ptr++) Line 5641  for (;; ptr++)
5641      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
5642      stuff after it, but just skip the item if the repeat was {0,0}. */      stuff after it, but just skip the item if the repeat was {0,0}. */
5643    
5644      else if (*previous == OP_CLASS ||      else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
              *previous == OP_NCLASS ||  
5645  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5646               *previous == OP_XCLASS ||               *previous == OP_XCLASS ||
5647  #endif  #endif
5648               *previous == OP_REF ||               *previous == OP_REF   || *previous == OP_REFI ||
5649               *previous == OP_REFI)               *previous == OP_DNREF || *previous == OP_DNREFI)
5650        {        {
5651        if (repeat_max == 0)        if (repeat_max == 0)
5652          {          {
# Line 5545  for (;; ptr++) Line 6066  for (;; ptr++)
6066        goto FAILED;        goto FAILED;
6067        }        }
6068    
6069      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6070      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6071      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6072      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6073      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6074    
6075      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6076      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6077      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6078        tempcode, not at previous, which might be the first part of a string whose
6079      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6080    
6081      if (possessive_quantifier)      if (possessive_quantifier)
6082        {        {
6083        int len;        int len;
6084    
6085        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6086          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6087          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6088          remains is greater than zero, there's a further opcode that can be
6089          handled. If not, do nothing, leaving the EXACT alone. */
6090    
6091          switch(*tempcode)
6092            {
6093            case OP_TYPEEXACT:
6094          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6095            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6096            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6097            break;
6098    
6099        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6100          {  
6101            case OP_CHAR:
6102            case OP_CHARI:
6103            case OP_NOT:
6104            case OP_NOTI:
6105            case OP_EXACT:
6106            case OP_EXACTI:
6107            case OP_NOTEXACT:
6108            case OP_NOTEXACTI:
6109          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6110  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6111          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6112            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6113  #endif  #endif
6114            break;
6115    
6116            /* For the class opcodes, the repeat operator appears at the end;
6117            adjust tempcode to point to it. */
6118    
6119            case OP_CLASS:
6120            case OP_NCLASS:
6121            tempcode += 1 + 32/sizeof(pcre_uchar);
6122            break;
6123    
6124    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6125            case OP_XCLASS:
6126            tempcode += GET(tempcode, 1);
6127            break;
6128    #endif
6129          }          }
6130    
6131          /* If tempcode is equal to code (which points to the end of the repeated
6132          item), it means we have skipped an EXACT item but there is no following
6133          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6134          all other cases, tempcode will be pointing to the repeat opcode, and will
6135          be less than code, so the value of len will be greater than 0. */
6136    
6137        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6138          if (len > 0)
6139            {
6140            unsigned int repcode = *tempcode;
6141    
6142            /* There is a table for possessifying opcodes, all of which are less
6143            than OP_CALLOUT. A zero entry means there is no possessified version.
6144            */
6145    
6146            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6147              *tempcode = opcode_possessify[repcode];
6148    
6149            /* For opcode without a special possessified version, wrap the item in
6150            ONCE brackets. Because we are moving code along, we must ensure that any
6151            pending recursive references are updated. */
6152    
6153            else
6154              {
6155              *code = OP_END;
6156              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6157              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6158              code += 1 + LINK_SIZE;
6159              len += 1 + LINK_SIZE;
6160              tempcode[0] = OP_ONCE;
6161              *code++ = OP_KET;
6162              PUTINC(code, 0, len);
6163              PUT(tempcode, 1, len);
6164              }
6165            }
6166    
6167    #ifdef NEVER
6168        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6169          {          {
6170          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5609  for (;; ptr++) Line 6192  for (;; ptr++)
6192          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6193          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6194    
6195            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6196            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6197            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6198            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6199    
6200          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6201          pending recursive references are updated. */          pending recursive references are updated. */
6202    
# Line 5624  for (;; ptr++) Line 6212  for (;; ptr++)
6212          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6213          break;          break;
6214          }          }
6215    #endif
6216        }        }
6217    
6218      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 5846  for (;; ptr++) Line 6435  for (;; ptr++)
6435                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6436            break;            break;
6437    
6438          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6439          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6440    
6441          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6442          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 5863  for (;; ptr++) Line 6452  for (;; ptr++)
6452            }            }
6453    
6454          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6455          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6456            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6457            consist entirely of digits, there is scope for ambiguity. */
6458    
6459          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6460            {            {
# Line 5881  for (;; ptr++) Line 6472  for (;; ptr++)
6472            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6473            }            }
6474    
6475          /* We now expect to read a name; any thing else is an error */          /* When a name is one of a number of duplicates, a different opcode is
6476            used and it needs more memory. Unfortunately we cannot tell whether a
6477            name is a duplicate in the first pass, so we have to allow for more
6478            memory except when we know it is a relative numerical reference. */
6479    
6480            if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6481    
6482            /* We now expect to read a name (possibly all digits); any thing else
6483            is an error. In the case of all digits, also get it as a number. */
6484    
6485          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6486            {            {
# Line 5890  for (;; ptr++) Line 6489  for (;; ptr++)
6489            goto FAILED;            goto FAILED;
6490            }            }
6491    
         /* Read the name, but also get it as a number if it's all digits */  
   
6492          recno = 0;          recno = 0;
6493          name = ++ptr;          name = ++ptr;
6494          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
# Line 5902  for (;; ptr++) Line 6499  for (;; ptr++)
6499            }            }
6500          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6501    
6502            /* Check the terminator */
6503    
6504          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6505              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6506            {            {
# Line 5937  for (;; ptr++) Line 6536  for (;; ptr++)
6536            }            }
6537    
6538          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
6539          name. If we find a name, add one to the opcode to change OP_CREF or          name. */
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6540    
6541          slot = cd->name_table;          slot = cd->name_table;
6542          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 5949  for (;; ptr++) Line 6545  for (;; ptr++)
6545            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6546            }            }
6547    
6548          /* Found a previous named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6549            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6550            appropriate data values. Otherwise, just insert the unique subpattern
6551            number. */
6552    
6553          if (i < cd->names_found)          if (i < cd->names_found)
6554            {            {
6555            recno = GET2(slot, 0);            int offset = i++;
6556            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6557            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6558            }            for (; i < cd->names_found; i++)
6559                {
6560          /* Search the pattern for a forward reference */              slot += cd->name_entry_size;
6561                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6562          else if ((i = find_parens(cd, name, namelen,              count++;
6563                          (options & PCRE_EXTENDED) != 0, utf)) > 0)              }
6564            {            if (count > 1)
6565            PUT2(code, 2+LINK_SIZE, i);              {
6566            code[1+LINK_SIZE]++;              PUT2(code, 2+LINK_SIZE, offset);
6567                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6568                skipbytes += IMM2_SIZE;
6569                code[1+LINK_SIZE]++;
6570                }
6571              else  /* Not a duplicated name */
6572                {
6573                PUT2(code, 2+LINK_SIZE, recno);
6574                }
6575            }            }
6576    
6577          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
# Line 6130  for (;; ptr++) Line 6737  for (;; ptr++)
6737          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6738          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
6739          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
6740            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6741              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6742            name = ++ptr;
6743    
6744            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6745            namelen = (int)(ptr - name);
6746    
6747            /* In the pre-compile phase, do a syntax check, remember the longest
6748            name, and then remember the group in a vector, expanding it if
6749            necessary. Duplicates for the same number are skipped; other duplicates
6750            are checked for validity. In the actual compile, there is nothing to
6751            do. */
6752    
6753            if (lengthptr != NULL)
6754            {            {
6755            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            named_group *ng;
6756              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            pcre_uint32 number = cd->bracount + 1;
           name = ++ptr;  
6757    
6758            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            if (*ptr != (pcre_uchar)terminator)
6759            namelen = (int)(ptr - name);              {
6760                *errorcodeptr = ERR42;
6761                goto FAILED;
6762                }
6763    
6764            /* In the pre-compile phase, just do a syntax check. */            if (cd->names_found >= MAX_NAME_COUNT)
6765                {
6766                *errorcodeptr = ERR49;
6767                goto FAILED;
6768                }
6769    
6770            if (lengthptr != NULL)            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6771              {              {
6772              if (*ptr != (pcre_uchar)terminator)              cd->name_entry_size = namelen + IMM2_SIZE + 1;
6773                {              if (namelen > MAX_NAME_SIZE)
               *errorcodeptr = ERR42;  
               goto FAILED;  
               }  
             if (cd->names_found >= MAX_NAME_COUNT)  
6774                {                {
6775                *errorcodeptr = ERR49;                *errorcodeptr = ERR48;
6776                goto FAILED;                goto FAILED;
6777                }                }
6778              if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)              }
6779    
6780              /* Scan the list to check for duplicates. For duplicate names, if the
6781              number is the same, break the loop, which causes the name to be
6782              discarded; otherwise, if DUPNAMES is not set, give an error.
6783              If it is set, allow the name with a different number, but continue
6784              scanning in case this is a duplicate with the same number. For
6785              non-duplicate names, give an error if the number is duplicated. */
6786    
6787              ng = cd->named_groups;
6788              for (i = 0; i < cd->names_found; i++, ng++)
6789                {
6790                if (namelen == ng->length &&
6791                    STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6792                {                {
6793                cd->name_entry_size = namelen + IMM2_SIZE + 1;                if (ng->number == number) break;
6794                if (namelen > MAX_NAME_SIZE)                if ((options & PCRE_DUPNAMES) == 0)
6795                  {                  {
6796                  *errorcodeptr = ERR48;                  *errorcodeptr = ERR43;
6797                  goto FAILED;                  goto FAILED;
6798                  }                  }
6799                  cd->dupnames = TRUE;  /* Duplicate names exist */
6800                  }
6801                else if (ng->number == number)
6802                  {
6803                  *errorcodeptr = ERR65;
6804                  goto FAILED;
6805                }                }
6806              }              }
6807    
6808            /* In the real compile, create the entry in the table, maintaining            if (i >= cd->names_found)     /* Not a duplicate with same number */
           alphabetical order. Duplicate names for different numbers are  
           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same  
           number are always OK. (An existing number can be re-used if (?|  
           appears in the pattern.) In either event, a duplicate name results in  
           a duplicate entry in the table, even if the number is the same. This  
           is because the number of names, and hence the table size, is computed  
           in the pre-compile, and it affects various numbers and pointers which  
           would all have to be modified, and the compiled code moved down, if  
           duplicates with the same number were omitted from the table. This  
           doesn't seem worth the hassle. However, *different* names for the  
           same number are not permitted. */  
   
           else  
6809              {              {
6810              BOOL dupname = FALSE;              /* Increase the list size if necessary */
             slot = cd->name_table;  
6811    
6812              for (i = 0; i < cd->names_found; i++)              if (cd->names_found >= cd->named_group_list_size)
6813                {                {
6814                int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));                int newsize = cd->named_group_list_size * 2;
6815                if (crc == 0)                named_group *newspace = (PUBL(malloc))
6816                  {                  (newsize * sizeof(named_group));
                 if (slot[IMM2_SIZE+namelen] == 0)  
                   {  
                   if (GET2(slot, 0) != cd->bracount + 1 &&  
                       (options & PCRE_DUPNAMES) == 0)  
                     {  
                     *errorcodeptr = ERR43;  
                     goto FAILED;  
                     }  
                   else dupname = TRUE;  
                   }  
                 else crc = -1;      /* Current name is a substring */  
                 }  
6817    
6818                /* Make space in the table and break the loop for an earlier                if (newspace == NULL)
               name. For a duplicate or later name, carry on. We do this for  
               duplicates so that in the simple case (when ?(| is not used) they  
               are in order of their numbers. */  
   
               if (crc < 0)  
6819                  {                  {
6820                  memmove(slot + cd->name_entry_size, slot,                  *errorcodeptr = ERR21;
6821                    IN_UCHARS((cd->names_found - i) * cd->name_entry_size));                  goto FAILED;
                 break;  
6822                  }                  }
6823    
6824                /* Continue the loop for a later or duplicate name */                memcpy(newspace, cd->named_groups,
6825                    cd->named_group_list_size * sizeof(named_group));
6826                slot += cd->name_entry_size;                if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6827                }                  (PUBL(free))((void *)cd->named_groups);
6828                  cd->named_groups = newspace;
6829              /* For non-duplicate names, check for a duplicate number before                cd->named_group_list_size = newsize;
             adding the new name. */  
   
             if (!dupname)  
               {  
               pcre_uchar *cslot = cd->name_table;  
               for (i = 0; i < cd->names_found; i++)  
                 {  
                 if (cslot != slot)  
                   {  
                   if (GET2(cslot, 0) == cd->bracount + 1)  
                     {  
                     *errorcodeptr = ERR65;  
                     goto FAILED;  
                     }  
                   }  
                 else i--;  
                 cslot += cd->name_entry_size;  
                 }  
6830                }                }
6831    
6832              PUT2(slot, 0, cd->bracount + 1);              cd->named_groups[cd->names_found].name = name;
6833              memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));              cd->named_groups[cd->names_found].length = namelen;
6834              slot[IMM2_SIZE + namelen] = 0;              cd->named_groups[cd->names_found].number = number;
6835                cd->names_found++;
6836              }              }
6837            }            }
6838    
6839          /* In both pre-compile and compile, count the number of names we've          ptr++;                    /* Move past > or ' in both passes. */
         encountered. */  
   
         cd->names_found++;  
         ptr++;                    /* Move past > or ' */  
6840          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
6841    
6842    
# Line 6277  for (;; ptr++) Line 6866  for (;; ptr++)
6866    
6867          if (lengthptr != NULL)          if (lengthptr != NULL)
6868            {            {
6869            const pcre_uchar *temp;            named_group *ng;
6870    
6871            if (namelen == 0)            if (namelen == 0)
6872              {              {
# Line 6295  for (;; ptr++) Line 6884  for (;; ptr++)
6884              goto FAILED;              goto FAILED;
6885              }              }
6886    
6887            /* The name table does not exist in the first pass, so we cannot            /* The name table does not exist in the first pass; instead we must
6888            do a simple search as in the code below. Instead, we have to scan the            scan the list of names encountered so far in order to get the
6889            pattern to find the number. It is important that we scan it only as            number. If the name is not found, set the value to 0 for a forward
6890            far as we have got because the syntax of named subpatterns has not            reference. */
6891            been checked for the rest of the pattern, and find_parens() assumes  
6892            correct syntax. In any case, it's a waste of resources to scan            ng = cd->named_groups;
6893            further. We stop the scan at the current point by temporarily            for (i = 0; i < cd->names_found; i++, ng++)
6894            adjusting the value of cd->endpattern. */              {
6895                if (namelen == ng->length &&
6896            temp = cd->end_pattern;                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6897            cd->end_pattern = ptr;                break;
6898            recno = find_parens(cd, name, namelen,              }
6899              (options & PCRE_EXTENDED) != 0, utf);            recno = (i < cd->names_found)? ng->number : 0;
6900            cd->end_pattern = temp;  
6901            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            /* Count named back references. */
6902    
6903              if (!is_recurse) cd->namedrefcount++;
6904            }            }
6905    
6906          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, search the name table. We check the name
6907          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
6908          table. That way, if the name that is longer than any in the table,          table. That way, if the name is longer than any in the table, the
6909          the comparison will fail without reading beyond the table entry. */          comparison will fail without reading beyond the table entry. */
6910    
6911          else          else
6912            {            {
# Line 6328  for (;; ptr++) Line 6919  for (;; ptr++)
6919              slot += cd->name_entry_size;              slot += cd->name_entry_size;
6920              }              }
6921    
6922            if (i < cd->names_found)         /* Back reference */            if (i < cd->names_found)
6923              {              {
6924              recno = GET2(slot, 0);              recno = GET2(slot, 0);
6925              }              }
6926            else if ((recno =                /* Forward back reference */            else
                     find_parens(cd, name, namelen,  
                       (options & PCRE_EXTENDED) != 0, utf)) <= 0)  
6927              {              {
6928              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
6929              goto FAILED;              goto FAILED;
6930              }              }
6931            }            }
6932    
6933          /* In both phases, we can now go to the code than handles numerical          /* In both phases, for recursions, we can now go to the code than
6934          recursion or backreferences. */          handles numerical recursion. */
6935    
6936          if (is_recurse) goto HANDLE_RECURSION;          if (is_recurse) goto HANDLE_RECURSION;
6937            else goto HANDLE_REFERENCE;  
6938            /* In the second pass we must see if the name is duplicated. If so, we
6939            generate a different opcode. */
6940    
6941            if (lengthptr == NULL && cd->dupnames)
6942              {
6943              int count = 1;
6944              unsigned int index = i;
6945              pcre_uchar *cslot = slot + cd->name_entry_size;
6946    
6947              for (i++; i < cd->names_found; i++)
6948                {
6949                if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
6950                count++;
6951                cslot += cd->name_entry_size;
6952                }
6953    
6954              if (count > 1)
6955                {
6956                if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6957                previous = code;
6958                *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6959                PUT2INC(code, 0, index);
6960                PUT2INC(code, 0, count);
6961    
6962                /* Process each potentially referenced group. */
6963    
6964                for (; slot < cslot; slot += cd->name_entry_size)
6965                  {
6966                  open_capitem *oc;
6967                  recno = GET2(slot, 0);
6968                  cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6969                  if (recno > cd->top_backref) cd->top_backref = recno;
6970    
6971                  /* Check to see if this back reference is recursive, that it, it
6972                  is inside the group that it references. A flag is set so that the
6973                  group can be made atomic. */
6974    
6975                  for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6976                    {
6977                    if (oc->number == recno)
6978                      {
6979                      oc->flag = TRUE;
6980                      break;
6981                      }
6982                    }
6983                  }
6984    
6985                continue;  /* End of back ref handling */
6986                }
6987              }
6988    
6989            /* First pass, or a non-duplicated name. */
6990    
6991            goto HANDLE_REFERENCE;
6992    
6993    
6994          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
# Line 6444  for (;; ptr++) Line 7087  for (;; ptr++)
7087    
7088              if (called == NULL)              if (called == NULL)
7089                {                {
7090                if (find_parens(cd, NULL, recno,                if (recno > cd->final_bracount)
                     (options & PCRE_EXTENDED) != 0, utf) < 0)  
7091                  {                  {
7092                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
7093                  goto FAILED;                  goto FAILED;
# Line 6929  for (;; ptr++) Line 7571  for (;; ptr++)
7571          open_capitem *oc;          open_capitem *oc;
7572          recno = -escape;          recno = -escape;
7573    
7574          HANDLE_REFERENCE:    /* Come here from named backref handling */          /* Come here from named backref handling when the reference is to a
7575            single group (i.e. not to a duplicated name. */
7576    
7577            HANDLE_REFERENCE:
7578          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7579          previous = code;          previous = code;
7580          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
# Line 7058  for (;; ptr++) Line 7703  for (;; ptr++)
7703          *code++ = OP_PROP;          *code++ = OP_PROP;
7704          *code++ = PT_CLIST;          *code++ = PT_CLIST;
7705          *code++ = c;          *code++ = c;
7706          if (firstcharflags == REQ_UNSET) firstcharflags = zerofirstcharflags = REQ_NONE;          if (firstcharflags == REQ_UNSET)
7707              firstcharflags = zerofirstcharflags = REQ_NONE;
7708          break;          break;
7709          }          }
7710        }        }
# Line 7147  out the amount of memory needed, as well Line 7793  out the amount of memory needed, as well
7793  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
7794    
7795  Arguments:  Arguments:
7796    options        option bits, including any changes for this subpattern    options           option bits, including any changes for this subpattern
7797    codeptr        -> the address of the current code pointer    codeptr           -> the address of the current code pointer
7798    ptrptr         -> the address of the current pattern pointer    ptrptr            -> the address of the current pattern pointer
7799    errorcodeptr   -> pointer to error code variable    errorcodeptr      -> pointer to error code variable
7800    lookbehind     TRUE if this is a lookbehind assertion    lookbehind        TRUE if this is a lookbehind assertion
7801    reset_bracount TRUE to reset the count for each branch    reset_bracount    TRUE to reset the count for each branch
7802    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes         skip this many bytes at start (for brackets and OP_COND)
7803    cond_depth     depth of nesting for conditional subpatterns    cond_depth        depth of nesting for conditional subpatterns
7804    firstcharptr    place to put the first required character    firstcharptr      place to put the first required character
7805    firstcharflagsptr place to put the first character flags, or a negative number    firstcharflagsptr place to put the first character flags, or a negative number
7806    reqcharptr     place to put the last required character    reqcharptr        place to put the last required character
7807    reqcharflagsptr place to put the last required character flags, or a negative number    reqcharflagsptr   place to put the last required character flags, or a negative number
7808    bcptr          pointer to the chain of currently open branches    bcptr             pointer to the chain of currently open branches
7809    cd             points to the data block with tables pointers etc.    cd                points to the data block with tables pointers etc.
7810    lengthptr      NULL during the real compile phase    lengthptr         NULL during the real compile phase
7811                   points to length accumulator during pre-compile phase                      points to length accumulator during pre-compile phase
7812    
7813  Returns:         TRUE on success  Returns:            TRUE on success
7814  */  */
7815    
7816  static BOOL  static BOOL
# Line 7615  do { Line 8261  do {
8261       switch (*scode)       switch (*scode)
8262         {         {
8263         case OP_CREF:         case OP_CREF:
8264         case OP_NCREF:         case OP_DNCREF:
8265         case OP_RREF:         case OP_RREF:
8266         case OP_NRREF:         case OP_DNRREF:
8267         case OP_DEF:         case OP_DEF:
8268         return FALSE;         return FALSE;
8269    
# Line 7701  return TRUE; Line 8347  return TRUE;
8347  discarded, because they can cause conflicts with actual literals that follow.  discarded, because they can cause conflicts with actual literals that follow.
8348  However, if we end up without a first char setting for an unanchored pattern,  However, if we end up without a first char setting for an unanchored pattern,
8349  it is worth scanning the regex to see if there is an initial asserted first  it is worth scanning the regex to see if there is an initial asserted first
8350  char. If all branches start with the same asserted char, or with a bracket all  char. If all branches start with the same asserted char, or with a
8351  of whose alternatives start with the same asserted char (recurse ad lib), then  non-conditional bracket all of whose alternatives start with the same asserted
8352  we return that char, otherwise -1.  char (recurse ad lib), then we return that char, with the flags set to zero or
8353    REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8354    
8355  Arguments:  Arguments:
8356    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
8357    flags       points to the first char flags, or to REQ_NONE    flags      points to the first char flags, or to REQ_NONE
8358    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
8359    
8360  Returns:     the fixed first char, or 0 with REQ_NONE in flags  Returns:     the fixed first char, or 0 with REQ_NONE in flags
# Line 7744  do { Line 8391  do {
8391       case OP_ASSERT:       case OP_ASSERT:
8392       case OP_ONCE:       case OP_ONCE:
8393       case OP_ONCE_NC:       case OP_ONCE_NC:
      case OP_COND:  
8394       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);       d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8395       if (dflags < 0)       if (dflags < 0)
8396         return 0;         return 0;
# Line 7789  return c; Line 8435  return c;
8435    
8436    
8437  /*************************************************  /*************************************************
8438    *     Add an entry to the name/number table      *
8439    *************************************************/
8440    
8441    /* This function is called between compiling passes to add an entry to the
8442    name/number table, maintaining alphabetical order. Checking for permitted
8443    and forbidden duplicates has already been done.
8444    
8445    Arguments:
8446      cd           the compile data block
8447      name         the name to add
8448      length       the length of the name
8449      groupno      the group number
8450    
8451    Returns:       nothing
8452    */
8453    
8454    static void
8455    add_name(compile_data *cd, const pcre_uchar *name, int length,
8456      unsigned int groupno)
8457    {
8458    int i;
8459    pcre_uchar *slot = cd->name_table;
8460    
8461    for (i = 0; i < cd->names_found; i++)
8462      {
8463      int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8464      if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8465        crc = -1; /* Current name is a substring */
8466    
8467      /* Make space in the table and break the loop for an earlier name. For a
8468      duplicate or later name, carry on. We do this for duplicates so that in the
8469      simple case (when ?(| is not used) they are in order of their numbers. In all
8470      cases they are in the order in which they appear in the pattern. */
8471    
8472      if (crc < 0)
8473        {
8474        memmove(slot + cd->name_entry_size, slot,
8475          IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8476        break;
8477        }
8478    
8479      /* Continue the loop for a later or duplicate name */
8480    
8481      slot += cd->name_entry_size;
8482      }
8483    
8484    PUT2(slot, 0, groupno);
8485    memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8486    slot[IMM2_SIZE + length] = 0;
8487    cd->names_found++;
8488    }
8489    
8490    
8491    
8492    /*************************************************
8493  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
8494  *************************************************/  *************************************************/
8495    
# Line 7875  new memory is obtained from malloc(). */ Line 8576  new memory is obtained from malloc(). */
8576    
8577  pcre_uchar cworkspace[COMPILE_WORK_SIZE];  pcre_uchar cworkspace[COMPILE_WORK_SIZE];
8578    
8579    /* This vector is used for remembering name groups during the pre-compile. In a
8580    similar way to cworkspace, it can be expanded using malloc() if necessary. */
8581    
8582    named_group named_groups[NAMED_GROUP_LIST_SIZE];
8583    
8584  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
8585    
8586  ptr = (const pcre_uchar *)pattern;  ptr = (const pcre_uchar *)pattern;
# Line 8137  cd->bracount = cd->final_bracount = 0; Line 8843  cd->bracount = cd->final_bracount = 0;
8843  cd->names_found = 0;  cd->names_found = 0;
8844  cd->name_entry_size = 0;  cd->name_entry_size = 0;
8845  cd->name_table = NULL;  cd->name_table = NULL;
8846    cd->dupnames = FALSE;
8847    cd->namedrefcount = 0;
8848  cd->start_code = cworkspace;  cd->start_code = cworkspace;
8849  cd->hwm = cworkspace;  cd->hwm = cworkspace;
8850  cd->start_workspace = cworkspace;  cd->start_workspace = cworkspace;
8851  cd->workspace_size = COMPILE_WORK_SIZE;  cd->workspace_size = COMPILE_WORK_SIZE;
8852    cd->named_groups = named_groups;
8853    cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
8854  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
8855  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
8856  cd->req_varyopt = 0;  cd->req_varyopt = 0;
# Line 8172  if (length > MAX_PATTERN_SIZE) Line 8882  if (length > MAX_PATTERN_SIZE)
8882    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
8883    }    }
8884    
8885  /* Compute the size of data block needed and get it, either from malloc or  /* If there are groups with duplicate names and there are also references by
8886  externally provided function. Integer overflow should no longer be possible  name, we must allow for the possibility of named references to duplicated
8887  because nowadays we limit the maximum value of cd->names_found and  groups. These require an extra data item each. */
 cd->name_entry_size. */  
8888    
8889  size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);  if (cd->dupnames && cd->namedrefcount > 0)
8890  re = (REAL_PCRE *)(PUBL(malloc))(size);    length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
8891    
8892    /* Compute the size of the data block for storing the compiled pattern. Integer
8893    overflow should no longer be possible because nowadays we limit the maximum
8894    value of cd->names_found and cd->name_entry_size. */
8895    
8896    size = sizeof(REAL_PCRE) +
8897      (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
8898    
8899    /* Get the memory. */
8900    
8901    re = (REAL_PCRE *)(PUBL(malloc))(size);
8902  if (re == NULL)  if (re == NULL)
8903    {    {
8904    errorcode = ERR21;    errorcode = ERR21;
# Line 8223  cd->final_bracount = cd->bracount;  /* S Line 8942  cd->final_bracount = cd->bracount;  /* S
8942  cd->assert_depth = 0;  cd->assert_depth = 0;
8943  cd->bracount = 0;  cd->bracount = 0;
8944  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
 cd->names_found = 0;  
8945  cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
8946  codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
8947  cd->start_code = codestart;  cd->start_code = codestart;
# Line 8234  cd->had_pruneorskip = FALSE; Line 8952  cd->had_pruneorskip = FALSE;
8952  cd->check_lookbehind = FALSE;  cd->check_lookbehind = FALSE;
8953  cd->open_caps = NULL;  cd->open_caps = NULL;
8954    
8955    /* If any named groups were found, create the name/number table from the list
8956    created in the first pass. */
8957    
8958    if (cd->names_found > 0)
8959      {
8960      int i = cd->names_found;
8961      named_group *ng = cd->named_groups;
8962      cd->names_found = 0;
8963      for (; i > 0; i--, ng++)
8964        add_name(cd, ng->name, ng->length, ng->number);
8965      if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
8966        (PUBL(free))((void *)cd->named_groups);
8967      }
8968    
8969  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
8970  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
8971  of the function here. */  of the function here. */
# Line 8297  if (cd->hwm > cd->start_workspace) Line 9029  if (cd->hwm > cd->start_workspace)
9029      }      }
9030    }    }
9031    
9032  /* If the workspace had to be expanded, free the new memory. Set the pointer to  /* If the workspace had to be expanded, free the new memory. Set the pointer to
9033  NULL to indicate that forward references have been filled in. */  NULL to indicate that forward references have been filled in. */
9034    
9035  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
9036    (PUBL(free))((void *)cd->start_workspace);    (PUBL(free))((void *)cd->start_workspace);
9037  cd->start_workspace = NULL;  cd->start_workspace = NULL;
9038    
9039  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
9040  subpattern. */  subpattern. */
9041    
9042  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9043    
9044    /* Unless disabled, check whether single character iterators can be
9045    auto-possessified. The function overwrites the appropriate opcode values. */
9046    
9047    if ((options & PCRE_NO_AUTO_POSSESSIFY) == 0)
9048      auto_possessify((pcre_uchar *)codestart, utf, cd);
9049    
9050  /* If there were any lookbehind assertions that contained OP_RECURSE  /* If there were any lookbehind assertions that contained OP_RECURSE
9051  (recursions or subroutine calls), a flag is set for them to be checked here,  (recursions or subroutine calls), a flag is set for them to be checked here,
9052  because they may contain forward references. Actual recursions cannot be fixed  because they may contain forward references. Actual recursions cannot be fixed
# Line 8506  if (code - codestart > length) Line 9244  if (code - codestart > length)
9244    }    }
9245  #endif   /* PCRE_DEBUG */  #endif   /* PCRE_DEBUG */
9246    
9247  /* Check for a pattern than can match an empty string, so that this information  /* Check for a pattern than can match an empty string, so that this information
9248  can be provided to applications. */  can be provided to applications. */
9249    
9250  do  do
# Line 8515  do Line 9253  do
9253      {      {
9254      re->flags |= PCRE_MATCH_EMPTY;      re->flags |= PCRE_MATCH_EMPTY;
9255      break;      break;
9256      }      }
9257    codestart += GET(codestart, 1);    codestart += GET(codestart, 1);
9258    }    }
9259  while (*codestart == OP_ALT);  while (*codestart == OP_ALT);
# Line 8530  return (pcre32 *)re; Line 9268  return (pcre32 *)re;
9268  }  }
9269    
9270  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9271    

Legend:
Removed from v.1348  
changed lines
  Added in v.1384

  ViewVC Help
Powered by ViewVC 1.1.5