/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1363 by ph10, Tue Oct 1 16:54:40 2013 UTC revision 1396 by ph10, Sun Nov 10 19:04:34 2013 UTC
# Line 264  static const int verbcount = sizeof(verb Line 264  static const int verbcount = sizeof(verb
264  now all in a single string, to reduce the number of relocations when a shared  now all in a single string, to reduce the number of relocations when a shared
265  library is dynamically loaded. The list of lengths is terminated by a zero  library is dynamically loaded. The list of lengths is terminated by a zero
266  length entry. The first three must be alpha, lower, upper, as this is assumed  length entry. The first three must be alpha, lower, upper, as this is assumed
267  for handling case independence. */  for handling case independence. The indices for graph, print, and punct are
268    needed, so identify them. */
269    
270  static const char posix_names[] =  static const char posix_names[] =
271    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
# Line 275  static const char posix_names[] = Line 276  static const char posix_names[] =
276  static const pcre_uint8 posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
277    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
278    
279    #define PC_GRAPH  8
280    #define PC_PRINT  9
281    #define PC_PUNCT 10
282    
283    
284  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
285  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
286  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 302  static const int posix_class_maps[] = { Line 308  static const int posix_class_maps[] = {
308    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
309  };  };
310    
311  /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class  /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
312  substitutes must be in the order of the names, defined above, and there are  Unicode property escapes. */
 both positive and negative cases. NULL means no substitute. */  
313    
314  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
315  static const pcre_uchar string_PNd[]  = {  static const pcre_uchar string_PNd[]  = {
# Line 329  static const pcre_uchar string_pXwd[] = Line 334  static const pcre_uchar string_pXwd[] =
334  static const pcre_uchar *substitutes[] = {  static const pcre_uchar *substitutes[] = {
335    string_PNd,           /* \D */    string_PNd,           /* \D */
336    string_pNd,           /* \d */    string_pNd,           /* \d */
337    string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */    string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
338    string_pXsp,          /* \s */    string_pXsp,          /* \s */   /* space and POSIX space are the same. */
339    string_PXwd,          /* \W */    string_PXwd,          /* \W */
340    string_pXwd           /* \w */    string_pXwd           /* \w */
341  };  };
342    
343    /* The POSIX class substitutes must be in the order of the POSIX class names,
344    defined above, and there are both positive and negative cases. NULL means no
345    general substitute of a Unicode property escape (\p or \P). However, for some
346    POSIX classes (e.g. graph, print, punct) a special property code is compiled
347    directly. */
348    
349  static const pcre_uchar string_pL[] =   {  static const pcre_uchar string_pL[] =   {
350    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,    CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };    CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
# Line 382  static const pcre_uchar *posix_substitut Line 393  static const pcre_uchar *posix_substitut
393    NULL,                 /* graph */    NULL,                 /* graph */
394    NULL,                 /* print */    NULL,                 /* print */
395    NULL,                 /* punct */    NULL,                 /* punct */
396    string_pXps,          /* space */    /* NOTE: Xps is POSIX space */    string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
397    string_pXwd,          /* word */    string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
398    NULL,                 /* xdigit */    NULL,                 /* xdigit */
399    /* Negated cases */    /* Negated cases */
400    string_PL,            /* ^alpha */    string_PL,            /* ^alpha */
# Line 397  static const pcre_uchar *posix_substitut Line 408  static const pcre_uchar *posix_substitut
408    NULL,                 /* ^graph */    NULL,                 /* ^graph */
409    NULL,                 /* ^print */    NULL,                 /* ^print */
410    NULL,                 /* ^punct */    NULL,                 /* ^punct */
411    string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */    string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
412    string_PXwd,          /* ^word */    string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
413    NULL                  /* ^xdigit */    NULL                  /* ^xdigit */
414  };  };
415  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))  #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
# Line 462  static const char error_texts[] = Line 473  static const char error_texts[] =
473    "POSIX collating elements are not supported\0"    "POSIX collating elements are not supported\0"
474    "this version of PCRE is compiled without UTF support\0"    "this version of PCRE is compiled without UTF support\0"
475    "spare error\0"  /** DEAD **/    "spare error\0"  /** DEAD **/
476    "character value in \\x{...} sequence is too large\0"    "character value in \\x{} or \\o{} is too large\0"
477    /* 35 */    /* 35 */
478    "invalid condition (?(0)\0"    "invalid condition (?(0)\0"
479    "\\C not allowed in lookbehind assertion\0"    "\\C not allowed in lookbehind assertion\0"
# Line 516  static const char error_texts[] = Line 527  static const char error_texts[] =
527    "character value in \\u.... sequence is too large\0"    "character value in \\u.... sequence is too large\0"
528    "invalid UTF-32 string\0"    "invalid UTF-32 string\0"
529    "setting UTF is disabled by the application\0"    "setting UTF is disabled by the application\0"
530      "non-hex character in \\x{} (closing brace missing?)\0"
531      /* 80 */
532      "non-octal character in \\o{} (closing brace missing?)\0"
533      "missing opening brace after \\o\0"
534      "parentheses are too deeply nested\0"
535      "invalid range in character class\0"
536      "group name must start with a non-digit\0"
537    ;    ;
538    
539  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 773  static const pcre_uint8 posspropstab[3][ Line 791  static const pcre_uint8 posspropstab[3][
791    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */    { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
792  };  };
793    
794    /* This table is used when converting repeating opcodes into possessified
795    versions as a result of an explicit possessive quantifier such as ++. A zero
796    value means there is no possessified version - in those cases the item in
797    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
798    because all relevant opcodes are less than that. */
799    
800    static const pcre_uint8 opcode_possessify[] = {
801      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
802      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
803    
804      0,                       /* NOTI */
805      OP_POSSTAR, 0,           /* STAR, MINSTAR */
806      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
807      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
808      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
809      0,                       /* EXACT */
810      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
811    
812      OP_POSSTARI, 0,          /* STARI, MINSTARI */
813      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
814      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
815      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
816      0,                       /* EXACTI */
817      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
818    
819      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
820      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
821      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
822      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
823      0,                       /* NOTEXACT */
824      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
825    
826      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
827      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
828      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
829      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
830      0,                       /* NOTEXACTI */
831      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
832    
833      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
834      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
835      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
836      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
837      0,                       /* TYPEEXACT */
838      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
839    
840      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
841      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
842      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
843      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
844      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
845    
846      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
847      0, 0,                    /* REF, REFI */
848      0, 0,                    /* DNREF, DNREFI */
849      0, 0                     /* RECURSE, CALLOUT */
850    };
851    
852    
853    
854  /*************************************************  /*************************************************
# Line 879  return (*p == CHAR_RIGHT_CURLY_BRACKET); Line 955  return (*p == CHAR_RIGHT_CURLY_BRACKET);
955  *************************************************/  *************************************************/
956    
957  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
958  positive value for a simple escape such as \n, or 0 for a data character  positive value for a simple escape such as \n, or 0 for a data character which
959  which will be placed in chptr. A backreference to group n is returned as  will be placed in chptr. A backreference to group n is returned as negative n.
960  negative n. When UTF-8 is enabled, a positive value greater than 255 may  When UTF-8 is enabled, a positive value greater than 255 may be returned in
961  be returned in chptr.  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
962  On entry,ptr is pointing at the \. On exit, it is on the final character of the  character of the escape sequence.
 escape sequence.  
963    
964  Arguments:  Arguments:
965    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
966    chptr          points to the data character    chptr          points to a returned data character
967    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
968    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
969    options        the options bits    options        the options bits
# Line 1092  else Line 1167  else
1167      break;      break;
1168    
1169      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1170      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1171      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1172        recommended to avoid the ambiguities in the old syntax.
1173    
1174      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1175      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1176      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1177      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1178      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1179      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1180      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1181    
1182        Inside a character class, \ followed by a digit is always either a literal
1183        8 or 9 or an octal number. */
1184    
1185      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1186      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
# Line 1128  else Line 1207  else
1207          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
1208          break;          break;
1209          }          }
1210        if (s < 10 || s <= bracount)        if (s < 8 || s <= bracount)  /* Check for back reference */
1211          {          {
1212          escape = -s;          escape = -s;
1213          break;          break;
# Line 1136  else Line 1215  else
1215        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1216        }        }
1217    
1218      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1219      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1220      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1221        changed so as not to insert the binary zero. */
1222    
1223      if ((c = *ptr) >= CHAR_8)      if ((c = *ptr) >= CHAR_8) break;
1224        {  
1225        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1226    
1227      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1228      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
# Line 1162  else Line 1239  else
1239  #endif  #endif
1240      break;      break;
1241    
1242      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1243      than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.      specifying character codes in octal. The only supported form is \o{ddd}. */
1244      If not, { is treated as a data character. */  
1245        case CHAR_o:
1246        if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1247          {
1248          ptr += 2;
1249          c = 0;
1250          overflow = FALSE;
1251          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1252            {
1253            register pcre_uint32 cc = *ptr++;
1254            if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1255    #ifdef COMPILE_PCRE32
1256            if (c >= 0x20000000l) { overflow = TRUE; break; }
1257    #endif
1258            c = (c << 3) + cc - CHAR_0 ;
1259    #if defined COMPILE_PCRE8
1260            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1261    #elif defined COMPILE_PCRE16
1262            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1263    #elif defined COMPILE_PCRE32
1264            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1265    #endif
1266            }
1267          if (overflow)
1268            {
1269            while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1270            *errorcodeptr = ERR34;
1271            }
1272          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1273            {
1274            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1275            }
1276          else *errorcodeptr = ERR80;
1277          }
1278        break;
1279    
1280        /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1281        numbers. Otherwise it is a lowercase x letter. */
1282    
1283      case CHAR_x:      case CHAR_x:
1284      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1285        {        {
       /* In JavaScript, \x must be followed by two hexadecimal numbers.  
       Otherwise it is a lowercase x letter. */  
1286        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1287          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1288          {          {
# Line 1187  else Line 1299  else
1299  #endif  #endif
1300            }            }
1301          }          }
1302        break;        }    /* End JavaScript handling */
       }  
1303    
1304      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)      /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1305        {      greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1306        const pcre_uchar *pt = ptr + 2;      digits. If not, { used to be treated as a data character. However, Perl
1307        seems to read hex digits up to the first non-such, and ignore the rest, so
1308        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1309        now gives an error. */
1310    
1311        c = 0;      else
1312        overflow = FALSE;        {
1313        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1314          {          {
1315          register pcre_uint32 cc = *pt++;          ptr += 2;
1316          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */          c = 0;
1317            overflow = FALSE;
1318            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1319              {
1320              register pcre_uint32 cc = *ptr++;
1321              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1322    
1323  #ifdef COMPILE_PCRE32  #ifdef COMPILE_PCRE32
1324          if (c >= 0x10000000l) { overflow = TRUE; break; }            if (c >= 0x10000000l) { overflow = TRUE; break; }
1325  #endif  #endif
1326    
1327  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1328          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1329          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1330  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1331          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1332          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1333  #endif  #endif
1334    
1335  #if defined COMPILE_PCRE8  #if defined COMPILE_PCRE8
1336          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1337  #elif defined COMPILE_PCRE16  #elif defined COMPILE_PCRE16
1338          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }            if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1339  #elif defined COMPILE_PCRE32  #elif defined COMPILE_PCRE32
1340          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1341  #endif  #endif
1342          }            }
1343    
1344        if (overflow)          if (overflow)
1345          {            {
1346          while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1347          *errorcodeptr = ERR34;            *errorcodeptr = ERR34;
1348          }            }
1349    
1350        if (*pt == CHAR_RIGHT_CURLY_BRACKET)          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1351          {            {
1352          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;            if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1353          ptr = pt;            }
         break;  
         }  
1354    
1355        /* If the sequence of hex digits does not end with '}', then we don't          /* If the sequence of hex digits does not end with '}', give an error.
1356        recognize this construct; fall through to the normal \x handling. */          We used just to recognize this construct and fall through to the normal
1357        }          \x handling, but nowadays Perl gives an error, which seems much more
1358            sensible, so we do too. */
1359    
1360      /* Read just a single-byte hex-defined char */          else *errorcodeptr = ERR79;
1361            }   /* End of \x{} processing */
1362    
1363      c = 0;        /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1364      while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)  
1365        {        else
1366        pcre_uint32 cc;                          /* Some compilers don't like */          {
1367        cc = *(++ptr);                           /* ++ in initializers */          c = 0;
1368            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1369              {
1370              pcre_uint32 cc;                          /* Some compilers don't like */
1371              cc = *(++ptr);                           /* ++ in initializers */
1372  #ifndef EBCDIC  /* ASCII/UTF-8 coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1373        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */            if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1374        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1375  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1376        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */            if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1377        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1378  #endif  #endif
1379        }            }
1380            }     /* End of \xdd handling */
1381          }       /* End of Perl-style \x handling */
1382      break;      break;
1383    
1384      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
# Line 1524  for (;;) Line 1649  for (;;)
1649    
1650      case OP_CALLOUT:      case OP_CALLOUT:
1651      case OP_CREF:      case OP_CREF:
1652      case OP_NCREF:      case OP_DNCREF:
1653      case OP_RREF:      case OP_RREF:
1654      case OP_NRREF:      case OP_DNRREF:
1655      case OP_DEF:      case OP_DEF:
1656      code += PRIV(OP_lengths)[*code];      code += PRIV(OP_lengths)[*code];
1657      break;      break;
# Line 1663  for (;;) Line 1788  for (;;)
1788      case OP_COMMIT:      case OP_COMMIT:
1789      case OP_CREF:      case OP_CREF:
1790      case OP_DEF:      case OP_DEF:
1791        case OP_DNCREF:
1792        case OP_DNRREF:
1793      case OP_DOLL:      case OP_DOLL:
1794      case OP_DOLLM:      case OP_DOLLM:
1795      case OP_EOD:      case OP_EOD:
1796      case OP_EODN:      case OP_EODN:
1797      case OP_FAIL:      case OP_FAIL:
     case OP_NCREF:  
     case OP_NRREF:  
1798      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1799      case OP_PRUNE:      case OP_PRUNE:
1800      case OP_REVERSE:      case OP_REVERSE:
# Line 1764  for (;;) Line 1889  for (;;)
1889    
1890      switch (*cc)      switch (*cc)
1891        {        {
       case OP_CRPLUS:  
       case OP_CRMINPLUS:  
1892        case OP_CRSTAR:        case OP_CRSTAR:
1893        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1894          case OP_CRPLUS:
1895          case OP_CRMINPLUS:
1896        case OP_CRQUERY:        case OP_CRQUERY:
1897        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1898          case OP_CRPOSSTAR:
1899          case OP_CRPOSPLUS:
1900          case OP_CRPOSQUERY:
1901        return -1;        return -1;
1902    
1903        case OP_CRRANGE:        case OP_CRRANGE:
1904        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1905          case OP_CRPOSRANGE:
1906        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1907        branchlength += (int)GET2(cc,1);        branchlength += (int)GET2(cc,1);
1908        cc += 1 + 2 * IMM2_SIZE;        cc += 1 + 2 * IMM2_SIZE;
# Line 2366  for (code = first_significant_code(code Line 2495  for (code = first_significant_code(code
2495        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2496        case OP_CRQUERY:        case OP_CRQUERY:
2497        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2498          case OP_CRPOSSTAR:
2499          case OP_CRPOSQUERY:
2500        break;        break;
2501    
2502        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2503        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2504        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2505          case OP_CRPOSPLUS:
2506        return FALSE;        return FALSE;
2507    
2508        case OP_CRRANGE:        case OP_CRRANGE:
2509        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2510          case OP_CRPOSRANGE:
2511        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2512        break;        break;
2513        }        }
# Line 2650  switch(ptype) Line 2783  switch(ptype)
2783    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2784            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2785    
2786    case PT_SPACE:    /* Perl space */    /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2787    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    means that Perl space and POSIX space are now identical. PCRE was changed
2788            c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)    at release 8.34. */
           == negated;  
2789    
2790      case PT_SPACE:    /* Perl space */
2791    case PT_PXSPACE:  /* POSIX space */    case PT_PXSPACE:  /* POSIX space */
2792    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||    switch(c)
2793            c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||      {
2794            c == CHAR_FF || c == CHAR_CR)      HSPACE_CASES:
2795            == negated;      VSPACE_CASES:
2796        return negated;
2797    
2798        default:
2799        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2800        }
2801      break;  /* Control never reaches here */
2802    
2803    case PT_WORD:    case PT_WORD:
2804    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||    return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
# Line 2818  switch(c) Line 2957  switch(c)
2957      return code + 2;      return code + 2;
2958      }      }
2959    
2960    /* Convert only if we have anough space. */    /* Convert only if we have enough space. */
2961    
2962    clist_src = PRIV(ucd_caseless_sets) + code[1];    clist_src = PRIV(ucd_caseless_sets) + code[1];
2963    clist_dest = list + 2;    clist_dest = list + 2;
2964    code += 2;    code += 2;
2965    
2966    do {    do {
      /* Early return if there is not enough space. */  
2967       if (clist_dest >= list + 8)       if (clist_dest >= list + 8)
2968         {         {
2969           /* Early return if there is not enough space. This should never
2970           happen, since all clists are shorter than 5 character now. */
2971         list[2] = code[0];         list[2] = code[0];
2972         list[3] = code[1];         list[3] = code[1];
2973         return code;         return code;
2974         }         }
2975       *clist_dest++ = *clist_src;       *clist_dest++ = *clist_src;
2976       }       }
2977     while(*clist_src++ != NOTACHAR);    while(*clist_src++ != NOTACHAR);
2978    
2979    /* Enough space to store all characters. */    /* All characters are stored. The terminating NOTACHAR
2980      is copied form the clist itself. */
2981    
2982    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;    list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2983    return code;    return code;
# Line 2846  switch(c) Line 2987  switch(c)
2987    case OP_CLASS:    case OP_CLASS:
2988  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2989    case OP_XCLASS:    case OP_XCLASS:
   
2990    if (c == OP_XCLASS)    if (c == OP_XCLASS)
2991      end = code + GET(code, 0);      end = code + GET(code, 0) - 1;
2992    else    else
2993  #endif  #endif
2994      end = code + 32 / sizeof(pcre_uchar);      end = code + 32 / sizeof(pcre_uchar);
# Line 2859  switch(c) Line 2999  switch(c)
2999      case OP_CRMINSTAR:      case OP_CRMINSTAR:
3000      case OP_CRQUERY:      case OP_CRQUERY:
3001      case OP_CRMINQUERY:      case OP_CRMINQUERY:
3002        case OP_CRPOSSTAR:
3003        case OP_CRPOSQUERY:
3004      list[1] = TRUE;      list[1] = TRUE;
3005      end++;      end++;
3006      break;      break;
3007    
3008        case OP_CRPLUS:
3009        case OP_CRMINPLUS:
3010        case OP_CRPOSPLUS:
3011        end++;
3012        break;
3013    
3014      case OP_CRRANGE:      case OP_CRRANGE:
3015      case OP_CRMINRANGE:      case OP_CRMINRANGE:
3016        case OP_CRPOSRANGE:
3017      list[1] = (GET2(end, 1) == 0);      list[1] = (GET2(end, 1) == 0);
3018      end += 1 + 2 * IMM2_SIZE;      end += 1 + 2 * IMM2_SIZE;
3019      break;      break;
# Line 2895  Returns:      TRUE if the auto-possessif Line 3044  Returns:      TRUE if the auto-possessif
3044    
3045  static BOOL  static BOOL
3046  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,  compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3047    const pcre_uint32* base_list)    const pcre_uint32 *base_list, const pcre_uchar *base_end)
3048  {  {
3049  pcre_uchar c;  pcre_uchar c;
3050  pcre_uint32 list[8];  pcre_uint32 list[8];
3051  const pcre_uint32* chr_ptr;  const pcre_uint32 *chr_ptr;
3052  const pcre_uint32* ochr_ptr;  const pcre_uint32 *ochr_ptr;
3053  const pcre_uint32* list_ptr;  const pcre_uint32 *list_ptr;
3054    const pcre_uchar *next_code;
3055    const pcre_uint8 *class_bitset;
3056    const pcre_uint32 *set1, *set2, *set_end;
3057  pcre_uint32 chr;  pcre_uint32 chr;
3058    BOOL accepted, invert_bits;
3059    
3060    /* Note: the base_list[1] contains whether the current opcode has greedy
3061    (represented by a non-zero value) quantifier. This is a different from
3062    other character type lists, which stores here that the character iterator
3063    matches to an empty string (also represented by a non-zero value). */
3064    
3065  for(;;)  for(;;)
3066    {    {
3067      /* All operations move the code pointer forward.
3068      Therefore infinite recursions are not possible. */
3069    
3070    c = *code;    c = *code;
3071    
3072    /* Skip over callouts */    /* Skip over callouts */
# Line 2925  for(;;) Line 3086  for(;;)
3086    switch(c)    switch(c)
3087      {      {
3088      case OP_END:      case OP_END:
3089      /* TRUE only in greedy case. The non-greedy case could be replaced by an      case OP_KETRPOS:
3090      OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses      /* TRUE only in greedy case. The non-greedy case could be replaced by
3091      more memory, which we cannot get at this stage.) */      an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3092        uses more memory, which we cannot get at this stage.) */
3093    
3094      return base_list[1] != 0;      return base_list[1] != 0;
3095    
3096      case OP_KET:      case OP_KET:
3097      /* If the bracket is capturing, and referenced by an OP_RECURSE, the      /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3098      non-greedy case cannot be converted to a possessive form. We do not test      it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3099      the bracket type at the moment, but we might do it in the future to improve      cannot be converted to a possessive form. */
     this condition. (But note that recursive calls are always atomic.) */  
3100    
3101      if (base_list[1] == 0) return FALSE;      if (base_list[1] == 0) return FALSE;
3102    
3103        switch(*(code - GET(code, 1)))
3104          {
3105          case OP_ASSERT:
3106          case OP_ASSERT_NOT:
3107          case OP_ASSERTBACK:
3108          case OP_ASSERTBACK_NOT:
3109          case OP_ONCE:
3110          case OP_ONCE_NC:
3111          /* Atomic sub-patterns and assertions can always auto-possessify their
3112          last iterator. */
3113          return TRUE;
3114          }
3115    
3116        code += PRIV(OP_lengths)[c];
3117        continue;
3118    
3119        case OP_ONCE:
3120        case OP_ONCE_NC:
3121        case OP_BRA:
3122        case OP_CBRA:
3123        next_code = code + GET(code, 1);
3124        code += PRIV(OP_lengths)[c];
3125    
3126        while (*next_code == OP_ALT)
3127          {
3128          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3129          code = next_code + 1 + LINK_SIZE;
3130          next_code += GET(next_code, 1);
3131          }
3132        continue;
3133    
3134        case OP_BRAZERO:
3135        case OP_BRAMINZERO:
3136    
3137        next_code = code + 1;
3138        if (*next_code != OP_BRA && *next_code != OP_CBRA
3139            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3140    
3141        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3142    
3143        /* The bracket content will be checked by the
3144        OP_BRA/OP_CBRA case above. */
3145        next_code += 1 + LINK_SIZE;
3146        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3147          return FALSE;
3148    
3149      code += PRIV(OP_lengths)[c];      code += PRIV(OP_lengths)[c];
3150      continue;      continue;
3151      }      }
# Line 2961  for(;;) Line 3169  for(;;)
3169      list_ptr = base_list;      list_ptr = base_list;
3170      }      }
3171    
3172      /* Character bitsets can also be compared to certain opcodes. */
3173    
3174      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3175    #ifdef COMPILE_PCRE8
3176          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3177          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3178    #endif
3179          )
3180        {
3181    #ifdef COMPILE_PCRE8
3182        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3183    #else
3184        if (base_list[0] == OP_CLASS)
3185    #endif
3186          {
3187          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3188          list_ptr = list;
3189          }
3190        else
3191          {
3192          set1 = (pcre_uint32 *)(code - list[2]);
3193          list_ptr = base_list;
3194          }
3195    
3196        invert_bits = FALSE;
3197        switch(list_ptr[0])
3198          {
3199          case OP_CLASS:
3200          case OP_NCLASS:
3201          set2 = (pcre_uint32 *)
3202            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3203          break;
3204    
3205          /* OP_XCLASS cannot be supported here, because its bitset
3206          is not necessarily complete. E.g: [a-\0x{200}] is stored
3207          as a character range, and the appropriate bits are not set. */
3208    
3209          case OP_NOT_DIGIT:
3210            invert_bits = TRUE;
3211            /* Fall through */
3212          case OP_DIGIT:
3213            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3214            break;
3215    
3216          case OP_NOT_WHITESPACE:
3217            invert_bits = TRUE;
3218            /* Fall through */
3219          case OP_WHITESPACE:
3220            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3221            break;
3222    
3223          case OP_NOT_WORDCHAR:
3224            invert_bits = TRUE;
3225            /* Fall through */
3226          case OP_WORDCHAR:
3227            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3228            break;
3229    
3230          default:
3231          return FALSE;
3232          }
3233    
3234        /* Compare 4 bytes to improve speed. */
3235        set_end = set1 + (32 / 4);
3236        if (invert_bits)
3237          {
3238          do
3239            {
3240            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3241            }
3242          while (set1 < set_end);
3243          }
3244        else
3245          {
3246          do
3247            {
3248            if ((*set1++ & *set2++) != 0) return FALSE;
3249            }
3250          while (set1 < set_end);
3251          }
3252    
3253        if (list[1] == 0) return TRUE;
3254        /* Might be an empty repeat. */
3255        continue;
3256        }
3257    
3258    /* Some property combinations also acceptable. Unicode property opcodes are    /* Some property combinations also acceptable. Unicode property opcodes are
3259    processed specially; the rest can be handled with a lookup table. */    processed specially; the rest can be handled with a lookup table. */
3260    
# Line 2968  for(;;) Line 3262  for(;;)
3262      {      {
3263      pcre_uint32 leftop, rightop;      pcre_uint32 leftop, rightop;
3264    
     if (list[1] != 0) return FALSE;   /* Must match at least one character */  
3265      leftop = base_list[0];      leftop = base_list[0];
3266      rightop = list[0];      rightop = list[0];
3267    
3268  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3269        accepted = FALSE; /* Always set in non-unicode case. */
3270      if (leftop == OP_PROP || leftop == OP_NOTPROP)      if (leftop == OP_PROP || leftop == OP_NOTPROP)
3271        {        {
3272        if (rightop == OP_EOD) return TRUE;        if (rightop == OP_EOD)
3273        if (rightop == OP_PROP || rightop == OP_NOTPROP)          accepted = TRUE;
3274          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3275          {          {
3276          int n;          int n;
3277          const pcre_uint8 *p;          const pcre_uint8 *p;
# Line 2997  for(;;) Line 3292  for(;;)
3292          n = propposstab[base_list[2]][list[2]];          n = propposstab[base_list[2]][list[2]];
3293          switch(n)          switch(n)
3294            {            {
3295            case 0: return FALSE;            case 0: break;
3296            case 1: return bothprop;            case 1: accepted = bothprop; break;
3297            case 2: return (base_list[3] == list[3]) != same;            case 2: accepted = (base_list[3] == list[3]) != same; break;
3298            case 3: return !same;            case 3: accepted = !same; break;
3299    
3300            case 4:  /* Left general category, right particular category */            case 4:  /* Left general category, right particular category */
3301            return risprop && catposstab[base_list[3]][list[3]] == same;            accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3302              break;
3303    
3304            case 5:  /* Right general category, left particular category */            case 5:  /* Right general category, left particular category */
3305            return lisprop && catposstab[list[3]][base_list[3]] == same;            accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3306              break;
3307    
3308            /* This code is logically tricky. Think hard before fiddling with it.            /* This code is logically tricky. Think hard before fiddling with it.
3309            The posspropstab table has four entries per row. Each row relates to            The posspropstab table has four entries per row. Each row relates to
3310            one of PCRE's special properties such as ALNUM or SPACE or WORD.            one of PCRE's special properties such as ALNUM or SPACE or WORD.
3311            Only WORD actually needs all four entries, but using repeats for the            Only WORD actually needs all four entries, but using repeats for the
3312            others means they can all use the same code below.            others means they can all use the same code below.
3313    
3314            The first two entries in each row are Unicode general categories, and            The first two entries in each row are Unicode general categories, and
3315            apply always, because all the characters they include are part of the            apply always, because all the characters they include are part of the
3316            PCRE character set. The third and fourth entries are a general and a            PCRE character set. The third and fourth entries are a general and a
# Line 3023  for(;;) Line 3320  for(;;)
3320            category contains more characters than the specials that are defined            category contains more characters than the specials that are defined
3321            for the property being tested against. Therefore, it cannot be used            for the property being tested against. Therefore, it cannot be used
3322            in a NOTPROP case.            in a NOTPROP case.
3323    
3324            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.            Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3325            Underscore is covered by ucp_P or ucp_Po. */            Underscore is covered by ucp_P or ucp_Po. */
3326    
# Line 3031  for(;;) Line 3328  for(;;)
3328            case 7:  /* Left space vs right general category */            case 7:  /* Left space vs right general category */
3329            case 8:  /* Left word vs right general category */            case 8:  /* Left word vs right general category */
3330            p = posspropstab[n-6];            p = posspropstab[n-6];
3331            return risprop && lisprop ==            accepted = risprop && lisprop ==
3332              (list[3] != p[0] &&              (list[3] != p[0] &&
3333               list[3] != p[1] &&               list[3] != p[1] &&
3334              (list[3] != p[2] || !lisprop));              (list[3] != p[2] || !lisprop));
3335              break;
3336    
3337            case 9:   /* Right alphanum vs left general category */            case 9:   /* Right alphanum vs left general category */
3338            case 10:  /* Right space vs left general category */            case 10:  /* Right space vs left general category */
3339            case 11:  /* Right word vs left general category */            case 11:  /* Right word vs left general category */
3340            p = posspropstab[n-9];            p = posspropstab[n-9];
3341            return lisprop && risprop ==            accepted = lisprop && risprop ==
3342              (base_list[3] != p[0] &&              (base_list[3] != p[0] &&
3343               base_list[3] != p[1] &&               base_list[3] != p[1] &&
3344              (base_list[3] != p[2] || !risprop));              (base_list[3] != p[2] || !risprop));
3345              break;
3346    
3347            case 12:  /* Left alphanum vs right particular category */            case 12:  /* Left alphanum vs right particular category */
3348            case 13:  /* Left space vs right particular category */            case 13:  /* Left space vs right particular category */
3349            case 14:  /* Left word vs right particular category */            case 14:  /* Left word vs right particular category */
3350            p = posspropstab[n-12];            p = posspropstab[n-12];
3351            return risprop && lisprop ==            accepted = risprop && lisprop ==
3352              (catposstab[p[0]][list[3]] &&              (catposstab[p[0]][list[3]] &&
3353               catposstab[p[1]][list[3]] &&               catposstab[p[1]][list[3]] &&
3354              (list[3] != p[3] || !lisprop));              (list[3] != p[3] || !lisprop));
3355              break;
3356    
3357            case 15:  /* Right alphanum vs left particular category */            case 15:  /* Right alphanum vs left particular category */
3358            case 16:  /* Right space vs left particular category */            case 16:  /* Right space vs left particular category */
3359            case 17:  /* Right word vs left particular category */            case 17:  /* Right word vs left particular category */
3360            p = posspropstab[n-15];            p = posspropstab[n-15];
3361            return lisprop && risprop ==            accepted = lisprop && risprop ==
3362              (catposstab[p[0]][base_list[3]] &&              (catposstab[p[0]][base_list[3]] &&
3363               catposstab[p[1]][base_list[3]] &&               catposstab[p[1]][base_list[3]] &&
3364              (base_list[3] != p[3] || !risprop));              (base_list[3] != p[3] || !risprop));
3365              break;
3366            }            }
3367          }          }
       return FALSE;  
3368        }        }
3369    
3370      else      else
3371  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
3372    
3373      return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&      accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3374             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&             rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3375             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];             autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3376    
3377        if (!accepted)
3378          return FALSE;
3379    
3380        if (list[1] == 0) return TRUE;
3381        /* Might be an empty repeat. */
3382        continue;
3383      }      }
3384    
3385    /* Control reaches here only if one of the items is a small character list.    /* Control reaches here only if one of the items is a small character list.
# Line 3186  for(;;) Line 3493  for(;;)
3493        case OP_EOD:    /* Can always possessify before \z */        case OP_EOD:    /* Can always possessify before \z */
3494        break;        break;
3495    
3496    #ifdef SUPPORT_UCP
3497        case OP_PROP:        case OP_PROP:
3498        case OP_NOTPROP:        case OP_NOTPROP:
3499        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],        if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3500              list_ptr[0] == OP_NOTPROP))              list_ptr[0] == OP_NOTPROP))
3501          return FALSE;          return FALSE;
3502        break;        break;
3503    #endif
       /* The class comparisons work only when the class is the second item  
       of the pair, because there are at present no possessive forms of the  
       class opcodes. Note also that the "code" variable that is used below  
       points after the second item, and that the pointer for the first item  
       is not available, so even if there were possessive forms of the class  
       opcodes, the correct comparison could not be done. */  
3504    
3505        case OP_NCLASS:        case OP_NCLASS:
3506        if (chr > 255) return FALSE;        if (chr > 255) return FALSE;
3507        /* Fall through */        /* Fall through */
3508    
3509        case OP_CLASS:        case OP_CLASS:
       if (list_ptr != list) return FALSE;   /* Class is first opcode */  
3510        if (chr > 255) break;        if (chr > 255) break;
3511        if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)        class_bitset = (pcre_uint8 *)
3512          return FALSE;          ((list_ptr == list ? code : base_end) - list_ptr[2]);
3513          if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3514        break;        break;
3515    
3516  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3517        case OP_XCLASS:        case OP_XCLASS:
3518        if (list_ptr != list) return FALSE;   /* Class is first opcode */        if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3519        if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))            list_ptr[2] + LINK_SIZE, utf)) return FALSE;
         return FALSE;  
3520        break;        break;
3521  #endif  #endif
3522    
# Line 3257  auto_possessify(pcre_uchar *code, BOOL u Line 3558  auto_possessify(pcre_uchar *code, BOOL u
3558  {  {
3559  register pcre_uchar c;  register pcre_uchar c;
3560  const pcre_uchar *end;  const pcre_uchar *end;
3561    pcre_uchar *repeat_opcode;
3562  pcre_uint32 list[8];  pcre_uint32 list[8];
3563    
3564  for (;;)  for (;;)
# Line 3270  for (;;) Line 3572  for (;;)
3572        get_chr_property_list(code, utf, cd->fcc, list) : NULL;        get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3573      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;      list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3574    
3575      if (end != NULL && compare_opcodes(end, utf, cd, list))      if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3576        {        {
3577        switch(c)        switch(c)
3578          {          {
# Line 3309  for (;;) Line 3611  for (;;)
3611        }        }
3612      c = *code;      c = *code;
3613      }      }
3614      else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3615        {
3616    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3617        if (c == OP_XCLASS)
3618          repeat_opcode = code + GET(code, 1);
3619        else
3620    #endif
3621          repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3622    
3623        c = *repeat_opcode;
3624        if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3625          {
3626          /* end must not be NULL. */
3627          end = get_chr_property_list(code, utf, cd->fcc, list);
3628    
3629          list[1] = (c & 1) == 0;
3630    
3631          if (compare_opcodes(end, utf, cd, list, end))
3632            {
3633            switch (c)
3634              {
3635              case OP_CRSTAR:
3636              case OP_CRMINSTAR:
3637              *repeat_opcode = OP_CRPOSSTAR;
3638              break;
3639    
3640              case OP_CRPLUS:
3641              case OP_CRMINPLUS:
3642              *repeat_opcode = OP_CRPOSPLUS;
3643              break;
3644    
3645              case OP_CRQUERY:
3646              case OP_CRMINQUERY:
3647              *repeat_opcode = OP_CRPOSQUERY;
3648              break;
3649    
3650              case OP_CRRANGE:
3651              case OP_CRMINRANGE:
3652              *repeat_opcode = OP_CRPOSRANGE;
3653              break;
3654              }
3655            }
3656          }
3657        c = *code;
3658        }
3659    
3660    switch(c)    switch(c)
3661      {      {
# Line 3335  for (;;) Line 3682  for (;;)
3682        code += 2;        code += 2;
3683      break;      break;
3684    
3685    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3686      case OP_XCLASS:      case OP_XCLASS:
3687      code += GET(code, 1);      code += GET(code, 1);
3688      break;      break;
3689    #endif
3690    
3691      case OP_MARK:      case OP_MARK:
3692      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
# Line 3446  class, but [abc[:x\]pqr:]] is (so that a Line 3795  class, but [abc[:x\]pqr:]] is (so that a
3795  below handles the special case of \], but does not try to do any other escape  below handles the special case of \], but does not try to do any other escape
3796  processing. This makes it different from Perl for cases such as [:l\ower:]  processing. This makes it different from Perl for cases such as [:l\ower:]
3797  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize  where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3798  "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,  "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3799  I think.  I think.
3800    
3801  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.  A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
# Line 4097  for (;; ptr++) Line 4446  for (;; ptr++)
4446    /* Get next character in the pattern */    /* Get next character in the pattern */
4447    
4448    c = *ptr;    c = *ptr;
4449    
4450    /* If we are at the end of a nested substitution, revert to the outer level    /* If we are at the end of a nested substitution, revert to the outer level
4451    string. Nesting only happens one level deep. */    string. Nesting only happens one level deep. */
4452    
# Line 4199  for (;; ptr++) Line 4548  for (;; ptr++)
4548          }          }
4549        goto NORMAL_CHAR;        goto NORMAL_CHAR;
4550        }        }
4551        /* Control does not reach here. */
4552      }      }
4553    
4554    /* Fill in length of a previous callout, except when the next thing is    /* In extended mode, skip white space and comments. We need a loop in order
4555    a quantifier. */    to check for more white space and more comments after a comment. */
4556    
   is_quantifier =  
     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||  
     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));  
   
   if (!is_quantifier && previous_callout != NULL &&  
        after_manual_callout-- <= 0)  
     {  
     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */  
       complete_callout(previous_callout, ptr, cd);  
     previous_callout = NULL;  
     }  
   
   /* In extended mode, skip white space and comments. */  
   
4557    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
4558      {      {
4559      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;      for (;;)
     if (c == CHAR_NUMBER_SIGN)  
4560        {        {
4561          while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4562          if (c != CHAR_NUMBER_SIGN) break;
4563        ptr++;        ptr++;
4564        while (*ptr != CHAR_NULL)        while (*ptr != CHAR_NULL)
4565          {          {
4566          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4567              {                          /* IS_NEWLINE sets cd->nllen. */
4568              ptr += cd->nllen;
4569              break;
4570              }
4571          ptr++;          ptr++;
4572  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
4573          if (utf) FORWARDCHAR(ptr);          if (utf) FORWARDCHAR(ptr);
4574  #endif  #endif
4575          }          }
4576        if (*ptr != CHAR_NULL) continue;        c = *ptr;     /* Either NULL or the char after a newline */
4577          }
4578        }
4579    
4580        /* Else fall through to handle end of string */    /* See if the next thing is a quantifier. */
4581        c = 0;  
4582        }    is_quantifier =
4583        c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4584        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4585    
4586      /* Fill in length of a previous callout, except when the next thing is a
4587      quantifier or when processing a property substitution string in UCP mode. */
4588    
4589      if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4590           after_manual_callout-- <= 0)
4591        {
4592        if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4593          complete_callout(previous_callout, ptr, cd);
4594        previous_callout = NULL;
4595      }      }
4596    
4597    /* No auto callout for quantifiers. */    /* Create auto callout, except for quantifiers, or while processing property
4598      strings that are substituted for \w etc in UCP mode. */
4599    
4600    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)    if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4601      {      {
4602      previous_callout = code;      previous_callout = code;
4603      code = auto_callout(code, ptr, cd);      code = auto_callout(code, ptr, cd);
4604      }      }
4605    
4606      /* Process the next pattern item. */
4607    
4608    switch(c)    switch(c)
4609      {      {
4610      /* ===================================================================*/      /* ===================================================================*/
4611      case 0:                        /* The branch terminates at string end */      case CHAR_NULL:                /* The branch terminates at string end */
4612      case CHAR_VERTICAL_LINE:       /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
4613      case CHAR_RIGHT_PARENTHESIS:      case CHAR_RIGHT_PARENTHESIS:
4614      *firstcharptr = firstchar;      *firstcharptr = firstchar;
# Line 4493  for (;; ptr++) Line 4851  for (;; ptr++)
4851            posix_class = 0;            posix_class = 0;
4852    
4853          /* When PCRE_UCP is set, some of the POSIX classes are converted to          /* When PCRE_UCP is set, some of the POSIX classes are converted to
4854          different escape sequences that use Unicode properties. */          different escape sequences that use Unicode properties \p or \P. Others
4855            that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4856            directly. */
4857    
4858  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4859          if ((options & PCRE_UCP) != 0)          if ((options & PCRE_UCP) != 0)
4860            {            {
4861              unsigned int ptype = 0;
4862            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);            int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4863    
4864              /* The posix_substitutes table specifies which POSIX classes can be
4865              converted to \p or \P items. */
4866    
4867            if (posix_substitutes[pc] != NULL)            if (posix_substitutes[pc] != NULL)
4868              {              {
4869              nestptr = tempptr + 1;              nestptr = tempptr + 1;
4870              ptr = posix_substitutes[pc] - 1;              ptr = posix_substitutes[pc] - 1;
4871              continue;              continue;
4872              }              }
4873    
4874              /* There are three other classes that generate special property calls
4875              that are recognized only in an XCLASS. */
4876    
4877              else switch(posix_class)
4878                {
4879                case PC_GRAPH:
4880                ptype = PT_PXGRAPH;
4881                /* Fall through */
4882                case PC_PRINT:
4883                if (ptype == 0) ptype = PT_PXPRINT;
4884                /* Fall through */
4885                case PC_PUNCT:
4886                if (ptype == 0) ptype = PT_PXPUNCT;
4887                *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4888                *class_uchardata++ = ptype;
4889                *class_uchardata++ = 0;
4890                ptr = tempptr + 1;
4891                continue;
4892    
4893                /* For all other POSIX classes, no special action is taken in UCP
4894                mode. Fall through to the non_UCP case. */
4895    
4896                default:
4897                break;
4898                }
4899            }            }
4900  #endif  #endif
4901          /* In the non-UCP case, we build the bit map for the POSIX class in a          /* In the non-UCP case, or when UCP makes no difference, we build the
4902          chunk of local store because we may be adding and subtracting from it,          bit map for the POSIX class in a chunk of local store because we may be
4903          and we don't want to subtract bits that may be in the main map already.          adding and subtracting from it, and we don't want to subtract bits that
4904          At the end we or the result into the bit map that is being built. */          may be in the main map already. At the end we or the result into the
4905            bit map that is being built. */
4906    
4907          posix_class *= 3;          posix_class *= 3;
4908    
# Line 4627  for (;; ptr++) Line 5019  for (;; ptr++)
5019              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5020              continue;              continue;
5021    
5022              /* Perl 5.004 onwards omits VT from \s, but we must preserve it              /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5023              if it was previously set by something earlier in the character              5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5024              class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and              previously set by something earlier in the character class.
5025              EBCDIC, so we lazily just adjust the appropriate bit. */              Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5026                we could just adjust the appropriate bit. From PCRE 8.34 we no
5027                longer treat \s and \S specially. */
5028    
5029              case ESC_s:              case ESC_s:
5030              classbits[0] |= cbits[cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
             classbits[1] |= cbits[cbit_space+1] & ~0x08;  
             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];  
5031              continue;              continue;
5032    
5033              case ESC_S:              case ESC_S:
5034              should_flip_negation = TRUE;              should_flip_negation = TRUE;
5035              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */  
5036              continue;              continue;
5037    
5038              /* The rest apply in both UCP and non-UCP cases. */              /* The rest apply in both UCP and non-UCP cases. */
# Line 4762  for (;; ptr++) Line 5153  for (;; ptr++)
5153          else          else
5154  #endif  #endif
5155          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
5156    
5157          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape
5158          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          sequence, but not any of the other escapes. Perl treats a hyphen as a
5159          in such circumstances. */          literal in such circumstances. However, in Perl's warning mode, a
5160            warning is given, so PCRE now faults it as it is almost certainly a
5161          if (!inescq && d == CHAR_BACKSLASH)          mistake on the user's part. */
5162            {  
5163            int descape;          if (!inescq)
5164            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);            {
5165            if (*errorcodeptr != 0) goto FAILED;            if (d == CHAR_BACKSLASH)
   
           /* \b is backspace; any other special means the '-' was literal. */  
   
           if (descape != 0)  
5166              {              {
5167              if (descape == ESC_b) d = CHAR_BS; else              int descape;
5168                descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5169                if (*errorcodeptr != 0) goto FAILED;
5170    
5171                /* 0 means a character was put into d; \b is backspace; any other
5172                special causes an error. */
5173    
5174                if (descape != 0)
5175                {                {
5176                ptr = oldptr;                if (descape == ESC_b) d = CHAR_BS; else
5177                goto CLASS_SINGLE_CHARACTER;  /* A few lines below */                  {
5178                    *errorcodeptr = ERR83;
5179                    goto FAILED;
5180                    }
5181                }                }
5182              }              }
5183            }  
5184              /* A hyphen followed by a POSIX class is treated in the same way. */
5185    
5186              else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5187                       (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5188                        ptr[1] == CHAR_EQUALS_SIGN) &&
5189                       check_posix_syntax(ptr, &tempptr))
5190                {
5191                *errorcodeptr = ERR83;
5192                goto FAILED;
5193                }
5194              }
5195    
5196          /* Check that the two values are in the correct order. Optimize          /* Check that the two values are in the correct order. Optimize
5197          one-character ranges. */          one-character ranges. */
# Line 5045  for (;; ptr++) Line 5453  for (;; ptr++)
5453      insert something before it. */      insert something before it. */
5454    
5455      tempcode = previous;      tempcode = previous;
5456    
5457        /* Before checking for a possessive quantifier, we must skip over
5458        whitespace and comments in extended mode because Perl allows white space at
5459        this point. */
5460    
5461        if ((options & PCRE_EXTENDED) != 0)
5462          {
5463          const pcre_uchar *p = ptr + 1;
5464          for (;;)
5465            {
5466            while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5467            if (*p != CHAR_NUMBER_SIGN) break;
5468            p++;
5469            while (*p != CHAR_NULL)
5470              {
5471              if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5472                {                        /* IS_NEWLINE sets cd->nllen. */
5473                p += cd->nllen;
5474                break;
5475                }
5476              p++;
5477    #ifdef SUPPORT_UTF
5478              if (utf) FORWARDCHAR(p);
5479    #endif
5480              }           /* Loop for comment characters */
5481            }             /* Loop for multiple comments */
5482          ptr = p - 1;    /* Character before the next significant one. */
5483          }
5484    
5485      /* If the next character is '+', we have a possessive quantifier. This      /* If the next character is '+', we have a possessive quantifier. This
5486      implies greediness, whatever the setting of the PCRE_UNGREEDY option.      implies greediness, whatever the setting of the PCRE_UNGREEDY option.
# Line 5338  for (;; ptr++) Line 5774  for (;; ptr++)
5774      opcodes such as BRA and CBRA, as this is the place where they get converted      opcodes such as BRA and CBRA, as this is the place where they get converted
5775      into the more special varieties such as BRAPOS and SBRA. A test for >=      into the more special varieties such as BRAPOS and SBRA. A test for >=
5776      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,      OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5777      ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow      ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5778      repetition of assertions, but now it does, for Perl compatibility. */      Originally, PCRE did not allow repetition of assertions, but now it does,
5779        for Perl compatibility. */
5780    
5781      else if (*previous >= OP_ASSERT && *previous <= OP_COND)      else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5782        {        {
# Line 5357  for (;; ptr++) Line 5794  for (;; ptr++)
5794        /* There is no sense in actually repeating assertions. The only potential        /* There is no sense in actually repeating assertions. The only potential
5795        use of repetition is in cases when the assertion is optional. Therefore,        use of repetition is in cases when the assertion is optional. Therefore,
5796        if the minimum is greater than zero, just ignore the repeat. If the        if the minimum is greater than zero, just ignore the repeat. If the
5797        maximum is not not zero or one, set it to 1. */        maximum is not zero or one, set it to 1. */
5798    
5799        if (*previous < OP_ONCE)    /* Assertion */        if (*previous < OP_ONCE)    /* Assertion */
5800          {          {
# Line 5730  for (;; ptr++) Line 6167  for (;; ptr++)
6167        goto FAILED;        goto FAILED;
6168        }        }
6169    
6170      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', possessive_quantifier is
6171      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,      TRUE. For some opcodes, there are special alternative opcodes for this
6172      there are special alternative opcodes for this case. For anything else, we      case. For anything else, we wrap the entire repeated item inside OP_ONCE
6173      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'      brackets. Logically, the '+' notation is just syntactic sugar, taken from
6174      notation is just syntactic sugar, taken from Sun's Java package, but the      Sun's Java package, but the special opcodes can optimize it.
     special opcodes can optimize it.  
6175    
6176      Some (but not all) possessively repeated subpatterns have already been      Some (but not all) possessively repeated subpatterns have already been
6177      completely handled in the code just above. For them, possessive_quantifier      completely handled in the code just above. For them, possessive_quantifier
6178      is always FALSE at this stage.      is always FALSE at this stage. Note that the repeated item starts at
6179        tempcode, not at previous, which might be the first part of a string whose
6180      Note that the repeated item starts at tempcode, not at previous, which      (former) last char we repeated. */
     might be the first part of a string whose (former) last char we repeated.  
   
     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But  
     an 'upto' may follow. We skip over an 'exact' item, and then test the  
     length of what remains before proceeding. */  
6181    
6182      if (possessive_quantifier)      if (possessive_quantifier)
6183        {        {
6184        int len;        int len;
6185    
6186        if (*tempcode == OP_TYPEEXACT)        /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6187          However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6188          {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6189          remains is greater than zero, there's a further opcode that can be
6190          handled. If not, do nothing, leaving the EXACT alone. */
6191    
6192          switch(*tempcode)
6193            {
6194            case OP_TYPEEXACT:
6195          tempcode += PRIV(OP_lengths)[*tempcode] +          tempcode += PRIV(OP_lengths)[*tempcode] +
6196            ((tempcode[1 + IMM2_SIZE] == OP_PROP            ((tempcode[1 + IMM2_SIZE] == OP_PROP
6197            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);            || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6198            break;
6199    
6200        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)          /* CHAR opcodes are used for exacts whose count is 1. */
6201          {  
6202            case OP_CHAR:
6203            case OP_CHARI:
6204            case OP_NOT:
6205            case OP_NOTI:
6206            case OP_EXACT:
6207            case OP_EXACTI:
6208            case OP_NOTEXACT:
6209            case OP_NOTEXACTI:
6210          tempcode += PRIV(OP_lengths)[*tempcode];          tempcode += PRIV(OP_lengths)[*tempcode];
6211  #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
6212          if (utf && HAS_EXTRALEN(tempcode[-1]))          if (utf && HAS_EXTRALEN(tempcode[-1]))
6213            tempcode += GET_EXTRALEN(tempcode[-1]);            tempcode += GET_EXTRALEN(tempcode[-1]);
6214  #endif  #endif
6215            break;
6216    
6217            /* For the class opcodes, the repeat operator appears at the end;
6218            adjust tempcode to point to it. */
6219    
6220            case OP_CLASS:
6221            case OP_NCLASS:
6222            tempcode += 1 + 32/sizeof(pcre_uchar);
6223            break;
6224    
6225    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6226            case OP_XCLASS:
6227            tempcode += GET(tempcode, 1);
6228            break;
6229    #endif
6230          }          }
6231    
6232          /* If tempcode is equal to code (which points to the end of the repeated
6233          item), it means we have skipped an EXACT item but there is no following
6234          QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6235          all other cases, tempcode will be pointing to the repeat opcode, and will
6236          be less than code, so the value of len will be greater than 0. */
6237    
6238        len = (int)(code - tempcode);        len = (int)(code - tempcode);
6239          if (len > 0)
6240            {
6241            unsigned int repcode = *tempcode;
6242    
6243            /* There is a table for possessifying opcodes, all of which are less
6244            than OP_CALLOUT. A zero entry means there is no possessified version.
6245            */
6246    
6247            if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6248              *tempcode = opcode_possessify[repcode];
6249    
6250            /* For opcode without a special possessified version, wrap the item in
6251            ONCE brackets. Because we are moving code along, we must ensure that any
6252            pending recursive references are updated. */
6253    
6254            else
6255              {
6256              *code = OP_END;
6257              adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6258              memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6259              code += 1 + LINK_SIZE;
6260              len += 1 + LINK_SIZE;
6261              tempcode[0] = OP_ONCE;
6262              *code++ = OP_KET;
6263              PUTINC(code, 0, len);
6264              PUT(tempcode, 1, len);
6265              }
6266            }
6267    
6268    #ifdef NEVER
6269        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
6270          {          {
6271          case OP_STAR:  *tempcode = OP_POSSTAR; break;          case OP_STAR:  *tempcode = OP_POSSTAR; break;
# Line 5794  for (;; ptr++) Line 6293  for (;; ptr++)
6293          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;          case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6294          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;          case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6295    
6296            case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6297            case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6298            case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6299            case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6300    
6301          /* Because we are moving code along, we must ensure that any          /* Because we are moving code along, we must ensure that any
6302          pending recursive references are updated. */          pending recursive references are updated. */
6303    
# Line 5809  for (;; ptr++) Line 6313  for (;; ptr++)
6313          PUT(tempcode, 1, len);          PUT(tempcode, 1, len);
6314          break;          break;
6315          }          }
6316    #endif
6317        }        }
6318    
6319      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 5997  for (;; ptr++) Line 6502  for (;; ptr++)
6502          tempptr = ptr;          tempptr = ptr;
6503    
6504          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
6505          group), a name (referring to a named group), or 'R', referring to          group's having been set), a name (referring to a named group), or 'R',
6506          recursion. R<digits> and R&name are also permitted for recursion tests.          referring to recursion. R<digits> and R&name are also permitted for
6507            recursion tests.
6508          There are several syntaxes for testing a named group: (?(name)) is used  
6509          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).          There are ways of testing a named group: (?(name)) is used by Python;
6510            Perl 5.10 onwards uses (?(<name>) or (?('name')).
6511          There are two unfortunate ambiguities, caused by history. (a) 'R' can  
6512          be the recursive thing or the name 'R' (and similarly for 'R' followed          There is one unfortunate ambiguity, caused by history. 'R' can be the
6513          by digits), and (b) a number could be a name that consists of digits.          recursive thing or the name 'R' (and similarly for 'R' followed by
6514          In both cases, we look for a name first; if not found, we try the other          digits). We look for a name first; if not found, we try the other case.
         cases.  
6515    
6516          For compatibility with auto-callouts, we allow a callout to be          For compatibility with auto-callouts, we allow a callout to be
6517          specified before a condition that is an assertion. First, check for the          specified before a condition that is an assertion. First, check for the
# Line 6031  for (;; ptr++) Line 6535  for (;; ptr++)
6535                 tempptr[2] == CHAR_LESS_THAN_SIGN))                 tempptr[2] == CHAR_LESS_THAN_SIGN))
6536            break;            break;
6537    
6538          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6539          below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */          need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6540    
6541          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
6542          skipbytes = 1+IMM2_SIZE;          skipbytes = 1+IMM2_SIZE;
# Line 6040  for (;; ptr++) Line 6544  for (;; ptr++)
6544    
6545          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
6546    
6547          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)          ptr++;
6548            if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6549            {            {
6550            terminator = -1;            terminator = -1;
6551            ptr += 2;            ptr += 2;
# Line 6048  for (;; ptr++) Line 6553  for (;; ptr++)
6553            }            }
6554    
6555          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
6556          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6557            syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6558    
6559          else if (ptr[1] == CHAR_LESS_THAN_SIGN)          else if (*ptr == CHAR_LESS_THAN_SIGN)
6560            {            {
6561            terminator = CHAR_GREATER_THAN_SIGN;            terminator = CHAR_GREATER_THAN_SIGN;
6562            ptr++;            ptr++;
6563            }            }
6564          else if (ptr[1] == CHAR_APOSTROPHE)          else if (*ptr == CHAR_APOSTROPHE)
6565            {            {
6566            terminator = CHAR_APOSTROPHE;            terminator = CHAR_APOSTROPHE;
6567            ptr++;            ptr++;
# Line 6063  for (;; ptr++) Line 6569  for (;; ptr++)
6569          else          else
6570            {            {
6571            terminator = CHAR_NULL;            terminator = CHAR_NULL;
6572            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);            if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6573                else if (IS_DIGIT(*ptr)) refsign = 0;
6574            }            }
6575    
6576          /* We now expect to read a name; any thing else is an error */          /* Handle a number */
6577    
6578          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)          if (refsign >= 0)
6579            {            {
6580            ptr += 1;  /* To get the right offset */            recno = 0;
6581            *errorcodeptr = ERR28;            while (IS_DIGIT(*ptr))
6582            goto FAILED;              {
6583            }              recno = recno * 10 + (int)(*ptr - CHAR_0);
6584                ptr++;
6585          /* Read the name, but also get it as a number if it's all digits */              }
6586              }
6587    
6588            /* Otherwise we expect to read a name; anything else is an error. When
6589            a name is one of a number of duplicates, a different opcode is used and
6590            it needs more memory. Unfortunately we cannot tell whether a name is a
6591            duplicate in the first pass, so we have to allow for more memory. */
6592    
6593          recno = 0;          else
         name = ++ptr;  
         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)  
6594            {            {
6595            if (recno >= 0)            if (IS_DIGIT(*ptr))
6596              recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;              {
6597            ptr++;              *errorcodeptr = ERR84;
6598                goto FAILED;
6599                }
6600              if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6601                {
6602                *errorcodeptr = ERR28;   /* Assertion expected */
6603                goto FAILED;
6604                }
6605              name = ptr++;
6606              while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6607                {
6608                ptr++;
6609                }
6610              namelen = (int)(ptr - name);
6611              if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
6612            }            }
6613          namelen = (int)(ptr - name);  
6614            /* Check the terminator */
6615    
6616          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||          if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6617              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
6618            {            {
6619            ptr--;      /* Error offset */            ptr--;                  /* Error offset */
6620            *errorcodeptr = ERR26;            *errorcodeptr = ERR26;  /* Malformed number or name */
6621            goto FAILED;            goto FAILED;
6622            }            }
6623    
# Line 6100  for (;; ptr++) Line 6626  for (;; ptr++)
6626          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
6627    
6628          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
6629          reference. If the string started with "+" or "-" we require the rest to          reference. If refsign is not negative, it means we have a number in
6630          be digits, in which case recno will be set. */          recno. */
6631    
6632          if (refsign > 0)          if (refsign >= 0)
6633            {            {
6634            if (recno <= 0)            if (recno <= 0)
6635              {              {
6636              *errorcodeptr = ERR58;              *errorcodeptr = ERR35;
6637              goto FAILED;              goto FAILED;
6638              }              }
6639            recno = (refsign == CHAR_MINUS)?            if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6640              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno + cd->bracount;
6641            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
6642              {              {
6643              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 6121  for (;; ptr++) Line 6647  for (;; ptr++)
6647            break;            break;
6648            }            }
6649    
6650          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise look for the name. */
         name. If we find a name, add one to the opcode to change OP_CREF or  
         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,  
         except they record that the reference was originally to a name. The  
         information is used to check duplicate names. */  
6651    
6652          slot = cd->name_table;          slot = cd->name_table;
6653          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 6134  for (;; ptr++) Line 6656  for (;; ptr++)
6656            slot += cd->name_entry_size;            slot += cd->name_entry_size;
6657            }            }
6658    
6659          /* Found the named subpattern */          /* Found the named subpattern. If the name is duplicated, add one to
6660            the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6661            appropriate data values. Otherwise, just insert the unique subpattern
6662            number. */
6663    
6664          if (i < cd->names_found)          if (i < cd->names_found)
6665            {            {
6666            recno = GET2(slot, 0);            int offset = i++;
6667            PUT2(code, 2+LINK_SIZE, recno);            int count = 1;
6668            code[1+LINK_SIZE]++;            recno = GET2(slot, 0);   /* Number from first found */
6669              for (; i < cd->names_found; i++)
6670                {
6671                slot += cd->name_entry_size;
6672                if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6673                count++;
6674                }
6675              if (count > 1)
6676                {
6677                PUT2(code, 2+LINK_SIZE, offset);
6678                PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6679                skipbytes += IMM2_SIZE;
6680                code[1+LINK_SIZE]++;
6681                }
6682              else  /* Not a duplicated name */
6683                {
6684                PUT2(code, 2+LINK_SIZE, recno);
6685                }
6686            }            }
6687    
6688          /* If terminator == CHAR_NULL it means that the name followed directly          /* If terminator == CHAR_NULL it means that the name followed directly
6689          after the opening parenthesis [e.g. (?(abc)...] and in this case there          after the opening parenthesis [e.g. (?(abc)...] and in this case there
6690          are some further alternatives to try. For the cases where terminator !=          are some further alternatives to try. For the cases where terminator !=
6691          0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have          CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6692          now checked all the possibilities, so give an error. */          we have now checked all the possibilities, so give an error. */
6693    
6694          else if (terminator != CHAR_NULL)          else if (terminator != CHAR_NULL)
6695            {            {
# Line 6184  for (;; ptr++) Line 6726  for (;; ptr++)
6726            skipbytes = 1;            skipbytes = 1;
6727            }            }
6728    
6729          /* Check for the "name" actually being a subpattern number. We are          /* Reference to an unidentified subpattern. */
         in the second pass here, so final_bracount is set. */  
   
         else if (recno > 0 && recno <= cd->final_bracount)  
           {  
           PUT2(code, 2+LINK_SIZE, recno);  
           }  
   
         /* Either an unidentified subpattern, or a reference to (?(0) */  
6730    
6731          else          else
6732            {            {
6733            *errorcodeptr = (recno == 0)? ERR35: ERR15;            *errorcodeptr = ERR15;
6734            goto FAILED;            goto FAILED;
6735            }            }
6736          break;          break;
# Line 6209  for (;; ptr++) Line 6743  for (;; ptr++)
6743          ptr++;          ptr++;
6744          break;          break;
6745    
6746            /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6747            thing to do, but Perl allows all assertions to be quantified, and when
6748            they contain capturing parentheses there may be a potential use for
6749            this feature. Not that that applies to a quantified (?!) but we allow
6750            it for uniformity. */
6751    
6752          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
6753          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6754          ptr++;          ptr++;
6755          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6756                 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6757                (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6758            {            {
6759            *code++ = OP_FAIL;            *code++ = OP_FAIL;
6760            previous = NULL;            previous = NULL;
# Line 6309  for (;; ptr++) Line 6850  for (;; ptr++)
6850          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6851            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6852          name = ++ptr;          name = ++ptr;
6853            if (IS_DIGIT(*ptr))
6854              {
6855              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
6856              goto FAILED;
6857              }
6858          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6859          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6860    
# Line 6423  for (;; ptr++) Line 6968  for (;; ptr++)
6968    
6969          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6970          name = ++ptr;          name = ++ptr;
6971            if (IS_DIGIT(*ptr))
6972              {
6973              *errorcodeptr = ERR84;   /* Group name must start with non-digit */
6974              goto FAILED;
6975              }
6976          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6977          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6978    
# Line 6815  for (;; ptr++) Line 7365  for (;; ptr++)
7365        skipbytes = IMM2_SIZE;        skipbytes = IMM2_SIZE;
7366        }        }
7367    
7368      /* Process nested bracketed regex. Assertions used not to be repeatable,      /* Process nested bracketed regex. First check for parentheses nested too
7369      but this was changed for Perl compatibility, so all kinds can now be      deeply. */
7370      repeated. We copy code into a non-register variable (tempcode) in order to  
7371      be able to pass its address because some compilers complain otherwise. */      if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7372          {
7373          *errorcodeptr = ERR82;
7374          goto FAILED;
7375          }
7376    
7377        /* Assertions used not to be repeatable, but this was changed for Perl
7378        compatibility, so all kinds can now be repeated. We copy code into a
7379        non-register variable (tempcode) in order to be able to pass its address
7380        because some compilers complain otherwise. */
7381    
7382      previous = code;                      /* For handling repetition */      previous = code;                      /* For handling repetition */
7383      *code = bravalue;      *code = bravalue;
# Line 6848  for (;; ptr++) Line 7407  for (;; ptr++)
7407             &length_prevgroup              /* Pre-compile phase */             &length_prevgroup              /* Pre-compile phase */
7408           ))           ))
7409        goto FAILED;        goto FAILED;
7410    
7411        cd->parens_depth -= 1;
7412    
7413      /* If this was an atomic group and there are no capturing groups within it,      /* If this was an atomic group and there are no capturing groups within it,
7414      generate OP_ONCE_NC instead of OP_ONCE. */      generate OP_ONCE_NC instead of OP_ONCE. */
# Line 7063  for (;; ptr++) Line 7624  for (;; ptr++)
7624        if (escape == ESC_g)        if (escape == ESC_g)
7625          {          {
7626          const pcre_uchar *p;          const pcre_uchar *p;
7627            pcre_uint32 cf;
7628    
7629          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
7630          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7631            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7632    
7633          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
7634          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7635          fact, because we actually check for a number below, the paths that          fact, because we do the check for a number below, the paths that
7636          would actually be in error are never taken. */          would actually be in error are never taken. */
7637    
7638          skipbytes = 0;          skipbytes = 0;
7639          reset_bracount = FALSE;          reset_bracount = FALSE;
7640    
7641          /* Test for a name */          /* If it's not a signed or unsigned number, treat it as a name. */
7642    
7643          if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)          cf = ptr[1];
7644            if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7645            {            {
           BOOL is_a_number = TRUE;  
           for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)  
             {  
             if (!MAX_255(*p)) { is_a_number = FALSE; break; }  
             if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;  
             if ((cd->ctypes[*p] & ctype_word) == 0) break;  
             }  
           if (*p != (pcre_uchar)terminator)  
             {  
             *errorcodeptr = ERR57;  
             break;  
             }  
           if (is_a_number)  
             {  
             ptr++;  
             goto HANDLE_NUMERICAL_RECURSION;  
             }  
7646            is_recurse = TRUE;            is_recurse = TRUE;
7647            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
7648            }            }
7649    
7650          /* Test a signed number in angle brackets or quotes. */          /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7651            or a digit. */
7652    
7653          p = ptr + 2;          p = ptr + 2;
7654          while (IS_DIGIT(*p)) p++;          while (IS_DIGIT(*p)) p++;
# Line 7240  for (;; ptr++) Line 7788  for (;; ptr++)
7788    
7789      /* ===================================================================*/      /* ===================================================================*/
7790      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
7791      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in a UTF mode, it may be a
7792      multi-byte literal character. */      multi-unit literal character. */
7793    
7794      default:      default:
7795      NORMAL_CHAR:      NORMAL_CHAR:
# Line 7830  do { Line 8378  do {
8378       switch (*scode)       switch (*scode)
8379         {         {
8380         case OP_CREF:         case OP_CREF:
8381         case OP_NCREF:         case OP_DNCREF:
8382         case OP_RREF:         case OP_RREF:
8383         case OP_NRREF:         case OP_DNRREF:
8384         case OP_DEF:         case OP_DEF:
8385         return FALSE;         return FALSE;
8386    
# Line 8229  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. * Line 8777  PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. *
8777      { skipatstart += 6; options |= PCRE_UTF8; continue; }      { skipatstart += 6; options |= PCRE_UTF8; continue; }
8778    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
8779      { skipatstart += 6; options |= PCRE_UCP; continue; }      { skipatstart += 6; options |= PCRE_UCP; continue; }
8780      else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
8781        { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESSIFY; continue; }
8782    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)    else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
8783      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }      { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
8784    
# Line 8385  else Line 8935  else
8935      cd->nl[0] = newline;      cd->nl[0] = newline;
8936      }      }
8937    }    }
8938    
8939  /* Maximum back reference and backref bitmap. The bitmap records up to 31 back  /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
8940  references to help in deciding whether (.*) can be treated as anchored or not.  references to help in deciding whether (.*) can be treated as anchored or not.
8941  */  */
# Line 8423  cd->named_group_list_size = NAMED_GROUP_ Line 8973  cd->named_group_list_size = NAMED_GROUP_
8973  cd->start_pattern = (const pcre_uchar *)pattern;  cd->start_pattern = (const pcre_uchar *)pattern;
8974  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
8975  cd->req_varyopt = 0;  cd->req_varyopt = 0;
8976    cd->parens_depth = 0;
8977  cd->assert_depth = 0;  cd->assert_depth = 0;
8978  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
8979  cd->external_options = options;  cd->external_options = options;
# Line 8437  outside can help speed up starting point Line 8988  outside can help speed up starting point
8988  ptr += skipatstart;  ptr += skipatstart;
8989  code = cworkspace;  code = cworkspace;
8990  *code = OP_BRA;  *code = OP_BRA;
8991    
8992  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
8993    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,    FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
8994    cd, &length);    cd, &length);
# Line 8508  field; this time it's used for rememberi Line 9060  field; this time it's used for rememberi
9060  */  */
9061    
9062  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9063    cd->parens_depth = 0;
9064  cd->assert_depth = 0;  cd->assert_depth = 0;
9065  cd->bracount = 0;  cd->bracount = 0;
9066  cd->max_lookbehind = 0;  cd->max_lookbehind = 0;
# Line 8837  return (pcre32 *)re; Line 9390  return (pcre32 *)re;
9390  }  }
9391    
9392  /* End of pcre_compile.c */  /* End of pcre_compile.c */
9393    

Legend:
Removed from v.1363  
changed lines
  Added in v.1396

  ViewVC Help
Powered by ViewVC 1.1.5