/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 428 by ph10, Mon Aug 31 17:10:26 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 39  POSSIBILITY OF SUCH DAMAGE. Line 40  POSSIBILITY OF SUCH DAMAGE.
40    
41    
42  /* This module contains the external function pcre_dfa_exec(), which is an  /* This module contains the external function pcre_dfa_exec(), which is an
43  alternative matching function that uses a DFA algorithm. This is NOT Perl-  alternative matching function that uses a sort of DFA algorithm (not a true
44  compatible, but it has advantages in certain applications. */  FSM). This is NOT Perl- compatible, but it has advantages in certain
45    applications. */
46    
47    
48  #define NLBLOCK md           /* The block containing newline information */  #ifdef HAVE_CONFIG_H
49    #include "config.h"
50    #endif
51    
52    #define NLBLOCK md             /* Block containing newline information */
53    #define PSSTART start_subject  /* Field containing processed string start */
54    #define PSEND   end_subject    /* Field containing processed string end */
55    
56  #include "pcre_internal.h"  #include "pcre_internal.h"
57    
58    
# Line 52  compatible, but it has advantages in cer Line 61  compatible, but it has advantages in cer
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
67    
68  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69  into others, under special conditions. A gap of 10 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
70  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
71    never stored, so we push them well clear of the normal opcodes. */
72  #define OP_PROP_EXTRA    (EXTRACT_BASIC_MAX+1)  
73  #define OP_EXTUNI_EXTRA  (EXTRACT_BASIC_MAX+11)  #define OP_PROP_EXTRA       300
74    #define OP_EXTUNI_EXTRA     320
75    #define OP_ANYNL_EXTRA      340
76    #define OP_HSPACE_EXTRA     360
77    #define OP_VSPACE_EXTRA     380
78    
79    
80  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
81  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
82  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
83  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84  small value. */  small value. ***NOTE*** If the start of this table is modified, the two tables
85    that follow must also be modified. */
86    
87  static uschar coptable[] = {  static const uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90    0, 0,                          /* Any, Anybyte                           */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91      0, 0, 0,                       /* Any, AllAny, Anybyte                   */
92    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
95    1,                             /* Char                                   */    1,                             /* Char                                   */
96    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 83  static uschar coptable[] = { Line 98  static uschar coptable[] = {
98    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
99    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
100    3, 3, 3,                       /* upto, minupto, exact                   */    3, 3, 3,                       /* upto, minupto, exact                   */
101      1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
102    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
103    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
104    3, 3, 3,                       /* NOT upto, minupto, exact               */    3, 3, 3,                       /* NOT upto, minupto, exact               */
105      1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
106    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
107    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
108    3, 3, 3,                       /* Type upto, minupto, exact              */    3, 3, 3,                       /* Type upto, minupto, exact              */
109      1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
110    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
111    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
112    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 107  static uschar coptable[] = { Line 125  static uschar coptable[] = {
125    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
126    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
127    0,                             /* Reverse                                */    0,                             /* Reverse                                */
128    0,                             /* Once                                   */    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
129    0,                             /* COND                                   */    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
130    0,                             /* CREF                                   */    0,                             /* CREF                                   */
131      0,                             /* RREF                                   */
132      0,                             /* DEF                                    */
133    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
134    0,                             /* BRANUMBER                              */    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135    0                              /* BRA                                    */    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
136  };  };
137    
138  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139  and \w */  and \w */
140    
141  static uschar toptable1[] = {  static const uschar toptable1[] = {
142    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
143    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
144    ctype_space, ctype_space,    ctype_space, ctype_space,
145    ctype_word,  ctype_word,    ctype_word,  ctype_word,
146    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
147  };  };
148    
149  static uschar toptable2[] = {  static const uschar toptable2[] = {
150    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
151    ctype_digit, 0,    ctype_digit, 0,
152    ctype_space, 0,    ctype_space, 0,
153    ctype_word,  0,    ctype_word,  0,
154    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
155  };  };
156    
157    
# Line 203  Arguments: Line 223  Arguments:
223    rlevel            function call recursion level    rlevel            function call recursion level
224    recursing         regex recursive call level    recursing         regex recursive call level
225    
226  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
227                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
228                       -1 => failed to match                       -1 => failed to match
229                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
230    
# Line 278  stateblock *next_active_state, *next_new Line 298  stateblock *next_active_state, *next_new
298    
299  const uschar *ctypes, *lcc, *fcc;  const uschar *ctypes, *lcc, *fcc;
300  const uschar *ptr;  const uschar *ptr;
301  const uschar *end_code;  const uschar *end_code, *first_op;
302    
303  int active_count, new_count, match_count;  int active_count, new_count, match_count;
304    
# Line 291  const uschar *start_code = md->start_cod Line 311  const uschar *start_code = md->start_cod
311    
312  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
313  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314    #else
315    BOOL utf8 = FALSE;
316  #endif  #endif
317    
318  rlevel++;  rlevel++;
# Line 314  active_states = (stateblock *)(workspace Line 336  active_states = (stateblock *)(workspace
336  next_new_state = new_states = active_states + wscount;  next_new_state = new_states = active_states + wscount;
337  new_count = 0;  new_count = 0;
338    
339    first_op = this_start_code + 1 + LINK_SIZE +
340      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
344  makes is possible to use this function recursively, when we want to stop at a  makes is possible to use this function recursively, when we want to stop at a
# Line 323  If the first opcode in the first alterna Line 348  If the first opcode in the first alterna
348  a backward assertion. In that case, we have to find out the maximum amount to  a backward assertion. In that case, we have to find out the maximum amount to
349  move back, and set up each alternative appropriately. */  move back, and set up each alternative appropriately. */
350    
351  if (this_start_code[1+LINK_SIZE] == OP_REVERSE)  if (*first_op == OP_REVERSE)
352    {    {
353    int max_back = 0;    int max_back = 0;
354    int gone_back;    int gone_back;
# Line 405  else Line 430  else
430    
431    else    else
432      {      {
433        int length = 1 + LINK_SIZE +
434          ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435      do      do
436        {        {
437        ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);        ADD_NEW(end_code - start_code + length, 0);
438        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
439          length = 1 + LINK_SIZE;
440        }        }
441      while (*end_code == OP_ALT);      while (*end_code == OP_ALT);
442      }      }
# Line 426  for (;;) Line 454  for (;;)
454    int i, j;    int i, j;
455    int clen, dlen;    int clen, dlen;
456    unsigned int c, d;    unsigned int c, d;
457      int forced_fail = 0;
458      int reached_end = 0;
459    
460    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
461    new state list. */    new state list. */
# Line 461  for (;;) Line 491  for (;;)
491    
492    if (ptr < end_subject)    if (ptr < end_subject)
493      {      {
494      clen = 1;      clen = 1;        /* Number of bytes in the character */
495  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
496      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf8) { GETCHARLEN(c, ptr, clen); } else
497  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
# Line 469  for (;;) Line 499  for (;;)
499      }      }
500    else    else
501      {      {
502      clen = 0;    /* At end subject */      clen = 0;        /* This indicates the end of the subject */
503      c = -1;      c = NOTACHAR;    /* This value should never actually be used */
504      }      }
505    
506    /* Scan up the active states and act on each one. The result of an action    /* Scan up the active states and act on each one. The result of an action
# Line 483  for (;;) Line 513  for (;;)
513      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
514      const uschar *code;      const uschar *code;
515      int state_offset = current_state->offset;      int state_offset = current_state->offset;
516      int count, codevalue;      int count, codevalue, rrc;
     int chartype, script;  
517    
518  #ifdef DEBUG  #ifdef DEBUG
519      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
520      if (c < 0) printf("-1\n");      if (clen == 0) printf("EOL\n");
521        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
522          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
523  #endif  #endif
# Line 532  for (;;) Line 561  for (;;)
561    
562      code = start_code + state_offset;      code = start_code + state_offset;
563      codevalue = *code;      codevalue = *code;
     if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */  
564    
565      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
566      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
# Line 540  for (;;) Line 568  for (;;)
568      permitted.      permitted.
569    
570      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
571      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
572      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
573      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
574      opcodes. */      */
575    
576      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
577        {        {
# Line 554  for (;;) Line 582  for (;;)
582        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
583        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
584          {          {
585          if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;          switch(d)
586          if (d >= OP_NOTPROP)            {
587            codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;            case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
588              case OP_NOTPROP:
589              case OP_PROP: codevalue += OP_PROP_EXTRA; break;
590              case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
591              case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
592              case OP_NOT_HSPACE:
593              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
594              case OP_NOT_VSPACE:
595              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
596              default: break;
597              }
598          }          }
599        }        }
600      else      else
601        {        {
602        dlen = 0;         /* Not strictly necessary, but compilers moan */        dlen = 0;         /* Not strictly necessary, but compilers moan */
603        d = -1;           /* if these variables are not set. */        d = NOTACHAR;     /* if these variables are not set. */
604        }        }
605    
606    
# Line 588  for (;;) Line 626  for (;;)
626            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
627            }            }
628          }          }
629        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
630          {          {
631          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
632            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
633              match_count = 0;            {
634          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
635          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
636          if (offsetcount >= 2)                match_count = 0;
637            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
638            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
639            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
640            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
641              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
642            }              offsets[1] = ptr - start_subject;
643          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
644            {                offsets[1] - offsets[0], current_subject));
645            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
646              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
647              match_count, rlevel*2-2, SP));              {
648            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
649            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
650                  match_count, rlevel*2-2, SP));
651                return match_count;
652                }
653              }
654          }          }
655        break;        break;
656    
# Line 624  for (;;) Line 666  for (;;)
666    
667        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
668        case OP_BRA:        case OP_BRA:
669          case OP_SBRA:
670        do        do
671          {          {
672          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
# Line 633  for (;;) Line 676  for (;;)
676        break;        break;
677    
678        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
679          case OP_CBRA:
680          case OP_SCBRA:
681          ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
682          code += GET(code, 1);
683          while (*code == OP_ALT)
684            {
685            ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
686            code += GET(code, 1);
687            }
688          break;
689    
690          /*-----------------------------------------------------------------*/
691        case OP_BRAZERO:        case OP_BRAZERO:
692        case OP_BRAMINZERO:        case OP_BRAMINZERO:
693        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
# Line 642  for (;;) Line 697  for (;;)
697        break;        break;
698    
699        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
700        case OP_BRANUMBER:        case OP_SKIPZERO:
701        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);        code += 1 + GET(code, 2);
702          while (*code == OP_ALT) code += GET(code, 1);
703          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
704        break;        break;
705    
706        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
707        case OP_CIRC:        case OP_CIRC:
708        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
709            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
             ptr >= start_subject + md->nllen &&  
710              ptr != end_subject &&              ptr != end_subject &&
711              IS_NEWLINE(ptr - md->nllen)))              WAS_NEWLINE(ptr)))
712          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
713        break;        break;
714    
# Line 686  for (;;) Line 742  for (;;)
742    
743        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
744        case OP_ANY:        case OP_ANY:
745        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 ||        if (clen > 0 && !IS_NEWLINE(ptr))
746                         ptr > end_subject - md->nllen ||          { ADD_NEW(state_offset + 1, 0); }
747                         !IS_NEWLINE(ptr)))        break;
748    
749          /*-----------------------------------------------------------------*/
750          case OP_ALLANY:
751          if (clen > 0)
752          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
753        break;        break;
754    
755        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
756        case OP_EODN:        case OP_EODN:
757        if (clen == 0 ||        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
            (ptr == end_subject - md->nllen && IS_NEWLINE(ptr)))  
758          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
759        break;        break;
760    
# Line 704  for (;;) Line 763  for (;;)
763        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
764          {          {
765          if (clen == 0 ||          if (clen == 0 ||
766              (ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
767                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
768              ))              ))
769            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
770          }          }
771        else if ((ims & PCRE_MULTILINE) != 0 &&        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
                ptr <= end_subject - md->nllen && IS_NEWLINE(ptr))  
772          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
773        break;        break;
774    
# Line 750  for (;;) Line 808  for (;;)
808            }            }
809          else left_word = 0;          else left_word = 0;
810    
811          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
812            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
813            else              /* This is a fudge to ensure that if this is the */
814              {               /* last item in the pattern, we don't count it as */
815              reached_end--;  /* reached, thus disabling a partial match. */
816              right_word = 0;
817              }
818    
819          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
820            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 759  for (;;) Line 822  for (;;)
822        break;        break;
823    
824    
 #ifdef SUPPORT_UCP  
   
825        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
826        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
827        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
828        */        */
829    
830    #ifdef SUPPORT_UCP
831        case OP_PROP:        case OP_PROP:
832        case OP_NOTPROP:        case OP_NOTPROP:
833        if (clen > 0)        if (clen > 0)
834          {          {
835          BOOL OK;          BOOL OK;
836          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
837          switch(code[1])          switch(code[1])
838            {            {
839            case PT_ANY:            case PT_ANY:
# Line 779  for (;;) Line 841  for (;;)
841            break;            break;
842    
843            case PT_LAMP:            case PT_LAMP:
844            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
845            break;            break;
846    
847            case PT_GC:            case PT_GC:
848            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
849            break;            break;
850    
851            case PT_PC:            case PT_PC:
852            OK = chartype == code[2];            OK = prop->chartype == code[2];
853            break;            break;
854    
855            case PT_SC:            case PT_SC:
856            OK = script == code[2];            OK = prop->script == code[2];
857            break;            break;
858    
859            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 811  for (;;) Line 873  for (;;)
873  /* ========================================================================== */  /* ========================================================================== */
874        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
875        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
876        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
877        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
878    
879        case OP_TYPEPLUS:        case OP_TYPEPLUS:
880        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
881          case OP_TYPEPOSPLUS:
882        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
883        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
884        if (clen > 0)        if (clen > 0)
885          {          {
886          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
887              (c < 256 &&              (c < 256 &&
888                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                ptr > end_subject - md->nllen ||  
                !IS_NEWLINE(ptr)  
               ) &&  
889                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
890            {            {
891              if (count > 0 && codevalue == OP_TYPEPOSPLUS)
892                {
893                active_count--;            /* Remove non-match possibility */
894                next_active_state--;
895                }
896            count++;            count++;
897            ADD_NEW(state_offset, count);            ADD_NEW(state_offset, count);
898            }            }
# Line 838  for (;;) Line 902  for (;;)
902        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
903        case OP_TYPEQUERY:        case OP_TYPEQUERY:
904        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
905          case OP_TYPEPOSQUERY:
906        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
907        if (clen > 0)        if (clen > 0)
908          {          {
909          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
910              (c < 256 &&              (c < 256 &&
911                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                ptr > end_subject - md->nllen ||  
                !IS_NEWLINE(ptr)  
               ) &&  
912                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
913            {            {
914              if (codevalue == OP_TYPEPOSQUERY)
915                {
916                active_count--;            /* Remove non-match possibility */
917                next_active_state--;
918                }
919            ADD_NEW(state_offset + 2, 0);            ADD_NEW(state_offset + 2, 0);
920            }            }
921          }          }
# Line 858  for (;;) Line 924  for (;;)
924        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
925        case OP_TYPESTAR:        case OP_TYPESTAR:
926        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
927          case OP_TYPEPOSSTAR:
928        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
929        if (clen > 0)        if (clen > 0)
930          {          {
931          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
932              (c < 256 &&              (c < 256 &&
933                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                ptr > end_subject - md->nllen ||  
                !IS_NEWLINE(ptr)  
               ) &&  
934                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
935            {            {
936              if (codevalue == OP_TYPEPOSSTAR)
937                {
938                active_count--;            /* Remove non-match possibility */
939                next_active_state--;
940                }
941            ADD_NEW(state_offset, 0);            ADD_NEW(state_offset, 0);
942            }            }
943          }          }
# Line 877  for (;;) Line 945  for (;;)
945    
946        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
947        case OP_TYPEEXACT:        case OP_TYPEEXACT:
948          count = current_state->count;  /* Number already matched */
949          if (clen > 0)
950            {
951            if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
952                (c < 256 &&
953                  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
954                  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
955              {
956              if (++count >= GET2(code, 1))
957                { ADD_NEW(state_offset + 4, 0); }
958              else
959                { ADD_NEW(state_offset, count); }
960              }
961            }
962          break;
963    
964          /*-----------------------------------------------------------------*/
965        case OP_TYPEUPTO:        case OP_TYPEUPTO:
966        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
967        if (codevalue != OP_TYPEEXACT)        case OP_TYPEPOSUPTO:
968          { ADD_ACTIVE(state_offset + 4, 0); }        ADD_ACTIVE(state_offset + 4, 0);
969        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
970        if (clen > 0)        if (clen > 0)
971          {          {
972          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
973              (c < 256 &&              (c < 256 &&
974                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                ptr > end_subject - md->nllen ||  
                !IS_NEWLINE(ptr)  
               ) &&  
975                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
976            {            {
977              if (codevalue == OP_TYPEPOSUPTO)
978                {
979                active_count--;           /* Remove non-match possibility */
980                next_active_state--;
981                }
982            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
983              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 4, 0); }
984            else            else
# Line 903  for (;;) Line 989  for (;;)
989    
990  /* ========================================================================== */  /* ========================================================================== */
991        /* These are virtual opcodes that are used when something like        /* These are virtual opcodes that are used when something like
992        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It        OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
993        keeps the code above fast for the other cases. The argument is in the        argument. It keeps the code above fast for the other cases. The argument
994        d variable. */        is in the d variable. */
995    
996    #ifdef SUPPORT_UCP
997        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
998        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
999          case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1000        count = current_state->count;           /* Already matched */        count = current_state->count;           /* Already matched */
1001        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1002        if (clen > 0)        if (clen > 0)
1003          {          {
1004          BOOL OK;          BOOL OK;
1005          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1006          switch(code[2])          switch(code[2])
1007            {            {
1008            case PT_ANY:            case PT_ANY:
# Line 922  for (;;) Line 1010  for (;;)
1010            break;            break;
1011    
1012            case PT_LAMP:            case PT_LAMP:
1013            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1014            break;            break;
1015    
1016            case PT_GC:            case PT_GC:
1017            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1018            break;            break;
1019    
1020            case PT_PC:            case PT_PC:
1021            OK = chartype == code[3];            OK = prop->chartype == code[3];
1022            break;            break;
1023    
1024            case PT_SC:            case PT_SC:
1025            OK = script == code[3];            OK = prop->script == code[3];
1026            break;            break;
1027    
1028            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 944  for (;;) Line 1032  for (;;)
1032            break;            break;
1033            }            }
1034    
1035          if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); }          if (OK == (d == OP_PROP))
1036              {
1037              if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1038                {
1039                active_count--;           /* Remove non-match possibility */
1040                next_active_state--;
1041                }
1042              count++;
1043              ADD_NEW(state_offset, count);
1044              }
1045          }          }
1046        break;        break;
1047    
1048        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1049        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1050        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1051          case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1052        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1053        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1054        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1055          {          {
1056          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1057          int ncount = 0;          int ncount = 0;
1058            if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1059              {
1060              active_count--;           /* Remove non-match possibility */
1061              next_active_state--;
1062              }
1063          while (nptr < end_subject)          while (nptr < end_subject)
1064            {            {
1065            int nd;            int nd;
1066            int ndlen = 1;            int ndlen = 1;
1067            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1068            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1069            ncount++;            ncount++;
1070            nptr += ndlen;            nptr += ndlen;
1071            }            }
# Line 970  for (;;) Line 1073  for (;;)
1073          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1074          }          }
1075        break;        break;
1076    #endif
1077    
1078          /*-----------------------------------------------------------------*/
1079          case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1080          case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1081          case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1082          count = current_state->count;  /* Already matched */
1083          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1084          if (clen > 0)
1085            {
1086            int ncount = 0;
1087            switch (c)
1088              {
1089              case 0x000b:
1090              case 0x000c:
1091              case 0x0085:
1092              case 0x2028:
1093              case 0x2029:
1094              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1095              goto ANYNL01;
1096    
1097              case 0x000d:
1098              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1099              /* Fall through */
1100    
1101              ANYNL01:
1102              case 0x000a:
1103              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1104                {
1105                active_count--;           /* Remove non-match possibility */
1106                next_active_state--;
1107                }
1108              count++;
1109              ADD_NEW_DATA(-state_offset, count, ncount);
1110              break;
1111    
1112              default:
1113              break;
1114              }
1115            }
1116          break;
1117    
1118          /*-----------------------------------------------------------------*/
1119          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1120          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1121          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1122          count = current_state->count;  /* Already matched */
1123          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1124          if (clen > 0)
1125            {
1126            BOOL OK;
1127            switch (c)
1128              {
1129              case 0x000a:
1130              case 0x000b:
1131              case 0x000c:
1132              case 0x000d:
1133              case 0x0085:
1134              case 0x2028:
1135              case 0x2029:
1136              OK = TRUE;
1137              break;
1138    
1139              default:
1140              OK = FALSE;
1141              break;
1142              }
1143    
1144            if (OK == (d == OP_VSPACE))
1145              {
1146              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1147                {
1148                active_count--;           /* Remove non-match possibility */
1149                next_active_state--;
1150                }
1151              count++;
1152              ADD_NEW_DATA(-state_offset, count, 0);
1153              }
1154            }
1155          break;
1156    
1157          /*-----------------------------------------------------------------*/
1158          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1159          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1160          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1161          count = current_state->count;  /* Already matched */
1162          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1163          if (clen > 0)
1164            {
1165            BOOL OK;
1166            switch (c)
1167              {
1168              case 0x09:      /* HT */
1169              case 0x20:      /* SPACE */
1170              case 0xa0:      /* NBSP */
1171              case 0x1680:    /* OGHAM SPACE MARK */
1172              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1173              case 0x2000:    /* EN QUAD */
1174              case 0x2001:    /* EM QUAD */
1175              case 0x2002:    /* EN SPACE */
1176              case 0x2003:    /* EM SPACE */
1177              case 0x2004:    /* THREE-PER-EM SPACE */
1178              case 0x2005:    /* FOUR-PER-EM SPACE */
1179              case 0x2006:    /* SIX-PER-EM SPACE */
1180              case 0x2007:    /* FIGURE SPACE */
1181              case 0x2008:    /* PUNCTUATION SPACE */
1182              case 0x2009:    /* THIN SPACE */
1183              case 0x200A:    /* HAIR SPACE */
1184              case 0x202f:    /* NARROW NO-BREAK SPACE */
1185              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1186              case 0x3000:    /* IDEOGRAPHIC SPACE */
1187              OK = TRUE;
1188              break;
1189    
1190              default:
1191              OK = FALSE;
1192              break;
1193              }
1194    
1195            if (OK == (d == OP_HSPACE))
1196              {
1197              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1198                {
1199                active_count--;           /* Remove non-match possibility */
1200                next_active_state--;
1201                }
1202              count++;
1203              ADD_NEW_DATA(-state_offset, count, 0);
1204              }
1205            }
1206          break;
1207    
1208        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1209    #ifdef SUPPORT_UCP
1210        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1211        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1212          case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1213        count = 4;        count = 4;
1214        goto QS1;        goto QS1;
1215    
1216        case OP_PROP_EXTRA + OP_TYPESTAR:        case OP_PROP_EXTRA + OP_TYPESTAR:
1217        case OP_PROP_EXTRA + OP_TYPEMINSTAR:        case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1218          case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1219        count = 0;        count = 0;
1220    
1221        QS1:        QS1:
# Line 987  for (;;) Line 1224  for (;;)
1224        if (clen > 0)        if (clen > 0)
1225          {          {
1226          BOOL OK;          BOOL OK;
1227          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1228          switch(code[2])          switch(code[2])
1229            {            {
1230            case PT_ANY:            case PT_ANY:
# Line 995  for (;;) Line 1232  for (;;)
1232            break;            break;
1233    
1234            case PT_LAMP:            case PT_LAMP:
1235            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1236            break;            break;
1237    
1238            case PT_GC:            case PT_GC:
1239            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1240            break;            break;
1241    
1242            case PT_PC:            case PT_PC:
1243            OK = chartype == code[3];            OK = prop->chartype == code[3];
1244            break;            break;
1245    
1246            case PT_SC:            case PT_SC:
1247            OK = script == code[3];            OK = prop->script == code[3];
1248            break;            break;
1249    
1250            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1017  for (;;) Line 1254  for (;;)
1254            break;            break;
1255            }            }
1256    
1257          if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); }          if (OK == (d == OP_PROP))
1258              {
1259              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1260                  codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1261                {
1262                active_count--;           /* Remove non-match possibility */
1263                next_active_state--;
1264                }
1265              ADD_NEW(state_offset + count, 0);
1266              }
1267          }          }
1268        break;        break;
1269    
1270        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1271        case OP_EXTUNI_EXTRA + OP_TYPEQUERY:        case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1272        case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:        case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1273          case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1274        count = 2;        count = 2;
1275        goto QS2;        goto QS2;
1276    
1277        case OP_EXTUNI_EXTRA + OP_TYPESTAR:        case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1278        case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:        case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1279          case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1280        count = 0;        count = 0;
1281    
1282        QS2:        QS2:
1283    
1284        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1285        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1286          {          {
1287          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1288          int ncount = 0;          int ncount = 0;
1289            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1290                codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1291              {
1292              active_count--;           /* Remove non-match possibility */
1293              next_active_state--;
1294              }
1295          while (nptr < end_subject)          while (nptr < end_subject)
1296            {            {
1297            int nd;            int nd;
1298            int ndlen = 1;            int ndlen = 1;
1299            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1300            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1301            ncount++;            ncount++;
1302            nptr += ndlen;            nptr += ndlen;
1303            }            }
1304          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1305          }          }
1306        break;        break;
1307    #endif
1308    
1309          /*-----------------------------------------------------------------*/
1310          case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1311          case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1312          case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1313          count = 2;
1314          goto QS3;
1315    
1316          case OP_ANYNL_EXTRA + OP_TYPESTAR:
1317          case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1318          case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1319          count = 0;
1320    
1321          QS3:
1322          ADD_ACTIVE(state_offset + 2, 0);
1323          if (clen > 0)
1324            {
1325            int ncount = 0;
1326            switch (c)
1327              {
1328              case 0x000b:
1329              case 0x000c:
1330              case 0x0085:
1331              case 0x2028:
1332              case 0x2029:
1333              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1334              goto ANYNL02;
1335    
1336              case 0x000d:
1337              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1338              /* Fall through */
1339    
1340              ANYNL02:
1341              case 0x000a:
1342              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1343                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1344                {
1345                active_count--;           /* Remove non-match possibility */
1346                next_active_state--;
1347                }
1348              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1349              break;
1350    
1351              default:
1352              break;
1353              }
1354            }
1355          break;
1356    
1357          /*-----------------------------------------------------------------*/
1358          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1359          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1360          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1361          count = 2;
1362          goto QS4;
1363    
1364          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1365          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1366          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1367          count = 0;
1368    
1369          QS4:
1370          ADD_ACTIVE(state_offset + 2, 0);
1371          if (clen > 0)
1372            {
1373            BOOL OK;
1374            switch (c)
1375              {
1376              case 0x000a:
1377              case 0x000b:
1378              case 0x000c:
1379              case 0x000d:
1380              case 0x0085:
1381              case 0x2028:
1382              case 0x2029:
1383              OK = TRUE;
1384              break;
1385    
1386              default:
1387              OK = FALSE;
1388              break;
1389              }
1390            if (OK == (d == OP_VSPACE))
1391              {
1392              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1393                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1394                {
1395                active_count--;           /* Remove non-match possibility */
1396                next_active_state--;
1397                }
1398              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1399              }
1400            }
1401          break;
1402    
1403          /*-----------------------------------------------------------------*/
1404          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1405          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1406          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1407          count = 2;
1408          goto QS5;
1409    
1410          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1411          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1412          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1413          count = 0;
1414    
1415          QS5:
1416          ADD_ACTIVE(state_offset + 2, 0);
1417          if (clen > 0)
1418            {
1419            BOOL OK;
1420            switch (c)
1421              {
1422              case 0x09:      /* HT */
1423              case 0x20:      /* SPACE */
1424              case 0xa0:      /* NBSP */
1425              case 0x1680:    /* OGHAM SPACE MARK */
1426              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1427              case 0x2000:    /* EN QUAD */
1428              case 0x2001:    /* EM QUAD */
1429              case 0x2002:    /* EN SPACE */
1430              case 0x2003:    /* EM SPACE */
1431              case 0x2004:    /* THREE-PER-EM SPACE */
1432              case 0x2005:    /* FOUR-PER-EM SPACE */
1433              case 0x2006:    /* SIX-PER-EM SPACE */
1434              case 0x2007:    /* FIGURE SPACE */
1435              case 0x2008:    /* PUNCTUATION SPACE */
1436              case 0x2009:    /* THIN SPACE */
1437              case 0x200A:    /* HAIR SPACE */
1438              case 0x202f:    /* NARROW NO-BREAK SPACE */
1439              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1440              case 0x3000:    /* IDEOGRAPHIC SPACE */
1441              OK = TRUE;
1442              break;
1443    
1444              default:
1445              OK = FALSE;
1446              break;
1447              }
1448    
1449            if (OK == (d == OP_HSPACE))
1450              {
1451              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1452                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1453                {
1454                active_count--;           /* Remove non-match possibility */
1455                next_active_state--;
1456                }
1457              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1458              }
1459            }
1460          break;
1461    
1462        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1463    #ifdef SUPPORT_UCP
1464        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1465        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1466        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1467          case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1468        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1469          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 6, 0); }
1470        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1471        if (clen > 0)        if (clen > 0)
1472          {          {
1473          BOOL OK;          BOOL OK;
1474          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1475          switch(code[4])          switch(code[4])
1476            {            {
1477            case PT_ANY:            case PT_ANY:
# Line 1069  for (;;) Line 1479  for (;;)
1479            break;            break;
1480    
1481            case PT_LAMP:            case PT_LAMP:
1482            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1483            break;            break;
1484    
1485            case PT_GC:            case PT_GC:
1486            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1487            break;            break;
1488    
1489            case PT_PC:            case PT_PC:
1490            OK = chartype == code[5];            OK = prop->chartype == code[5];
1491            break;            break;
1492    
1493            case PT_SC:            case PT_SC:
1494            OK = script == code[5];            OK = prop->script == code[5];
1495            break;            break;
1496    
1497            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1093  for (;;) Line 1503  for (;;)
1503    
1504          if (OK == (d == OP_PROP))          if (OK == (d == OP_PROP))
1505            {            {
1506              if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1507                {
1508                active_count--;           /* Remove non-match possibility */
1509                next_active_state--;
1510                }
1511            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1512              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 6, 0); }
1513            else            else
# Line 1105  for (;;) Line 1520  for (;;)
1520        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:        case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1521        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1522        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1523          case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1524        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1525          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1526        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1527        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1528          {          {
1529          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1530          int ncount = 0;          int ncount = 0;
1531            if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1532              {
1533              active_count--;           /* Remove non-match possibility */
1534              next_active_state--;
1535              }
1536          while (nptr < end_subject)          while (nptr < end_subject)
1537            {            {
1538            int nd;            int nd;
1539            int ndlen = 1;            int ndlen = 1;
1540            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1541            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1542            ncount++;            ncount++;
1543            nptr += ndlen;            nptr += ndlen;
1544            }            }
# Line 1127  for (;;) Line 1548  for (;;)
1548            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1549          }          }
1550        break;        break;
1551    #endif
1552    
1553          /*-----------------------------------------------------------------*/
1554          case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1555          case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1556          case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1557          case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1558          if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1559            { ADD_ACTIVE(state_offset + 4, 0); }
1560          count = current_state->count;  /* Number already matched */
1561          if (clen > 0)
1562            {
1563            int ncount = 0;
1564            switch (c)
1565              {
1566              case 0x000b:
1567              case 0x000c:
1568              case 0x0085:
1569              case 0x2028:
1570              case 0x2029:
1571              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1572              goto ANYNL03;
1573    
1574              case 0x000d:
1575              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1576              /* Fall through */
1577    
1578              ANYNL03:
1579              case 0x000a:
1580              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1581                {
1582                active_count--;           /* Remove non-match possibility */
1583                next_active_state--;
1584                }
1585              if (++count >= GET2(code, 1))
1586                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1587              else
1588                { ADD_NEW_DATA(-state_offset, count, ncount); }
1589              break;
1590    
1591              default:
1592              break;
1593              }
1594            }
1595          break;
1596    
1597          /*-----------------------------------------------------------------*/
1598          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1599          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1600          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1601          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1602          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1603            { ADD_ACTIVE(state_offset + 4, 0); }
1604          count = current_state->count;  /* Number already matched */
1605          if (clen > 0)
1606            {
1607            BOOL OK;
1608            switch (c)
1609              {
1610              case 0x000a:
1611              case 0x000b:
1612              case 0x000c:
1613              case 0x000d:
1614              case 0x0085:
1615              case 0x2028:
1616              case 0x2029:
1617              OK = TRUE;
1618              break;
1619    
1620              default:
1621              OK = FALSE;
1622              }
1623    
1624            if (OK == (d == OP_VSPACE))
1625              {
1626              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1627                {
1628                active_count--;           /* Remove non-match possibility */
1629                next_active_state--;
1630                }
1631              if (++count >= GET2(code, 1))
1632                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1633              else
1634                { ADD_NEW_DATA(-state_offset, count, 0); }
1635              }
1636            }
1637          break;
1638    
1639          /*-----------------------------------------------------------------*/
1640          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1641          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1642          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1643          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1644          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1645            { ADD_ACTIVE(state_offset + 4, 0); }
1646          count = current_state->count;  /* Number already matched */
1647          if (clen > 0)
1648            {
1649            BOOL OK;
1650            switch (c)
1651              {
1652              case 0x09:      /* HT */
1653              case 0x20:      /* SPACE */
1654              case 0xa0:      /* NBSP */
1655              case 0x1680:    /* OGHAM SPACE MARK */
1656              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1657              case 0x2000:    /* EN QUAD */
1658              case 0x2001:    /* EM QUAD */
1659              case 0x2002:    /* EN SPACE */
1660              case 0x2003:    /* EM SPACE */
1661              case 0x2004:    /* THREE-PER-EM SPACE */
1662              case 0x2005:    /* FOUR-PER-EM SPACE */
1663              case 0x2006:    /* SIX-PER-EM SPACE */
1664              case 0x2007:    /* FIGURE SPACE */
1665              case 0x2008:    /* PUNCTUATION SPACE */
1666              case 0x2009:    /* THIN SPACE */
1667              case 0x200A:    /* HAIR SPACE */
1668              case 0x202f:    /* NARROW NO-BREAK SPACE */
1669              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1670              case 0x3000:    /* IDEOGRAPHIC SPACE */
1671              OK = TRUE;
1672              break;
1673    
1674              default:
1675              OK = FALSE;
1676              break;
1677              }
1678    
1679            if (OK == (d == OP_HSPACE))
1680              {
1681              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1682                {
1683                active_count--;           /* Remove non-match possibility */
1684                next_active_state--;
1685                }
1686              if (++count >= GET2(code, 1))
1687                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1688              else
1689                { ADD_NEW_DATA(-state_offset, count, 0); }
1690              }
1691            }
1692          break;
1693    
1694  /* ========================================================================== */  /* ========================================================================== */
1695        /* These opcodes are followed by a character that is usually compared        /* These opcodes are followed by a character that is usually compared
# Line 1148  for (;;) Line 1711  for (;;)
1711          {          {
1712          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1713            {            {
1714            int othercase;            unsigned int othercase;
1715            if (c < 128) othercase = fcc[c]; else            if (c < 128) othercase = fcc[c]; else
1716    
1717            /* If we have Unicode property support, we can use it to test the            /* If we have Unicode property support, we can use it to test the
1718            other case of the character. */            other case of the character. */
1719    
1720  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1721            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1722  #else  #else
1723            othercase = -1;            othercase = NOTACHAR;
1724  #endif  #endif
1725    
1726            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
# Line 1180  for (;;) Line 1743  for (;;)
1743        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1744    
1745        case OP_EXTUNI:        case OP_EXTUNI:
1746        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1747          {          {
1748          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1749          int ncount = 0;          int ncount = 0;
# Line 1188  for (;;) Line 1751  for (;;)
1751            {            {
1752            int nclen = 1;            int nclen = 1;
1753            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1754            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1755            ncount++;            ncount++;
1756            nptr += nclen;            nptr += nclen;
1757            }            }
# Line 1198  for (;;) Line 1761  for (;;)
1761  #endif  #endif
1762    
1763        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1764          /* This is a tricky like EXTUNI because it too can match more than one
1765          character (when CR is followed by LF). In this case, set up a negative
1766          state to wait for one character to pass before continuing. */
1767    
1768          case OP_ANYNL:
1769          if (clen > 0) switch(c)
1770            {
1771            case 0x000b:
1772            case 0x000c:
1773            case 0x0085:
1774            case 0x2028:
1775            case 0x2029:
1776            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1777    
1778            case 0x000a:
1779            ADD_NEW(state_offset + 1, 0);
1780            break;
1781    
1782            case 0x000d:
1783            if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1784              {
1785              ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1786              }
1787            else
1788              {
1789              ADD_NEW(state_offset + 1, 0);
1790              }
1791            break;
1792            }
1793          break;
1794    
1795          /*-----------------------------------------------------------------*/
1796          case OP_NOT_VSPACE:
1797          if (clen > 0) switch(c)
1798            {
1799            case 0x000a:
1800            case 0x000b:
1801            case 0x000c:
1802            case 0x000d:
1803            case 0x0085:
1804            case 0x2028:
1805            case 0x2029:
1806            break;
1807    
1808            default:
1809            ADD_NEW(state_offset + 1, 0);
1810            break;
1811            }
1812          break;
1813    
1814          /*-----------------------------------------------------------------*/
1815          case OP_VSPACE:
1816          if (clen > 0) switch(c)
1817            {
1818            case 0x000a:
1819            case 0x000b:
1820            case 0x000c:
1821            case 0x000d:
1822            case 0x0085:
1823            case 0x2028:
1824            case 0x2029:
1825            ADD_NEW(state_offset + 1, 0);
1826            break;
1827    
1828            default: break;
1829            }
1830          break;
1831    
1832          /*-----------------------------------------------------------------*/
1833          case OP_NOT_HSPACE:
1834          if (clen > 0) switch(c)
1835            {
1836            case 0x09:      /* HT */
1837            case 0x20:      /* SPACE */
1838            case 0xa0:      /* NBSP */
1839            case 0x1680:    /* OGHAM SPACE MARK */
1840            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1841            case 0x2000:    /* EN QUAD */
1842            case 0x2001:    /* EM QUAD */
1843            case 0x2002:    /* EN SPACE */
1844            case 0x2003:    /* EM SPACE */
1845            case 0x2004:    /* THREE-PER-EM SPACE */
1846            case 0x2005:    /* FOUR-PER-EM SPACE */
1847            case 0x2006:    /* SIX-PER-EM SPACE */
1848            case 0x2007:    /* FIGURE SPACE */
1849            case 0x2008:    /* PUNCTUATION SPACE */
1850            case 0x2009:    /* THIN SPACE */
1851            case 0x200A:    /* HAIR SPACE */
1852            case 0x202f:    /* NARROW NO-BREAK SPACE */
1853            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1854            case 0x3000:    /* IDEOGRAPHIC SPACE */
1855            break;
1856    
1857            default:
1858            ADD_NEW(state_offset + 1, 0);
1859            break;
1860            }
1861          break;
1862    
1863          /*-----------------------------------------------------------------*/
1864          case OP_HSPACE:
1865          if (clen > 0) switch(c)
1866            {
1867            case 0x09:      /* HT */
1868            case 0x20:      /* SPACE */
1869            case 0xa0:      /* NBSP */
1870            case 0x1680:    /* OGHAM SPACE MARK */
1871            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1872            case 0x2000:    /* EN QUAD */
1873            case 0x2001:    /* EM QUAD */
1874            case 0x2002:    /* EN SPACE */
1875            case 0x2003:    /* EM SPACE */
1876            case 0x2004:    /* THREE-PER-EM SPACE */
1877            case 0x2005:    /* FOUR-PER-EM SPACE */
1878            case 0x2006:    /* SIX-PER-EM SPACE */
1879            case 0x2007:    /* FIGURE SPACE */
1880            case 0x2008:    /* PUNCTUATION SPACE */
1881            case 0x2009:    /* THIN SPACE */
1882            case 0x200A:    /* HAIR SPACE */
1883            case 0x202f:    /* NARROW NO-BREAK SPACE */
1884            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1885            case 0x3000:    /* IDEOGRAPHIC SPACE */
1886            ADD_NEW(state_offset + 1, 0);
1887            break;
1888            }
1889          break;
1890    
1891          /*-----------------------------------------------------------------*/
1892        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1893        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1894        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 1205  for (;;) Line 1896  for (;;)
1896        case OP_NOT:        case OP_NOT:
1897        if (clen > 0)        if (clen > 0)
1898          {          {
1899          int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1900          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1901          }          }
1902        break;        break;
# Line 1213  for (;;) Line 1904  for (;;)
1904        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1905        case OP_PLUS:        case OP_PLUS:
1906        case OP_MINPLUS:        case OP_MINPLUS:
1907          case OP_POSPLUS:
1908        case OP_NOTPLUS:        case OP_NOTPLUS:
1909        case OP_NOTMINPLUS:        case OP_NOTMINPLUS:
1910          case OP_NOTPOSPLUS:
1911        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1912        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1913        if (clen > 0)        if (clen > 0)
1914          {          {
1915          int otherd = -1;          unsigned int otherd = NOTACHAR;
1916          if ((ims & PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1917            {            {
1918  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1919            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1920              {              {
1921  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1922              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1923  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1924              }              }
1925            else            else
# Line 1234  for (;;) Line 1927  for (;;)
1927            otherd = fcc[d];            otherd = fcc[d];
1928            }            }
1929          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1930            { count++; ADD_NEW(state_offset, count); }            {
1931              if (count > 0 &&
1932                  (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1933                {
1934                active_count--;             /* Remove non-match possibility */
1935                next_active_state--;
1936                }
1937              count++;
1938              ADD_NEW(state_offset, count);
1939              }
1940          }          }
1941        break;        break;
1942    
1943        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1944        case OP_QUERY:        case OP_QUERY:
1945        case OP_MINQUERY:        case OP_MINQUERY:
1946          case OP_POSQUERY:
1947        case OP_NOTQUERY:        case OP_NOTQUERY:
1948        case OP_NOTMINQUERY:        case OP_NOTMINQUERY:
1949          case OP_NOTPOSQUERY:
1950        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
1951        if (clen > 0)        if (clen > 0)
1952          {          {
1953          int otherd = -1;          unsigned int otherd = NOTACHAR;
1954          if ((ims & PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1955            {            {
1956  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1957            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1958              {              {
1959  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1960              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1961  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1962              }              }
1963            else            else
# Line 1261  for (;;) Line 1965  for (;;)
1965            otherd = fcc[d];            otherd = fcc[d];
1966            }            }
1967          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1968            { ADD_NEW(state_offset + dlen + 1, 0); }            {
1969              if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1970                {
1971                active_count--;            /* Remove non-match possibility */
1972                next_active_state--;
1973                }
1974              ADD_NEW(state_offset + dlen + 1, 0);
1975              }
1976          }          }
1977        break;        break;
1978    
1979        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1980        case OP_STAR:        case OP_STAR:
1981        case OP_MINSTAR:        case OP_MINSTAR:
1982          case OP_POSSTAR:
1983        case OP_NOTSTAR:        case OP_NOTSTAR:
1984        case OP_NOTMINSTAR:        case OP_NOTMINSTAR:
1985          case OP_NOTPOSSTAR:
1986        ADD_ACTIVE(state_offset + dlen + 1, 0);        ADD_ACTIVE(state_offset + dlen + 1, 0);
1987        if (clen > 0)        if (clen > 0)
1988          {          {
1989          int otherd = -1;          unsigned int otherd = NOTACHAR;
1990          if ((ims & PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1991            {            {
1992  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1993            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1994              {              {
1995  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1996              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1997  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1998              }              }
1999            else            else
# Line 1288  for (;;) Line 2001  for (;;)
2001            otherd = fcc[d];            otherd = fcc[d];
2002            }            }
2003          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2004            { ADD_NEW(state_offset, 0); }            {
2005              if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2006                {
2007                active_count--;            /* Remove non-match possibility */
2008                next_active_state--;
2009                }
2010              ADD_NEW(state_offset, 0);
2011              }
2012          }          }
2013        break;        break;
2014    
2015        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2016        case OP_EXACT:        case OP_EXACT:
2017          case OP_NOTEXACT:
2018          count = current_state->count;  /* Number already matched */
2019          if (clen > 0)
2020            {
2021            unsigned int otherd = NOTACHAR;
2022            if ((ims & PCRE_CASELESS) != 0)
2023              {
2024    #ifdef SUPPORT_UTF8
2025              if (utf8 && d >= 128)
2026                {
2027    #ifdef SUPPORT_UCP
2028                otherd = UCD_OTHERCASE(d);
2029    #endif  /* SUPPORT_UCP */
2030                }
2031              else
2032    #endif  /* SUPPORT_UTF8 */
2033              otherd = fcc[d];
2034              }
2035            if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2036              {
2037              if (++count >= GET2(code, 1))
2038                { ADD_NEW(state_offset + dlen + 3, 0); }
2039              else
2040                { ADD_NEW(state_offset, count); }
2041              }
2042            }
2043          break;
2044    
2045          /*-----------------------------------------------------------------*/
2046        case OP_UPTO:        case OP_UPTO:
2047        case OP_MINUPTO:        case OP_MINUPTO:
2048        case OP_NOTEXACT:        case OP_POSUPTO:
2049        case OP_NOTUPTO:        case OP_NOTUPTO:
2050        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2051        if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)        case OP_NOTPOSUPTO:
2052          { ADD_ACTIVE(state_offset + dlen + 3, 0); }        ADD_ACTIVE(state_offset + dlen + 3, 0);
2053        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2054        if (clen > 0)        if (clen > 0)
2055          {          {
2056          int otherd = -1;          unsigned int otherd = NOTACHAR;
2057          if ((ims & PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
2058            {            {
2059  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2060            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2061              {              {
2062  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2063              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2064  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2065              }              }
2066            else            else
# Line 1320  for (;;) Line 2069  for (;;)
2069            }            }
2070          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2071            {            {
2072              if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2073                {
2074                active_count--;             /* Remove non-match possibility */
2075                next_active_state--;
2076                }
2077            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2078              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 3, 0); }
2079            else            else
# Line 1414  for (;;) Line 2168  for (;;)
2168    
2169  /* ========================================================================== */  /* ========================================================================== */
2170        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2171        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2172          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2173          though the other "backtracking verbs" are not supported. */
2174    
2175          case OP_FAIL:
2176          forced_fail++;    /* Count FAILs for multiple states */
2177          break;
2178    
2179        case OP_ASSERT:        case OP_ASSERT:
2180        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1448  for (;;) Line 2208  for (;;)
2208    
2209        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2210        case OP_COND:        case OP_COND:
2211          case OP_SCOND:
2212          {          {
2213          int local_offsets[1000];          int local_offsets[1000];
2214          int local_workspace[1000];          int local_workspace[1000];
2215          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2216            int condcode;
2217    
2218          /* The only supported version of OP_CREF is for the value 0xffff, which          /* Because of the way auto-callout works during compile, a callout item
2219          means "test if in a recursion". */          is inserted between OP_COND and an assertion condition. This does not
2220            happen for the other conditions. */
2221    
2222          if (condcode == OP_CREF)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2223              {
2224              rrc = 0;
2225              if (pcre_callout != NULL)
2226                {
2227                pcre_callout_block cb;
2228                cb.version          = 1;   /* Version 1 of the callout block */
2229                cb.callout_number   = code[LINK_SIZE+2];
2230                cb.offset_vector    = offsets;
2231                cb.subject          = (PCRE_SPTR)start_subject;
2232                cb.subject_length   = end_subject - start_subject;
2233                cb.start_match      = current_subject - start_subject;
2234                cb.current_position = ptr - start_subject;
2235                cb.pattern_position = GET(code, LINK_SIZE + 3);
2236                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2237                cb.capture_top      = 1;
2238                cb.capture_last     = -1;
2239                cb.callout_data     = md->callout_data;
2240                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2241                }
2242              if (rrc > 0) break;                      /* Fail this thread */
2243              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2244              }
2245    
2246            condcode = code[LINK_SIZE+1];
2247    
2248            /* Back reference conditions are not supported */
2249    
2250            if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2251    
2252            /* The DEFINE condition is always false */
2253    
2254            if (condcode == OP_DEF)
2255              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256    
2257            /* The only supported version of OP_RREF is for the value RREF_ANY,
2258            which means "test if in any recursion". We can't test for specifically
2259            recursed groups. */
2260    
2261            else if (condcode == OP_RREF)
2262            {            {
2263            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2264            if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2265            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2266              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2267              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2268            }            }
2269    
2270          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1491  for (;;) Line 2294  for (;;)
2294                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2295              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2296            else            else
2297              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2298            }            }
2299          }          }
2300        break;        break;
# Line 1643  for (;;) Line 2446  for (;;)
2446        /* Handle callouts */        /* Handle callouts */
2447    
2448        case OP_CALLOUT:        case OP_CALLOUT:
2449          rrc = 0;
2450        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2451          {          {
         int rrc;  
2452          pcre_callout_block cb;          pcre_callout_block cb;
2453          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2454          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 1660  for (;;) Line 2463  for (;;)
2463          cb.capture_last     = -1;          cb.capture_last     = -1;
2464          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2465          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2466          }          }
2467          if (rrc == 0)
2468            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2469        break;        break;
2470    
2471    
# Line 1677  for (;;) Line 2481  for (;;)
2481    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2482    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2483    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2484    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2485      fail" variable counts the number of (*F) encountered for the character. If it
2486      is equal to the original active_count (saved in workspace[1]) it means that
2487      (*F) was found on every active state. In this case we don't want to give a
2488      partial match. */
2489    
2490    if (new_count <= 0)    if (new_count <= 0)
2491      {      {
2492      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2493          rlevel == 1 &&                         /* Top level match function */          reached_end != workspace[1] &&               /* Not all reached end */
2494          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2495          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2496          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2497            ||                                           /* or... */
2498            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2499             match_count < 0)                            /* no matches */
2500            ) &&                                         /* And... */
2501            ptr >= end_subject &&                     /* Reached end of subject */
2502            ptr > current_subject)                    /* Matched non-empty string */
2503        {        {
2504        if (offsetcount >= 2)        if (offsetcount >= 2)
2505          {          {
# Line 1728  is not anchored. Line 2542  is not anchored.
2542    
2543  Arguments:  Arguments:
2544    argument_re     points to the compiled expression    argument_re     points to the compiled expression
2545    extra_data      points to extra data or is NULL (not currently used)    extra_data      points to extra data or is NULL
2546    subject         points to the subject string    subject         points to the subject string
2547    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
2548    start_offset    where to start in the subject string    start_offset    where to start in the subject string
# Line 1744  Returns:          > 0 => number of match Line 2558  Returns:          > 0 => number of match
2558                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2559  */  */
2560    
2561  PCRE_DATA_SCOPE int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2562  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2563    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2564    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 1834  md->end_subject = end_subject; Line 2648  md->end_subject = end_subject;
2648  md->moptions = options;  md->moptions = options;
2649  md->poptions = re->options;  md->poptions = re->options;
2650    
2651  /* Handle different types of newline. The two bits give four cases. If nothing  /* If the BSR option is not set at match time, copy what was set
2652  is set at run time, whatever was used at compile time applies. */  at compile time. */
2653    
2654    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2655      {
2656      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2657        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2658    #ifdef BSR_ANYCRLF
2659      else md->moptions |= PCRE_BSR_ANYCRLF;
2660    #endif
2661      }
2662    
2663    /* Handle different types of newline. The three bits give eight cases. If
2664    nothing is set at run time, whatever was used at compile time applies. */
2665    
2666  switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2667           PCRE_NEWLINE_CRLF)           PCRE_NEWLINE_BITS)
2668    {    {
2669    default:              newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2670    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2671    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2672    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2673         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2674      case PCRE_NEWLINE_ANY: newline = -1; break;
2675      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2676      default: return PCRE_ERROR_BADNEWLINE;
2677    }    }
2678    
2679  if (newline > 255)  if (newline == -2)
2680    {    {
2681    md->nllen = 2;    md->nltype = NLTYPE_ANYCRLF;
2682    md->nl[0] = (newline >> 8) & 255;    }
2683    md->nl[1] = newline & 255;  else if (newline < 0)
2684      {
2685      md->nltype = NLTYPE_ANY;
2686    }    }
2687  else  else
2688    {    {
2689    md->nllen = 1;    md->nltype = NLTYPE_FIXED;
2690    md->nl[0] = newline;    if (newline > 255)
2691        {
2692        md->nllen = 2;
2693        md->nl[0] = (newline >> 8) & 255;
2694        md->nl[1] = newline & 255;
2695        }
2696      else
2697        {
2698        md->nllen = 1;
2699        md->nl[0] = newline;
2700        }
2701    }    }
2702    
2703  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
# Line 1889  if (md->tables == NULL) md->tables = _pc Line 2730  if (md->tables == NULL) md->tables = _pc
2730  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2731    
2732  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2733  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2734  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2735    
2736  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 1900  studied, there may be a bitmap of possib Line 2741  studied, there may be a bitmap of possib
2741    
2742  if (!anchored)  if (!anchored)
2743    {    {
2744    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2745      {      {
2746      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2747      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 1917  if (!anchored) Line 2758  if (!anchored)
2758  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2759  character" set. */  character" set. */
2760    
2761  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2762    {    {
2763    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2764    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 1925  if ((re->options & PCRE_REQCHSET) != 0) Line 2766  if ((re->options & PCRE_REQCHSET) != 0)
2766    }    }
2767    
2768  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2769  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2770  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2771    
2772  for (;;)  for (;;)
2773    {    {
# Line 1937  for (;;) Line 2777  for (;;)
2777      {      {
2778      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2779    
2780      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2781      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2782      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2783      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2784    
2785      if (firstline)      if (firstline)
2786        {        {
2787        const uschar *t = current_subject;        USPTR t = current_subject;
2788        while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;  #ifdef SUPPORT_UTF8
2789          if (utf8)
2790            {
2791            while (t < md->end_subject && !IS_NEWLINE(t))
2792              {
2793              t++;
2794              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2795              }
2796            }
2797          else
2798    #endif
2799          while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2800        end_subject = t;        end_subject = t;
2801        }        }
2802    
2803      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2804        starting point is not found, or if a known later character is not present.
2805        However, there is an option that disables these, for testing and for
2806        ensuring that all callouts do actually occur. */
2807    
2808        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2809        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2810    
2811      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2812    
2813      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + md->nllen +  
           start_offset)  
2814          {          {
2815          while (current_subject <= end_subject &&          if (first_byte_caseless)
2816                 !IS_NEWLINE(current_subject - md->nllen))            while (current_subject < end_subject &&
2817            current_subject++;                   lcc[*current_subject] != first_byte)
2818                current_subject++;
2819            else
2820              while (current_subject < end_subject &&
2821                     *current_subject != first_byte)
2822                current_subject++;
2823          }          }
       }  
2824    
2825      /* Or to a non-unique first char after study */        /* Or to just after a linebreak for a multiline match if possible */
2826    
2827      else if (start_bits != NULL)        else if (startline)
2828        {          {
2829        while (current_subject < end_subject)          if (current_subject > md->start_subject + start_offset)
2830              {
2831    #ifdef SUPPORT_UTF8
2832              if (utf8)
2833                {
2834                while (current_subject < end_subject &&
2835                       !WAS_NEWLINE(current_subject))
2836                  {
2837                  current_subject++;
2838                  while(current_subject < end_subject &&
2839                        (*current_subject & 0xc0) == 0x80)
2840                    current_subject++;
2841                  }
2842                }
2843              else
2844    #endif
2845              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2846                current_subject++;
2847    
2848              /* If we have just passed a CR and the newline option is ANY or
2849              ANYCRLF, and we are now at a LF, advance the match position by one
2850              more character. */
2851    
2852              if (current_subject[-1] == CHAR_CR &&
2853                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2854                   current_subject < end_subject &&
2855                   *current_subject == CHAR_NL)
2856                current_subject++;
2857              }
2858            }
2859    
2860          /* Or to a non-unique first char after study */
2861    
2862          else if (start_bits != NULL)
2863          {          {
2864          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2865          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2866            else break;            register unsigned int c = *current_subject;
2867              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2868                else break;
2869              }
2870          }          }
2871        }        }
2872    
# Line 2004  for (;;) Line 2888  for (;;)
2888    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2889    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2890    
2891    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2892    */    also be explicitly deactivated. Furthermore, we have to disable when
2893      restarting after a partial match, because the required character may have
2894      already been matched. */
2895    
2896    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2897          req_byte >= 0 &&
2898        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2899        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2900      {      {
2901      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2902    
# Line 2070  for (;;) Line 2957  for (;;)
2957    /* Advance to the next subject character unless we are at the end of a line    /* Advance to the next subject character unless we are at the end of a line
2958    and firstline is set. */    and firstline is set. */
2959    
2960    if (firstline &&    if (firstline && IS_NEWLINE(current_subject)) break;
       current_subject <= end_subject - md->nllen &&  
       IS_NEWLINE(current_subject)) break;  
2961    current_subject++;    current_subject++;
2962    if (utf8)    if (utf8)
2963      {      {
# Line 2080  for (;;) Line 2965  for (;;)
2965        current_subject++;        current_subject++;
2966      }      }
2967    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2968    }  
2969      /* If we have just passed a CR and we are now at a LF, and the pattern does
2970      not contain any explicit matches for \r or \n, and the newline option is CRLF
2971      or ANY or ANYCRLF, advance the match position by one more character. */
2972    
2973      if (current_subject[-1] == CHAR_CR &&
2974          current_subject < end_subject &&
2975          *current_subject == CHAR_NL &&
2976          (re->flags & PCRE_HASCRORLF) == 0 &&
2977            (md->nltype == NLTYPE_ANY ||
2978             md->nltype == NLTYPE_ANYCRLF ||
2979             md->nllen == 2))
2980        current_subject++;
2981    
2982      }   /* "Bumpalong" loop */
2983    
2984  return PCRE_ERROR_NOMATCH;  return PCRE_ERROR_NOMATCH;
2985  }  }

Legend:
Removed from v.91  
changed lines
  Added in v.428

  ViewVC Help
Powered by ViewVC 1.1.5