/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 657 by ph10, Mon Aug 15 17:39:09 2011 UTC revision 1486 by ph10, Wed Jun 18 16:48:57 2014 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code        pointer to start of group (the bracket)    re              compiled pattern block
70    startcode   pointer to start of the whole pattern    code            pointer to start of group (the bracket)
71    options     the compiling options    startcode       pointer to start of the whole pattern's code
72    had_accept  pointer to flag for (*ACCEPT) encountered    options         the compiling options
73    int         RECURSE depth    int             RECURSE depth
74    
75  Returns:   the minimum length  Returns:   the minimum length
76             -1 if \C was encountered             -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
78             -3 internal error (opcode not listed)             -3 internal error (opcode not listed)
79  */  */
80    
81  static int  static int
82  find_minlength(const uschar *code, const uschar *startcode, int options,  find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83    BOOL *had_accept_ptr, int recurse_depth)    const pcre_uchar *startcode, int options, int recurse_depth)
84  {  {
85  int length = -1;  int length = -1;
86  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87    BOOL utf = (options & PCRE_UTF8) != 0;
88  BOOL had_recurse = FALSE;  BOOL had_recurse = FALSE;
89  register int branchlength = 0;  register int branchlength = 0;
90  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
91    
92  if (*code == OP_CBRA || *code == OP_SCBRA ||  if (*code == OP_CBRA || *code == OP_SCBRA ||
93      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
94    
95  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
96  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 97  branch, check the length against that of Line 98  branch, check the length against that of
98  for (;;)  for (;;)
99    {    {
100    int d, min;    int d, min;
101    uschar *cs, *ce;    pcre_uchar *cs, *ce;
102    register int op = *cc;    register pcre_uchar op = *cc;
103    
104    switch (op)    switch (op)
105      {      {
# Line 128  for (;;) Line 129  for (;;)
129      case OP_BRAPOS:      case OP_BRAPOS:
130      case OP_SBRAPOS:      case OP_SBRAPOS:
131      case OP_ONCE:      case OP_ONCE:
132      d = find_minlength(cc, startcode, options, had_accept_ptr, recurse_depth);      case OP_ONCE_NC:
133        d = find_minlength(re, cc, startcode, options, recurse_depth);
134      if (d < 0) return d;      if (d < 0) return d;
135      branchlength += d;      branchlength += d;
     if (*had_accept_ptr) return branchlength;  
136      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
137      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
138      break;      break;
139    
140      /* Reached end of a branch; if it's a ket it is the end of a nested      /* ACCEPT makes things far too complicated; we have to give up. */
     call. If it's ALT it is an alternation in a nested call. If it is END it's  
     the end of the outer call. All can be handled by the same code. If it is  
     ACCEPT, it is essentially the same as END, but we set a flag so that  
     counting stops. */  
141    
142      case OP_ACCEPT:      case OP_ACCEPT:
143      case OP_ASSERT_ACCEPT:      case OP_ASSERT_ACCEPT:
144      *had_accept_ptr = TRUE;      return -1;
145      /* Fall through */  
146        /* Reached end of a branch; if it's a ket it is the end of a nested
147        call. If it's ALT it is an alternation in a nested call. If it is END it's
148        the end of the outer call. All can be handled by the same code. If an
149        ACCEPT was previously encountered, use the length that was in force at that
150        time, and pass back the shortest ACCEPT length. */
151    
152      case OP_ALT:      case OP_ALT:
153      case OP_KET:      case OP_KET:
154      case OP_KETRMAX:      case OP_KETRMAX:
# Line 173  for (;;) Line 176  for (;;)
176    
177      case OP_REVERSE:      case OP_REVERSE:
178      case OP_CREF:      case OP_CREF:
179      case OP_NCREF:      case OP_DNCREF:
180      case OP_RREF:      case OP_RREF:
181      case OP_NRREF:      case OP_DNRREF:
182      case OP_DEF:      case OP_DEF:
183      case OP_CALLOUT:      case OP_CALLOUT:
184      case OP_SOD:      case OP_SOD:
# Line 188  for (;;) Line 191  for (;;)
191      case OP_DOLLM:      case OP_DOLLM:
192      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
193      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
194      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
195      break;      break;
196    
197      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
# Line 197  for (;;) Line 200  for (;;)
200      case OP_BRAMINZERO:      case OP_BRAMINZERO:
201      case OP_BRAPOSZERO:      case OP_BRAPOSZERO:
202      case OP_SKIPZERO:      case OP_SKIPZERO:
203      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
204      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
205      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
206      break;      break;
# Line 222  for (;;) Line 225  for (;;)
225      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
226      branchlength++;      branchlength++;
227      cc += 2;      cc += 2;
228  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
229      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
230  #endif  #endif
231      break;      break;
232    
# Line 242  for (;;) Line 245  for (;;)
245      case OP_NOTEXACT:      case OP_NOTEXACT:
246      case OP_NOTEXACTI:      case OP_NOTEXACTI:
247      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
248      cc += 4;      cc += 2 + IMM2_SIZE;
249  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
250      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
251  #endif  #endif
252      break;      break;
253    
254      case OP_TYPEEXACT:      case OP_TYPEEXACT:
255      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
256      cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;      cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
257          || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
258      break;      break;
259    
260      /* Handle single-char non-literal matchers */      /* Handle single-char non-literal matchers */
# Line 285  for (;;) Line 289  for (;;)
289      cc++;      cc++;
290      break;      break;
291    
292      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
293        non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
294        appear, but leave the code, just in case.) */
295    
296      case OP_ANYBYTE:      case OP_ANYBYTE:
297  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
298      if (utf8) return -1;      if (utf) return -1;
299  #endif  #endif
300      branchlength++;      branchlength++;
301      cc++;      cc++;
# Line 305  for (;;) Line 311  for (;;)
311      case OP_TYPEPOSSTAR:      case OP_TYPEPOSSTAR:
312      case OP_TYPEPOSQUERY:      case OP_TYPEPOSQUERY:
313      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
314      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
315      break;      break;
316    
317      case OP_TYPEUPTO:      case OP_TYPEUPTO:
318      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
319      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
320      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP
321      cc += _pcre_OP_lengths[op];        || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
322        cc += PRIV(OP_lengths)[op];
323      break;      break;
324    
325      /* Check a class for variable quantification */      /* Check a class for variable quantification */
326    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
327      case OP_CLASS:      case OP_CLASS:
328      case OP_NCLASS:      case OP_NCLASS:
329      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
330        case OP_XCLASS:
331        /* The original code caused an unsigned overflow in 64 bit systems,
332        so now we use a conditional statement. */
333        if (op == OP_XCLASS)
334          cc += GET(cc, 1);
335        else
336          cc += PRIV(OP_lengths)[OP_CLASS];
337    #else
338        cc += PRIV(OP_lengths)[OP_CLASS];
339    #endif
340    
341      switch (*cc)      switch (*cc)
342        {        {
343        case OP_CRPLUS:        case OP_CRPLUS:
344        case OP_CRMINPLUS:        case OP_CRMINPLUS:
345          case OP_CRPOSPLUS:
346        branchlength++;        branchlength++;
347        /* Fall through */        /* Fall through */
348    
# Line 338  for (;;) Line 350  for (;;)
350        case OP_CRMINSTAR:        case OP_CRMINSTAR:
351        case OP_CRQUERY:        case OP_CRQUERY:
352        case OP_CRMINQUERY:        case OP_CRMINQUERY:
353          case OP_CRPOSSTAR:
354          case OP_CRPOSQUERY:
355        cc++;        cc++;
356        break;        break;
357    
358        case OP_CRRANGE:        case OP_CRRANGE:
359        case OP_CRMINRANGE:        case OP_CRMINRANGE:
360          case OP_CRPOSRANGE:
361        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
362        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
363        break;        break;
364    
365        default:        default:
# Line 365  for (;;) Line 380  for (;;)
380      matches an empty string (by default it causes a matching failure), so in      matches an empty string (by default it causes a matching failure), so in
381      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
382    
383      case OP_REF:      case OP_DNREF:     /* Duplicate named pattern back reference */
384        case OP_DNREFI:
385        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
386          {
387          int count = GET2(cc, 1+IMM2_SIZE);
388          pcre_uchar *slot = (pcre_uchar *)re +
389            re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
390          d = INT_MAX;
391          while (count-- > 0)
392            {
393            ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
394            if (cs == NULL) return -2;
395            do ce += GET(ce, 1); while (*ce == OP_ALT);
396            if (cc > cs && cc < ce)
397              {
398              d = 0;
399              had_recurse = TRUE;
400              break;
401              }
402            else
403              {
404              int dd = find_minlength(re, cs, startcode, options, recurse_depth);
405              if (dd < d) d = dd;
406              }
407            slot += re->name_entry_size;
408            }
409          }
410        else d = 0;
411        cc += 1 + 2*IMM2_SIZE;
412        goto REPEAT_BACK_REFERENCE;
413    
414        case OP_REF:      /* Single back reference */
415      case OP_REFI:      case OP_REFI:
416      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
417        {        {
418        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
419        if (cs == NULL) return -2;        if (cs == NULL) return -2;
420        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
421        if (cc > cs && cc < ce)        if (cc > cs && cc < ce)
# Line 379  for (;;) Line 425  for (;;)
425          }          }
426        else        else
427          {          {
428          d = find_minlength(cs, startcode, options, had_accept_ptr,          d = find_minlength(re, cs, startcode, options, recurse_depth);
           recurse_depth);  
         *had_accept_ptr = FALSE;  
429          }          }
430        }        }
431      else d = 0;      else d = 0;
432      cc += 3;      cc += 1 + IMM2_SIZE;
433    
434      /* Handle repeated back references */      /* Handle repeated back references */
435    
436        REPEAT_BACK_REFERENCE:
437      switch (*cc)      switch (*cc)
438        {        {
439        case OP_CRSTAR:        case OP_CRSTAR:
440        case OP_CRMINSTAR:        case OP_CRMINSTAR:
441        case OP_CRQUERY:        case OP_CRQUERY:
442        case OP_CRMINQUERY:        case OP_CRMINQUERY:
443          case OP_CRPOSSTAR:
444          case OP_CRPOSQUERY:
445        min = 0;        min = 0;
446        cc++;        cc++;
447        break;        break;
448    
449        case OP_CRPLUS:        case OP_CRPLUS:
450        case OP_CRMINPLUS:        case OP_CRMINPLUS:
451          case OP_CRPOSPLUS:
452        min = 1;        min = 1;
453        cc++;        cc++;
454        break;        break;
455    
456        case OP_CRRANGE:        case OP_CRRANGE:
457        case OP_CRMINRANGE:        case OP_CRMINRANGE:
458          case OP_CRPOSRANGE:
459        min = GET2(cc, 1);        min = GET2(cc, 1);
460        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
461        break;        break;
462    
463        default:        default:
# Line 423  for (;;) Line 472  for (;;)
472      caught by a recursion depth count. */      caught by a recursion depth count. */
473    
474      case OP_RECURSE:      case OP_RECURSE:
475      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
     if (cs == NULL) return -2;  
476      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
477      if ((cc > cs && cc < ce) || recurse_depth > 10)      if ((cc > cs && cc < ce) || recurse_depth > 10)
478        had_recurse = TRUE;        had_recurse = TRUE;
479      else      else
480        {        {
481        branchlength += find_minlength(cs, startcode, options, had_accept_ptr,        branchlength += find_minlength(re, cs, startcode, options,
482          recurse_depth + 1);          recurse_depth + 1);
       *had_accept_ptr = FALSE;  
483        }        }
484      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
485      break;      break;
# Line 484  for (;;) Line 531  for (;;)
531      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
532      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
533    
534      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
535  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
536      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
537  #endif  #endif
538      break;      break;
539    
# Line 495  for (;;) Line 542  for (;;)
542      case OP_MARK:      case OP_MARK:
543      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
544      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     cc += _pcre_OP_lengths[op] + cc[1];  
     break;  
   
545      case OP_THEN_ARG:      case OP_THEN_ARG:
546      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];      cc += PRIV(OP_lengths)[op] + cc[1];
547      break;      break;
548    
549      /* The remaining opcodes are just skipped over. */      /* The remaining opcodes are just skipped over. */
# Line 511  for (;;) Line 555  for (;;)
555      case OP_SET_SOM:      case OP_SET_SOM:
556      case OP_SKIP:      case OP_SKIP:
557      case OP_THEN:      case OP_THEN:
558      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
559      break;      break;
560    
561      /* This should not occur: we list all opcodes explicitly so that when      /* This should not occur: we list all opcodes explicitly so that when
# Line 540  Arguments: Line 584  Arguments:
584    p             points to the character    p             points to the character
585    caseless      the caseless flag    caseless      the caseless flag
586    cd            the block with char table pointers    cd            the block with char table pointers
587    utf8          TRUE for UTF-8 mode    utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
588    
589  Returns:        pointer after the character  Returns:        pointer after the character
590  */  */
591    
592  static const uschar *  static const pcre_uchar *
593  set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
594    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf)
595  {  {
596  unsigned int c = *p;  pcre_uint32 c = *p;
597    
598    #ifdef COMPILE_PCRE8
599  SET_BIT(c);  SET_BIT(c);
600    
601  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
602  if (utf8 && c > 127)  if (utf && c > 127)
603    {    {
604    GETCHARINC(c, p);    GETCHARINC(c, p);
605  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
606    if (caseless)    if (caseless)
607      {      {
608      uschar buff[8];      pcre_uchar buff[6];
609      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
610      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
611      SET_BIT(buff[0]);      SET_BIT(buff[0]);
612      }      }
613  #endif  #endif  /* Not SUPPORT_UCP */
614    return p;    return p;
615    }    }
616  #endif  #else   /* Not SUPPORT_UTF */
617    (void)(utf);   /* Stops warning for unused parameter */
618    #endif  /* SUPPORT_UTF */
619    
620  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
621    
622  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
623  return p + 1;  return p + 1;
624    #endif  /* COMPILE_PCRE8 */
625    
626    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
627    if (c > 0xff)
628      {
629      c = 0xff;
630      caseless = FALSE;
631      }
632    SET_BIT(c);
633    
634    #ifdef SUPPORT_UTF
635    if (utf && c > 127)
636      {
637      GETCHARINC(c, p);
638    #ifdef SUPPORT_UCP
639      if (caseless)
640        {
641        c = UCD_OTHERCASE(c);
642        if (c > 0xff)
643          c = 0xff;
644        SET_BIT(c);
645        }
646    #endif  /* SUPPORT_UCP */
647      return p;
648      }
649    #else   /* Not SUPPORT_UTF */
650    (void)(utf);   /* Stops warning for unused parameter */
651    #endif  /* SUPPORT_UTF */
652    
653    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
654    return p + 1;
655    #endif
656  }  }
657    
658    
# Line 599  Returns:         nothing Line 678  Returns:         nothing
678  */  */
679    
680  static void  static void
681  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,  set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682    compile_data *cd)    compile_data *cd)
683  {  {
684  register int c;  register pcre_uint32 c;
685  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
686    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687  if (table_limit == 32) return;  if (table_limit == 32) return;
688  for (c = 128; c < 256; c++)  for (c = 128; c < 256; c++)
689    {    {
690    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
691      {      {
692      uschar buff[8];      pcre_uchar buff[6];
693      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
694      SET_BIT(buff[0]);      SET_BIT(buff[0]);
695      }      }
696    }    }
697    #endif
698  }  }
699    
700    
# Line 639  Returns:         nothing Line 720  Returns:         nothing
720  */  */
721    
722  static void  static void
723  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724    compile_data *cd)    compile_data *cd)
725  {  {
726  register int c;  register pcre_uint32 c;
727  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
728    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
730    #endif
731  }  }
732    
733    
# Line 664  function fails unless the result is SSB_ Line 747  function fails unless the result is SSB_
747  Arguments:  Arguments:
748    code         points to an expression    code         points to an expression
749    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
750    utf8         TRUE if in UTF-8 mode    utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
751    cd           the block with char table pointers    cd           the block with char table pointers
752    
753  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
# Line 674  Returns:       SSB_FAIL     => Failed to Line 757  Returns:       SSB_FAIL     => Failed to
757  */  */
758    
759  static int  static int
760  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
761    compile_data *cd)    compile_data *cd)
762  {  {
763  register int c;  register pcre_uint32 c;
764  int yield = SSB_DONE;  int yield = SSB_DONE;
765  int table_limit = utf8? 16:32;  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
766    int table_limit = utf? 16:32;
767    #else
768    int table_limit = 32;
769    #endif
770    
771  #if 0  #if 0
772  /* ========================================================================= */  /* ========================================================================= */
# Line 701  volatile int dummy; Line 788  volatile int dummy;
788  do  do
789    {    {
790    BOOL try_next = TRUE;    BOOL try_next = TRUE;
791    const uschar *tcode = code + 1 + LINK_SIZE;    const pcre_uchar *tcode = code + 1 + LINK_SIZE;
792    
793    if (*code == OP_CBRA || *code == OP_SCBRA ||    if (*code == OP_CBRA || *code == OP_SCBRA ||
794        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
795    
796    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
797      {      {
# Line 733  do Line 820  do
820        case OP_COND:        case OP_COND:
821        case OP_CREF:        case OP_CREF:
822        case OP_DEF:        case OP_DEF:
823          case OP_DNCREF:
824          case OP_DNREF:
825          case OP_DNREFI:
826          case OP_DNRREF:
827        case OP_DOLL:        case OP_DOLL:
828        case OP_DOLLM:        case OP_DOLLM:
829        case OP_END:        case OP_END:
# Line 741  do Line 832  do
832        case OP_EXTUNI:        case OP_EXTUNI:
833        case OP_FAIL:        case OP_FAIL:
834        case OP_MARK:        case OP_MARK:
       case OP_NCREF:  
835        case OP_NOT:        case OP_NOT:
836        case OP_NOTEXACT:        case OP_NOTEXACT:
837        case OP_NOTEXACTI:        case OP_NOTEXACTI:
# Line 773  do Line 863  do
863        case OP_NOTUPTOI:        case OP_NOTUPTOI:
864        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
865        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
       case OP_NRREF:  
866        case OP_PROP:        case OP_PROP:
867        case OP_PRUNE:        case OP_PRUNE:
868        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
# Line 790  do Line 879  do
879        case OP_SOM:        case OP_SOM:
880        case OP_THEN:        case OP_THEN:
881        case OP_THEN_ARG:        case OP_THEN_ARG:
       case OP_XCLASS:  
882        return SSB_FAIL;        return SSB_FAIL;
883    
884        /* We can ignore word boundary tests. */        /* We can ignore word boundary tests. */
885    
886        case OP_WORD_BOUNDARY:        case OP_WORD_BOUNDARY:
887        case OP_NOT_WORD_BOUNDARY:        case OP_NOT_WORD_BOUNDARY:
888        tcode++;        tcode++;
889        break;        break;
890    
891        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
892        bits from within the subpattern. If it can't find anything, we have to        bits from within the subpattern. If it can't find anything, we have to
# Line 814  do Line 902  do
902        case OP_CBRAPOS:        case OP_CBRAPOS:
903        case OP_SCBRAPOS:        case OP_SCBRAPOS:
904        case OP_ONCE:        case OP_ONCE:
905          case OP_ONCE_NC:
906        case OP_ASSERT:        case OP_ASSERT:
907        rc = set_start_bits(tcode, start_bits, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf, cd);
908        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
909        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
910          {          {
# Line 862  do Line 951  do
951        case OP_BRAZERO:        case OP_BRAZERO:
952        case OP_BRAMINZERO:        case OP_BRAMINZERO:
953        case OP_BRAPOSZERO:        case OP_BRAPOSZERO:
954        rc = set_start_bits(++tcode, start_bits, utf8, cd);        rc = set_start_bits(++tcode, start_bits, utf, cd);
955        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
956  /* =========================================================================  /* =========================================================================
957        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 889  do Line 978  do
978        case OP_QUERY:        case OP_QUERY:
979        case OP_MINQUERY:        case OP_MINQUERY:
980        case OP_POSQUERY:        case OP_POSQUERY:
981        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
982        break;        break;
983    
984        case OP_STARI:        case OP_STARI:
# Line 898  do Line 987  do
987        case OP_QUERYI:        case OP_QUERYI:
988        case OP_MINQUERYI:        case OP_MINQUERYI:
989        case OP_POSQUERYI:        case OP_POSQUERYI:
990        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
991        break;        break;
992    
993        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 906  do Line 995  do
995        case OP_UPTO:        case OP_UPTO:
996        case OP_MINUPTO:        case OP_MINUPTO:
997        case OP_POSUPTO:        case OP_POSUPTO:
998        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
999        break;        break;
1000    
1001        case OP_UPTOI:        case OP_UPTOI:
1002        case OP_MINUPTOI:        case OP_MINUPTOI:
1003        case OP_POSUPTOI:        case OP_POSUPTOI:
1004        tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1005        break;        break;
1006    
1007        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
1008    
1009        case OP_EXACT:        case OP_EXACT:
1010        tcode += 2;        tcode += IMM2_SIZE;
1011        /* Fall through */        /* Fall through */
1012        case OP_CHAR:        case OP_CHAR:
1013        case OP_PLUS:        case OP_PLUS:
1014        case OP_MINPLUS:        case OP_MINPLUS:
1015        case OP_POSPLUS:        case OP_POSPLUS:
1016        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1017        try_next = FALSE;        try_next = FALSE;
1018        break;        break;
1019    
1020        case OP_EXACTI:        case OP_EXACTI:
1021        tcode += 2;        tcode += IMM2_SIZE;
1022        /* Fall through */        /* Fall through */
1023        case OP_CHARI:        case OP_CHARI:
1024        case OP_PLUSI:        case OP_PLUSI:
1025        case OP_MINPLUSI:        case OP_MINPLUSI:
1026        case OP_POSPLUSI:        case OP_POSPLUSI:
1027        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1028        try_next = FALSE;        try_next = FALSE;
1029        break;        break;
1030    
# Line 946  do Line 1035  do
1035        identical. */        identical. */
1036    
1037        case OP_HSPACE:        case OP_HSPACE:
1038        SET_BIT(0x09);        SET_BIT(CHAR_HT);
1039        SET_BIT(0x20);        SET_BIT(CHAR_SPACE);
1040        if (utf8)  #ifdef SUPPORT_UTF
1041          if (utf)
1042          {          {
1043    #ifdef COMPILE_PCRE8
1044          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
1045          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
1046          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1047          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
1048    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1049            SET_BIT(0xA0);
1050            SET_BIT(0xFF);  /* For characters > 255 */
1051    #endif  /* COMPILE_PCRE[8|16|32] */
1052            }
1053          else
1054    #endif /* SUPPORT_UTF */
1055            {
1056    #ifndef EBCDIC
1057            SET_BIT(0xA0);
1058    #endif  /* Not EBCDIC */
1059    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1060            SET_BIT(0xFF);  /* For characters > 255 */
1061    #endif  /* COMPILE_PCRE[16|32] */
1062          }          }
       else SET_BIT(0xA0);  
1063        try_next = FALSE;        try_next = FALSE;
1064        break;        break;
1065    
1066        case OP_ANYNL:        case OP_ANYNL:
1067        case OP_VSPACE:        case OP_VSPACE:
1068        SET_BIT(0x0A);        SET_BIT(CHAR_LF);
1069        SET_BIT(0x0B);        SET_BIT(CHAR_VT);
1070        SET_BIT(0x0C);        SET_BIT(CHAR_FF);
1071        SET_BIT(0x0D);        SET_BIT(CHAR_CR);
1072        if (utf8)  #ifdef SUPPORT_UTF
1073          if (utf)
1074          {          {
1075    #ifdef COMPILE_PCRE8
1076          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
1077          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1078    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1079            SET_BIT(CHAR_NEL);
1080            SET_BIT(0xFF);  /* For characters > 255 */
1081    #endif  /* COMPILE_PCRE[8|16|32] */
1082            }
1083          else
1084    #endif /* SUPPORT_UTF */
1085            {
1086            SET_BIT(CHAR_NEL);
1087    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1088            SET_BIT(0xFF);  /* For characters > 255 */
1089    #endif
1090          }          }
       else SET_BIT(0x85);  
1091        try_next = FALSE;        try_next = FALSE;
1092        break;        break;
1093    
# Line 989  do Line 1106  do
1106        try_next = FALSE;        try_next = FALSE;
1107        break;        break;
1108    
1109        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we no longer
1110        ensure it is set as not whitespace. */        have to play fancy tricks because Perl added VT to its whitespace at
1111          release 5.18. PCRE added it at release 8.34. */
1112    
1113        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
1114        set_nottype_bits(start_bits, cbit_space, table_limit, cd);        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
       start_bits[1] |= 0x08;  
1115        try_next = FALSE;        try_next = FALSE;
1116        break;        break;
1117    
       /* The cbit_space table has vertical tab as whitespace; we have to  
       not set it from the table. */  
   
1118        case OP_WHITESPACE:        case OP_WHITESPACE:
       c = start_bits[1];    /* Save in case it was already set */  
1119        set_type_bits(start_bits, cbit_space, table_limit, cd);        set_type_bits(start_bits, cbit_space, table_limit, cd);
       start_bits[1] = (start_bits[1] & ~0x08) | c;  
1120        try_next = FALSE;        try_next = FALSE;
1121        break;        break;
1122    
# Line 1028  do Line 1140  do
1140        break;        break;
1141    
1142        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1143        tcode += 3;        tcode += 1 + IMM2_SIZE;
1144        break;        break;
1145    
1146        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 1037  do Line 1149  do
1149        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1150        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1151        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1152        tcode += 2;               /* Fall through */        tcode += IMM2_SIZE;  /* Fall through */
1153    
1154        case OP_TYPESTAR:        case OP_TYPESTAR:
1155        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
# Line 1053  do Line 1165  do
1165          return SSB_FAIL;          return SSB_FAIL;
1166    
1167          case OP_HSPACE:          case OP_HSPACE:
1168          SET_BIT(0x09);          SET_BIT(CHAR_HT);
1169          SET_BIT(0x20);          SET_BIT(CHAR_SPACE);
1170          if (utf8)  #ifdef SUPPORT_UTF
1171            if (utf)
1172            {            {
1173    #ifdef COMPILE_PCRE8
1174            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
1175            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
1176            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1177            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
1178    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1179              SET_BIT(0xA0);
1180              SET_BIT(0xFF);  /* For characters > 255 */
1181    #endif  /* COMPILE_PCRE[8|16|32] */
1182            }            }
1183          else SET_BIT(0xA0);          else
1184    #endif /* SUPPORT_UTF */
1185    #ifndef EBCDIC
1186              SET_BIT(0xA0);
1187    #endif  /* Not EBCDIC */
1188          break;          break;
1189    
1190          case OP_ANYNL:          case OP_ANYNL:
1191          case OP_VSPACE:          case OP_VSPACE:
1192          SET_BIT(0x0A);          SET_BIT(CHAR_LF);
1193          SET_BIT(0x0B);          SET_BIT(CHAR_VT);
1194          SET_BIT(0x0C);          SET_BIT(CHAR_FF);
1195          SET_BIT(0x0D);          SET_BIT(CHAR_CR);
1196          if (utf8)  #ifdef SUPPORT_UTF
1197            if (utf)
1198            {            {
1199    #ifdef COMPILE_PCRE8
1200            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
1201            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
1202    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1203              SET_BIT(CHAR_NEL);
1204              SET_BIT(0xFF);  /* For characters > 255 */
1205    #endif  /* COMPILE_PCRE16 */
1206            }            }
1207          else SET_BIT(0x85);          else
1208    #endif /* SUPPORT_UTF */
1209              SET_BIT(CHAR_NEL);
1210          break;          break;
1211    
1212          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 1087  do Line 1217  do
1217          set_type_bits(start_bits, cbit_digit, table_limit, cd);          set_type_bits(start_bits, cbit_digit, table_limit, cd);
1218          break;          break;
1219    
1220          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we no longer
1221          ensure it gets set as not whitespace. */          have to play fancy tricks because Perl added VT to its whitespace at
1222            release 5.18. PCRE added it at release 8.34. */
1223    
1224          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1225          set_nottype_bits(start_bits, cbit_space, table_limit, cd);          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] |= 0x08;  
1226          break;          break;
1227    
         /* The cbit_space table has vertical tab as whitespace; we have to  
         avoid setting it. */  
   
1228          case OP_WHITESPACE:          case OP_WHITESPACE:
         c = start_bits[1];    /* Save in case it was already set */  
1229          set_type_bits(start_bits, cbit_space, table_limit, cd);          set_type_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] = (start_bits[1] & ~0x08) | c;  
1230          break;          break;
1231    
1232          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 1122  do Line 1247  do
1247        with a value >= 0xc4 is a potentially valid starter because it starts a        with a value >= 0xc4 is a potentially valid starter because it starts a
1248        character with a value > 255. */        character with a value > 255. */
1249    
1250    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1251          case OP_XCLASS:
1252          if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1253            return SSB_FAIL;
1254          /* All bits are set. */
1255          if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1256            return SSB_FAIL;
1257    #endif
1258          /* Fall through */
1259    
1260        case OP_NCLASS:        case OP_NCLASS:
1261  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1262        if (utf8)        if (utf)
1263          {          {
1264          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1265          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1266          }          }
1267  #endif  #endif
1268    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269          SET_BIT(0xFF);                         /* For characters > 255 */
1270    #endif
1271        /* Fall through */        /* Fall through */
1272    
1273        case OP_CLASS:        case OP_CLASS:
1274          {          {
1275          tcode++;          pcre_uint8 *map;
1276    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1277            map = NULL;
1278            if (*tcode == OP_XCLASS)
1279              {
1280              if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1281                map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1282              tcode += GET(tcode, 1);
1283              }
1284            else
1285    #endif
1286              {
1287              tcode++;
1288              map = (pcre_uint8 *)tcode;
1289              tcode += 32 / sizeof(pcre_uchar);
1290              }
1291    
1292          /* In UTF-8 mode, the bits in a bit map correspond to character          /* In UTF-8 mode, the bits in a bit map correspond to character
1293          values, not to byte values. However, the bit map we are constructing is          values, not to byte values. However, the bit map we are constructing is
# Line 1142  do Line 1295  do
1295          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
1296          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1297    
1298  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1299          if (utf8)          if (map != NULL)
1300    #endif
1301            {            {
1302            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1303            for (c = 128; c < 256; c++)            if (utf)
1304              {              {
1305              if ((tcode[c/8] && (1 << (c&7))) != 0)              for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1306                for (c = 128; c < 256; c++)
1307                {                {
1308                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */                if ((map[c/8] && (1 << (c&7))) != 0)
1309                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */                  {
1310                c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */                  int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1311                    start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1312                    c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1313                    }
1314                }                }
1315              }              }
1316            }            else
   
         /* In non-UTF-8 mode, the two bit maps are completely compatible. */  
   
         else  
1317  #endif  #endif
1318            {              {
1319            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];              /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1320                for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1321                }
1322            }            }
1323    
1324          /* Advance past the bit map, and act on what follows. For a zero          /* Advance past the bit map, and act on what follows. For a zero
1325          minimum repeat, continue; otherwise stop processing. */          minimum repeat, continue; otherwise stop processing. */
1326    
         tcode += 32;  
1327          switch (*tcode)          switch (*tcode)
1328            {            {
1329            case OP_CRSTAR:            case OP_CRSTAR:
1330            case OP_CRMINSTAR:            case OP_CRMINSTAR:
1331            case OP_CRQUERY:            case OP_CRQUERY:
1332            case OP_CRMINQUERY:            case OP_CRMINQUERY:
1333              case OP_CRPOSSTAR:
1334              case OP_CRPOSQUERY:
1335            tcode++;            tcode++;
1336            break;            break;
1337    
1338            case OP_CRRANGE:            case OP_CRRANGE:
1339            case OP_CRMINRANGE:            case OP_CRMINRANGE:
1340            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            case OP_CRPOSRANGE:
1341              if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1342              else try_next = FALSE;              else try_next = FALSE;
1343            break;            break;
1344    
# Line 1209  return yield; Line 1367  return yield;
1367  *************************************************/  *************************************************/
1368    
1369  /* This function is handed a compiled expression that it must study to produce  /* This function is handed a compiled expression that it must study to produce
1370  information that will speed up the matching. It returns a pcre_extra block  information that will speed up the matching. It returns a pcre[16]_extra block
1371  which then gets handed back to pcre_exec().  which then gets handed back to pcre_exec().
1372    
1373  Arguments:  Arguments:
# Line 1218  Arguments: Line 1376  Arguments:
1376    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
1377              set NULL unless error              set NULL unless error
1378    
1379  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1380                appropriate flags set;                the appropriate flags set;
1381              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1382  */  */
1383    
1384    #if defined COMPILE_PCRE8
1385  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1386  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1387    #elif defined COMPILE_PCRE16
1388    PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1389    pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1390    #elif defined COMPILE_PCRE32
1391    PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1392    pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1393    #endif
1394  {  {
1395  int min;  int min;
1396  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1397  BOOL had_accept = FALSE;  pcre_uint8 start_bits[32];
1398  uschar start_bits[32];  PUBL(extra) *extra = NULL;
 pcre_extra *extra;  
1399  pcre_study_data *study;  pcre_study_data *study;
1400  const uschar *tables;  const pcre_uint8 *tables;
1401  uschar *code;  pcre_uchar *code;
1402  compile_data compile_block;  compile_data compile_block;
1403  const real_pcre *re = (const real_pcre *)external_re;  const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1404    
1405    
1406  *errorptr = NULL;  *errorptr = NULL;
1407    
# Line 1245  if (re == NULL || re->magic_number != MA Line 1411  if (re == NULL || re->magic_number != MA
1411    return NULL;    return NULL;
1412    }    }
1413    
1414    if ((re->flags & PCRE_MODE) == 0)
1415      {
1416    #if defined COMPILE_PCRE8
1417      *errorptr = "argument not compiled in 8 bit mode";
1418    #elif defined COMPILE_PCRE16
1419      *errorptr = "argument not compiled in 16 bit mode";
1420    #elif defined COMPILE_PCRE32
1421      *errorptr = "argument not compiled in 32 bit mode";
1422    #endif
1423      return NULL;
1424      }
1425    
1426  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1427    {    {
1428    *errorptr = "unknown or incorrect option bit(s) set";    *errorptr = "unknown or incorrect option bit(s) set";
1429    return NULL;    return NULL;
1430    }    }
1431    
1432  code = (uschar *)re + re->name_table_offset +  code = (pcre_uchar *)re + re->name_table_offset +
1433    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1434    
1435  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
# Line 1266  if ((re->options & PCRE_ANCHORED) == 0 & Line 1444  if ((re->options & PCRE_ANCHORED) == 0 &
1444    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1445    
1446    tables = re->tables;    tables = re->tables;
1447    
1448    #if defined COMPILE_PCRE8
1449    if (tables == NULL)    if (tables == NULL)
1450      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1451      (void *)(&tables));      (void *)(&tables));
1452    #elif defined COMPILE_PCRE16
1453      if (tables == NULL)
1454        (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1455        (void *)(&tables));
1456    #elif defined COMPILE_PCRE32
1457      if (tables == NULL)
1458        (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1459        (void *)(&tables));
1460    #endif
1461    
1462    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1463    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
# Line 1277  if ((re->options & PCRE_ANCHORED) == 0 & Line 1466  if ((re->options & PCRE_ANCHORED) == 0 &
1466    
1467    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1468    
1469    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1470    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1471      &compile_block);      &compile_block);
1472    bits_set = rc == SSB_DONE;    bits_set = rc == SSB_DONE;
1473    if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";    if (rc == SSB_UNKNOWN)
1474        {
1475        *errorptr = "internal error: opcode not recognized";
1476        return NULL;
1477        }
1478    }    }
1479    
1480  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1481    
1482  switch(min = find_minlength(code, code, re->options, &had_accept, 0))  switch(min = find_minlength(re, code, code, re->options, 0))
1483    {    {
1484    case -2: *errorptr = "internal error: missing capturing bracket"; break;    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1485    case -3: *errorptr = "internal error: opcode not recognized"; break;    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1486    default: break;    default: break;
1487    }    }
1488    
1489  /* Return NULL if there's been an error or if no optimization is possible. */  /* If a set of starting bytes has been identified, or if the minimum length is
1490    greater than zero, or if JIT optimization has been requested, or if
1491    PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1492    pcre_study_data block. The study data is put in the latter, which is pointed to
1493    by the former, which may also get additional data set later by the calling
1494    program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1495    save it in a field for returning via the pcre_fullinfo() function so that if it
1496    becomes variable in the future, we don't have to change that code. */
1497    
1498    if (bits_set || min > 0 || (options & (
1499    #ifdef SUPPORT_JIT
1500        PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1501        PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1502    #endif
1503        PCRE_STUDY_EXTRA_NEEDED)) != 0)
1504      {
1505      extra = (PUBL(extra) *)(PUBL(malloc))
1506        (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1507      if (extra == NULL)
1508        {
1509        *errorptr = "failed to get memory";
1510        return NULL;
1511        }
1512    
1513  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;    study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1514      extra->flags = PCRE_EXTRA_STUDY_DATA;
1515      extra->study_data = study;
1516    
1517      study->size = sizeof(pcre_study_data);
1518      study->flags = 0;
1519    
1520      /* Set the start bits always, to avoid unset memory errors if the
1521      study data is written to a file, but set the flag only if any of the bits
1522      are set, to save time looking when none are. */
1523    
1524  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in    if (bits_set)
1525  the latter, which is pointed to by the former, which may also get additional      {
1526  data set later by the calling program. At the moment, the size of      study->flags |= PCRE_STUDY_MAPPED;
1527  pcre_study_data is fixed. We nevertheless save it in a field for returning via      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1528  the pcre_fullinfo() function so that if it becomes variable in the future, we      }
1529  don't have to change that code. */    else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1530    
1531  extra = (pcre_extra *)(pcre_malloc)  #ifdef PCRE_DEBUG
1532    (sizeof(pcre_extra) + sizeof(pcre_study_data));    if (bits_set)
1533        {
1534        pcre_uint8 *ptr = start_bits;
1535        int i;
1536    
1537  if (extra == NULL)      printf("Start bits:\n");
1538    {      for (i = 0; i < 32; i++)
1539    *errorptr = "failed to get memory";        printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1540    return NULL;      }
1541    }  #endif
1542    
1543  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));    /* Always set the minlength value in the block, because the JIT compiler
1544  extra->flags = PCRE_EXTRA_STUDY_DATA;    makes use of it. However, don't set the bit unless the length is greater than
1545  extra->study_data = study;    zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1546      checking the zero case. */
1547    
1548  study->size = sizeof(pcre_study_data);    if (min > 0)
1549  study->flags = 0;      {
1550        study->flags |= PCRE_STUDY_MINLEN;
1551        study->minlength = min;
1552        }
1553      else study->minlength = 0;
1554    
1555  if (bits_set)    /* If JIT support was compiled and requested, attempt the JIT compilation.
1556    {    If no starting bytes were found, and the minimum length is zero, and JIT
1557    study->flags |= PCRE_STUDY_MAPPED;    compilation fails, abandon the extra block and return NULL, unless
1558    memcpy(study->start_bits, start_bits, sizeof(start_bits));    PCRE_STUDY_EXTRA_NEEDED is set. */
1559    }  
1560    #ifdef SUPPORT_JIT
1561      extra->executable_jit = NULL;
1562      if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1563        PRIV(jit_compile)(re, extra, JIT_COMPILE);
1564      if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1565        PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1566      if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1567        PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1568    
1569  if (min >= 0)    if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1570    {        (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1571    study->flags |= PCRE_STUDY_MINLEN;      {
1572    study->minlength = min;  #if defined COMPILE_PCRE8
1573        pcre_free_study(extra);
1574    #elif defined COMPILE_PCRE16
1575        pcre16_free_study(extra);
1576    #elif defined COMPILE_PCRE32
1577        pcre32_free_study(extra);
1578    #endif
1579        extra = NULL;
1580        }
1581    #endif
1582    }    }
1583    
1584  return extra;  return extra;
1585  }  }
1586    
1587    
1588    /*************************************************
1589    *          Free the study data                   *
1590    *************************************************/
1591    
1592    /* This function frees the memory that was obtained by pcre_study().
1593    
1594    Argument:   a pointer to the pcre[16]_extra block
1595    Returns:    nothing
1596    */
1597    
1598    #if defined COMPILE_PCRE8
1599    PCRE_EXP_DEFN void
1600    pcre_free_study(pcre_extra *extra)
1601    #elif defined COMPILE_PCRE16
1602    PCRE_EXP_DEFN void
1603    pcre16_free_study(pcre16_extra *extra)
1604    #elif defined COMPILE_PCRE32
1605    PCRE_EXP_DEFN void
1606    pcre32_free_study(pcre32_extra *extra)
1607    #endif
1608    {
1609    if (extra == NULL)
1610      return;
1611    #ifdef SUPPORT_JIT
1612    if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1613         extra->executable_jit != NULL)
1614      PRIV(jit_free)(extra->executable_jit);
1615    #endif
1616    PUBL(free)(extra);
1617    }
1618    
1619  /* End of pcre_study.c */  /* End of pcre_study.c */

Legend:
Removed from v.657  
changed lines
  Added in v.1486

  ViewVC Help
Powered by ViewVC 1.1.5