/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 657 by ph10, Mon Aug 15 17:39:09 2011 UTC revision 1414 by zherczeg, Sun Dec 22 16:27:35 2013 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code        pointer to start of group (the bracket)    re              compiled pattern block
70    startcode   pointer to start of the whole pattern    code            pointer to start of group (the bracket)
71    options     the compiling options    startcode       pointer to start of the whole pattern's code
72    had_accept  pointer to flag for (*ACCEPT) encountered    options         the compiling options
73    int         RECURSE depth    int             RECURSE depth
74    
75  Returns:   the minimum length  Returns:   the minimum length
76             -1 if \C was encountered             -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
78             -3 internal error (opcode not listed)             -3 internal error (opcode not listed)
79  */  */
80    
81  static int  static int
82  find_minlength(const uschar *code, const uschar *startcode, int options,  find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83    BOOL *had_accept_ptr, int recurse_depth)    const pcre_uchar *startcode, int options, int recurse_depth)
84  {  {
85  int length = -1;  int length = -1;
86  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87    BOOL utf = (options & PCRE_UTF8) != 0;
88  BOOL had_recurse = FALSE;  BOOL had_recurse = FALSE;
89  register int branchlength = 0;  register int branchlength = 0;
90  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
91    
92  if (*code == OP_CBRA || *code == OP_SCBRA ||  if (*code == OP_CBRA || *code == OP_SCBRA ||
93      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
94    
95  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
96  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 97  branch, check the length against that of Line 98  branch, check the length against that of
98  for (;;)  for (;;)
99    {    {
100    int d, min;    int d, min;
101    uschar *cs, *ce;    pcre_uchar *cs, *ce;
102    register int op = *cc;    register pcre_uchar op = *cc;
103    
104    switch (op)    switch (op)
105      {      {
# Line 128  for (;;) Line 129  for (;;)
129      case OP_BRAPOS:      case OP_BRAPOS:
130      case OP_SBRAPOS:      case OP_SBRAPOS:
131      case OP_ONCE:      case OP_ONCE:
132      d = find_minlength(cc, startcode, options, had_accept_ptr, recurse_depth);      case OP_ONCE_NC:
133        d = find_minlength(re, cc, startcode, options, recurse_depth);
134      if (d < 0) return d;      if (d < 0) return d;
135      branchlength += d;      branchlength += d;
     if (*had_accept_ptr) return branchlength;  
136      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
137      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
138      break;      break;
139    
140      /* Reached end of a branch; if it's a ket it is the end of a nested      /* ACCEPT makes things far too complicated; we have to give up. */
     call. If it's ALT it is an alternation in a nested call. If it is END it's  
     the end of the outer call. All can be handled by the same code. If it is  
     ACCEPT, it is essentially the same as END, but we set a flag so that  
     counting stops. */  
141    
142      case OP_ACCEPT:      case OP_ACCEPT:
143      case OP_ASSERT_ACCEPT:      case OP_ASSERT_ACCEPT:
144      *had_accept_ptr = TRUE;      return -1;
145      /* Fall through */  
146        /* Reached end of a branch; if it's a ket it is the end of a nested
147        call. If it's ALT it is an alternation in a nested call. If it is END it's
148        the end of the outer call. All can be handled by the same code. If an
149        ACCEPT was previously encountered, use the length that was in force at that
150        time, and pass back the shortest ACCEPT length. */
151    
152      case OP_ALT:      case OP_ALT:
153      case OP_KET:      case OP_KET:
154      case OP_KETRMAX:      case OP_KETRMAX:
# Line 173  for (;;) Line 176  for (;;)
176    
177      case OP_REVERSE:      case OP_REVERSE:
178      case OP_CREF:      case OP_CREF:
179      case OP_NCREF:      case OP_DNCREF:
180      case OP_RREF:      case OP_RREF:
181      case OP_NRREF:      case OP_DNRREF:
182      case OP_DEF:      case OP_DEF:
183      case OP_CALLOUT:      case OP_CALLOUT:
184      case OP_SOD:      case OP_SOD:
# Line 188  for (;;) Line 191  for (;;)
191      case OP_DOLLM:      case OP_DOLLM:
192      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
193      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
194      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
195      break;      break;
196    
197      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
# Line 197  for (;;) Line 200  for (;;)
200      case OP_BRAMINZERO:      case OP_BRAMINZERO:
201      case OP_BRAPOSZERO:      case OP_BRAPOSZERO:
202      case OP_SKIPZERO:      case OP_SKIPZERO:
203      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
204      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
205      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
206      break;      break;
# Line 222  for (;;) Line 225  for (;;)
225      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
226      branchlength++;      branchlength++;
227      cc += 2;      cc += 2;
228  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
229      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
230  #endif  #endif
231      break;      break;
232    
# Line 242  for (;;) Line 245  for (;;)
245      case OP_NOTEXACT:      case OP_NOTEXACT:
246      case OP_NOTEXACTI:      case OP_NOTEXACTI:
247      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
248      cc += 4;      cc += 2 + IMM2_SIZE;
249  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
250      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
251  #endif  #endif
252      break;      break;
253    
254      case OP_TYPEEXACT:      case OP_TYPEEXACT:
255      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
256      cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;      cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
257          || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
258      break;      break;
259    
260      /* Handle single-char non-literal matchers */      /* Handle single-char non-literal matchers */
# Line 285  for (;;) Line 289  for (;;)
289      cc++;      cc++;
290      break;      break;
291    
292      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
293        non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
294        appear, but leave the code, just in case.) */
295    
296      case OP_ANYBYTE:      case OP_ANYBYTE:
297  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
298      if (utf8) return -1;      if (utf) return -1;
299  #endif  #endif
300      branchlength++;      branchlength++;
301      cc++;      cc++;
# Line 305  for (;;) Line 311  for (;;)
311      case OP_TYPEPOSSTAR:      case OP_TYPEPOSSTAR:
312      case OP_TYPEPOSQUERY:      case OP_TYPEPOSQUERY:
313      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
314      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
315      break;      break;
316    
317      case OP_TYPEUPTO:      case OP_TYPEUPTO:
318      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
319      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
320      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP
321      cc += _pcre_OP_lengths[op];        || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
322        cc += PRIV(OP_lengths)[op];
323      break;      break;
324    
325      /* Check a class for variable quantification */      /* Check a class for variable quantification */
326    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
327      case OP_CLASS:      case OP_CLASS:
328      case OP_NCLASS:      case OP_NCLASS:
329      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
330        case OP_XCLASS:
331        /* The original code caused an unsigned overflow in 64 bit systems,
332        so now we use a conditional statement. */
333        if (op == OP_XCLASS)
334          cc += GET(cc, 1);
335        else
336          cc += PRIV(OP_lengths)[OP_CLASS];
337    #else
338        cc += PRIV(OP_lengths)[OP_CLASS];
339    #endif
340    
341      switch (*cc)      switch (*cc)
342        {        {
343        case OP_CRPLUS:        case OP_CRPLUS:
344        case OP_CRMINPLUS:        case OP_CRMINPLUS:
345          case OP_CRPOSPLUS:
346        branchlength++;        branchlength++;
347        /* Fall through */        /* Fall through */
348    
# Line 338  for (;;) Line 350  for (;;)
350        case OP_CRMINSTAR:        case OP_CRMINSTAR:
351        case OP_CRQUERY:        case OP_CRQUERY:
352        case OP_CRMINQUERY:        case OP_CRMINQUERY:
353          case OP_CRPOSSTAR:
354          case OP_CRPOSQUERY:
355        cc++;        cc++;
356        break;        break;
357    
358        case OP_CRRANGE:        case OP_CRRANGE:
359        case OP_CRMINRANGE:        case OP_CRMINRANGE:
360          case OP_CRPOSRANGE:
361        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
362        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
363        break;        break;
364    
365        default:        default:
# Line 365  for (;;) Line 380  for (;;)
380      matches an empty string (by default it causes a matching failure), so in      matches an empty string (by default it causes a matching failure), so in
381      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
382    
383      case OP_REF:      case OP_DNREF:     /* Duplicate named pattern back reference */
384        case OP_DNREFI:
385        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
386          {
387          int count = GET2(cc, 1+IMM2_SIZE);
388          pcre_uchar *slot = (pcre_uchar *)re +
389            re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
390          d = INT_MAX;
391          while (count-- > 0)
392            {
393            ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
394            if (cs == NULL) return -2;
395            do ce += GET(ce, 1); while (*ce == OP_ALT);
396            if (cc > cs && cc < ce)
397              {
398              d = 0;
399              had_recurse = TRUE;
400              break;
401              }
402            else
403              {
404              int dd = find_minlength(re, cs, startcode, options, recurse_depth);
405              if (dd < d) d = dd;
406              }
407            slot += re->name_entry_size;
408            }
409          }
410        else d = 0;
411        cc += 1 + 2*IMM2_SIZE;
412        goto REPEAT_BACK_REFERENCE;
413    
414        case OP_REF:      /* Single back reference */
415      case OP_REFI:      case OP_REFI:
416      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
417        {        {
418        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
419        if (cs == NULL) return -2;        if (cs == NULL) return -2;
420        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
421        if (cc > cs && cc < ce)        if (cc > cs && cc < ce)
# Line 379  for (;;) Line 425  for (;;)
425          }          }
426        else        else
427          {          {
428          d = find_minlength(cs, startcode, options, had_accept_ptr,          d = find_minlength(re, cs, startcode, options, recurse_depth);
           recurse_depth);  
         *had_accept_ptr = FALSE;  
429          }          }
430        }        }
431      else d = 0;      else d = 0;
432      cc += 3;      cc += 1 + IMM2_SIZE;
433    
434      /* Handle repeated back references */      /* Handle repeated back references */
435    
436        REPEAT_BACK_REFERENCE:
437      switch (*cc)      switch (*cc)
438        {        {
439        case OP_CRSTAR:        case OP_CRSTAR:
440        case OP_CRMINSTAR:        case OP_CRMINSTAR:
441        case OP_CRQUERY:        case OP_CRQUERY:
442        case OP_CRMINQUERY:        case OP_CRMINQUERY:
443          case OP_CRPOSSTAR:
444          case OP_CRPOSQUERY:
445        min = 0;        min = 0;
446        cc++;        cc++;
447        break;        break;
448    
449        case OP_CRPLUS:        case OP_CRPLUS:
450        case OP_CRMINPLUS:        case OP_CRMINPLUS:
451          case OP_CRPOSPLUS:
452        min = 1;        min = 1;
453        cc++;        cc++;
454        break;        break;
455    
456        case OP_CRRANGE:        case OP_CRRANGE:
457        case OP_CRMINRANGE:        case OP_CRMINRANGE:
458          case OP_CRPOSRANGE:
459        min = GET2(cc, 1);        min = GET2(cc, 1);
460        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
461        break;        break;
462    
463        default:        default:
# Line 423  for (;;) Line 472  for (;;)
472      caught by a recursion depth count. */      caught by a recursion depth count. */
473    
474      case OP_RECURSE:      case OP_RECURSE:
475      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
     if (cs == NULL) return -2;  
476      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
477      if ((cc > cs && cc < ce) || recurse_depth > 10)      if ((cc > cs && cc < ce) || recurse_depth > 10)
478        had_recurse = TRUE;        had_recurse = TRUE;
479      else      else
480        {        {
481        branchlength += find_minlength(cs, startcode, options, had_accept_ptr,        branchlength += find_minlength(re, cs, startcode, options,
482          recurse_depth + 1);          recurse_depth + 1);
       *had_accept_ptr = FALSE;  
483        }        }
484      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
485      break;      break;
# Line 484  for (;;) Line 531  for (;;)
531      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
532      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
533    
534      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
535  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
536      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
537  #endif  #endif
538      break;      break;
539    
# Line 495  for (;;) Line 542  for (;;)
542      case OP_MARK:      case OP_MARK:
543      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
544      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     cc += _pcre_OP_lengths[op] + cc[1];  
     break;  
   
545      case OP_THEN_ARG:      case OP_THEN_ARG:
546      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];      cc += PRIV(OP_lengths)[op] + cc[1];
547      break;      break;
548    
549      /* The remaining opcodes are just skipped over. */      /* The remaining opcodes are just skipped over. */
# Line 511  for (;;) Line 555  for (;;)
555      case OP_SET_SOM:      case OP_SET_SOM:
556      case OP_SKIP:      case OP_SKIP:
557      case OP_THEN:      case OP_THEN:
558      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
559      break;      break;
560    
561      /* This should not occur: we list all opcodes explicitly so that when      /* This should not occur: we list all opcodes explicitly so that when
# Line 540  Arguments: Line 584  Arguments:
584    p             points to the character    p             points to the character
585    caseless      the caseless flag    caseless      the caseless flag
586    cd            the block with char table pointers    cd            the block with char table pointers
587    utf8          TRUE for UTF-8 mode    utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
588    
589  Returns:        pointer after the character  Returns:        pointer after the character
590  */  */
591    
592  static const uschar *  static const pcre_uchar *
593  set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
594    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf)
595  {  {
596  unsigned int c = *p;  pcre_uint32 c = *p;
597    
598    #ifdef COMPILE_PCRE8
599  SET_BIT(c);  SET_BIT(c);
600    
601  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
602  if (utf8 && c > 127)  if (utf && c > 127)
603    {    {
604    GETCHARINC(c, p);    GETCHARINC(c, p);
605  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
606    if (caseless)    if (caseless)
607      {      {
608      uschar buff[8];      pcre_uchar buff[6];
609      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
610      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
611      SET_BIT(buff[0]);      SET_BIT(buff[0]);
612      }      }
613  #endif  #endif  /* Not SUPPORT_UCP */
614    return p;    return p;
615    }    }
616  #endif  #else   /* Not SUPPORT_UTF */
617    (void)(utf);   /* Stops warning for unused parameter */
618    #endif  /* SUPPORT_UTF */
619    
620  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
621    
622  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
623  return p + 1;  return p + 1;
624    #endif  /* COMPILE_PCRE8 */
625    
626    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
627    if (c > 0xff)
628      {
629      c = 0xff;
630      caseless = FALSE;
631      }
632    SET_BIT(c);
633    
634    #ifdef SUPPORT_UTF
635    if (utf && c > 127)
636      {
637      GETCHARINC(c, p);
638    #ifdef SUPPORT_UCP
639      if (caseless)
640        {
641        c = UCD_OTHERCASE(c);
642        if (c > 0xff)
643          c = 0xff;
644        SET_BIT(c);
645        }
646    #endif  /* SUPPORT_UCP */
647      return p;
648      }
649    #else   /* Not SUPPORT_UTF */
650    (void)(utf);   /* Stops warning for unused parameter */
651    #endif  /* SUPPORT_UTF */
652    
653    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
654    return p + 1;
655    #endif
656  }  }
657    
658    
# Line 599  Returns:         nothing Line 678  Returns:         nothing
678  */  */
679    
680  static void  static void
681  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,  set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682    compile_data *cd)    compile_data *cd)
683  {  {
684  register int c;  register pcre_uint32 c;
685  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
686    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687  if (table_limit == 32) return;  if (table_limit == 32) return;
688  for (c = 128; c < 256; c++)  for (c = 128; c < 256; c++)
689    {    {
690    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
691      {      {
692      uschar buff[8];      pcre_uchar buff[6];
693      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
694      SET_BIT(buff[0]);      SET_BIT(buff[0]);
695      }      }
696    }    }
697    #endif
698  }  }
699    
700    
# Line 639  Returns:         nothing Line 720  Returns:         nothing
720  */  */
721    
722  static void  static void
723  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724    compile_data *cd)    compile_data *cd)
725  {  {
726  register int c;  register pcre_uint32 c;
727  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
728    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
730    #endif
731  }  }
732    
733    
# Line 664  function fails unless the result is SSB_ Line 747  function fails unless the result is SSB_
747  Arguments:  Arguments:
748    code         points to an expression    code         points to an expression
749    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
750    utf8         TRUE if in UTF-8 mode    utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
751    cd           the block with char table pointers    cd           the block with char table pointers
752    
753  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
# Line 674  Returns:       SSB_FAIL     => Failed to Line 757  Returns:       SSB_FAIL     => Failed to
757  */  */
758    
759  static int  static int
760  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
761    compile_data *cd)    compile_data *cd)
762  {  {
763  register int c;  register pcre_uint32 c;
764  int yield = SSB_DONE;  int yield = SSB_DONE;
765  int table_limit = utf8? 16:32;  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
766    int table_limit = utf? 16:32;
767    #else
768    int table_limit = 32;
769    #endif
770    
771  #if 0  #if 0
772  /* ========================================================================= */  /* ========================================================================= */
# Line 701  volatile int dummy; Line 788  volatile int dummy;
788  do  do
789    {    {
790    BOOL try_next = TRUE;    BOOL try_next = TRUE;
791    const uschar *tcode = code + 1 + LINK_SIZE;    const pcre_uchar *tcode = code + 1 + LINK_SIZE;
792    
793    if (*code == OP_CBRA || *code == OP_SCBRA ||    if (*code == OP_CBRA || *code == OP_SCBRA ||
794        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
795    
796    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
797      {      {
# Line 733  do Line 820  do
820        case OP_COND:        case OP_COND:
821        case OP_CREF:        case OP_CREF:
822        case OP_DEF:        case OP_DEF:
823          case OP_DNCREF:
824          case OP_DNREF:
825          case OP_DNREFI:
826          case OP_DNRREF:
827        case OP_DOLL:        case OP_DOLL:
828        case OP_DOLLM:        case OP_DOLLM:
829        case OP_END:        case OP_END:
# Line 741  do Line 832  do
832        case OP_EXTUNI:        case OP_EXTUNI:
833        case OP_FAIL:        case OP_FAIL:
834        case OP_MARK:        case OP_MARK:
       case OP_NCREF:  
835        case OP_NOT:        case OP_NOT:
836        case OP_NOTEXACT:        case OP_NOTEXACT:
837        case OP_NOTEXACTI:        case OP_NOTEXACTI:
# Line 773  do Line 863  do
863        case OP_NOTUPTOI:        case OP_NOTUPTOI:
864        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
865        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
       case OP_NRREF:  
866        case OP_PROP:        case OP_PROP:
867        case OP_PRUNE:        case OP_PRUNE:
868        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
# Line 790  do Line 879  do
879        case OP_SOM:        case OP_SOM:
880        case OP_THEN:        case OP_THEN:
881        case OP_THEN_ARG:        case OP_THEN_ARG:
       case OP_XCLASS:  
882        return SSB_FAIL;        return SSB_FAIL;
883    
884        /* We can ignore word boundary tests. */        /* We can ignore word boundary tests. */
885    
886        case OP_WORD_BOUNDARY:        case OP_WORD_BOUNDARY:
887        case OP_NOT_WORD_BOUNDARY:        case OP_NOT_WORD_BOUNDARY:
888        tcode++;        tcode++;
889        break;        break;
890    
891        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
892        bits from within the subpattern. If it can't find anything, we have to        bits from within the subpattern. If it can't find anything, we have to
# Line 814  do Line 902  do
902        case OP_CBRAPOS:        case OP_CBRAPOS:
903        case OP_SCBRAPOS:        case OP_SCBRAPOS:
904        case OP_ONCE:        case OP_ONCE:
905          case OP_ONCE_NC:
906        case OP_ASSERT:        case OP_ASSERT:
907        rc = set_start_bits(tcode, start_bits, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf, cd);
908        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
909        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
910          {          {
# Line 862  do Line 951  do
951        case OP_BRAZERO:        case OP_BRAZERO:
952        case OP_BRAMINZERO:        case OP_BRAMINZERO:
953        case OP_BRAPOSZERO:        case OP_BRAPOSZERO:
954        rc = set_start_bits(++tcode, start_bits, utf8, cd);        rc = set_start_bits(++tcode, start_bits, utf, cd);
955        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
956  /* =========================================================================  /* =========================================================================
957        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 889  do Line 978  do
978        case OP_QUERY:        case OP_QUERY:
979        case OP_MINQUERY:        case OP_MINQUERY:
980        case OP_POSQUERY:        case OP_POSQUERY:
981        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
982        break;        break;
983    
984        case OP_STARI:        case OP_STARI:
# Line 898  do Line 987  do
987        case OP_QUERYI:        case OP_QUERYI:
988        case OP_MINQUERYI:        case OP_MINQUERYI:
989        case OP_POSQUERYI:        case OP_POSQUERYI:
990        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
991        break;        break;
992    
993        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 906  do Line 995  do
995        case OP_UPTO:        case OP_UPTO:
996        case OP_MINUPTO:        case OP_MINUPTO:
997        case OP_POSUPTO:        case OP_POSUPTO:
998        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
999        break;        break;
1000    
1001        case OP_UPTOI:        case OP_UPTOI:
1002        case OP_MINUPTOI:        case OP_MINUPTOI:
1003        case OP_POSUPTOI:        case OP_POSUPTOI:
1004        tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1005        break;        break;
1006    
1007        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
1008    
1009        case OP_EXACT:        case OP_EXACT:
1010        tcode += 2;        tcode += IMM2_SIZE;
1011        /* Fall through */        /* Fall through */
1012        case OP_CHAR:        case OP_CHAR:
1013        case OP_PLUS:        case OP_PLUS:
1014        case OP_MINPLUS:        case OP_MINPLUS:
1015        case OP_POSPLUS:        case OP_POSPLUS:
1016        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1017        try_next = FALSE;        try_next = FALSE;
1018        break;        break;
1019    
1020        case OP_EXACTI:        case OP_EXACTI:
1021        tcode += 2;        tcode += IMM2_SIZE;
1022        /* Fall through */        /* Fall through */
1023        case OP_CHARI:        case OP_CHARI:
1024        case OP_PLUSI:        case OP_PLUSI:
1025        case OP_MINPLUSI:        case OP_MINPLUSI:
1026        case OP_POSPLUSI:        case OP_POSPLUSI:
1027        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1028        try_next = FALSE;        try_next = FALSE;
1029        break;        break;
1030    
# Line 946  do Line 1035  do
1035        identical. */        identical. */
1036    
1037        case OP_HSPACE:        case OP_HSPACE:
1038        SET_BIT(0x09);        SET_BIT(CHAR_HT);
1039        SET_BIT(0x20);        SET_BIT(CHAR_SPACE);
1040        if (utf8)  #ifdef SUPPORT_UTF
1041          if (utf)
1042          {          {
1043    #ifdef COMPILE_PCRE8
1044          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
1045          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
1046          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1047          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
1048    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1049            SET_BIT(0xA0);
1050            SET_BIT(0xFF);  /* For characters > 255 */
1051    #endif  /* COMPILE_PCRE[8|16|32] */
1052            }
1053          else
1054    #endif /* SUPPORT_UTF */
1055            {
1056    #ifndef EBCDIC
1057            SET_BIT(0xA0);
1058    #endif  /* Not EBCDIC */
1059    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1060            SET_BIT(0xFF);  /* For characters > 255 */
1061    #endif  /* COMPILE_PCRE[16|32] */
1062          }          }
       else SET_BIT(0xA0);  
1063        try_next = FALSE;        try_next = FALSE;
1064        break;        break;
1065    
1066        case OP_ANYNL:        case OP_ANYNL:
1067        case OP_VSPACE:        case OP_VSPACE:
1068        SET_BIT(0x0A);        SET_BIT(CHAR_LF);
1069        SET_BIT(0x0B);        SET_BIT(CHAR_VT);
1070        SET_BIT(0x0C);        SET_BIT(CHAR_FF);
1071        SET_BIT(0x0D);        SET_BIT(CHAR_CR);
1072        if (utf8)  #ifdef SUPPORT_UTF
1073          if (utf)
1074          {          {
1075    #ifdef COMPILE_PCRE8
1076          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
1077          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1078    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1079            SET_BIT(CHAR_NEL);
1080            SET_BIT(0xFF);  /* For characters > 255 */
1081    #endif  /* COMPILE_PCRE[8|16|32] */
1082            }
1083          else
1084    #endif /* SUPPORT_UTF */
1085            {
1086            SET_BIT(CHAR_NEL);
1087    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1088            SET_BIT(0xFF);  /* For characters > 255 */
1089    #endif
1090          }          }
       else SET_BIT(0x85);  
1091        try_next = FALSE;        try_next = FALSE;
1092        break;        break;
1093    
# Line 990  do Line 1107  do
1107        break;        break;
1108    
1109        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
1110        ensure it is set as not whitespace. */        ensure it is set as not whitespace. Luckily, the code value is the same
1111          (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1112    
1113        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
1114        set_nottype_bits(start_bits, cbit_space, table_limit, cd);        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
# Line 998  do Line 1116  do
1116        try_next = FALSE;        try_next = FALSE;
1117        break;        break;
1118    
1119        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to not
1120        not set it from the table. */        set it from the table. Luckily, the code value is the same (0x0b) in
1121          ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1122    
1123        case OP_WHITESPACE:        case OP_WHITESPACE:
1124        c = start_bits[1];    /* Save in case it was already set */        c = start_bits[1];    /* Save in case it was already set */
# Line 1028  do Line 1147  do
1147        break;        break;
1148    
1149        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1150        tcode += 3;        tcode += 1 + IMM2_SIZE;
1151        break;        break;
1152    
1153        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 1037  do Line 1156  do
1156        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1157        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1158        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1159        tcode += 2;               /* Fall through */        tcode += IMM2_SIZE;  /* Fall through */
1160    
1161        case OP_TYPESTAR:        case OP_TYPESTAR:
1162        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
# Line 1053  do Line 1172  do
1172          return SSB_FAIL;          return SSB_FAIL;
1173    
1174          case OP_HSPACE:          case OP_HSPACE:
1175          SET_BIT(0x09);          SET_BIT(CHAR_HT);
1176          SET_BIT(0x20);          SET_BIT(CHAR_SPACE);
1177          if (utf8)  #ifdef SUPPORT_UTF
1178            if (utf)
1179            {            {
1180    #ifdef COMPILE_PCRE8
1181            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
1182            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
1183            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1184            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
1185    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1186              SET_BIT(0xA0);
1187              SET_BIT(0xFF);  /* For characters > 255 */
1188    #endif  /* COMPILE_PCRE[8|16|32] */
1189            }            }
1190          else SET_BIT(0xA0);          else
1191    #endif /* SUPPORT_UTF */
1192    #ifndef EBCDIC
1193              SET_BIT(0xA0);
1194    #endif  /* Not EBCDIC */
1195          break;          break;
1196    
1197          case OP_ANYNL:          case OP_ANYNL:
1198          case OP_VSPACE:          case OP_VSPACE:
1199          SET_BIT(0x0A);          SET_BIT(CHAR_LF);
1200          SET_BIT(0x0B);          SET_BIT(CHAR_VT);
1201          SET_BIT(0x0C);          SET_BIT(CHAR_FF);
1202          SET_BIT(0x0D);          SET_BIT(CHAR_CR);
1203          if (utf8)  #ifdef SUPPORT_UTF
1204            if (utf)
1205            {            {
1206    #ifdef COMPILE_PCRE8
1207            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
1208            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
1209    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1210              SET_BIT(CHAR_NEL);
1211              SET_BIT(0xFF);  /* For characters > 255 */
1212    #endif  /* COMPILE_PCRE16 */
1213            }            }
1214          else SET_BIT(0x85);          else
1215    #endif /* SUPPORT_UTF */
1216              SET_BIT(CHAR_NEL);
1217          break;          break;
1218    
1219          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 1087  do Line 1224  do
1224          set_type_bits(start_bits, cbit_digit, table_limit, cd);          set_type_bits(start_bits, cbit_digit, table_limit, cd);
1225          break;          break;
1226    
1227          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we no longer
1228          ensure it gets set as not whitespace. */          have to play fancy tricks because Perl added VT to its whitespace at
1229            release 5.18. PCRE added it at release 8.34. */
1230    
1231          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1232          set_nottype_bits(start_bits, cbit_space, table_limit, cd);          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] |= 0x08;  
1233          break;          break;
1234    
         /* The cbit_space table has vertical tab as whitespace; we have to  
         avoid setting it. */  
   
1235          case OP_WHITESPACE:          case OP_WHITESPACE:
         c = start_bits[1];    /* Save in case it was already set */  
1236          set_type_bits(start_bits, cbit_space, table_limit, cd);          set_type_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] = (start_bits[1] & ~0x08) | c;  
1237          break;          break;
1238    
1239          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 1122  do Line 1254  do
1254        with a value >= 0xc4 is a potentially valid starter because it starts a        with a value >= 0xc4 is a potentially valid starter because it starts a
1255        character with a value > 255. */        character with a value > 255. */
1256    
1257    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1258          case OP_XCLASS:
1259          if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1260            return SSB_FAIL;
1261          /* All bits are set. */
1262          if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1263            return SSB_FAIL;
1264    #endif
1265          /* Fall through */
1266    
1267        case OP_NCLASS:        case OP_NCLASS:
1268  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1269        if (utf8)        if (utf)
1270          {          {
1271          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1272          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1273          }          }
1274  #endif  #endif
1275    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1276          SET_BIT(0xFF);                         /* For characters > 255 */
1277    #endif
1278        /* Fall through */        /* Fall through */
1279    
1280        case OP_CLASS:        case OP_CLASS:
1281          {          {
1282          tcode++;          pcre_uint8 *map;
1283    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1284            map = NULL;
1285            if (*tcode == OP_XCLASS)
1286              {
1287              if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1288                map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1289              tcode += GET(tcode, 1);
1290              }
1291            else
1292    #endif
1293              {
1294              tcode++;
1295              map = (pcre_uint8 *)tcode;
1296              tcode += 32 / sizeof(pcre_uchar);
1297              }
1298    
1299          /* In UTF-8 mode, the bits in a bit map correspond to character          /* In UTF-8 mode, the bits in a bit map correspond to character
1300          values, not to byte values. However, the bit map we are constructing is          values, not to byte values. However, the bit map we are constructing is
# Line 1142  do Line 1302  do
1302          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
1303          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1304    
1305  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1306          if (utf8)          if (map != NULL)
1307    #endif
1308            {            {
1309            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1310            for (c = 128; c < 256; c++)            if (utf)
1311              {              {
1312              if ((tcode[c/8] && (1 << (c&7))) != 0)              for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1313                for (c = 128; c < 256; c++)
1314                {                {
1315                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */                if ((map[c/8] && (1 << (c&7))) != 0)
1316                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */                  {
1317                c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */                  int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1318                    start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1319                    c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1320                    }
1321                }                }
1322              }              }
1323            }            else
   
         /* In non-UTF-8 mode, the two bit maps are completely compatible. */  
   
         else  
1324  #endif  #endif
1325            {              {
1326            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];              /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1327                for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1328                }
1329            }            }
1330    
1331          /* Advance past the bit map, and act on what follows. For a zero          /* Advance past the bit map, and act on what follows. For a zero
1332          minimum repeat, continue; otherwise stop processing. */          minimum repeat, continue; otherwise stop processing. */
1333    
         tcode += 32;  
1334          switch (*tcode)          switch (*tcode)
1335            {            {
1336            case OP_CRSTAR:            case OP_CRSTAR:
1337            case OP_CRMINSTAR:            case OP_CRMINSTAR:
1338            case OP_CRQUERY:            case OP_CRQUERY:
1339            case OP_CRMINQUERY:            case OP_CRMINQUERY:
1340              case OP_CRPOSSTAR:
1341              case OP_CRPOSQUERY:
1342            tcode++;            tcode++;
1343            break;            break;
1344    
1345            case OP_CRRANGE:            case OP_CRRANGE:
1346            case OP_CRMINRANGE:            case OP_CRMINRANGE:
1347            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            case OP_CRPOSRANGE:
1348              if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1349              else try_next = FALSE;              else try_next = FALSE;
1350            break;            break;
1351    
# Line 1209  return yield; Line 1374  return yield;
1374  *************************************************/  *************************************************/
1375    
1376  /* This function is handed a compiled expression that it must study to produce  /* This function is handed a compiled expression that it must study to produce
1377  information that will speed up the matching. It returns a pcre_extra block  information that will speed up the matching. It returns a pcre[16]_extra block
1378  which then gets handed back to pcre_exec().  which then gets handed back to pcre_exec().
1379    
1380  Arguments:  Arguments:
# Line 1218  Arguments: Line 1383  Arguments:
1383    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
1384              set NULL unless error              set NULL unless error
1385    
1386  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1387                appropriate flags set;                the appropriate flags set;
1388              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1389  */  */
1390    
1391    #if defined COMPILE_PCRE8
1392  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1393  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1394    #elif defined COMPILE_PCRE16
1395    PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1396    pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1397    #elif defined COMPILE_PCRE32
1398    PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1399    pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1400    #endif
1401  {  {
1402  int min;  int min;
1403  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1404  BOOL had_accept = FALSE;  pcre_uint8 start_bits[32];
1405  uschar start_bits[32];  PUBL(extra) *extra = NULL;
 pcre_extra *extra;  
1406  pcre_study_data *study;  pcre_study_data *study;
1407  const uschar *tables;  const pcre_uint8 *tables;
1408  uschar *code;  pcre_uchar *code;
1409  compile_data compile_block;  compile_data compile_block;
1410  const real_pcre *re = (const real_pcre *)external_re;  const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1411    
1412    
1413  *errorptr = NULL;  *errorptr = NULL;
1414    
# Line 1245  if (re == NULL || re->magic_number != MA Line 1418  if (re == NULL || re->magic_number != MA
1418    return NULL;    return NULL;
1419    }    }
1420    
1421    if ((re->flags & PCRE_MODE) == 0)
1422      {
1423    #if defined COMPILE_PCRE8
1424      *errorptr = "argument not compiled in 8 bit mode";
1425    #elif defined COMPILE_PCRE16
1426      *errorptr = "argument not compiled in 16 bit mode";
1427    #elif defined COMPILE_PCRE32
1428      *errorptr = "argument not compiled in 32 bit mode";
1429    #endif
1430      return NULL;
1431      }
1432    
1433  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1434    {    {
1435    *errorptr = "unknown or incorrect option bit(s) set";    *errorptr = "unknown or incorrect option bit(s) set";
1436    return NULL;    return NULL;
1437    }    }
1438    
1439  code = (uschar *)re + re->name_table_offset +  code = (pcre_uchar *)re + re->name_table_offset +
1440    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1441    
1442  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
# Line 1266  if ((re->options & PCRE_ANCHORED) == 0 & Line 1451  if ((re->options & PCRE_ANCHORED) == 0 &
1451    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1452    
1453    tables = re->tables;    tables = re->tables;
1454    
1455    #if defined COMPILE_PCRE8
1456    if (tables == NULL)    if (tables == NULL)
1457      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1458      (void *)(&tables));      (void *)(&tables));
1459    #elif defined COMPILE_PCRE16
1460      if (tables == NULL)
1461        (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1462        (void *)(&tables));
1463    #elif defined COMPILE_PCRE32
1464      if (tables == NULL)
1465        (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1466        (void *)(&tables));
1467    #endif
1468    
1469    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1470    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
# Line 1277  if ((re->options & PCRE_ANCHORED) == 0 & Line 1473  if ((re->options & PCRE_ANCHORED) == 0 &
1473    
1474    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1475    
1476    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1477    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1478      &compile_block);      &compile_block);
1479    bits_set = rc == SSB_DONE;    bits_set = rc == SSB_DONE;
1480    if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";    if (rc == SSB_UNKNOWN)
1481        {
1482        *errorptr = "internal error: opcode not recognized";
1483        return NULL;
1484        }
1485    }    }
1486    
1487  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1488    
1489  switch(min = find_minlength(code, code, re->options, &had_accept, 0))  switch(min = find_minlength(re, code, code, re->options, 0))
1490    {    {
1491    case -2: *errorptr = "internal error: missing capturing bracket"; break;    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1492    case -3: *errorptr = "internal error: opcode not recognized"; break;    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1493    default: break;    default: break;
1494    }    }
1495    
1496  /* Return NULL if there's been an error or if no optimization is possible. */  /* If a set of starting bytes has been identified, or if the minimum length is
1497    greater than zero, or if JIT optimization has been requested, or if
1498    PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1499    pcre_study_data block. The study data is put in the latter, which is pointed to
1500    by the former, which may also get additional data set later by the calling
1501    program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1502    save it in a field for returning via the pcre_fullinfo() function so that if it
1503    becomes variable in the future, we don't have to change that code. */
1504    
1505    if (bits_set || min > 0 || (options & (
1506    #ifdef SUPPORT_JIT
1507        PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1508        PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1509    #endif
1510        PCRE_STUDY_EXTRA_NEEDED)) != 0)
1511      {
1512      extra = (PUBL(extra) *)(PUBL(malloc))
1513        (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1514      if (extra == NULL)
1515        {
1516        *errorptr = "failed to get memory";
1517        return NULL;
1518        }
1519    
1520  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;    study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1521      extra->flags = PCRE_EXTRA_STUDY_DATA;
1522      extra->study_data = study;
1523    
1524      study->size = sizeof(pcre_study_data);
1525      study->flags = 0;
1526    
1527      /* Set the start bits always, to avoid unset memory errors if the
1528      study data is written to a file, but set the flag only if any of the bits
1529      are set, to save time looking when none are. */
1530    
1531  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in    if (bits_set)
1532  the latter, which is pointed to by the former, which may also get additional      {
1533  data set later by the calling program. At the moment, the size of      study->flags |= PCRE_STUDY_MAPPED;
1534  pcre_study_data is fixed. We nevertheless save it in a field for returning via      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1535  the pcre_fullinfo() function so that if it becomes variable in the future, we      }
1536  don't have to change that code. */    else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1537    
1538  extra = (pcre_extra *)(pcre_malloc)  #ifdef PCRE_DEBUG
1539    (sizeof(pcre_extra) + sizeof(pcre_study_data));    if (bits_set)
1540        {
1541        pcre_uint8 *ptr = start_bits;
1542        int i;
1543    
1544  if (extra == NULL)      printf("Start bits:\n");
1545    {      for (i = 0; i < 32; i++)
1546    *errorptr = "failed to get memory";        printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1547    return NULL;      }
1548    }  #endif
1549    
1550  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));    /* Always set the minlength value in the block, because the JIT compiler
1551  extra->flags = PCRE_EXTRA_STUDY_DATA;    makes use of it. However, don't set the bit unless the length is greater than
1552  extra->study_data = study;    zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1553      checking the zero case. */
1554    
1555  study->size = sizeof(pcre_study_data);    if (min > 0)
1556  study->flags = 0;      {
1557        study->flags |= PCRE_STUDY_MINLEN;
1558        study->minlength = min;
1559        }
1560      else study->minlength = 0;
1561    
1562  if (bits_set)    /* If JIT support was compiled and requested, attempt the JIT compilation.
1563    {    If no starting bytes were found, and the minimum length is zero, and JIT
1564    study->flags |= PCRE_STUDY_MAPPED;    compilation fails, abandon the extra block and return NULL, unless
1565    memcpy(study->start_bits, start_bits, sizeof(start_bits));    PCRE_STUDY_EXTRA_NEEDED is set. */
1566    }  
1567    #ifdef SUPPORT_JIT
1568      extra->executable_jit = NULL;
1569      if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1570        PRIV(jit_compile)(re, extra, JIT_COMPILE);
1571      if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1572        PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1573      if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1574        PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1575    
1576  if (min >= 0)    if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1577    {        (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1578    study->flags |= PCRE_STUDY_MINLEN;      {
1579    study->minlength = min;  #if defined COMPILE_PCRE8
1580        pcre_free_study(extra);
1581    #elif defined COMPILE_PCRE16
1582        pcre16_free_study(extra);
1583    #elif defined COMPILE_PCRE32
1584        pcre32_free_study(extra);
1585    #endif
1586        extra = NULL;
1587        }
1588    #endif
1589    }    }
1590    
1591  return extra;  return extra;
1592  }  }
1593    
1594    
1595    /*************************************************
1596    *          Free the study data                   *
1597    *************************************************/
1598    
1599    /* This function frees the memory that was obtained by pcre_study().
1600    
1601    Argument:   a pointer to the pcre[16]_extra block
1602    Returns:    nothing
1603    */
1604    
1605    #if defined COMPILE_PCRE8
1606    PCRE_EXP_DEFN void
1607    pcre_free_study(pcre_extra *extra)
1608    #elif defined COMPILE_PCRE16
1609    PCRE_EXP_DEFN void
1610    pcre16_free_study(pcre16_extra *extra)
1611    #elif defined COMPILE_PCRE32
1612    PCRE_EXP_DEFN void
1613    pcre32_free_study(pcre32_extra *extra)
1614    #endif
1615    {
1616    if (extra == NULL)
1617      return;
1618    #ifdef SUPPORT_JIT
1619    if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1620         extra->executable_jit != NULL)
1621      PRIV(jit_free)(extra->executable_jit);
1622    #endif
1623    PUBL(free)(extra);
1624    }
1625    
1626  /* End of pcre_study.c */  /* End of pcre_study.c */

Legend:
Removed from v.657  
changed lines
  Added in v.1414

  ViewVC Help
Powered by ViewVC 1.1.5