/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 657 by ph10, Mon Aug 15 17:39:09 2011 UTC revision 1379 by ph10, Mon Oct 14 13:54:07 2013 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code        pointer to start of group (the bracket)    re              compiled pattern block
70    startcode   pointer to start of the whole pattern    code            pointer to start of group (the bracket)
71    options     the compiling options    startcode       pointer to start of the whole pattern's code
72    had_accept  pointer to flag for (*ACCEPT) encountered    options         the compiling options
73    int         RECURSE depth    int             RECURSE depth
74    
75  Returns:   the minimum length  Returns:   the minimum length
76             -1 if \C was encountered             -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
78             -3 internal error (opcode not listed)             -3 internal error (opcode not listed)
79  */  */
80    
81  static int  static int
82  find_minlength(const uschar *code, const uschar *startcode, int options,  find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83    BOOL *had_accept_ptr, int recurse_depth)    const pcre_uchar *startcode, int options, int recurse_depth)
84  {  {
85  int length = -1;  int length = -1;
86  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87    BOOL utf = (options & PCRE_UTF8) != 0;
88  BOOL had_recurse = FALSE;  BOOL had_recurse = FALSE;
89  register int branchlength = 0;  register int branchlength = 0;
90  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
91    
92  if (*code == OP_CBRA || *code == OP_SCBRA ||  if (*code == OP_CBRA || *code == OP_SCBRA ||
93      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
94    
95  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
96  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 97  branch, check the length against that of Line 98  branch, check the length against that of
98  for (;;)  for (;;)
99    {    {
100    int d, min;    int d, min;
101    uschar *cs, *ce;    pcre_uchar *cs, *ce;
102    register int op = *cc;    register pcre_uchar op = *cc;
103    
104    switch (op)    switch (op)
105      {      {
# Line 128  for (;;) Line 129  for (;;)
129      case OP_BRAPOS:      case OP_BRAPOS:
130      case OP_SBRAPOS:      case OP_SBRAPOS:
131      case OP_ONCE:      case OP_ONCE:
132      d = find_minlength(cc, startcode, options, had_accept_ptr, recurse_depth);      case OP_ONCE_NC:
133        d = find_minlength(re, cc, startcode, options, recurse_depth);
134      if (d < 0) return d;      if (d < 0) return d;
135      branchlength += d;      branchlength += d;
     if (*had_accept_ptr) return branchlength;  
136      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
137      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
138      break;      break;
139    
140      /* Reached end of a branch; if it's a ket it is the end of a nested      /* ACCEPT makes things far too complicated; we have to give up. */
     call. If it's ALT it is an alternation in a nested call. If it is END it's  
     the end of the outer call. All can be handled by the same code. If it is  
     ACCEPT, it is essentially the same as END, but we set a flag so that  
     counting stops. */  
141    
142      case OP_ACCEPT:      case OP_ACCEPT:
143      case OP_ASSERT_ACCEPT:      case OP_ASSERT_ACCEPT:
144      *had_accept_ptr = TRUE;      return -1;
145      /* Fall through */  
146        /* Reached end of a branch; if it's a ket it is the end of a nested
147        call. If it's ALT it is an alternation in a nested call. If it is END it's
148        the end of the outer call. All can be handled by the same code. If an
149        ACCEPT was previously encountered, use the length that was in force at that
150        time, and pass back the shortest ACCEPT length. */
151    
152      case OP_ALT:      case OP_ALT:
153      case OP_KET:      case OP_KET:
154      case OP_KETRMAX:      case OP_KETRMAX:
# Line 173  for (;;) Line 176  for (;;)
176    
177      case OP_REVERSE:      case OP_REVERSE:
178      case OP_CREF:      case OP_CREF:
179      case OP_NCREF:      case OP_DNCREF:
180      case OP_RREF:      case OP_RREF:
181      case OP_NRREF:      case OP_DNRREF:
182      case OP_DEF:      case OP_DEF:
183      case OP_CALLOUT:      case OP_CALLOUT:
184      case OP_SOD:      case OP_SOD:
# Line 188  for (;;) Line 191  for (;;)
191      case OP_DOLLM:      case OP_DOLLM:
192      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
193      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
194      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
195      break;      break;
196    
197      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
# Line 197  for (;;) Line 200  for (;;)
200      case OP_BRAMINZERO:      case OP_BRAMINZERO:
201      case OP_BRAPOSZERO:      case OP_BRAPOSZERO:
202      case OP_SKIPZERO:      case OP_SKIPZERO:
203      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
204      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
205      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
206      break;      break;
# Line 222  for (;;) Line 225  for (;;)
225      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
226      branchlength++;      branchlength++;
227      cc += 2;      cc += 2;
228  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
229      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
230  #endif  #endif
231      break;      break;
232    
# Line 242  for (;;) Line 245  for (;;)
245      case OP_NOTEXACT:      case OP_NOTEXACT:
246      case OP_NOTEXACTI:      case OP_NOTEXACTI:
247      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
248      cc += 4;      cc += 2 + IMM2_SIZE;
249  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
250      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
251  #endif  #endif
252      break;      break;
253    
254      case OP_TYPEEXACT:      case OP_TYPEEXACT:
255      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
256      cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;      cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
257          || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
258      break;      break;
259    
260      /* Handle single-char non-literal matchers */      /* Handle single-char non-literal matchers */
# Line 285  for (;;) Line 289  for (;;)
289      cc++;      cc++;
290      break;      break;
291    
292      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
293        non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
294        appear, but leave the code, just in case.) */
295    
296      case OP_ANYBYTE:      case OP_ANYBYTE:
297  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
298      if (utf8) return -1;      if (utf) return -1;
299  #endif  #endif
300      branchlength++;      branchlength++;
301      cc++;      cc++;
# Line 305  for (;;) Line 311  for (;;)
311      case OP_TYPEPOSSTAR:      case OP_TYPEPOSSTAR:
312      case OP_TYPEPOSQUERY:      case OP_TYPEPOSQUERY:
313      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
314      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
315      break;      break;
316    
317      case OP_TYPEUPTO:      case OP_TYPEUPTO:
318      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
319      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
320      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP
321      cc += _pcre_OP_lengths[op];        || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
322        cc += PRIV(OP_lengths)[op];
323      break;      break;
324    
325      /* Check a class for variable quantification */      /* Check a class for variable quantification */
326    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
327      case OP_CLASS:      case OP_CLASS:
328      case OP_NCLASS:      case OP_NCLASS:
329      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
330        case OP_XCLASS:
331        /* The original code caused an unsigned overflow in 64 bit systems,
332        so now we use a conditional statement. */
333        if (op == OP_XCLASS)
334          cc += GET(cc, 1);
335        else
336          cc += PRIV(OP_lengths)[OP_CLASS];
337    #else
338        cc += PRIV(OP_lengths)[OP_CLASS];
339    #endif
340    
341      switch (*cc)      switch (*cc)
342        {        {
343        case OP_CRPLUS:        case OP_CRPLUS:
344        case OP_CRMINPLUS:        case OP_CRMINPLUS:
345          case OP_CRPOSPLUS:
346        branchlength++;        branchlength++;
347        /* Fall through */        /* Fall through */
348    
# Line 338  for (;;) Line 350  for (;;)
350        case OP_CRMINSTAR:        case OP_CRMINSTAR:
351        case OP_CRQUERY:        case OP_CRQUERY:
352        case OP_CRMINQUERY:        case OP_CRMINQUERY:
353          case OP_CRPOSSTAR:
354          case OP_CRPOSQUERY:
355        cc++;        cc++;
356        break;        break;
357    
358        case OP_CRRANGE:        case OP_CRRANGE:
359        case OP_CRMINRANGE:        case OP_CRMINRANGE:
360          case OP_CRPOSRANGE:
361        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
362        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
363        break;        break;
364    
365        default:        default:
# Line 364  for (;;) Line 379  for (;;)
379      If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket      If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
380      matches an empty string (by default it causes a matching failure), so in      matches an empty string (by default it causes a matching failure), so in
381      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
382    
383        case OP_DNREF:     /* Duplicate named pattern back reference */
384        case OP_DNREFI:
385        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
386          {
387          int count = GET2(cc, 1+IMM2_SIZE);
388          pcre_uchar *slot = (pcre_uchar *)re +
389            re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
390          d = INT_MAX;
391          while (count-- > 0)
392            {
393            ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
394            if (cs == NULL) return -2;
395            do ce += GET(ce, 1); while (*ce == OP_ALT);
396            if (cc > cs && cc < ce)
397              {
398              d = 0;
399              had_recurse = TRUE;
400              break;
401              }
402            else
403              {
404              int dd = find_minlength(re, cs, startcode, options, recurse_depth);
405              if (dd < d) d = dd;
406              }
407            slot += re->name_entry_size;
408            }
409          }
410        else d = 0;
411        cc += 1 + 2*IMM2_SIZE;
412        goto REPEAT_BACK_REFERENCE;
413    
414      case OP_REF:      case OP_REF:      /* Single back reference */
415      case OP_REFI:      case OP_REFI:
416      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
417        {        {
418        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
419        if (cs == NULL) return -2;        if (cs == NULL) return -2;
420        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
421        if (cc > cs && cc < ce)        if (cc > cs && cc < ce)
# Line 379  for (;;) Line 425  for (;;)
425          }          }
426        else        else
427          {          {
428          d = find_minlength(cs, startcode, options, had_accept_ptr,          d = find_minlength(re, cs, startcode, options, recurse_depth);
           recurse_depth);  
         *had_accept_ptr = FALSE;  
429          }          }
430        }        }
431      else d = 0;      else d = 0;
432      cc += 3;      cc += 1 + IMM2_SIZE;
433    
434      /* Handle repeated back references */      /* Handle repeated back references */
435    
436        REPEAT_BACK_REFERENCE:
437      switch (*cc)      switch (*cc)
438        {        {
439        case OP_CRSTAR:        case OP_CRSTAR:
440        case OP_CRMINSTAR:        case OP_CRMINSTAR:
441        case OP_CRQUERY:        case OP_CRQUERY:
442        case OP_CRMINQUERY:        case OP_CRMINQUERY:
443          case OP_CRPOSSTAR:
444          case OP_CRPOSQUERY:
445        min = 0;        min = 0;
446        cc++;        cc++;
447        break;        break;
448    
449        case OP_CRPLUS:        case OP_CRPLUS:
450        case OP_CRMINPLUS:        case OP_CRMINPLUS:
451          case OP_CRPOSPLUS:
452        min = 1;        min = 1;
453        cc++;        cc++;
454        break;        break;
455    
456        case OP_CRRANGE:        case OP_CRRANGE:
457        case OP_CRMINRANGE:        case OP_CRMINRANGE:
458          case OP_CRPOSRANGE:
459        min = GET2(cc, 1);        min = GET2(cc, 1);
460        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
461        break;        break;
462    
463        default:        default:
# Line 423  for (;;) Line 472  for (;;)
472      caught by a recursion depth count. */      caught by a recursion depth count. */
473    
474      case OP_RECURSE:      case OP_RECURSE:
475      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
     if (cs == NULL) return -2;  
476      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
477      if ((cc > cs && cc < ce) || recurse_depth > 10)      if ((cc > cs && cc < ce) || recurse_depth > 10)
478        had_recurse = TRUE;        had_recurse = TRUE;
479      else      else
480        {        {
481        branchlength += find_minlength(cs, startcode, options, had_accept_ptr,        branchlength += find_minlength(re, cs, startcode, options,
482          recurse_depth + 1);          recurse_depth + 1);
       *had_accept_ptr = FALSE;  
483        }        }
484      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
485      break;      break;
# Line 484  for (;;) Line 531  for (;;)
531      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
532      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
533    
534      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
535  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
536      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
537  #endif  #endif
538      break;      break;
539    
# Line 495  for (;;) Line 542  for (;;)
542      case OP_MARK:      case OP_MARK:
543      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
544      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     cc += _pcre_OP_lengths[op] + cc[1];  
     break;  
   
545      case OP_THEN_ARG:      case OP_THEN_ARG:
546      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];      cc += PRIV(OP_lengths)[op] + cc[1];
547      break;      break;
548    
549      /* The remaining opcodes are just skipped over. */      /* The remaining opcodes are just skipped over. */
# Line 511  for (;;) Line 555  for (;;)
555      case OP_SET_SOM:      case OP_SET_SOM:
556      case OP_SKIP:      case OP_SKIP:
557      case OP_THEN:      case OP_THEN:
558      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
559      break;      break;
560    
561      /* This should not occur: we list all opcodes explicitly so that when      /* This should not occur: we list all opcodes explicitly so that when
# Line 540  Arguments: Line 584  Arguments:
584    p             points to the character    p             points to the character
585    caseless      the caseless flag    caseless      the caseless flag
586    cd            the block with char table pointers    cd            the block with char table pointers
587    utf8          TRUE for UTF-8 mode    utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
588    
589  Returns:        pointer after the character  Returns:        pointer after the character
590  */  */
591    
592  static const uschar *  static const pcre_uchar *
593  set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
594    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf)
595  {  {
596  unsigned int c = *p;  pcre_uint32 c = *p;
597    
598    #ifdef COMPILE_PCRE8
599  SET_BIT(c);  SET_BIT(c);
600    
601  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
602  if (utf8 && c > 127)  if (utf && c > 127)
603    {    {
604    GETCHARINC(c, p);    GETCHARINC(c, p);
605  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
606    if (caseless)    if (caseless)
607      {      {
608      uschar buff[8];      pcre_uchar buff[6];
609      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
610      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
611      SET_BIT(buff[0]);      SET_BIT(buff[0]);
612      }      }
613  #endif  #endif  /* Not SUPPORT_UCP */
614    return p;    return p;
615    }    }
616  #endif  #else   /* Not SUPPORT_UTF */
617    (void)(utf);   /* Stops warning for unused parameter */
618    #endif  /* SUPPORT_UTF */
619    
620  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
621    
622  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
623  return p + 1;  return p + 1;
624    #endif  /* COMPILE_PCRE8 */
625    
626    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
627    if (c > 0xff)
628      {
629      c = 0xff;
630      caseless = FALSE;
631      }
632    SET_BIT(c);
633    
634    #ifdef SUPPORT_UTF
635    if (utf && c > 127)
636      {
637      GETCHARINC(c, p);
638    #ifdef SUPPORT_UCP
639      if (caseless)
640        {
641        c = UCD_OTHERCASE(c);
642        if (c > 0xff)
643          c = 0xff;
644        SET_BIT(c);
645        }
646    #endif  /* SUPPORT_UCP */
647      return p;
648      }
649    #else   /* Not SUPPORT_UTF */
650    (void)(utf);   /* Stops warning for unused parameter */
651    #endif  /* SUPPORT_UTF */
652    
653    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
654    return p + 1;
655    #endif
656  }  }
657    
658    
# Line 599  Returns:         nothing Line 678  Returns:         nothing
678  */  */
679    
680  static void  static void
681  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,  set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682    compile_data *cd)    compile_data *cd)
683  {  {
684  register int c;  register pcre_uint32 c;
685  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
686    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687  if (table_limit == 32) return;  if (table_limit == 32) return;
688  for (c = 128; c < 256; c++)  for (c = 128; c < 256; c++)
689    {    {
690    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
691      {      {
692      uschar buff[8];      pcre_uchar buff[6];
693      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
694      SET_BIT(buff[0]);      SET_BIT(buff[0]);
695      }      }
696    }    }
697    #endif
698  }  }
699    
700    
# Line 639  Returns:         nothing Line 720  Returns:         nothing
720  */  */
721    
722  static void  static void
723  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724    compile_data *cd)    compile_data *cd)
725  {  {
726  register int c;  register pcre_uint32 c;
727  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
728    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
730    #endif
731  }  }
732    
733    
# Line 664  function fails unless the result is SSB_ Line 747  function fails unless the result is SSB_
747  Arguments:  Arguments:
748    code         points to an expression    code         points to an expression
749    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
750    utf8         TRUE if in UTF-8 mode    utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
751    cd           the block with char table pointers    cd           the block with char table pointers
752    
753  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
# Line 674  Returns:       SSB_FAIL     => Failed to Line 757  Returns:       SSB_FAIL     => Failed to
757  */  */
758    
759  static int  static int
760  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
761    compile_data *cd)    compile_data *cd)
762  {  {
763  register int c;  register pcre_uint32 c;
764  int yield = SSB_DONE;  int yield = SSB_DONE;
765  int table_limit = utf8? 16:32;  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
766    int table_limit = utf? 16:32;
767    #else
768    int table_limit = 32;
769    #endif
770    
771  #if 0  #if 0
772  /* ========================================================================= */  /* ========================================================================= */
# Line 701  volatile int dummy; Line 788  volatile int dummy;
788  do  do
789    {    {
790    BOOL try_next = TRUE;    BOOL try_next = TRUE;
791    const uschar *tcode = code + 1 + LINK_SIZE;    const pcre_uchar *tcode = code + 1 + LINK_SIZE;
792    
793    if (*code == OP_CBRA || *code == OP_SCBRA ||    if (*code == OP_CBRA || *code == OP_SCBRA ||
794        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
795    
796    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
797      {      {
# Line 733  do Line 820  do
820        case OP_COND:        case OP_COND:
821        case OP_CREF:        case OP_CREF:
822        case OP_DEF:        case OP_DEF:
823          case OP_DNCREF:
824          case OP_DNREF:
825          case OP_DNREFI:
826          case OP_DNRREF:
827        case OP_DOLL:        case OP_DOLL:
828        case OP_DOLLM:        case OP_DOLLM:
829        case OP_END:        case OP_END:
# Line 741  do Line 832  do
832        case OP_EXTUNI:        case OP_EXTUNI:
833        case OP_FAIL:        case OP_FAIL:
834        case OP_MARK:        case OP_MARK:
       case OP_NCREF:  
835        case OP_NOT:        case OP_NOT:
836        case OP_NOTEXACT:        case OP_NOTEXACT:
837        case OP_NOTEXACTI:        case OP_NOTEXACTI:
# Line 773  do Line 863  do
863        case OP_NOTUPTOI:        case OP_NOTUPTOI:
864        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
865        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
       case OP_NRREF:  
866        case OP_PROP:        case OP_PROP:
867        case OP_PRUNE:        case OP_PRUNE:
868        case OP_PRUNE_ARG:        case OP_PRUNE_ARG:
# Line 790  do Line 879  do
879        case OP_SOM:        case OP_SOM:
880        case OP_THEN:        case OP_THEN:
881        case OP_THEN_ARG:        case OP_THEN_ARG:
882    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
883        case OP_XCLASS:        case OP_XCLASS:
884    #endif
885        return SSB_FAIL;        return SSB_FAIL;
886    
887        /* We can ignore word boundary tests. */        /* We can ignore word boundary tests. */
888    
889        case OP_WORD_BOUNDARY:        case OP_WORD_BOUNDARY:
890        case OP_NOT_WORD_BOUNDARY:        case OP_NOT_WORD_BOUNDARY:
891        tcode++;        tcode++;
892        break;        break;
893    
894        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
895        bits from within the subpattern. If it can't find anything, we have to        bits from within the subpattern. If it can't find anything, we have to
# Line 814  do Line 905  do
905        case OP_CBRAPOS:        case OP_CBRAPOS:
906        case OP_SCBRAPOS:        case OP_SCBRAPOS:
907        case OP_ONCE:        case OP_ONCE:
908          case OP_ONCE_NC:
909        case OP_ASSERT:        case OP_ASSERT:
910        rc = set_start_bits(tcode, start_bits, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf, cd);
911        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
912        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
913          {          {
# Line 862  do Line 954  do
954        case OP_BRAZERO:        case OP_BRAZERO:
955        case OP_BRAMINZERO:        case OP_BRAMINZERO:
956        case OP_BRAPOSZERO:        case OP_BRAPOSZERO:
957        rc = set_start_bits(++tcode, start_bits, utf8, cd);        rc = set_start_bits(++tcode, start_bits, utf, cd);
958        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
959  /* =========================================================================  /* =========================================================================
960        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 889  do Line 981  do
981        case OP_QUERY:        case OP_QUERY:
982        case OP_MINQUERY:        case OP_MINQUERY:
983        case OP_POSQUERY:        case OP_POSQUERY:
984        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
985        break;        break;
986    
987        case OP_STARI:        case OP_STARI:
# Line 898  do Line 990  do
990        case OP_QUERYI:        case OP_QUERYI:
991        case OP_MINQUERYI:        case OP_MINQUERYI:
992        case OP_POSQUERYI:        case OP_POSQUERYI:
993        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
994        break;        break;
995    
996        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 906  do Line 998  do
998        case OP_UPTO:        case OP_UPTO:
999        case OP_MINUPTO:        case OP_MINUPTO:
1000        case OP_POSUPTO:        case OP_POSUPTO:
1001        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1002        break;        break;
1003    
1004        case OP_UPTOI:        case OP_UPTOI:
1005        case OP_MINUPTOI:        case OP_MINUPTOI:
1006        case OP_POSUPTOI:        case OP_POSUPTOI:
1007        tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1008        break;        break;
1009    
1010        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
1011    
1012        case OP_EXACT:        case OP_EXACT:
1013        tcode += 2;        tcode += IMM2_SIZE;
1014        /* Fall through */        /* Fall through */
1015        case OP_CHAR:        case OP_CHAR:
1016        case OP_PLUS:        case OP_PLUS:
1017        case OP_MINPLUS:        case OP_MINPLUS:
1018        case OP_POSPLUS:        case OP_POSPLUS:
1019        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1020        try_next = FALSE;        try_next = FALSE;
1021        break;        break;
1022    
1023        case OP_EXACTI:        case OP_EXACTI:
1024        tcode += 2;        tcode += IMM2_SIZE;
1025        /* Fall through */        /* Fall through */
1026        case OP_CHARI:        case OP_CHARI:
1027        case OP_PLUSI:        case OP_PLUSI:
1028        case OP_MINPLUSI:        case OP_MINPLUSI:
1029        case OP_POSPLUSI:        case OP_POSPLUSI:
1030        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1031        try_next = FALSE;        try_next = FALSE;
1032        break;        break;
1033    
# Line 946  do Line 1038  do
1038        identical. */        identical. */
1039    
1040        case OP_HSPACE:        case OP_HSPACE:
1041        SET_BIT(0x09);        SET_BIT(CHAR_HT);
1042        SET_BIT(0x20);        SET_BIT(CHAR_SPACE);
1043        if (utf8)  #ifdef SUPPORT_UTF
1044          if (utf)
1045          {          {
1046    #ifdef COMPILE_PCRE8
1047          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
1048          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
1049          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1050          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
1051    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1052            SET_BIT(0xA0);
1053            SET_BIT(0xFF);  /* For characters > 255 */
1054    #endif  /* COMPILE_PCRE[8|16|32] */
1055            }
1056          else
1057    #endif /* SUPPORT_UTF */
1058            {
1059    #ifndef EBCDIC
1060            SET_BIT(0xA0);
1061    #endif  /* Not EBCDIC */
1062    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1063            SET_BIT(0xFF);  /* For characters > 255 */
1064    #endif  /* COMPILE_PCRE[16|32] */
1065          }          }
       else SET_BIT(0xA0);  
1066        try_next = FALSE;        try_next = FALSE;
1067        break;        break;
1068    
1069        case OP_ANYNL:        case OP_ANYNL:
1070        case OP_VSPACE:        case OP_VSPACE:
1071        SET_BIT(0x0A);        SET_BIT(CHAR_LF);
1072        SET_BIT(0x0B);        SET_BIT(CHAR_VT);
1073        SET_BIT(0x0C);        SET_BIT(CHAR_FF);
1074        SET_BIT(0x0D);        SET_BIT(CHAR_CR);
1075        if (utf8)  #ifdef SUPPORT_UTF
1076          if (utf)
1077          {          {
1078    #ifdef COMPILE_PCRE8
1079          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
1080          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1081    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1082            SET_BIT(CHAR_NEL);
1083            SET_BIT(0xFF);  /* For characters > 255 */
1084    #endif  /* COMPILE_PCRE[8|16|32] */
1085            }
1086          else
1087    #endif /* SUPPORT_UTF */
1088            {
1089            SET_BIT(CHAR_NEL);
1090    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1091            SET_BIT(0xFF);  /* For characters > 255 */
1092    #endif
1093          }          }
       else SET_BIT(0x85);  
1094        try_next = FALSE;        try_next = FALSE;
1095        break;        break;
1096    
# Line 990  do Line 1110  do
1110        break;        break;
1111    
1112        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
1113        ensure it is set as not whitespace. */        ensure it is set as not whitespace. Luckily, the code value is the same
1114          (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1115    
1116        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
1117        set_nottype_bits(start_bits, cbit_space, table_limit, cd);        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
# Line 998  do Line 1119  do
1119        try_next = FALSE;        try_next = FALSE;
1120        break;        break;
1121    
1122        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to not
1123        not set it from the table. */        set it from the table. Luckily, the code value is the same (0x0b) in
1124          ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1125    
1126        case OP_WHITESPACE:        case OP_WHITESPACE:
1127        c = start_bits[1];    /* Save in case it was already set */        c = start_bits[1];    /* Save in case it was already set */
# Line 1028  do Line 1150  do
1150        break;        break;
1151    
1152        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1153        tcode += 3;        tcode += 1 + IMM2_SIZE;
1154        break;        break;
1155    
1156        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 1037  do Line 1159  do
1159        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1160        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1161        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1162        tcode += 2;               /* Fall through */        tcode += IMM2_SIZE;  /* Fall through */
1163    
1164        case OP_TYPESTAR:        case OP_TYPESTAR:
1165        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
# Line 1053  do Line 1175  do
1175          return SSB_FAIL;          return SSB_FAIL;
1176    
1177          case OP_HSPACE:          case OP_HSPACE:
1178          SET_BIT(0x09);          SET_BIT(CHAR_HT);
1179          SET_BIT(0x20);          SET_BIT(CHAR_SPACE);
1180          if (utf8)  #ifdef SUPPORT_UTF
1181            if (utf)
1182            {            {
1183    #ifdef COMPILE_PCRE8
1184            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
1185            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
1186            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1187            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
1188    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1189              SET_BIT(0xA0);
1190              SET_BIT(0xFF);  /* For characters > 255 */
1191    #endif  /* COMPILE_PCRE[8|16|32] */
1192            }            }
1193          else SET_BIT(0xA0);          else
1194    #endif /* SUPPORT_UTF */
1195    #ifndef EBCDIC
1196              SET_BIT(0xA0);
1197    #endif  /* Not EBCDIC */
1198          break;          break;
1199    
1200          case OP_ANYNL:          case OP_ANYNL:
1201          case OP_VSPACE:          case OP_VSPACE:
1202          SET_BIT(0x0A);          SET_BIT(CHAR_LF);
1203          SET_BIT(0x0B);          SET_BIT(CHAR_VT);
1204          SET_BIT(0x0C);          SET_BIT(CHAR_FF);
1205          SET_BIT(0x0D);          SET_BIT(CHAR_CR);
1206          if (utf8)  #ifdef SUPPORT_UTF
1207            if (utf)
1208            {            {
1209    #ifdef COMPILE_PCRE8
1210            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
1211            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
1212    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1213              SET_BIT(CHAR_NEL);
1214              SET_BIT(0xFF);  /* For characters > 255 */
1215    #endif  /* COMPILE_PCRE16 */
1216            }            }
1217          else SET_BIT(0x85);          else
1218    #endif /* SUPPORT_UTF */
1219              SET_BIT(CHAR_NEL);
1220          break;          break;
1221    
1222          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 1087  do Line 1227  do
1227          set_type_bits(start_bits, cbit_digit, table_limit, cd);          set_type_bits(start_bits, cbit_digit, table_limit, cd);
1228          break;          break;
1229    
1230          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we no longer
1231          ensure it gets set as not whitespace. */          have to play fancy tricks because Perl added VT to its whitespace at
1232            release 5.18. PCRE added it at release 8.34. */
1233    
1234          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1235          set_nottype_bits(start_bits, cbit_space, table_limit, cd);          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] |= 0x08;  
1236          break;          break;
1237    
         /* The cbit_space table has vertical tab as whitespace; we have to  
         avoid setting it. */  
   
1238          case OP_WHITESPACE:          case OP_WHITESPACE:
         c = start_bits[1];    /* Save in case it was already set */  
1239          set_type_bits(start_bits, cbit_space, table_limit, cd);          set_type_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] = (start_bits[1] & ~0x08) | c;  
1240          break;          break;
1241    
1242          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 1123  do Line 1258  do
1258        character with a value > 255. */        character with a value > 255. */
1259    
1260        case OP_NCLASS:        case OP_NCLASS:
1261  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1262        if (utf8)        if (utf)
1263          {          {
1264          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1265          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1266          }          }
1267  #endif  #endif
1268    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269          SET_BIT(0xFF);                         /* For characters > 255 */
1270    #endif
1271        /* Fall through */        /* Fall through */
1272    
1273        case OP_CLASS:        case OP_CLASS:
1274          {          {
1275            pcre_uint8 *map;
1276          tcode++;          tcode++;
1277            map = (pcre_uint8 *)tcode;
1278    
1279          /* In UTF-8 mode, the bits in a bit map correspond to character          /* In UTF-8 mode, the bits in a bit map correspond to character
1280          values, not to byte values. However, the bit map we are constructing is          values, not to byte values. However, the bit map we are constructing is
# Line 1142  do Line 1282  do
1282          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
1283          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1284    
1285  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1286          if (utf8)          if (utf)
1287            {            {
1288            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1289            for (c = 128; c < 256; c++)            for (c = 128; c < 256; c++)
1290              {              {
1291              if ((tcode[c/8] && (1 << (c&7))) != 0)              if ((map[c/8] && (1 << (c&7))) != 0)
1292                {                {
1293                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1294                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
# Line 1156  do Line 1296  do
1296                }                }
1297              }              }
1298            }            }
   
         /* In non-UTF-8 mode, the two bit maps are completely compatible. */  
   
1299          else          else
1300  #endif  #endif
1301            {            {
1302            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1303              for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1304            }            }
1305    
1306          /* Advance past the bit map, and act on what follows. For a zero          /* Advance past the bit map, and act on what follows. For a zero
1307          minimum repeat, continue; otherwise stop processing. */          minimum repeat, continue; otherwise stop processing. */
1308    
1309          tcode += 32;          tcode += 32 / sizeof(pcre_uchar);
1310          switch (*tcode)          switch (*tcode)
1311            {            {
1312            case OP_CRSTAR:            case OP_CRSTAR:
1313            case OP_CRMINSTAR:            case OP_CRMINSTAR:
1314            case OP_CRQUERY:            case OP_CRQUERY:
1315            case OP_CRMINQUERY:            case OP_CRMINQUERY:
1316              case OP_CRPOSSTAR:
1317              case OP_CRPOSQUERY:
1318            tcode++;            tcode++;
1319            break;            break;
1320    
1321            case OP_CRRANGE:            case OP_CRRANGE:
1322            case OP_CRMINRANGE:            case OP_CRMINRANGE:
1323            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            case OP_CRPOSRANGE:
1324              if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1325              else try_next = FALSE;              else try_next = FALSE;
1326            break;            break;
1327    
# Line 1209  return yield; Line 1350  return yield;
1350  *************************************************/  *************************************************/
1351    
1352  /* This function is handed a compiled expression that it must study to produce  /* This function is handed a compiled expression that it must study to produce
1353  information that will speed up the matching. It returns a pcre_extra block  information that will speed up the matching. It returns a pcre[16]_extra block
1354  which then gets handed back to pcre_exec().  which then gets handed back to pcre_exec().
1355    
1356  Arguments:  Arguments:
# Line 1218  Arguments: Line 1359  Arguments:
1359    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
1360              set NULL unless error              set NULL unless error
1361    
1362  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1363                appropriate flags set;                the appropriate flags set;
1364              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1365  */  */
1366    
1367    #if defined COMPILE_PCRE8
1368  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1369  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1370    #elif defined COMPILE_PCRE16
1371    PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1372    pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1373    #elif defined COMPILE_PCRE32
1374    PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1375    pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1376    #endif
1377  {  {
1378  int min;  int min;
1379  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1380  BOOL had_accept = FALSE;  pcre_uint8 start_bits[32];
1381  uschar start_bits[32];  PUBL(extra) *extra = NULL;
 pcre_extra *extra;  
1382  pcre_study_data *study;  pcre_study_data *study;
1383  const uschar *tables;  const pcre_uint8 *tables;
1384  uschar *code;  pcre_uchar *code;
1385  compile_data compile_block;  compile_data compile_block;
1386  const real_pcre *re = (const real_pcre *)external_re;  const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1387    
1388    
1389  *errorptr = NULL;  *errorptr = NULL;
1390    
# Line 1245  if (re == NULL || re->magic_number != MA Line 1394  if (re == NULL || re->magic_number != MA
1394    return NULL;    return NULL;
1395    }    }
1396    
1397    if ((re->flags & PCRE_MODE) == 0)
1398      {
1399    #if defined COMPILE_PCRE8
1400      *errorptr = "argument not compiled in 8 bit mode";
1401    #elif defined COMPILE_PCRE16
1402      *errorptr = "argument not compiled in 16 bit mode";
1403    #elif defined COMPILE_PCRE32
1404      *errorptr = "argument not compiled in 32 bit mode";
1405    #endif
1406      return NULL;
1407      }
1408    
1409  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1410    {    {
1411    *errorptr = "unknown or incorrect option bit(s) set";    *errorptr = "unknown or incorrect option bit(s) set";
1412    return NULL;    return NULL;
1413    }    }
1414    
1415  code = (uschar *)re + re->name_table_offset +  code = (pcre_uchar *)re + re->name_table_offset +
1416    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1417    
1418  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
# Line 1266  if ((re->options & PCRE_ANCHORED) == 0 & Line 1427  if ((re->options & PCRE_ANCHORED) == 0 &
1427    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1428    
1429    tables = re->tables;    tables = re->tables;
1430    
1431    #if defined COMPILE_PCRE8
1432    if (tables == NULL)    if (tables == NULL)
1433      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1434      (void *)(&tables));      (void *)(&tables));
1435    #elif defined COMPILE_PCRE16
1436      if (tables == NULL)
1437        (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1438        (void *)(&tables));
1439    #elif defined COMPILE_PCRE32
1440      if (tables == NULL)
1441        (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1442        (void *)(&tables));
1443    #endif
1444    
1445    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1446    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
# Line 1277  if ((re->options & PCRE_ANCHORED) == 0 & Line 1449  if ((re->options & PCRE_ANCHORED) == 0 &
1449    
1450    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1451    
1452    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1453    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1454      &compile_block);      &compile_block);
1455    bits_set = rc == SSB_DONE;    bits_set = rc == SSB_DONE;
1456    if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";    if (rc == SSB_UNKNOWN)
1457        {
1458        *errorptr = "internal error: opcode not recognized";
1459        return NULL;
1460        }
1461    }    }
1462    
1463  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1464    
1465  switch(min = find_minlength(code, code, re->options, &had_accept, 0))  switch(min = find_minlength(re, code, code, re->options, 0))
1466    {    {
1467    case -2: *errorptr = "internal error: missing capturing bracket"; break;    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1468    case -3: *errorptr = "internal error: opcode not recognized"; break;    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1469    default: break;    default: break;
1470    }    }
1471    
1472  /* Return NULL if there's been an error or if no optimization is possible. */  /* If a set of starting bytes has been identified, or if the minimum length is
1473    greater than zero, or if JIT optimization has been requested, or if
1474    PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1475    pcre_study_data block. The study data is put in the latter, which is pointed to
1476    by the former, which may also get additional data set later by the calling
1477    program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1478    save it in a field for returning via the pcre_fullinfo() function so that if it
1479    becomes variable in the future, we don't have to change that code. */
1480    
1481    if (bits_set || min > 0 || (options & (
1482    #ifdef SUPPORT_JIT
1483        PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1484        PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1485    #endif
1486        PCRE_STUDY_EXTRA_NEEDED)) != 0)
1487      {
1488      extra = (PUBL(extra) *)(PUBL(malloc))
1489        (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1490      if (extra == NULL)
1491        {
1492        *errorptr = "failed to get memory";
1493        return NULL;
1494        }
1495    
1496  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;    study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1497      extra->flags = PCRE_EXTRA_STUDY_DATA;
1498      extra->study_data = study;
1499    
1500      study->size = sizeof(pcre_study_data);
1501      study->flags = 0;
1502    
1503      /* Set the start bits always, to avoid unset memory errors if the
1504      study data is written to a file, but set the flag only if any of the bits
1505      are set, to save time looking when none are. */
1506    
1507  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in    if (bits_set)
1508  the latter, which is pointed to by the former, which may also get additional      {
1509  data set later by the calling program. At the moment, the size of      study->flags |= PCRE_STUDY_MAPPED;
1510  pcre_study_data is fixed. We nevertheless save it in a field for returning via      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1511  the pcre_fullinfo() function so that if it becomes variable in the future, we      }
1512  don't have to change that code. */    else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1513    
1514  extra = (pcre_extra *)(pcre_malloc)  #ifdef PCRE_DEBUG
1515    (sizeof(pcre_extra) + sizeof(pcre_study_data));    if (bits_set)
1516        {
1517        pcre_uint8 *ptr = start_bits;
1518        int i;
1519    
1520  if (extra == NULL)      printf("Start bits:\n");
1521    {      for (i = 0; i < 32; i++)
1522    *errorptr = "failed to get memory";        printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1523    return NULL;      }
1524    }  #endif
1525    
1526  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));    /* Always set the minlength value in the block, because the JIT compiler
1527  extra->flags = PCRE_EXTRA_STUDY_DATA;    makes use of it. However, don't set the bit unless the length is greater than
1528  extra->study_data = study;    zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1529      checking the zero case. */
1530    
1531  study->size = sizeof(pcre_study_data);    if (min > 0)
1532  study->flags = 0;      {
1533        study->flags |= PCRE_STUDY_MINLEN;
1534        study->minlength = min;
1535        }
1536      else study->minlength = 0;
1537    
1538  if (bits_set)    /* If JIT support was compiled and requested, attempt the JIT compilation.
1539    {    If no starting bytes were found, and the minimum length is zero, and JIT
1540    study->flags |= PCRE_STUDY_MAPPED;    compilation fails, abandon the extra block and return NULL, unless
1541    memcpy(study->start_bits, start_bits, sizeof(start_bits));    PCRE_STUDY_EXTRA_NEEDED is set. */
1542    }  
1543    #ifdef SUPPORT_JIT
1544      extra->executable_jit = NULL;
1545      if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1546        PRIV(jit_compile)(re, extra, JIT_COMPILE);
1547      if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1548        PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1549      if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1550        PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1551    
1552  if (min >= 0)    if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1553    {        (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1554    study->flags |= PCRE_STUDY_MINLEN;      {
1555    study->minlength = min;  #if defined COMPILE_PCRE8
1556        pcre_free_study(extra);
1557    #elif defined COMPILE_PCRE16
1558        pcre16_free_study(extra);
1559    #elif defined COMPILE_PCRE32
1560        pcre32_free_study(extra);
1561    #endif
1562        extra = NULL;
1563        }
1564    #endif
1565    }    }
1566    
1567  return extra;  return extra;
1568  }  }
1569    
1570    
1571    /*************************************************
1572    *          Free the study data                   *
1573    *************************************************/
1574    
1575    /* This function frees the memory that was obtained by pcre_study().
1576    
1577    Argument:   a pointer to the pcre[16]_extra block
1578    Returns:    nothing
1579    */
1580    
1581    #if defined COMPILE_PCRE8
1582    PCRE_EXP_DEFN void
1583    pcre_free_study(pcre_extra *extra)
1584    #elif defined COMPILE_PCRE16
1585    PCRE_EXP_DEFN void
1586    pcre16_free_study(pcre16_extra *extra)
1587    #elif defined COMPILE_PCRE32
1588    PCRE_EXP_DEFN void
1589    pcre32_free_study(pcre32_extra *extra)
1590    #endif
1591    {
1592    if (extra == NULL)
1593      return;
1594    #ifdef SUPPORT_JIT
1595    if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1596         extra->executable_jit != NULL)
1597      PRIV(jit_free)(extra->executable_jit);
1598    #endif
1599    PUBL(free)(extra);
1600    }
1601    
1602  /* End of pcre_study.c */  /* End of pcre_study.c */

Legend:
Removed from v.657  
changed lines
  Added in v.1379

  ViewVC Help
Powered by ViewVC 1.1.5