/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 689 by ph10, Fri Sep 9 10:34:57 2011 UTC revision 1148 by zherczeg, Sat Oct 20 20:52:52 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code        pointer to start of group (the bracket)    code            pointer to start of group (the bracket)
70    startcode   pointer to start of the whole pattern    startcode       pointer to start of the whole pattern
71    options     the compiling options    options         the compiling options
72    had_accept  pointer to flag for (*ACCEPT) encountered    int             RECURSE depth
   int         RECURSE depth  
73    
74  Returns:   the minimum length  Returns:   the minimum length
75             -1 if \C was encountered             -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
77             -3 internal error (opcode not listed)             -3 internal error (opcode not listed)
78  */  */
79    
80  static int  static int
81  find_minlength(const uschar *code, const uschar *startcode, int options,  find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
82    BOOL *had_accept_ptr, int recurse_depth)    int recurse_depth)
83  {  {
84  int length = -1;  int length = -1;
85  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
86    BOOL utf = (options & PCRE_UTF8) != 0;
87  BOOL had_recurse = FALSE;  BOOL had_recurse = FALSE;
88  register int branchlength = 0;  register int branchlength = 0;
89  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
90    
91  if (*code == OP_CBRA || *code == OP_SCBRA ||  if (*code == OP_CBRA || *code == OP_SCBRA ||
92      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
93    
94  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
95  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 97  branch, check the length against that of Line 97  branch, check the length against that of
97  for (;;)  for (;;)
98    {    {
99    int d, min;    int d, min;
100    uschar *cs, *ce;    pcre_uchar *cs, *ce;
101    register int op = *cc;    register pcre_uchar op = *cc;
102    
103    switch (op)    switch (op)
104      {      {
# Line 128  for (;;) Line 128  for (;;)
128      case OP_BRAPOS:      case OP_BRAPOS:
129      case OP_SBRAPOS:      case OP_SBRAPOS:
130      case OP_ONCE:      case OP_ONCE:
131      d = find_minlength(cc, startcode, options, had_accept_ptr, recurse_depth);      case OP_ONCE_NC:
132        d = find_minlength(cc, startcode, options, recurse_depth);
133      if (d < 0) return d;      if (d < 0) return d;
134      branchlength += d;      branchlength += d;
     if (*had_accept_ptr) return branchlength;  
135      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
136      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
137      break;      break;
138    
139      /* Reached end of a branch; if it's a ket it is the end of a nested      /* ACCEPT makes things far too complicated; we have to give up. */
     call. If it's ALT it is an alternation in a nested call. If it is END it's  
     the end of the outer call. All can be handled by the same code. If it is  
     ACCEPT, it is essentially the same as END, but we set a flag so that  
     counting stops. */  
140    
141      case OP_ACCEPT:      case OP_ACCEPT:
142      case OP_ASSERT_ACCEPT:      case OP_ASSERT_ACCEPT:
143      *had_accept_ptr = TRUE;      return -1;
144      /* Fall through */  
145        /* Reached end of a branch; if it's a ket it is the end of a nested
146        call. If it's ALT it is an alternation in a nested call. If it is END it's
147        the end of the outer call. All can be handled by the same code. If an
148        ACCEPT was previously encountered, use the length that was in force at that
149        time, and pass back the shortest ACCEPT length. */
150    
151      case OP_ALT:      case OP_ALT:
152      case OP_KET:      case OP_KET:
153      case OP_KETRMAX:      case OP_KETRMAX:
# Line 188  for (;;) Line 190  for (;;)
190      case OP_DOLLM:      case OP_DOLLM:
191      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
192      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
193      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
194      break;      break;
195    
196      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
# Line 197  for (;;) Line 199  for (;;)
199      case OP_BRAMINZERO:      case OP_BRAMINZERO:
200      case OP_BRAPOSZERO:      case OP_BRAPOSZERO:
201      case OP_SKIPZERO:      case OP_SKIPZERO:
202      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
203      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
204      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
205      break;      break;
# Line 222  for (;;) Line 224  for (;;)
224      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
225      branchlength++;      branchlength++;
226      cc += 2;      cc += 2;
227  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
228      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
229  #endif  #endif
230      break;      break;
231    
# Line 242  for (;;) Line 244  for (;;)
244      case OP_NOTEXACT:      case OP_NOTEXACT:
245      case OP_NOTEXACTI:      case OP_NOTEXACTI:
246      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
247      cc += 4;      cc += 2 + IMM2_SIZE;
248  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
249      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
250  #endif  #endif
251      break;      break;
252    
253      case OP_TYPEEXACT:      case OP_TYPEEXACT:
254      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
255      cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;      cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
256          || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
257      break;      break;
258    
259      /* Handle single-char non-literal matchers */      /* Handle single-char non-literal matchers */
# Line 285  for (;;) Line 288  for (;;)
288      cc++;      cc++;
289      break;      break;
290    
291      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
292        non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
293        appear, but leave the code, just in case.) */
294    
295      case OP_ANYBYTE:      case OP_ANYBYTE:
296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
297      if (utf8) return -1;      if (utf) return -1;
298  #endif  #endif
299      branchlength++;      branchlength++;
300      cc++;      cc++;
# Line 305  for (;;) Line 310  for (;;)
310      case OP_TYPEPOSSTAR:      case OP_TYPEPOSSTAR:
311      case OP_TYPEPOSQUERY:      case OP_TYPEPOSQUERY:
312      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
313      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
314      break;      break;
315    
316      case OP_TYPEUPTO:      case OP_TYPEUPTO:
317      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
318      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
319      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP
320      cc += _pcre_OP_lengths[op];        || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
321        cc += PRIV(OP_lengths)[op];
322      break;      break;
323    
324      /* Check a class for variable quantification */      /* Check a class for variable quantification */
325    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
326      case OP_CLASS:      case OP_CLASS:
327      case OP_NCLASS:      case OP_NCLASS:
328      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
329        case OP_XCLASS:
330        /* The original code caused an unsigned overflow in 64 bit systems,
331        so now we use a conditional statement. */
332        if (op == OP_XCLASS)
333          cc += GET(cc, 1);
334        else
335          cc += PRIV(OP_lengths)[OP_CLASS];
336    #else
337        cc += PRIV(OP_lengths)[OP_CLASS];
338    #endif
339    
340      switch (*cc)      switch (*cc)
341        {        {
# Line 344  for (;;) Line 354  for (;;)
354        case OP_CRRANGE:        case OP_CRRANGE:
355        case OP_CRMINRANGE:        case OP_CRMINRANGE:
356        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
357        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
358        break;        break;
359    
360        default:        default:
# Line 369  for (;;) Line 379  for (;;)
379      case OP_REFI:      case OP_REFI:
380      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
381        {        {
382        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
383        if (cs == NULL) return -2;        if (cs == NULL) return -2;
384        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
385        if (cc > cs && cc < ce)        if (cc > cs && cc < ce)
# Line 379  for (;;) Line 389  for (;;)
389          }          }
390        else        else
391          {          {
392          d = find_minlength(cs, startcode, options, had_accept_ptr,          d = find_minlength(cs, startcode, options, recurse_depth);
           recurse_depth);  
         *had_accept_ptr = FALSE;  
393          }          }
394        }        }
395      else d = 0;      else d = 0;
396      cc += 3;      cc += 1 + IMM2_SIZE;
397    
398      /* Handle repeated back references */      /* Handle repeated back references */
399    
# Line 408  for (;;) Line 416  for (;;)
416        case OP_CRRANGE:        case OP_CRRANGE:
417        case OP_CRMINRANGE:        case OP_CRMINRANGE:
418        min = GET2(cc, 1);        min = GET2(cc, 1);
419        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
420        break;        break;
421    
422        default:        default:
# Line 423  for (;;) Line 431  for (;;)
431      caught by a recursion depth count. */      caught by a recursion depth count. */
432    
433      case OP_RECURSE:      case OP_RECURSE:
434      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
     if (cs == NULL) return -2;  
435      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
436      if ((cc > cs && cc < ce) || recurse_depth > 10)      if ((cc > cs && cc < ce) || recurse_depth > 10)
437        had_recurse = TRUE;        had_recurse = TRUE;
438      else      else
439        {        {
440        branchlength += find_minlength(cs, startcode, options, had_accept_ptr,        branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
         recurse_depth + 1);  
       *had_accept_ptr = FALSE;  
441        }        }
442      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
443      break;      break;
# Line 484  for (;;) Line 489  for (;;)
489      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
490      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
491    
492      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
493  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
494      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
495  #endif  #endif
496      break;      break;
497    
# Line 495  for (;;) Line 500  for (;;)
500      case OP_MARK:      case OP_MARK:
501      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
502      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     cc += _pcre_OP_lengths[op] + cc[1];  
     break;  
   
503      case OP_THEN_ARG:      case OP_THEN_ARG:
504      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];      cc += PRIV(OP_lengths)[op] + cc[1];
505      break;      break;
506    
507      /* The remaining opcodes are just skipped over. */      /* The remaining opcodes are just skipped over. */
# Line 511  for (;;) Line 513  for (;;)
513      case OP_SET_SOM:      case OP_SET_SOM:
514      case OP_SKIP:      case OP_SKIP:
515      case OP_THEN:      case OP_THEN:
516      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
517      break;      break;
518    
519      /* This should not occur: we list all opcodes explicitly so that when      /* This should not occur: we list all opcodes explicitly so that when
# Line 540  Arguments: Line 542  Arguments:
542    p             points to the character    p             points to the character
543    caseless      the caseless flag    caseless      the caseless flag
544    cd            the block with char table pointers    cd            the block with char table pointers
545    utf8          TRUE for UTF-8 mode    utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
546    
547  Returns:        pointer after the character  Returns:        pointer after the character
548  */  */
549    
550  static const uschar *  static const pcre_uchar *
551  set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
552    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf)
553  {  {
554  unsigned int c = *p;  pcre_uint32 c = *p;
555    
556    #ifdef COMPILE_PCRE8
557  SET_BIT(c);  SET_BIT(c);
558    
559  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
560  if (utf8 && c > 127)  if (utf && c > 127)
561    {    {
562    GETCHARINC(c, p);    GETCHARINC(c, p);
563  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
564    if (caseless)    if (caseless)
565      {      {
566      uschar buff[8];      pcre_uchar buff[6];
567      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
568      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
569      SET_BIT(buff[0]);      SET_BIT(buff[0]);
570      }      }
571  #endif  #endif  /* Not SUPPORT_UCP */
572    return p;    return p;
573    }    }
574  #endif  #else   /* Not SUPPORT_UTF */
575    (void)(utf);   /* Stops warning for unused parameter */
576    #endif  /* SUPPORT_UTF */
577    
578  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
579    
580  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
581  return p + 1;  return p + 1;
582    #endif  /* COMPILE_PCRE8 */
583    
584    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
585    if (c > 0xff)
586      {
587      c = 0xff;
588      caseless = FALSE;
589      }
590    SET_BIT(c);
591    
592    #ifdef SUPPORT_UTF
593    if (utf && c > 127)
594      {
595      GETCHARINC(c, p);
596    #ifdef SUPPORT_UCP
597      if (caseless)
598        {
599        c = UCD_OTHERCASE(c);
600        if (c > 0xff)
601          c = 0xff;
602        SET_BIT(c);
603        }
604    #endif  /* SUPPORT_UCP */
605      return p;
606      }
607    #else   /* Not SUPPORT_UTF */
608    (void)(utf);   /* Stops warning for unused parameter */
609    #endif  /* SUPPORT_UTF */
610    
611    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
612    return p + 1;
613    #endif
614  }  }
615    
616    
# Line 599  Returns:         nothing Line 636  Returns:         nothing
636  */  */
637    
638  static void  static void
639  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,  set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
640    compile_data *cd)    compile_data *cd)
641  {  {
642  register int c;  register pcre_uint32 c;
643  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
644    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
645  if (table_limit == 32) return;  if (table_limit == 32) return;
646  for (c = 128; c < 256; c++)  for (c = 128; c < 256; c++)
647    {    {
648    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
649      {      {
650      uschar buff[8];      pcre_uchar buff[6];
651      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
652      SET_BIT(buff[0]);      SET_BIT(buff[0]);
653      }      }
654    }    }
655    #endif
656  }  }
657    
658    
# Line 639  Returns:         nothing Line 678  Returns:         nothing
678  */  */
679    
680  static void  static void
681  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682    compile_data *cd)    compile_data *cd)
683  {  {
684  register int c;  register pcre_uint32 c;
685  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
686    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
688    #endif
689  }  }
690    
691    
# Line 664  function fails unless the result is SSB_ Line 705  function fails unless the result is SSB_
705  Arguments:  Arguments:
706    code         points to an expression    code         points to an expression
707    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
708    utf8         TRUE if in UTF-8 mode    utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
709    cd           the block with char table pointers    cd           the block with char table pointers
710    
711  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
# Line 674  Returns:       SSB_FAIL     => Failed to Line 715  Returns:       SSB_FAIL     => Failed to
715  */  */
716    
717  static int  static int
718  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
719    compile_data *cd)    compile_data *cd)
720  {  {
721  register int c;  register pcre_uint32 c;
722  int yield = SSB_DONE;  int yield = SSB_DONE;
723  int table_limit = utf8? 16:32;  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
724    int table_limit = utf? 16:32;
725    #else
726    int table_limit = 32;
727    #endif
728    
729  #if 0  #if 0
730  /* ========================================================================= */  /* ========================================================================= */
# Line 701  volatile int dummy; Line 746  volatile int dummy;
746  do  do
747    {    {
748    BOOL try_next = TRUE;    BOOL try_next = TRUE;
749    const uschar *tcode = code + 1 + LINK_SIZE;    const pcre_uchar *tcode = code + 1 + LINK_SIZE;
750    
751    if (*code == OP_CBRA || *code == OP_SCBRA ||    if (*code == OP_CBRA || *code == OP_SCBRA ||
752        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
753    
754    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
755      {      {
# Line 790  do Line 835  do
835        case OP_SOM:        case OP_SOM:
836        case OP_THEN:        case OP_THEN:
837        case OP_THEN_ARG:        case OP_THEN_ARG:
838    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
839        case OP_XCLASS:        case OP_XCLASS:
840    #endif
841        return SSB_FAIL;        return SSB_FAIL;
842    
843        /* We can ignore word boundary tests. */        /* We can ignore word boundary tests. */
# Line 814  do Line 861  do
861        case OP_CBRAPOS:        case OP_CBRAPOS:
862        case OP_SCBRAPOS:        case OP_SCBRAPOS:
863        case OP_ONCE:        case OP_ONCE:
864          case OP_ONCE_NC:
865        case OP_ASSERT:        case OP_ASSERT:
866        rc = set_start_bits(tcode, start_bits, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf, cd);
867        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
868        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
869          {          {
# Line 862  do Line 910  do
910        case OP_BRAZERO:        case OP_BRAZERO:
911        case OP_BRAMINZERO:        case OP_BRAMINZERO:
912        case OP_BRAPOSZERO:        case OP_BRAPOSZERO:
913        rc = set_start_bits(++tcode, start_bits, utf8, cd);        rc = set_start_bits(++tcode, start_bits, utf, cd);
914        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
915  /* =========================================================================  /* =========================================================================
916        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 889  do Line 937  do
937        case OP_QUERY:        case OP_QUERY:
938        case OP_MINQUERY:        case OP_MINQUERY:
939        case OP_POSQUERY:        case OP_POSQUERY:
940        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
941        break;        break;
942    
943        case OP_STARI:        case OP_STARI:
# Line 898  do Line 946  do
946        case OP_QUERYI:        case OP_QUERYI:
947        case OP_MINQUERYI:        case OP_MINQUERYI:
948        case OP_POSQUERYI:        case OP_POSQUERYI:
949        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
950        break;        break;
951    
952        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 906  do Line 954  do
954        case OP_UPTO:        case OP_UPTO:
955        case OP_MINUPTO:        case OP_MINUPTO:
956        case OP_POSUPTO:        case OP_POSUPTO:
957        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
958        break;        break;
959    
960        case OP_UPTOI:        case OP_UPTOI:
961        case OP_MINUPTOI:        case OP_MINUPTOI:
962        case OP_POSUPTOI:        case OP_POSUPTOI:
963        tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
964        break;        break;
965    
966        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
967    
968        case OP_EXACT:        case OP_EXACT:
969        tcode += 2;        tcode += IMM2_SIZE;
970        /* Fall through */        /* Fall through */
971        case OP_CHAR:        case OP_CHAR:
972        case OP_PLUS:        case OP_PLUS:
973        case OP_MINPLUS:        case OP_MINPLUS:
974        case OP_POSPLUS:        case OP_POSPLUS:
975        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
976        try_next = FALSE;        try_next = FALSE;
977        break;        break;
978    
979        case OP_EXACTI:        case OP_EXACTI:
980        tcode += 2;        tcode += IMM2_SIZE;
981        /* Fall through */        /* Fall through */
982        case OP_CHARI:        case OP_CHARI:
983        case OP_PLUSI:        case OP_PLUSI:
984        case OP_MINPLUSI:        case OP_MINPLUSI:
985        case OP_POSPLUSI:        case OP_POSPLUSI:
986        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
987        try_next = FALSE;        try_next = FALSE;
988        break;        break;
989    
# Line 946  do Line 994  do
994        identical. */        identical. */
995    
996        case OP_HSPACE:        case OP_HSPACE:
997        SET_BIT(0x09);        SET_BIT(CHAR_HT);
998        SET_BIT(0x20);        SET_BIT(CHAR_SPACE);
999        if (utf8)  #ifdef SUPPORT_UTF
1000          if (utf)
1001          {          {
1002    #ifdef COMPILE_PCRE8
1003          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
1004          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
1005          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1006          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
1007    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1008            SET_BIT(0xA0);
1009            SET_BIT(0xFF);  /* For characters > 255 */
1010    #endif  /* COMPILE_PCRE[8|16|32] */
1011            }
1012          else
1013    #endif /* SUPPORT_UTF */
1014            {
1015    #ifndef EBCDIC
1016            SET_BIT(0xA0);
1017    #endif  /* Not EBCDIC */
1018    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1019            SET_BIT(0xFF);  /* For characters > 255 */
1020    #endif  /* COMPILE_PCRE[16|32] */
1021          }          }
       else SET_BIT(0xA0);  
1022        try_next = FALSE;        try_next = FALSE;
1023        break;        break;
1024    
1025        case OP_ANYNL:        case OP_ANYNL:
1026        case OP_VSPACE:        case OP_VSPACE:
1027        SET_BIT(0x0A);        SET_BIT(CHAR_LF);
1028        SET_BIT(0x0B);        SET_BIT(CHAR_VT);
1029        SET_BIT(0x0C);        SET_BIT(CHAR_FF);
1030        SET_BIT(0x0D);        SET_BIT(CHAR_CR);
1031        if (utf8)  #ifdef SUPPORT_UTF
1032          if (utf)
1033          {          {
1034    #ifdef COMPILE_PCRE8
1035          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
1036          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1037    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1038            SET_BIT(CHAR_NEL);
1039            SET_BIT(0xFF);  /* For characters > 255 */
1040    #endif  /* COMPILE_PCRE[8|16|32] */
1041            }
1042          else
1043    #endif /* SUPPORT_UTF */
1044            {
1045            SET_BIT(CHAR_NEL);
1046    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1047            SET_BIT(0xFF);  /* For characters > 255 */
1048    #endif
1049          }          }
       else SET_BIT(0x85);  
1050        try_next = FALSE;        try_next = FALSE;
1051        break;        break;
1052    
# Line 990  do Line 1066  do
1066        break;        break;
1067    
1068        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
1069        ensure it is set as not whitespace. */        ensure it is set as not whitespace. Luckily, the code value is the same
1070          (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1071    
1072        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
1073        set_nottype_bits(start_bits, cbit_space, table_limit, cd);        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
# Line 998  do Line 1075  do
1075        try_next = FALSE;        try_next = FALSE;
1076        break;        break;
1077    
1078        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to not
1079        not set it from the table. */        set it from the table. Luckily, the code value is the same (0x0b) in
1080          ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1081    
1082        case OP_WHITESPACE:        case OP_WHITESPACE:
1083        c = start_bits[1];    /* Save in case it was already set */        c = start_bits[1];    /* Save in case it was already set */
# Line 1028  do Line 1106  do
1106        break;        break;
1107    
1108        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1109        tcode += 3;        tcode += 1 + IMM2_SIZE;
1110        break;        break;
1111    
1112        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 1037  do Line 1115  do
1115        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1116        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1117        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1118        tcode += 2;               /* Fall through */        tcode += IMM2_SIZE;  /* Fall through */
1119    
1120        case OP_TYPESTAR:        case OP_TYPESTAR:
1121        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
# Line 1053  do Line 1131  do
1131          return SSB_FAIL;          return SSB_FAIL;
1132    
1133          case OP_HSPACE:          case OP_HSPACE:
1134          SET_BIT(0x09);          SET_BIT(CHAR_HT);
1135          SET_BIT(0x20);          SET_BIT(CHAR_SPACE);
1136          if (utf8)  #ifdef SUPPORT_UTF
1137            if (utf)
1138            {            {
1139    #ifdef COMPILE_PCRE8
1140            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
1141            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
1142            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1143            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
1144    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145              SET_BIT(0xA0);
1146              SET_BIT(0xFF);  /* For characters > 255 */
1147    #endif  /* COMPILE_PCRE[8|16|32] */
1148            }            }
1149          else SET_BIT(0xA0);          else
1150    #endif /* SUPPORT_UTF */
1151    #ifndef EBCDIC
1152              SET_BIT(0xA0);
1153    #endif  /* Not EBCDIC */
1154          break;          break;
1155    
1156          case OP_ANYNL:          case OP_ANYNL:
1157          case OP_VSPACE:          case OP_VSPACE:
1158          SET_BIT(0x0A);          SET_BIT(CHAR_LF);
1159          SET_BIT(0x0B);          SET_BIT(CHAR_VT);
1160          SET_BIT(0x0C);          SET_BIT(CHAR_FF);
1161          SET_BIT(0x0D);          SET_BIT(CHAR_CR);
1162          if (utf8)  #ifdef SUPPORT_UTF
1163            if (utf)
1164            {            {
1165    #ifdef COMPILE_PCRE8
1166            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
1167            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
1168    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1169              SET_BIT(CHAR_NEL);
1170              SET_BIT(0xFF);  /* For characters > 255 */
1171    #endif  /* COMPILE_PCRE16 */
1172            }            }
1173          else SET_BIT(0x85);          else
1174    #endif /* SUPPORT_UTF */
1175              SET_BIT(CHAR_NEL);
1176          break;          break;
1177    
1178          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 1088  do Line 1184  do
1184          break;          break;
1185    
1186          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1187          ensure it gets set as not whitespace. */          ensure it gets set as not whitespace. Luckily, the code value is the
1188            same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
1189            bit. */
1190    
1191          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1192          set_nottype_bits(start_bits, cbit_space, table_limit, cd);          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
# Line 1096  do Line 1194  do
1194          break;          break;
1195    
1196          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1197          avoid setting it. */          avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
1198            and EBCDIC, so we can just adjust the appropriate bit. */
1199    
1200          case OP_WHITESPACE:          case OP_WHITESPACE:
1201          c = start_bits[1];    /* Save in case it was already set */          c = start_bits[1];    /* Save in case it was already set */
# Line 1123  do Line 1222  do
1222        character with a value > 255. */        character with a value > 255. */
1223    
1224        case OP_NCLASS:        case OP_NCLASS:
1225  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1226        if (utf8)        if (utf)
1227          {          {
1228          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1229          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1230          }          }
1231  #endif  #endif
1232    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1233          SET_BIT(0xFF);                         /* For characters > 255 */
1234    #endif
1235        /* Fall through */        /* Fall through */
1236    
1237        case OP_CLASS:        case OP_CLASS:
1238          {          {
1239            pcre_uint8 *map;
1240          tcode++;          tcode++;
1241            map = (pcre_uint8 *)tcode;
1242    
1243          /* In UTF-8 mode, the bits in a bit map correspond to character          /* In UTF-8 mode, the bits in a bit map correspond to character
1244          values, not to byte values. However, the bit map we are constructing is          values, not to byte values. However, the bit map we are constructing is
# Line 1142  do Line 1246  do
1246          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
1247          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1248    
1249  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1250          if (utf8)          if (utf)
1251            {            {
1252            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1253            for (c = 128; c < 256; c++)            for (c = 128; c < 256; c++)
1254              {              {
1255              if ((tcode[c/8] && (1 << (c&7))) != 0)              if ((map[c/8] && (1 << (c&7))) != 0)
1256                {                {
1257                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1258                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
# Line 1156  do Line 1260  do
1260                }                }
1261              }              }
1262            }            }
   
         /* In non-UTF-8 mode, the two bit maps are completely compatible. */  
   
1263          else          else
1264  #endif  #endif
1265            {            {
1266            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1267              for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1268            }            }
1269    
1270          /* Advance past the bit map, and act on what follows. For a zero          /* Advance past the bit map, and act on what follows. For a zero
1271          minimum repeat, continue; otherwise stop processing. */          minimum repeat, continue; otherwise stop processing. */
1272    
1273          tcode += 32;          tcode += 32 / sizeof(pcre_uchar);
1274          switch (*tcode)          switch (*tcode)
1275            {            {
1276            case OP_CRSTAR:            case OP_CRSTAR:
# Line 1180  do Line 1282  do
1282    
1283            case OP_CRRANGE:            case OP_CRRANGE:
1284            case OP_CRMINRANGE:            case OP_CRMINRANGE:
1285            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1286              else try_next = FALSE;              else try_next = FALSE;
1287            break;            break;
1288    
# Line 1209  return yield; Line 1311  return yield;
1311  *************************************************/  *************************************************/
1312    
1313  /* This function is handed a compiled expression that it must study to produce  /* This function is handed a compiled expression that it must study to produce
1314  information that will speed up the matching. It returns a pcre_extra block  information that will speed up the matching. It returns a pcre[16]_extra block
1315  which then gets handed back to pcre_exec().  which then gets handed back to pcre_exec().
1316    
1317  Arguments:  Arguments:
# Line 1218  Arguments: Line 1320  Arguments:
1320    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
1321              set NULL unless error              set NULL unless error
1322    
1323  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1324                appropriate flags set;                the appropriate flags set;
1325              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1326  */  */
1327    
1328    #if defined COMPILE_PCRE8
1329  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1330  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1331    #elif defined COMPILE_PCRE16
1332    PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1333    pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1334    #elif defined COMPILE_PCRE32
1335    PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1336    pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1337    #endif
1338  {  {
1339  int min;  int min;
1340  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1341  BOOL had_accept = FALSE;  pcre_uint8 start_bits[32];
1342  uschar start_bits[32];  PUBL(extra) *extra = NULL;
 pcre_extra *extra = NULL;  
1343  pcre_study_data *study;  pcre_study_data *study;
1344  const uschar *tables;  const pcre_uint8 *tables;
1345  uschar *code;  pcre_uchar *code;
1346  compile_data compile_block;  compile_data compile_block;
1347  const real_pcre *re = (const real_pcre *)external_re;  const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1348    
1349  *errorptr = NULL;  *errorptr = NULL;
1350    
# Line 1245  if (re == NULL || re->magic_number != MA Line 1354  if (re == NULL || re->magic_number != MA
1354    return NULL;    return NULL;
1355    }    }
1356    
1357    if ((re->flags & PCRE_MODE) == 0)
1358      {
1359    #if defined COMPILE_PCRE8
1360      *errorptr = "argument not compiled in 8 bit mode";
1361    #elif defined COMPILE_PCRE16
1362      *errorptr = "argument not compiled in 16 bit mode";
1363    #elif defined COMPILE_PCRE32
1364      *errorptr = "argument not compiled in 32 bit mode";
1365    #endif
1366      return NULL;
1367      }
1368    
1369  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1370    {    {
1371    *errorptr = "unknown or incorrect option bit(s) set";    *errorptr = "unknown or incorrect option bit(s) set";
1372    return NULL;    return NULL;
1373    }    }
1374    
1375  code = (uschar *)re + re->name_table_offset +  code = (pcre_uchar *)re + re->name_table_offset +
1376    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1377    
1378  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
# Line 1266  if ((re->options & PCRE_ANCHORED) == 0 & Line 1387  if ((re->options & PCRE_ANCHORED) == 0 &
1387    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1388    
1389    tables = re->tables;    tables = re->tables;
1390    
1391    #if defined COMPILE_PCRE8
1392    if (tables == NULL)    if (tables == NULL)
1393      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1394      (void *)(&tables));      (void *)(&tables));
1395    #elif defined COMPILE_PCRE16
1396      if (tables == NULL)
1397        (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1398        (void *)(&tables));
1399    #elif defined COMPILE_PCRE32
1400      if (tables == NULL)
1401        (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1402        (void *)(&tables));
1403    #endif
1404    
1405    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1406    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
# Line 1277  if ((re->options & PCRE_ANCHORED) == 0 & Line 1409  if ((re->options & PCRE_ANCHORED) == 0 &
1409    
1410    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1411    
1412    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1413    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1414      &compile_block);      &compile_block);
1415    bits_set = rc == SSB_DONE;    bits_set = rc == SSB_DONE;
1416    if (rc == SSB_UNKNOWN)    if (rc == SSB_UNKNOWN)
1417      {      {
1418      *errorptr = "internal error: opcode not recognized";      *errorptr = "internal error: opcode not recognized";
1419      return NULL;      return NULL;
1420      }      }
1421    }    }
1422    
1423  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1424    
1425  switch(min = find_minlength(code, code, re->options, &had_accept, 0))  switch(min = find_minlength(code, code, re->options, 0))
1426    {    {
1427    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1428    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
# Line 1298  switch(min = find_minlength(code, code, Line 1430  switch(min = find_minlength(code, code,
1430    }    }
1431    
1432  /* If a set of starting bytes has been identified, or if the minimum length is  /* If a set of starting bytes has been identified, or if the minimum length is
1433  greater than zero, or if JIT optimization has been requested, get a pcre_extra  greater than zero, or if JIT optimization has been requested, or if
1434  block and a pcre_study_data block. The study data is put in the latter, which  PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1435  is pointed to by the former, which may also get additional data set later by  pcre_study_data block. The study data is put in the latter, which is pointed to
1436  the calling program. At the moment, the size of pcre_study_data is fixed. We  by the former, which may also get additional data set later by the calling
1437  nevertheless save it in a field for returning via the pcre_fullinfo() function  program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1438  so that if it becomes variable in the future, we don't have to change that  save it in a field for returning via the pcre_fullinfo() function so that if it
1439  code. */  becomes variable in the future, we don't have to change that code. */
1440    
1441  if (bits_set || min > 0  if (bits_set || min > 0 || (options & (
1442  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT
1443      || (options & PCRE_STUDY_JIT_COMPILE) != 0      PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1444        PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1445  #endif  #endif
1446    )      PCRE_STUDY_EXTRA_NEEDED)) != 0)
1447    {    {
1448    extra = (pcre_extra *)(pcre_malloc)    extra = (PUBL(extra) *)(PUBL(malloc))
1449      (sizeof(pcre_extra) + sizeof(pcre_study_data));      (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1450    if (extra == NULL)    if (extra == NULL)
1451      {      {
1452      *errorptr = "failed to get memory";      *errorptr = "failed to get memory";
1453      return NULL;      return NULL;
1454      }      }
1455    
1456    study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));    study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1457    extra->flags = PCRE_EXTRA_STUDY_DATA;    extra->flags = PCRE_EXTRA_STUDY_DATA;
1458    extra->study_data = study;    extra->study_data = study;
1459    
1460    study->size = sizeof(pcre_study_data);    study->size = sizeof(pcre_study_data);
1461    study->flags = 0;    study->flags = 0;
1462    
1463      /* Set the start bits always, to avoid unset memory errors if the
1464      study data is written to a file, but set the flag only if any of the bits
1465      are set, to save time looking when none are. */
1466    
1467    if (bits_set)    if (bits_set)
1468      {      {
1469      study->flags |= PCRE_STUDY_MAPPED;      study->flags |= PCRE_STUDY_MAPPED;
1470      memcpy(study->start_bits, start_bits, sizeof(start_bits));      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1471      }      }
1472      else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1473    /* Always set the minlength value in the block, because the JIT compiler  
1474    makes use of it. However, don't set the bit unless the length is greater than  #ifdef PCRE_DEBUG
1475    zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time    if (bits_set)
1476    checking this case. */      {
1477        pcre_uint8 *ptr = start_bits;
1478    study->minlength = min;      int i;
1479    if (min > 0) study->flags |= PCRE_STUDY_MINLEN;  
1480        printf("Start bits:\n");
1481        for (i = 0; i < 32; i++)
1482          printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1483        }
1484    #endif
1485    
1486      /* Always set the minlength value in the block, because the JIT compiler
1487      makes use of it. However, don't set the bit unless the length is greater than
1488      zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1489      checking the zero case. */
1490    
1491      if (min > 0)
1492        {
1493        study->flags |= PCRE_STUDY_MINLEN;
1494        study->minlength = min;
1495        }
1496      else study->minlength = 0;
1497    
1498    /* If JIT support was compiled and requested, attempt the JIT compilation.    /* If JIT support was compiled and requested, attempt the JIT compilation.
1499    If no starting bytes were found, and the minimum length is zero, and JIT    If no starting bytes were found, and the minimum length is zero, and JIT
1500    compilation fails, abandon the extra block and return NULL. */    compilation fails, abandon the extra block and return NULL, unless
1501      PCRE_STUDY_EXTRA_NEEDED is set. */
1502    
1503  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT
1504    extra->executable_jit = NULL;    extra->executable_jit = NULL;
1505    if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);    if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1506    if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)      PRIV(jit_compile)(re, extra, JIT_COMPILE);
1507      if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1508        PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1509      if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1510        PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1511    
1512      if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1513          (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1514      {      {
1515    #if defined COMPILE_PCRE8
1516      pcre_free_study(extra);      pcre_free_study(extra);
1517    #elif defined COMPILE_PCRE16
1518        pcre16_free_study(extra);
1519    #elif defined COMPILE_PCRE32
1520        pcre32_free_study(extra);
1521    #endif
1522      extra = NULL;      extra = NULL;
1523      }      }
1524  #endif  #endif
1525    }    }
1526    
# Line 1366  return extra; Line 1534  return extra;
1534    
1535  /* This function frees the memory that was obtained by pcre_study().  /* This function frees the memory that was obtained by pcre_study().
1536    
1537  Argument:   a pointer to the pcre_extra block  Argument:   a pointer to the pcre[16]_extra block
1538  Returns:    nothing  Returns:    nothing
1539  */  */
1540    
1541    #if defined COMPILE_PCRE8
1542  PCRE_EXP_DEFN void  PCRE_EXP_DEFN void
1543  pcre_free_study(pcre_extra *extra)  pcre_free_study(pcre_extra *extra)
1544    #elif defined COMPILE_PCRE16
1545    PCRE_EXP_DEFN void
1546    pcre16_free_study(pcre16_extra *extra)
1547    #elif defined COMPILE_PCRE32
1548    PCRE_EXP_DEFN void
1549    pcre32_free_study(pcre32_extra *extra)
1550    #endif
1551  {  {
1552    if (extra == NULL)
1553      return;
1554  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT
1555  if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&  if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1556       extra->executable_jit != NULL)       extra->executable_jit != NULL)
1557    _pcre_jit_free(extra->executable_jit);    PRIV(jit_free)(extra->executable_jit);
1558  #endif  #endif
1559  pcre_free(extra);  PUBL(free)(extra);
1560  }  }
1561    
1562  /* End of pcre_study.c */  /* End of pcre_study.c */

Legend:
Removed from v.689  
changed lines
  Added in v.1148

  ViewVC Help
Powered by ViewVC 1.1.5