/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 613 by ph10, Sat Jul 2 16:59:52 2011 UTC revision 1221 by ph10, Sun Nov 11 20:27:03 2012 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2010 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code        pointer to start of group (the bracket)    code            pointer to start of group (the bracket)
70    startcode   pointer to start of the whole pattern    startcode       pointer to start of the whole pattern
71    options     the compiling options    options         the compiling options
72    had_accept  pointer to flag for (*ACCEPT) encountered    int             RECURSE depth
73    
74  Returns:   the minimum length  Returns:   the minimum length
75             -1 if \C was encountered             -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
77             -3 internal error (opcode not listed)             -3 internal error (opcode not listed)
78  */  */
79    
80  static int  static int
81  find_minlength(const uschar *code, const uschar *startcode, int options,  find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
82    BOOL *had_accept_ptr)    int recurse_depth)
83  {  {
84  int length = -1;  int length = -1;
85  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
86    BOOL utf = (options & PCRE_UTF8) != 0;
87  BOOL had_recurse = FALSE;  BOOL had_recurse = FALSE;
88  register int branchlength = 0;  register int branchlength = 0;
89  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
90    
91  if (*code == OP_CBRA || *code == OP_SCBRA ||  if (*code == OP_CBRA || *code == OP_SCBRA ||
92      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
93    
94  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
95  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 96  branch, check the length against that of Line 97  branch, check the length against that of
97  for (;;)  for (;;)
98    {    {
99    int d, min;    int d, min;
100    uschar *cs, *ce;    pcre_uchar *cs, *ce;
101    register int op = *cc;    register pcre_uchar op = *cc;
102    
103    switch (op)    switch (op)
104      {      {
# Line 127  for (;;) Line 128  for (;;)
128      case OP_BRAPOS:      case OP_BRAPOS:
129      case OP_SBRAPOS:      case OP_SBRAPOS:
130      case OP_ONCE:      case OP_ONCE:
131      d = find_minlength(cc, startcode, options, had_accept_ptr);      case OP_ONCE_NC:
132        d = find_minlength(cc, startcode, options, recurse_depth);
133      if (d < 0) return d;      if (d < 0) return d;
134      branchlength += d;      branchlength += d;
     if (*had_accept_ptr) return branchlength;  
135      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
136      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
137      break;      break;
138    
139        /* ACCEPT makes things far too complicated; we have to give up. */
140    
141        case OP_ACCEPT:
142        case OP_ASSERT_ACCEPT:
143        return -1;
144    
145      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
146      call. If it's ALT it is an alternation in a nested call. If it is END it's      call. If it's ALT it is an alternation in a nested call. If it is END it's
147      the end of the outer call. All can be handled by the same code. If it is      the end of the outer call. All can be handled by the same code. If an
148      ACCEPT, it is essentially the same as END, but we set a flag so that      ACCEPT was previously encountered, use the length that was in force at that
149      counting stops. */      time, and pass back the shortest ACCEPT length. */
150    
     case OP_ACCEPT:  
     case OP_ASSERT_ACCEPT:  
     *had_accept_ptr = TRUE;  
     /* Fall through */  
151      case OP_ALT:      case OP_ALT:
152      case OP_KET:      case OP_KET:
153      case OP_KETRMAX:      case OP_KETRMAX:
# Line 187  for (;;) Line 190  for (;;)
190      case OP_DOLLM:      case OP_DOLLM:
191      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
192      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
193      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
194      break;      break;
195    
196      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
# Line 196  for (;;) Line 199  for (;;)
199      case OP_BRAMINZERO:      case OP_BRAMINZERO:
200      case OP_BRAPOSZERO:      case OP_BRAPOSZERO:
201      case OP_SKIPZERO:      case OP_SKIPZERO:
202      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
203      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
204      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
205      break;      break;
# Line 221  for (;;) Line 224  for (;;)
224      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
225      branchlength++;      branchlength++;
226      cc += 2;      cc += 2;
227  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
228      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
229  #endif  #endif
230      break;      break;
231    
# Line 241  for (;;) Line 244  for (;;)
244      case OP_NOTEXACT:      case OP_NOTEXACT:
245      case OP_NOTEXACTI:      case OP_NOTEXACTI:
246      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
247      cc += 4;      cc += 2 + IMM2_SIZE;
248  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
249      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
250  #endif  #endif
251      break;      break;
252    
253      case OP_TYPEEXACT:      case OP_TYPEEXACT:
254      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
255      cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;      cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
256          || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
257      break;      break;
258    
259      /* Handle single-char non-literal matchers */      /* Handle single-char non-literal matchers */
# Line 276  for (;;) Line 280  for (;;)
280      cc++;      cc++;
281      break;      break;
282    
283      /* "Any newline" might match two characters, but it also might match just      /* "Any newline" might match two characters, but it also might match just
284      one. */      one. */
285    
286      case OP_ANYNL:      case OP_ANYNL:
# Line 284  for (;;) Line 288  for (;;)
288      cc++;      cc++;
289      break;      break;
290    
291      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
292        non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
293        appear, but leave the code, just in case.) */
294    
295      case OP_ANYBYTE:      case OP_ANYBYTE:
296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
297      if (utf8) return -1;      if (utf) return -1;
298  #endif  #endif
299      branchlength++;      branchlength++;
300      cc++;      cc++;
# Line 304  for (;;) Line 310  for (;;)
310      case OP_TYPEPOSSTAR:      case OP_TYPEPOSSTAR:
311      case OP_TYPEPOSQUERY:      case OP_TYPEPOSQUERY:
312      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;      if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
313      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
314      break;      break;
315    
316      case OP_TYPEUPTO:      case OP_TYPEUPTO:
317      case OP_TYPEMINUPTO:      case OP_TYPEMINUPTO:
318      case OP_TYPEPOSUPTO:      case OP_TYPEPOSUPTO:
319      if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;      if (cc[1 + IMM2_SIZE] == OP_PROP
320      cc += _pcre_OP_lengths[op];        || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
321        cc += PRIV(OP_lengths)[op];
322      break;      break;
323    
324      /* Check a class for variable quantification */      /* Check a class for variable quantification */
325    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
326      case OP_CLASS:      case OP_CLASS:
327      case OP_NCLASS:      case OP_NCLASS:
328      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
329        case OP_XCLASS:
330        /* The original code caused an unsigned overflow in 64 bit systems,
331        so now we use a conditional statement. */
332        if (op == OP_XCLASS)
333          cc += GET(cc, 1);
334        else
335          cc += PRIV(OP_lengths)[OP_CLASS];
336    #else
337        cc += PRIV(OP_lengths)[OP_CLASS];
338    #endif
339    
340      switch (*cc)      switch (*cc)
341        {        {
# Line 343  for (;;) Line 354  for (;;)
354        case OP_CRRANGE:        case OP_CRRANGE:
355        case OP_CRMINRANGE:        case OP_CRMINRANGE:
356        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
357        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
358        break;        break;
359    
360        default:        default:
# Line 368  for (;;) Line 379  for (;;)
379      case OP_REFI:      case OP_REFI:
380      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
381        {        {
382        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
383        if (cs == NULL) return -2;        if (cs == NULL) return -2;
384        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
385        if (cc > cs && cc < ce)        if (cc > cs && cc < ce)
# Line 376  for (;;) Line 387  for (;;)
387          d = 0;          d = 0;
388          had_recurse = TRUE;          had_recurse = TRUE;
389          }          }
390        else        else
391          {          {
392          d = find_minlength(cs, startcode, options, had_accept_ptr);          d = find_minlength(cs, startcode, options, recurse_depth);
393          *had_accept_ptr = FALSE;          }
         }  
394        }        }
395      else d = 0;      else d = 0;
396      cc += 3;      cc += 1 + IMM2_SIZE;
397    
398      /* Handle repeated back references */      /* Handle repeated back references */
399    
# Line 406  for (;;) Line 416  for (;;)
416        case OP_CRRANGE:        case OP_CRRANGE:
417        case OP_CRMINRANGE:        case OP_CRMINRANGE:
418        min = GET2(cc, 1);        min = GET2(cc, 1);
419        cc += 5;        cc += 1 + 2 * IMM2_SIZE;
420        break;        break;
421    
422        default:        default:
# Line 417  for (;;) Line 427  for (;;)
427      branchlength += min * d;      branchlength += min * d;
428      break;      break;
429    
430        /* We can easily detect direct recursion, but not mutual recursion. This is
431        caught by a recursion depth count. */
432    
433      case OP_RECURSE:      case OP_RECURSE:
434      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
     if (cs == NULL) return -2;  
435      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
436      if (cc > cs && cc < ce)      if ((cc > cs && cc < ce) || recurse_depth > 10)
437        had_recurse = TRUE;        had_recurse = TRUE;
438      else      else
439        {        {
440        branchlength += find_minlength(cs, startcode, options, had_accept_ptr);        branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
441        *had_accept_ptr = FALSE;        }
       }  
442      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
443      break;      break;
444    
# Line 478  for (;;) Line 489  for (;;)
489      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
490      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
491    
492      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
493  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
494      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
495  #endif  #endif
496      break;      break;
497    
# Line 489  for (;;) Line 500  for (;;)
500      case OP_MARK:      case OP_MARK:
501      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
502      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     cc += _pcre_OP_lengths[op] + cc[1];  
     break;  
   
503      case OP_THEN_ARG:      case OP_THEN_ARG:
504      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];      cc += PRIV(OP_lengths)[op] + cc[1];
505      break;      break;
506    
507      /* The remaining opcodes are just skipped over. */      /* The remaining opcodes are just skipped over. */
508    
509      case OP_CLOSE:      case OP_CLOSE:
# Line 505  for (;;) Line 513  for (;;)
513      case OP_SET_SOM:      case OP_SET_SOM:
514      case OP_SKIP:      case OP_SKIP:
515      case OP_THEN:      case OP_THEN:
516      cc += _pcre_OP_lengths[op];      cc += PRIV(OP_lengths)[op];
517      break;      break;
518    
519      /* This should not occur: we list all opcodes explicitly so that when      /* This should not occur: we list all opcodes explicitly so that when
# Line 534  Arguments: Line 542  Arguments:
542    p             points to the character    p             points to the character
543    caseless      the caseless flag    caseless      the caseless flag
544    cd            the block with char table pointers    cd            the block with char table pointers
545    utf8          TRUE for UTF-8 mode    utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
546    
547  Returns:        pointer after the character  Returns:        pointer after the character
548  */  */
549    
550  static const uschar *  static const pcre_uchar *
551  set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
552    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf)
553  {  {
554  unsigned int c = *p;  pcre_uint32 c = *p;
555    
556    #ifdef COMPILE_PCRE8
557  SET_BIT(c);  SET_BIT(c);
558    
559  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
560  if (utf8 && c > 127)  if (utf && c > 127)
561    {    {
562    GETCHARINC(c, p);    GETCHARINC(c, p);
563  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
564    if (caseless)    if (caseless)
565      {      {
566      uschar buff[8];      pcre_uchar buff[6];
567      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
568      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
569      SET_BIT(buff[0]);      SET_BIT(buff[0]);
570      }      }
571  #endif  #endif  /* Not SUPPORT_UCP */
572    return p;    return p;
573    }    }
574  #endif  #else   /* Not SUPPORT_UTF */
575    (void)(utf);   /* Stops warning for unused parameter */
576    #endif  /* SUPPORT_UTF */
577    
578  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
579    
580  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
581  return p + 1;  return p + 1;
582    #endif  /* COMPILE_PCRE8 */
583    
584    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
585    if (c > 0xff)
586      {
587      c = 0xff;
588      caseless = FALSE;
589      }
590    SET_BIT(c);
591    
592    #ifdef SUPPORT_UTF
593    if (utf && c > 127)
594      {
595      GETCHARINC(c, p);
596    #ifdef SUPPORT_UCP
597      if (caseless)
598        {
599        c = UCD_OTHERCASE(c);
600        if (c > 0xff)
601          c = 0xff;
602        SET_BIT(c);
603        }
604    #endif  /* SUPPORT_UCP */
605      return p;
606      }
607    #else   /* Not SUPPORT_UTF */
608    (void)(utf);   /* Stops warning for unused parameter */
609    #endif  /* SUPPORT_UTF */
610    
611    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
612    return p + 1;
613    #endif
614  }  }
615    
616    
# Line 593  Returns:         nothing Line 636  Returns:         nothing
636  */  */
637    
638  static void  static void
639  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,  set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
640    compile_data *cd)    compile_data *cd)
641  {  {
642  register int c;  register pcre_uint32 c;
643  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
644    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
645  if (table_limit == 32) return;  if (table_limit == 32) return;
646  for (c = 128; c < 256; c++)  for (c = 128; c < 256; c++)
647    {    {
648    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)    if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
649      {      {
650      uschar buff[8];      pcre_uchar buff[6];
651      (void)_pcre_ord2utf8(c, buff);      (void)PRIV(ord2utf)(c, buff);
652      SET_BIT(buff[0]);      SET_BIT(buff[0]);
653      }      }
654    }    }
655    #endif
656  }  }
657    
658    
# Line 633  Returns:         nothing Line 678  Returns:         nothing
678  */  */
679    
680  static void  static void
681  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682    compile_data *cd)    compile_data *cd)
683  {  {
684  register int c;  register pcre_uint32 c;
685  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
686    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
688    #endif
689  }  }
690    
691    
# Line 658  function fails unless the result is SSB_ Line 705  function fails unless the result is SSB_
705  Arguments:  Arguments:
706    code         points to an expression    code         points to an expression
707    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
708    utf8         TRUE if in UTF-8 mode    utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
709    cd           the block with char table pointers    cd           the block with char table pointers
710    
711  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
# Line 668  Returns:       SSB_FAIL     => Failed to Line 715  Returns:       SSB_FAIL     => Failed to
715  */  */
716    
717  static int  static int
718  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
719    compile_data *cd)    compile_data *cd)
720  {  {
721  register int c;  register pcre_uint32 c;
722  int yield = SSB_DONE;  int yield = SSB_DONE;
723  int table_limit = utf8? 16:32;  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
724    int table_limit = utf? 16:32;
725    #else
726    int table_limit = 32;
727    #endif
728    
729  #if 0  #if 0
730  /* ========================================================================= */  /* ========================================================================= */
# Line 695  volatile int dummy; Line 746  volatile int dummy;
746  do  do
747    {    {
748    BOOL try_next = TRUE;    BOOL try_next = TRUE;
749    const uschar *tcode = code + 1 + LINK_SIZE;    const pcre_uchar *tcode = code + 1 + LINK_SIZE;
750    
751    if (*code == OP_CBRA || *code == OP_SCBRA ||    if (*code == OP_CBRA || *code == OP_SCBRA ||
752        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
753    
754    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
755      {      {
# Line 716  do Line 767  do
767        /* Fail for a valid opcode that implies no starting bits. */        /* Fail for a valid opcode that implies no starting bits. */
768    
769        case OP_ACCEPT:        case OP_ACCEPT:
770        case OP_ASSERT_ACCEPT:        case OP_ASSERT_ACCEPT:
771        case OP_ALLANY:        case OP_ALLANY:
772        case OP_ANY:        case OP_ANY:
773        case OP_ANYBYTE:        case OP_ANYBYTE:
774        case OP_CIRC:        case OP_CIRC:
775        case OP_CIRCM:        case OP_CIRCM:
776        case OP_CLOSE:        case OP_CLOSE:
777        case OP_COMMIT:        case OP_COMMIT:
778        case OP_COND:        case OP_COND:
779        case OP_CREF:        case OP_CREF:
780        case OP_DEF:        case OP_DEF:
781        case OP_DOLL:        case OP_DOLL:
782        case OP_DOLLM:        case OP_DOLLM:
783        case OP_END:        case OP_END:
784        case OP_EOD:        case OP_EOD:
785        case OP_EODN:        case OP_EODN:
786        case OP_EXTUNI:        case OP_EXTUNI:
787        case OP_FAIL:        case OP_FAIL:
788        case OP_MARK:        case OP_MARK:
# Line 739  do Line 790  do
790        case OP_NOT:        case OP_NOT:
791        case OP_NOTEXACT:        case OP_NOTEXACT:
792        case OP_NOTEXACTI:        case OP_NOTEXACTI:
793        case OP_NOTI:        case OP_NOTI:
794        case OP_NOTMINPLUS:        case OP_NOTMINPLUS:
795        case OP_NOTMINPLUSI:        case OP_NOTMINPLUSI:
796        case OP_NOTMINQUERY:        case OP_NOTMINQUERY:
# Line 767  do Line 818  do
818        case OP_NOTUPTOI:        case OP_NOTUPTOI:
819        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
820        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
       case OP_NOT_WORD_BOUNDARY:  
821        case OP_NRREF:        case OP_NRREF:
822        case OP_PROP:        case OP_PROP:
823        case OP_PRUNE:        case OP_PRUNE:
# Line 777  do Line 827  do
827        case OP_REFI:        case OP_REFI:
828        case OP_REVERSE:        case OP_REVERSE:
829        case OP_RREF:        case OP_RREF:
830        case OP_SCOND:        case OP_SCOND:
831        case OP_SET_SOM:        case OP_SET_SOM:
832        case OP_SKIP:        case OP_SKIP:
833        case OP_SKIP_ARG:        case OP_SKIP_ARG:
# Line 785  do Line 835  do
835        case OP_SOM:        case OP_SOM:
836        case OP_THEN:        case OP_THEN:
837        case OP_THEN_ARG:        case OP_THEN_ARG:
838        case OP_WORD_BOUNDARY:  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
839        case OP_XCLASS:        case OP_XCLASS:
840    #endif
841        return SSB_FAIL;        return SSB_FAIL;
842    
843          /* We can ignore word boundary tests. */
844    
845          case OP_WORD_BOUNDARY:
846          case OP_NOT_WORD_BOUNDARY:
847          tcode++;
848          break;
849    
850        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
851        bits from within the subpattern. If it can't find anything, we have to        bits from within the subpattern. If it can't find anything, we have to
852        give up. If it finds some mandatory character(s), we are done for this        give up. If it finds some mandatory character(s), we are done for this
# Line 803  do Line 861  do
861        case OP_CBRAPOS:        case OP_CBRAPOS:
862        case OP_SCBRAPOS:        case OP_SCBRAPOS:
863        case OP_ONCE:        case OP_ONCE:
864          case OP_ONCE_NC:
865        case OP_ASSERT:        case OP_ASSERT:
866        rc = set_start_bits(tcode, start_bits, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf, cd);
867        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
868        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
869          {          {
# Line 851  do Line 910  do
910        case OP_BRAZERO:        case OP_BRAZERO:
911        case OP_BRAMINZERO:        case OP_BRAMINZERO:
912        case OP_BRAPOSZERO:        case OP_BRAPOSZERO:
913        rc = set_start_bits(++tcode, start_bits, utf8, cd);        rc = set_start_bits(++tcode, start_bits, utf, cd);
914        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
915  /* =========================================================================  /* =========================================================================
916        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 878  do Line 937  do
937        case OP_QUERY:        case OP_QUERY:
938        case OP_MINQUERY:        case OP_MINQUERY:
939        case OP_POSQUERY:        case OP_POSQUERY:
940        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
941        break;        break;
942    
943        case OP_STARI:        case OP_STARI:
# Line 887  do Line 946  do
946        case OP_QUERYI:        case OP_QUERYI:
947        case OP_MINQUERYI:        case OP_MINQUERYI:
948        case OP_POSQUERYI:        case OP_POSQUERYI:
949        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
950        break;        break;
951    
952        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 895  do Line 954  do
954        case OP_UPTO:        case OP_UPTO:
955        case OP_MINUPTO:        case OP_MINUPTO:
956        case OP_POSUPTO:        case OP_POSUPTO:
957        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
958        break;        break;
959    
960        case OP_UPTOI:        case OP_UPTOI:
961        case OP_MINUPTOI:        case OP_MINUPTOI:
962        case OP_POSUPTOI:        case OP_POSUPTOI:
963        tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
964        break;        break;
965    
966        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
967    
968        case OP_EXACT:        case OP_EXACT:
969        tcode += 2;        tcode += IMM2_SIZE;
970        /* Fall through */        /* Fall through */
971        case OP_CHAR:        case OP_CHAR:
972        case OP_PLUS:        case OP_PLUS:
973        case OP_MINPLUS:        case OP_MINPLUS:
974        case OP_POSPLUS:        case OP_POSPLUS:
975        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
976        try_next = FALSE;        try_next = FALSE;
977        break;        break;
978    
979        case OP_EXACTI:        case OP_EXACTI:
980        tcode += 2;        tcode += IMM2_SIZE;
981        /* Fall through */        /* Fall through */
982        case OP_CHARI:        case OP_CHARI:
983        case OP_PLUSI:        case OP_PLUSI:
984        case OP_MINPLUSI:        case OP_MINPLUSI:
985        case OP_POSPLUSI:        case OP_POSPLUSI:
986        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
987        try_next = FALSE;        try_next = FALSE;
988        break;        break;
989    
# Line 935  do Line 994  do
994        identical. */        identical. */
995    
996        case OP_HSPACE:        case OP_HSPACE:
997        SET_BIT(0x09);        SET_BIT(CHAR_HT);
998        SET_BIT(0x20);        SET_BIT(CHAR_SPACE);
999        if (utf8)  #ifdef SUPPORT_UTF
1000          if (utf)
1001          {          {
1002    #ifdef COMPILE_PCRE8
1003          SET_BIT(0xC2);  /* For U+00A0 */          SET_BIT(0xC2);  /* For U+00A0 */
1004          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
1005          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1006          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
1007    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1008            SET_BIT(0xA0);
1009            SET_BIT(0xFF);  /* For characters > 255 */
1010    #endif  /* COMPILE_PCRE[8|16|32] */
1011            }
1012          else
1013    #endif /* SUPPORT_UTF */
1014            {
1015    #ifndef EBCDIC
1016            SET_BIT(0xA0);
1017    #endif  /* Not EBCDIC */
1018    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1019            SET_BIT(0xFF);  /* For characters > 255 */
1020    #endif  /* COMPILE_PCRE[16|32] */
1021          }          }
       else SET_BIT(0xA0);  
1022        try_next = FALSE;        try_next = FALSE;
1023        break;        break;
1024    
1025        case OP_ANYNL:        case OP_ANYNL:
1026        case OP_VSPACE:        case OP_VSPACE:
1027        SET_BIT(0x0A);        SET_BIT(CHAR_LF);
1028        SET_BIT(0x0B);        SET_BIT(CHAR_VT);
1029        SET_BIT(0x0C);        SET_BIT(CHAR_FF);
1030        SET_BIT(0x0D);        SET_BIT(CHAR_CR);
1031        if (utf8)  #ifdef SUPPORT_UTF
1032          if (utf)
1033          {          {
1034    #ifdef COMPILE_PCRE8
1035          SET_BIT(0xC2);  /* For U+0085 */          SET_BIT(0xC2);  /* For U+0085 */
1036          SET_BIT(0xE2);  /* For U+2028, U+2029 */          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1037    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1038            SET_BIT(CHAR_NEL);
1039            SET_BIT(0xFF);  /* For characters > 255 */
1040    #endif  /* COMPILE_PCRE[8|16|32] */
1041            }
1042          else
1043    #endif /* SUPPORT_UTF */
1044            {
1045            SET_BIT(CHAR_NEL);
1046    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1047            SET_BIT(0xFF);  /* For characters > 255 */
1048    #endif
1049          }          }
       else SET_BIT(0x85);  
1050        try_next = FALSE;        try_next = FALSE;
1051        break;        break;
1052    
# Line 979  do Line 1066  do
1066        break;        break;
1067    
1068        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
1069        ensure it is set as not whitespace. */        ensure it is set as not whitespace. Luckily, the code value is the same
1070          (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1071    
1072        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
1073        set_nottype_bits(start_bits, cbit_space, table_limit, cd);        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
# Line 987  do Line 1075  do
1075        try_next = FALSE;        try_next = FALSE;
1076        break;        break;
1077    
1078        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to not
1079        not set it from the table. */        set it from the table. Luckily, the code value is the same (0x0b) in
1080          ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1081    
1082        case OP_WHITESPACE:        case OP_WHITESPACE:
1083        c = start_bits[1];    /* Save in case it was already set */        c = start_bits[1];    /* Save in case it was already set */
# Line 1017  do Line 1106  do
1106        break;        break;
1107    
1108        case OP_TYPEEXACT:        case OP_TYPEEXACT:
1109        tcode += 3;        tcode += 1 + IMM2_SIZE;
1110        break;        break;
1111    
1112        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 1026  do Line 1115  do
1115        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1116        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1117        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1118        tcode += 2;               /* Fall through */        tcode += IMM2_SIZE;  /* Fall through */
1119    
1120        case OP_TYPESTAR:        case OP_TYPESTAR:
1121        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
# Line 1042  do Line 1131  do
1131          return SSB_FAIL;          return SSB_FAIL;
1132    
1133          case OP_HSPACE:          case OP_HSPACE:
1134          SET_BIT(0x09);          SET_BIT(CHAR_HT);
1135          SET_BIT(0x20);          SET_BIT(CHAR_SPACE);
1136          if (utf8)  #ifdef SUPPORT_UTF
1137            if (utf)
1138            {            {
1139    #ifdef COMPILE_PCRE8
1140            SET_BIT(0xC2);  /* For U+00A0 */            SET_BIT(0xC2);  /* For U+00A0 */
1141            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
1142            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1143            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
1144    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145              SET_BIT(0xA0);
1146              SET_BIT(0xFF);  /* For characters > 255 */
1147    #endif  /* COMPILE_PCRE[8|16|32] */
1148            }            }
1149          else SET_BIT(0xA0);          else
1150    #endif /* SUPPORT_UTF */
1151    #ifndef EBCDIC
1152              SET_BIT(0xA0);
1153    #endif  /* Not EBCDIC */
1154          break;          break;
1155    
1156          case OP_ANYNL:          case OP_ANYNL:
1157          case OP_VSPACE:          case OP_VSPACE:
1158          SET_BIT(0x0A);          SET_BIT(CHAR_LF);
1159          SET_BIT(0x0B);          SET_BIT(CHAR_VT);
1160          SET_BIT(0x0C);          SET_BIT(CHAR_FF);
1161          SET_BIT(0x0D);          SET_BIT(CHAR_CR);
1162          if (utf8)  #ifdef SUPPORT_UTF
1163            if (utf)
1164            {            {
1165    #ifdef COMPILE_PCRE8
1166            SET_BIT(0xC2);  /* For U+0085 */            SET_BIT(0xC2);  /* For U+0085 */
1167            SET_BIT(0xE2);  /* For U+2028, U+2029 */            SET_BIT(0xE2);  /* For U+2028, U+2029 */
1168    #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1169              SET_BIT(CHAR_NEL);
1170              SET_BIT(0xFF);  /* For characters > 255 */
1171    #endif  /* COMPILE_PCRE16 */
1172            }            }
1173          else SET_BIT(0x85);          else
1174    #endif /* SUPPORT_UTF */
1175              SET_BIT(CHAR_NEL);
1176          break;          break;
1177    
1178          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 1077  do Line 1184  do
1184          break;          break;
1185    
1186          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1187          ensure it gets set as not whitespace. */          ensure it gets set as not whitespace. Luckily, the code value is the
1188            same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
1189            bit. */
1190    
1191          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1192          set_nottype_bits(start_bits, cbit_space, table_limit, cd);          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
# Line 1085  do Line 1194  do
1194          break;          break;
1195    
1196          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1197          avoid setting it. */          avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
1198            and EBCDIC, so we can just adjust the appropriate bit. */
1199    
1200          case OP_WHITESPACE:          case OP_WHITESPACE:
1201          c = start_bits[1];    /* Save in case it was already set */          c = start_bits[1];    /* Save in case it was already set */
# Line 1112  do Line 1222  do
1222        character with a value > 255. */        character with a value > 255. */
1223    
1224        case OP_NCLASS:        case OP_NCLASS:
1225  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1226        if (utf8)        if (utf)
1227          {          {
1228          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1229          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1230          }          }
1231  #endif  #endif
1232    #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1233          SET_BIT(0xFF);                         /* For characters > 255 */
1234    #endif
1235        /* Fall through */        /* Fall through */
1236    
1237        case OP_CLASS:        case OP_CLASS:
1238          {          {
1239            pcre_uint8 *map;
1240          tcode++;          tcode++;
1241            map = (pcre_uint8 *)tcode;
1242    
1243          /* In UTF-8 mode, the bits in a bit map correspond to character          /* In UTF-8 mode, the bits in a bit map correspond to character
1244          values, not to byte values. However, the bit map we are constructing is          values, not to byte values. However, the bit map we are constructing is
# Line 1131  do Line 1246  do
1246          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
1247          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1248    
1249  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1250          if (utf8)          if (utf)
1251            {            {
1252            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1253            for (c = 128; c < 256; c++)            for (c = 128; c < 256; c++)
1254              {              {
1255              if ((tcode[c/8] && (1 << (c&7))) != 0)              if ((map[c/8] && (1 << (c&7))) != 0)
1256                {                {
1257                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1258                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
# Line 1145  do Line 1260  do
1260                }                }
1261              }              }
1262            }            }
   
         /* In non-UTF-8 mode, the two bit maps are completely compatible. */  
   
1263          else          else
1264  #endif  #endif
1265            {            {
1266            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1267              for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1268            }            }
1269    
1270          /* Advance past the bit map, and act on what follows. For a zero          /* Advance past the bit map, and act on what follows. For a zero
1271          minimum repeat, continue; otherwise stop processing. */          minimum repeat, continue; otherwise stop processing. */
1272    
1273          tcode += 32;          tcode += 32 / sizeof(pcre_uchar);
1274          switch (*tcode)          switch (*tcode)
1275            {            {
1276            case OP_CRSTAR:            case OP_CRSTAR:
# Line 1169  do Line 1282  do
1282    
1283            case OP_CRRANGE:            case OP_CRRANGE:
1284            case OP_CRMINRANGE:            case OP_CRMINRANGE:
1285            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1286              else try_next = FALSE;              else try_next = FALSE;
1287            break;            break;
1288    
1289            default:            default:
1290            try_next = FALSE;            try_next = FALSE;
1291            break;            break;
# Line 1198  return yield; Line 1311  return yield;
1311  *************************************************/  *************************************************/
1312    
1313  /* This function is handed a compiled expression that it must study to produce  /* This function is handed a compiled expression that it must study to produce
1314  information that will speed up the matching. It returns a pcre_extra block  information that will speed up the matching. It returns a pcre[16]_extra block
1315  which then gets handed back to pcre_exec().  which then gets handed back to pcre_exec().
1316    
1317  Arguments:  Arguments:
# Line 1207  Arguments: Line 1320  Arguments:
1320    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
1321              set NULL unless error              set NULL unless error
1322    
1323  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1324                appropriate flags set;                the appropriate flags set;
1325              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1326  */  */
1327    
1328    #if defined COMPILE_PCRE8
1329  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1330  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1331    #elif defined COMPILE_PCRE16
1332    PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1333    pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1334    #elif defined COMPILE_PCRE32
1335    PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1336    pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1337    #endif
1338  {  {
1339  int min;  int min;
1340  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1341  BOOL had_accept = FALSE;  pcre_uint8 start_bits[32];
1342  uschar start_bits[32];  PUBL(extra) *extra = NULL;
 pcre_extra *extra;  
1343  pcre_study_data *study;  pcre_study_data *study;
1344  const uschar *tables;  const pcre_uint8 *tables;
1345  uschar *code;  pcre_uchar *code;
1346  compile_data compile_block;  compile_data compile_block;
1347  const real_pcre *re = (const real_pcre *)external_re;  const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1348    
1349  *errorptr = NULL;  *errorptr = NULL;
1350    
# Line 1234  if (re == NULL || re->magic_number != MA Line 1354  if (re == NULL || re->magic_number != MA
1354    return NULL;    return NULL;
1355    }    }
1356    
1357    if ((re->flags & PCRE_MODE) == 0)
1358      {
1359    #if defined COMPILE_PCRE8
1360      *errorptr = "argument not compiled in 8 bit mode";
1361    #elif defined COMPILE_PCRE16
1362      *errorptr = "argument not compiled in 16 bit mode";
1363    #elif defined COMPILE_PCRE32
1364      *errorptr = "argument not compiled in 32 bit mode";
1365    #endif
1366      return NULL;
1367      }
1368    
1369  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)  if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1370    {    {
1371    *errorptr = "unknown or incorrect option bit(s) set";    *errorptr = "unknown or incorrect option bit(s) set";
1372    return NULL;    return NULL;
1373    }    }
1374    
1375  code = (uschar *)re + re->name_table_offset +  code = (pcre_uchar *)re + re->name_table_offset +
1376    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1377    
1378  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
# Line 1255  if ((re->options & PCRE_ANCHORED) == 0 & Line 1387  if ((re->options & PCRE_ANCHORED) == 0 &
1387    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1388    
1389    tables = re->tables;    tables = re->tables;
1390    
1391    #if defined COMPILE_PCRE8
1392    if (tables == NULL)    if (tables == NULL)
1393      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1394      (void *)(&tables));      (void *)(&tables));
1395    #elif defined COMPILE_PCRE16
1396      if (tables == NULL)
1397        (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1398        (void *)(&tables));
1399    #elif defined COMPILE_PCRE32
1400      if (tables == NULL)
1401        (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1402        (void *)(&tables));
1403    #endif
1404    
1405    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1406    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
# Line 1266  if ((re->options & PCRE_ANCHORED) == 0 & Line 1409  if ((re->options & PCRE_ANCHORED) == 0 &
1409    
1410    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1411    
1412    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1413    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1414      &compile_block);      &compile_block);
1415    bits_set = rc == SSB_DONE;    bits_set = rc == SSB_DONE;
1416    if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";    if (rc == SSB_UNKNOWN)
1417        {
1418        *errorptr = "internal error: opcode not recognized";
1419        return NULL;
1420        }
1421    }    }
1422    
1423  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1424    
1425  switch(min = find_minlength(code, code, re->options, &had_accept))  switch(min = find_minlength(code, code, re->options, 0))
1426    {    {
1427    case -2: *errorptr = "internal error: missing capturing bracket"; break;    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1428    case -3: *errorptr = "internal error: opcode not recognized"; break;    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1429    default: break;    default: break;
1430    }    }
1431    
1432  /* Return NULL if there's been an error or if no optimization is possible. */  /* If a set of starting bytes has been identified, or if the minimum length is
1433    greater than zero, or if JIT optimization has been requested, or if
1434    PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1435    pcre_study_data block. The study data is put in the latter, which is pointed to
1436    by the former, which may also get additional data set later by the calling
1437    program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1438    save it in a field for returning via the pcre_fullinfo() function so that if it
1439    becomes variable in the future, we don't have to change that code. */
1440    
1441    if (bits_set || min > 0 || (options & (
1442    #ifdef SUPPORT_JIT
1443        PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1444        PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1445    #endif
1446        PCRE_STUDY_EXTRA_NEEDED)) != 0)
1447      {
1448      extra = (PUBL(extra) *)(PUBL(malloc))
1449        (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1450      if (extra == NULL)
1451        {
1452        *errorptr = "failed to get memory";
1453        return NULL;
1454        }
1455    
1456  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;    study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1457      extra->flags = PCRE_EXTRA_STUDY_DATA;
1458      extra->study_data = study;
1459    
1460      study->size = sizeof(pcre_study_data);
1461      study->flags = 0;
1462    
1463      /* Set the start bits always, to avoid unset memory errors if the
1464      study data is written to a file, but set the flag only if any of the bits
1465      are set, to save time looking when none are. */
1466    
1467  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in    if (bits_set)
1468  the latter, which is pointed to by the former, which may also get additional      {
1469  data set later by the calling program. At the moment, the size of      study->flags |= PCRE_STUDY_MAPPED;
1470  pcre_study_data is fixed. We nevertheless save it in a field for returning via      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1471  the pcre_fullinfo() function so that if it becomes variable in the future, we      }
1472  don't have to change that code. */    else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1473    
1474  extra = (pcre_extra *)(pcre_malloc)  #ifdef PCRE_DEBUG
1475    (sizeof(pcre_extra) + sizeof(pcre_study_data));    if (bits_set)
1476        {
1477        pcre_uint8 *ptr = start_bits;
1478        int i;
1479    
1480  if (extra == NULL)      printf("Start bits:\n");
1481    {      for (i = 0; i < 32; i++)
1482    *errorptr = "failed to get memory";        printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1483    return NULL;      }
1484    }  #endif
1485    
1486  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));    /* Always set the minlength value in the block, because the JIT compiler
1487  extra->flags = PCRE_EXTRA_STUDY_DATA;    makes use of it. However, don't set the bit unless the length is greater than
1488  extra->study_data = study;    zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1489      checking the zero case. */
1490    
1491  study->size = sizeof(pcre_study_data);    if (min > 0)
1492  study->flags = 0;      {
1493        study->flags |= PCRE_STUDY_MINLEN;
1494        study->minlength = min;
1495        }
1496      else study->minlength = 0;
1497    
1498  if (bits_set)    /* If JIT support was compiled and requested, attempt the JIT compilation.
1499    {    If no starting bytes were found, and the minimum length is zero, and JIT
1500    study->flags |= PCRE_STUDY_MAPPED;    compilation fails, abandon the extra block and return NULL, unless
1501    memcpy(study->start_bits, start_bits, sizeof(start_bits));    PCRE_STUDY_EXTRA_NEEDED is set. */
1502    }  
1503    #ifdef SUPPORT_JIT
1504      extra->executable_jit = NULL;
1505      if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1506        PRIV(jit_compile)(re, extra, JIT_COMPILE);
1507      if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1508        PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1509      if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1510        PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1511    
1512  if (min >= 0)    if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1513    {        (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1514    study->flags |= PCRE_STUDY_MINLEN;      {
1515    study->minlength = min;  #if defined COMPILE_PCRE8
1516        pcre_free_study(extra);
1517    #elif defined COMPILE_PCRE16
1518        pcre16_free_study(extra);
1519    #elif defined COMPILE_PCRE32
1520        pcre32_free_study(extra);
1521    #endif
1522        extra = NULL;
1523        }
1524    #endif
1525    }    }
1526    
1527  return extra;  return extra;
1528  }  }
1529    
1530    
1531    /*************************************************
1532    *          Free the study data                   *
1533    *************************************************/
1534    
1535    /* This function frees the memory that was obtained by pcre_study().
1536    
1537    Argument:   a pointer to the pcre[16]_extra block
1538    Returns:    nothing
1539    */
1540    
1541    #if defined COMPILE_PCRE8
1542    PCRE_EXP_DEFN void
1543    pcre_free_study(pcre_extra *extra)
1544    #elif defined COMPILE_PCRE16
1545    PCRE_EXP_DEFN void
1546    pcre16_free_study(pcre16_extra *extra)
1547    #elif defined COMPILE_PCRE32
1548    PCRE_EXP_DEFN void
1549    pcre32_free_study(pcre32_extra *extra)
1550    #endif
1551    {
1552    if (extra == NULL)
1553      return;
1554    #ifdef SUPPORT_JIT
1555    if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1556         extra->executable_jit != NULL)
1557      PRIV(jit_free)(extra->executable_jit);
1558    #endif
1559    PUBL(free)(extra);
1560    }
1561    
1562  /* End of pcre_study.c */  /* End of pcre_study.c */

Legend:
Removed from v.613  
changed lines
  Added in v.1221

  ViewVC Help
Powered by ViewVC 1.1.5