/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1084 by chpe, Tue Oct 16 15:55:28 2012 UTC revision 1364 by ph10, Sat Oct 5 15:45:11 2013 UTC
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69      re              compiled pattern block
70    code            pointer to start of group (the bracket)    code            pointer to start of group (the bracket)
71    startcode       pointer to start of the whole pattern    startcode       pointer to start of the whole pattern's code
72    options         the compiling options    options         the compiling options
73    int             RECURSE depth    int             RECURSE depth
74    
# Line 78  Returns:   the minimum length Line 79  Returns:   the minimum length
79  */  */
80    
81  static int  static int
82  find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,  find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83    int recurse_depth)    const pcre_uchar *startcode, int options, int recurse_depth)
84  {  {
85  int length = -1;  int length = -1;
86  /* PCRE_UTF16 has the same value as PCRE_UTF8. */  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
# Line 129  for (;;) Line 130  for (;;)
130      case OP_SBRAPOS:      case OP_SBRAPOS:
131      case OP_ONCE:      case OP_ONCE:
132      case OP_ONCE_NC:      case OP_ONCE_NC:
133      d = find_minlength(cc, startcode, options, recurse_depth);      d = find_minlength(re, cc, startcode, options, recurse_depth);
134      if (d < 0) return d;      if (d < 0) return d;
135      branchlength += d;      branchlength += d;
136      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 224  for (;;) Line 225  for (;;)
225      case OP_NOTPOSPLUSI:      case OP_NOTPOSPLUSI:
226      branchlength++;      branchlength++;
227      cc += 2;      cc += 2;
228  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
229      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
230  #endif  #endif
231      break;      break;
# Line 245  for (;;) Line 246  for (;;)
246      case OP_NOTEXACTI:      case OP_NOTEXACTI:
247      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
248      cc += 2 + IMM2_SIZE;      cc += 2 + IMM2_SIZE;
249  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
250      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
251  #endif  #endif
252      break;      break;
# Line 323  for (;;) Line 324  for (;;)
324    
325      /* Check a class for variable quantification */      /* Check a class for variable quantification */
326    
 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];  
     /* Fall through */  
 #endif  
   
327      case OP_CLASS:      case OP_CLASS:
328      case OP_NCLASS:      case OP_NCLASS:
329    #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
330        case OP_XCLASS:
331        /* The original code caused an unsigned overflow in 64 bit systems,
332        so now we use a conditional statement. */
333        if (op == OP_XCLASS)
334          cc += GET(cc, 1);
335        else
336          cc += PRIV(OP_lengths)[OP_CLASS];
337    #else
338      cc += PRIV(OP_lengths)[OP_CLASS];      cc += PRIV(OP_lengths)[OP_CLASS];
339    #endif
340    
341      switch (*cc)      switch (*cc)
342        {        {
# Line 370  for (;;) Line 375  for (;;)
375      If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket      If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
376      matches an empty string (by default it causes a matching failure), so in      matches an empty string (by default it causes a matching failure), so in
377      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
378    
379        case OP_DNREF:     /* Duplicate named pattern back reference */
380        case OP_DNREFI:
381        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
382          {
383          int count = GET2(cc, 1+IMM2_SIZE);
384          pcre_uchar *slot = (pcre_uchar *)re +
385            re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
386          d = INT_MAX;
387          while (count-- > 0)
388            {
389            ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
390            if (cs == NULL) return -2;
391            do ce += GET(ce, 1); while (*ce == OP_ALT);
392            if (cc > cs && cc < ce)
393              {
394              d = 0;
395              had_recurse = TRUE;
396              break;
397              }
398            else
399              {
400              int dd = find_minlength(re, cs, startcode, options, recurse_depth);
401              if (dd < d) d = dd;
402              }
403            slot += re->name_entry_size;
404            }
405          }
406        else d = 0;
407        cc += 1 + 2*IMM2_SIZE;
408        goto REPEAT_BACK_REFERENCE;
409    
410      case OP_REF:      case OP_REF:      /* Single back reference */
411      case OP_REFI:      case OP_REFI:
412      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
413        {        {
# Line 385  for (;;) Line 421  for (;;)
421          }          }
422        else        else
423          {          {
424          d = find_minlength(cs, startcode, options, recurse_depth);          d = find_minlength(re, cs, startcode, options, recurse_depth);
425          }          }
426        }        }
427      else d = 0;      else d = 0;
# Line 393  for (;;) Line 429  for (;;)
429    
430      /* Handle repeated back references */      /* Handle repeated back references */
431    
432        REPEAT_BACK_REFERENCE:
433      switch (*cc)      switch (*cc)
434        {        {
435        case OP_CRSTAR:        case OP_CRSTAR:
# Line 433  for (;;) Line 470  for (;;)
470        had_recurse = TRUE;        had_recurse = TRUE;
471      else      else
472        {        {
473        branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);        branchlength += find_minlength(re, cs, startcode, options,
474            recurse_depth + 1);
475        }        }
476      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
477      break;      break;
# Line 486  for (;;) Line 524  for (;;)
524      case OP_NOTPOSQUERYI:      case OP_NOTPOSQUERYI:
525    
526      cc += PRIV(OP_lengths)[op];      cc += PRIV(OP_lengths)[op];
527  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32  #ifdef SUPPORT_UTF
528      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
529  #endif  #endif
530      break;      break;
# Line 547  static const pcre_uchar * Line 585  static const pcre_uchar *
585  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,  set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
586    compile_data *cd, BOOL utf)    compile_data *cd, BOOL utf)
587  {  {
588  unsigned int c = *p;  pcre_uint32 c = *p;
589    
590  #ifdef COMPILE_PCRE8  #ifdef COMPILE_PCRE8
591  SET_BIT(c);  SET_BIT(c);
# Line 568  if (utf && c > 127) Line 606  if (utf && c > 127)
606    return p;    return p;
607    }    }
608  #else   /* Not SUPPORT_UTF */  #else   /* Not SUPPORT_UTF */
609  (void)(utf);   /* Stops warning for unused parameter */  (void)(utf);   /* Stops warning for unused parameter */
610  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
611    
612  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
# Line 601  if (utf && c > 127) Line 639  if (utf && c > 127)
639    return p;    return p;
640    }    }
641  #else   /* Not SUPPORT_UTF */  #else   /* Not SUPPORT_UTF */
642  (void)(utf);   /* Stops warning for unused parameter */  (void)(utf);   /* Stops warning for unused parameter */
643  #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
644    
645  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
# Line 632  Returns:         nothing Line 670  Returns:         nothing
670  */  */
671    
672  static void  static void
673  set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,  set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
674    compile_data *cd)    compile_data *cd)
675  {  {
676  register int c;  register pcre_uint32 c;
677  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
678  #if defined SUPPORT_UTF && defined COMPILE_PCRE8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
679  if (table_limit == 32) return;  if (table_limit == 32) return;
# Line 674  Returns:         nothing Line 712  Returns:         nothing
712  */  */
713    
714  static void  static void
715  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,  set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
716    compile_data *cd)    compile_data *cd)
717  {  {
718  register int c;  register pcre_uint32 c;
719  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
720  #if defined SUPPORT_UTF && defined COMPILE_PCRE8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
721  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
# Line 714  static int Line 752  static int
752  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,  set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
753    compile_data *cd)    compile_data *cd)
754  {  {
755  register int c;  register pcre_uint32 c;
756  int yield = SSB_DONE;  int yield = SSB_DONE;
757  #if defined SUPPORT_UTF && defined COMPILE_PCRE8  #if defined SUPPORT_UTF && defined COMPILE_PCRE8
758  int table_limit = utf? 16:32;  int table_limit = utf? 16:32;
# Line 821  do Line 859  do
859        case OP_RECURSE:        case OP_RECURSE:
860        case OP_REF:        case OP_REF:
861        case OP_REFI:        case OP_REFI:
862          case OP_DNREF:
863          case OP_DNREFI:
864        case OP_REVERSE:        case OP_REVERSE:
865        case OP_RREF:        case OP_RREF:
866        case OP_SCOND:        case OP_SCOND:
# Line 1008  do Line 1048  do
1048        else        else
1049  #endif /* SUPPORT_UTF */  #endif /* SUPPORT_UTF */
1050          {          {
1051  #ifndef EBCDIC  #ifndef EBCDIC
1052          SET_BIT(0xA0);          SET_BIT(0xA0);
1053  #endif  /* Not EBCDIC */  #endif  /* Not EBCDIC */
1054  #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32  #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1055          SET_BIT(0xFF);  /* For characters > 255 */          SET_BIT(0xFF);  /* For characters > 255 */
1056  #endif  /* COMPILE_PCRE[16|32] */  #endif  /* COMPILE_PCRE[16|32] */
# Line 1062  do Line 1102  do
1102        break;        break;
1103    
1104        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
1105        ensure it is set as not whitespace. Luckily, the code value is the same        ensure it is set as not whitespace. Luckily, the code value is the same
1106        (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */        (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1107    
1108        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
# Line 1146  do Line 1186  do
1186  #endif /* SUPPORT_UTF */  #endif /* SUPPORT_UTF */
1187  #ifndef EBCDIC  #ifndef EBCDIC
1188            SET_BIT(0xA0);            SET_BIT(0xA0);
1189  #endif  /* Not EBCDIC */  #endif  /* Not EBCDIC */
1190          break;          break;
1191    
1192          case OP_ANYNL:          case OP_ANYNL:
# Line 1179  do Line 1219  do
1219          set_type_bits(start_bits, cbit_digit, table_limit, cd);          set_type_bits(start_bits, cbit_digit, table_limit, cd);
1220          break;          break;
1221    
1222          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we no longer
1223          ensure it gets set as not whitespace. Luckily, the code value is the          have to play fancy tricks because Perl added VT to its whitespace at
1224          same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate          release 5.18. PCRE added it at release 8.34. */
         bit. */  
1225    
1226          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1227          set_nottype_bits(start_bits, cbit_space, table_limit, cd);          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] |= 0x08;  
1228          break;          break;
1229    
         /* The cbit_space table has vertical tab as whitespace; we have to  
         avoid setting it. Luckily, the code value is the same (0x0b) in ASCII  
         and EBCDIC, so we can just adjust the appropriate bit. */  
   
1230          case OP_WHITESPACE:          case OP_WHITESPACE:
         c = start_bits[1];    /* Save in case it was already set */  
1231          set_type_bits(start_bits, cbit_space, table_limit, cd);          set_type_bits(start_bits, cbit_space, table_limit, cd);
         start_bits[1] = (start_bits[1] & ~0x08) | c;  
1232          break;          break;
1233    
1234          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 1342  pcre_uchar *code; Line 1374  pcre_uchar *code;
1374  compile_data compile_block;  compile_data compile_block;
1375  const REAL_PCRE *re = (const REAL_PCRE *)external_re;  const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1376    
1377    
1378  *errorptr = NULL;  *errorptr = NULL;
1379    
1380  if (re == NULL || re->magic_number != MAGIC_NUMBER)  if (re == NULL || re->magic_number != MAGIC_NUMBER)
# Line 1418  if ((re->options & PCRE_ANCHORED) == 0 & Line 1451  if ((re->options & PCRE_ANCHORED) == 0 &
1451    
1452  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1453    
1454  switch(min = find_minlength(code, code, re->options, 0))  switch(min = find_minlength(re, code, code, re->options, 0))
1455    {    {
1456    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;    case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1457    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;    case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
# Line 1426  switch(min = find_minlength(code, code, Line 1459  switch(min = find_minlength(code, code,
1459    }    }
1460    
1461  /* If a set of starting bytes has been identified, or if the minimum length is  /* If a set of starting bytes has been identified, or if the minimum length is
1462  greater than zero, or if JIT optimization has been requested, or if  greater than zero, or if JIT optimization has been requested, or if
1463  PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a  PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1464  pcre_study_data block. The study data is put in the latter, which is pointed to  pcre_study_data block. The study data is put in the latter, which is pointed to
1465  by the former, which may also get additional data set later by the calling  by the former, which may also get additional data set later by the calling
# Line 1437  becomes variable in the future, we don't Line 1470  becomes variable in the future, we don't
1470  if (bits_set || min > 0 || (options & (  if (bits_set || min > 0 || (options & (
1471  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT
1472      PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |      PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1473      PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |      PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1474  #endif  #endif
1475      PCRE_STUDY_EXTRA_NEEDED)) != 0)      PCRE_STUDY_EXTRA_NEEDED)) != 0)
1476    {    {
# Line 1493  if (bits_set || min > 0 || (options & ( Line 1526  if (bits_set || min > 0 || (options & (
1526    
1527    /* If JIT support was compiled and requested, attempt the JIT compilation.    /* If JIT support was compiled and requested, attempt the JIT compilation.
1528    If no starting bytes were found, and the minimum length is zero, and JIT    If no starting bytes were found, and the minimum length is zero, and JIT
1529    compilation fails, abandon the extra block and return NULL, unless    compilation fails, abandon the extra block and return NULL, unless
1530    PCRE_STUDY_EXTRA_NEEDED is set. */    PCRE_STUDY_EXTRA_NEEDED is set. */
1531    
1532  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT

Legend:
Removed from v.1084  
changed lines
  Added in v.1364

  ViewVC Help
Powered by ViewVC 1.1.5