/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 93 by nigel, Sat Feb 24 21:41:42 2007 UTC revision 166 by ph10, Wed May 9 14:48:28 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 82  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
# Line 96  static const short int escapes[] = { Line 96  static const short int escapes[] = {
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 208  static const char *error_texts[] = { Line 208  static const char *error_texts[] = {
208    "malformed number or name after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
# Line 242  static const char *error_texts[] = { Line 242  static const char *error_texts[] = {
242    /* 55 */    /* 55 */
243    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
244    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
245    "\\g is not followed by an (optionally braced) non-zero number"    "\\g is not followed by an (optionally braced) non-zero number",
246      "(?+ or (?- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 262  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 298  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 312  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 346  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 421  if (c == 0) *errorcodeptr = ERR1; Line 422  if (c == 0) *errorcodeptr = ERR1;
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 562  else Line 563  else
563          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
564          count++;          count++;
565    
566  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
567          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
568          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
569  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
570          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
571          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
572  #endif  #endif
# Line 589  else Line 590  else
590        {        {
591        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
592        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
593  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
594        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
595        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
596  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
597        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
598        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
599  #endif  #endif
# Line 611  else Line 612  else
612        return 0;        return 0;
613        }        }
614    
615  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
616      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
617      c ^= 0x40;      c ^= 0x40;
618  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
619      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
620      c ^= 0xC0;      c ^= 0xC0;
621  #endif  #endif
# Line 1246  for (;;) Line 1247  for (;;)
1247    else    else
1248      {      {
1249      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1250    #ifdef SUPPORT_UTF8
1251      if (utf8) switch(c)      if (utf8) switch(c)
1252        {        {
1253        case OP_CHAR:        case OP_CHAR:
# Line 1266  for (;;) Line 1268  for (;;)
1268        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1269        break;        break;
1270        }        }
1271    #endif
1272      }      }
1273    }    }
1274  }  }
# Line 1309  for (;;) Line 1312  for (;;)
1312    else    else
1313      {      {
1314      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1315    #ifdef SUPPORT_UTF8
1316      if (utf8) switch(c)      if (utf8) switch(c)
1317        {        {
1318        case OP_CHAR:        case OP_CHAR:
# Line 1329  for (;;) Line 1333  for (;;)
1333        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1334        break;        break;
1335        }        }
1336    #endif
1337      }      }
1338    }    }
1339  }  }
# Line 3995  for (;; ptr++) Line 4000  for (;; ptr++)
4000    
4001    
4002          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4003            case '-': case '+':
4004          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4005          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4006            {            {
4007            const uschar *called;            const uschar *called;
4008              int sign = *ptr;
4009    
4010              if (sign == '+') ptr++;
4011              else if (sign == '-')
4012                {
4013                if ((digitab[ptr[1]] & ctype_digit) == 0)
4014                  goto OTHER_CHAR_AFTER_QUERY;
4015                ptr++;
4016                }
4017    
4018            recno = 0;            recno = 0;
4019            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4020              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4021    
4022            if (*ptr != ')')            if (*ptr != ')')
4023              {              {
4024              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4025              goto FAILED;              goto FAILED;
4026              }              }
4027    
4028              if (sign == '-')
4029                {
4030                if (recno == 0)
4031                  {
4032                  *errorcodeptr = ERR58;
4033                  goto FAILED;
4034                  }
4035                recno = cd->bracount - recno + 1;
4036                if (recno <= 0)
4037                  {
4038                  *errorcodeptr = ERR15;
4039                  goto FAILED;
4040                  }
4041                }
4042              else if (sign == '+')
4043                {
4044                if (recno == 0)
4045                  {
4046                  *errorcodeptr = ERR58;
4047                  goto FAILED;
4048                  }
4049                recno += cd->bracount;
4050                }
4051    
4052            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4053    
# Line 4080  for (;; ptr++) Line 4121  for (;; ptr++)
4121    
4122          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4123          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4124            OTHER_CHAR_AFTER_QUERY:
4125          set = unset = 0;          set = unset = 0;
4126          optset = &set;          optset = &set;
4127    
# Line 5039  Returns:        pointer to compiled data Line 5081  Returns:        pointer to compiled data
5081                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5082  */  */
5083    
5084  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5085  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5086    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5087  {  {
# Line 5047  return pcre_compile2(pattern, options, N Line 5089  return pcre_compile2(pattern, options, N
5089  }  }
5090    
5091    
5092  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5093  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5094    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5095  {  {
# Line 5096  if (errorcodeptr != NULL) *errorcodeptr Line 5138  if (errorcodeptr != NULL) *errorcodeptr
5138  if (erroroffset == NULL)  if (erroroffset == NULL)
5139    {    {
5140    errorcode = ERR16;    errorcode = ERR16;
5141    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5142    }    }
5143    
5144  *erroroffset = 0;  *erroroffset = 0;
# Line 5109  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5151  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5151       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5152    {    {
5153    errorcode = ERR44;    errorcode = ERR44;
5154    goto PCRE_UTF8_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5155    }    }
5156  #else  #else
5157  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 5134  cd->cbits = tables + cbits_offset; Line 5176  cd->cbits = tables + cbits_offset;
5176  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5177    
5178  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
5179  current code allows for fixed one- or two-byte sequences, plus "any". */  current code allows for fixed one- or two-byte sequences, plus "any" and
5180    "anycrlf". */
5181    
5182  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5183    {    {
# Line 5144  switch (options & (PCRE_NEWLINE_CRLF | P Line 5187  switch (options & (PCRE_NEWLINE_CRLF | P
5187    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5188         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5189    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
5190      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5191    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5192    }    }
5193    
5194  if (newline < 0)  if (newline == -2)
5195      {
5196      cd->nltype = NLTYPE_ANYCRLF;
5197      }
5198    else if (newline < 0)
5199    {    {
5200    cd->nltype = NLTYPE_ANY;    cd->nltype = NLTYPE_ANY;
5201    }    }
# Line 5321  if (errorcode != 0) Line 5369  if (errorcode != 0)
5369    (pcre_free)(re);    (pcre_free)(re);
5370    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
5371    *erroroffset = ptr - (const uschar *)pattern;    *erroroffset = ptr - (const uschar *)pattern;
5372  #ifdef SUPPORT_UTF8    PCRE_EARLY_ERROR_RETURN2:
   PCRE_UTF8_ERROR_RETURN:  
 #endif  
5373    *errorptr = error_texts[errorcode];    *errorptr = error_texts[errorcode];
5374    if (errorcodeptr != NULL) *errorcodeptr = errorcode;    if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5375    return NULL;    return NULL;
# Line 5413  if ((re->options & PCRE_REQCHSET) != 0) Line 5459  if ((re->options & PCRE_REQCHSET) != 0)
5459      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
5460    }    }
5461    
5462  pcre_printint(re, stdout);  pcre_printint(re, stdout, TRUE);
5463    
5464  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that
5465  was compiled can be seen. */  was compiled can be seen. */

Legend:
Removed from v.93  
changed lines
  Added in v.166

  ViewVC Help
Powered by ViewVC 1.1.5