/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 96 by nigel, Fri Mar 2 13:10:43 2007 UTC revision 176 by ph10, Mon Jun 11 13:48:37 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 82  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
# Line 96  static const short int escapes[] = { Line 96  static const short int escapes[] = {
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 208  static const char *error_texts[] = { Line 208  static const char *error_texts[] = {
208    "malformed number or name after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
# Line 242  static const char *error_texts[] = { Line 242  static const char *error_texts[] = {
242    /* 55 */    /* 55 */
243    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
244    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
245    "\\g is not followed by an (optionally braced) non-zero number"    "\\g is not followed by a braced name or an optionally braced non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 262  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 298  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 312  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 346  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 373  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
378      int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 421  if (c == 0) *errorcodeptr = ERR1; Line 422  if (c == 0) *errorcodeptr = ERR1;
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 452  else Line 453  else
453    
454      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by a number, either plain or braced. If positive, it
455      is an absolute backreference. If negative, it is a relative backreference.      is an absolute backreference. If negative, it is a relative backreference.
456      This is a Perl 5.10 feature. */      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457        reference to a named group. This is part of Perl's movement towards a
458        unified syntax for back references. As this is synonymous with \k{name}, we
459        fudge it up by pretending it really was \k. */
460    
461      case 'g':      case 'g':
462      if (ptr[1] == '{')      if (ptr[1] == '{')
463        {        {
464          const uschar *p;
465          for (p = ptr+2; *p != 0 && *p != '}'; p++)
466            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467          if (*p != 0 && *p != '}')
468            {
469            c = -ESC_k;
470            break;
471            }
472        braced = TRUE;        braced = TRUE;
473        ptr++;        ptr++;
474        }        }
# Line 562  else Line 574  else
574          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
575          count++;          count++;
576    
577  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
578          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
579          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
581          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
582          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583  #endif  #endif
# Line 589  else Line 601  else
601        {        {
602        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
603        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
604  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
605        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
606        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
608        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
609        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610  #endif  #endif
# Line 611  else Line 623  else
623        return 0;        return 0;
624        }        }
625    
626  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
627      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
628      c ^= 0x40;      c ^= 0x40;
629  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
630      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
631      c ^= 0xC0;      c ^= 0xC0;
632  #endif  #endif
# Line 1246  for (;;) Line 1258  for (;;)
1258    else    else
1259      {      {
1260      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1261    #ifdef SUPPORT_UTF8
1262      if (utf8) switch(c)      if (utf8) switch(c)
1263        {        {
1264        case OP_CHAR:        case OP_CHAR:
# Line 1266  for (;;) Line 1279  for (;;)
1279        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280        break;        break;
1281        }        }
1282    #endif
1283      }      }
1284    }    }
1285  }  }
# Line 1309  for (;;) Line 1323  for (;;)
1323    else    else
1324      {      {
1325      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1326    #ifdef SUPPORT_UTF8
1327      if (utf8) switch(c)      if (utf8) switch(c)
1328        {        {
1329        case OP_CHAR:        case OP_CHAR:
# Line 1329  for (;;) Line 1344  for (;;)
1344        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345        break;        break;
1346        }        }
1347    #endif
1348      }      }
1349    }    }
1350  }  }
# Line 1366  for (code = first_significant_code(code Line 1382  for (code = first_significant_code(code
1382    
1383    c = *code;    c = *code;
1384    
1385      /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388        {
1389        code += _pcre_OP_lengths[c];
1390        do code += GET(code, 1); while (*code == OP_ALT);
1391        c = *code;
1392        continue;
1393        }
1394    
1395      /* For other groups, scan the branches. */
1396    
1397    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398      {      {
1399      BOOL empty_branch;      BOOL empty_branch;
# Line 1382  for (code = first_significant_code(code Line 1410  for (code = first_significant_code(code
1410        }        }
1411      while (*code == OP_ALT);      while (*code == OP_ALT);
1412      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1413        c = *code;
     /* Move past the KET and fudge things so that the increment in the "for"  
     above has no effect. */  
   
     c = OP_END;  
     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];  
1414      continue;      continue;
1415      }      }
1416    
# Line 2087  for (;; ptr++) Line 2110  for (;; ptr++)
2110    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2111    BOOL is_quantifier;    BOOL is_quantifier;
2112    BOOL is_recurse;    BOOL is_recurse;
2113      BOOL reset_bracount;
2114    int class_charcount;    int class_charcount;
2115    int class_lastchar;    int class_lastchar;
2116    int newoptions;    int newoptions;
2117    int recno;    int recno;
2118      int refsign;
2119    int skipbytes;    int skipbytes;
2120    int subreqbyte;    int subreqbyte;
2121    int subfirstbyte;    int subfirstbyte;
# Line 2655  for (;; ptr++) Line 2680  for (;; ptr++)
2680              unsigned int origd = d;              unsigned int origd = d;
2681              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2682                {                {
2683                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2684                      ocd <= (unsigned int)d)
2685                    continue;                          /* Skip embedded ranges */
2686    
2687                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2688                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2689                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2690                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2691                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2692                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2693                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2694                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2695                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2696                  d = ocd;                  d = ocd;
2697                  continue;                  continue;
# Line 3560  for (;; ptr++) Line 3589  for (;; ptr++)
3589      skipbytes = 0;      skipbytes = 0;
3590      bravalue = OP_CBRA;      bravalue = OP_CBRA;
3591      save_hwm = cd->hwm;      save_hwm = cd->hwm;
3592        reset_bracount = FALSE;
3593    
3594      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3595        {        {
# Line 3582  for (;; ptr++) Line 3612  for (;; ptr++)
3612    
3613    
3614          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
3615            case '|':                 /* Reset capture count for each branch */
3616            reset_bracount = TRUE;
3617            /* Fall through */
3618    
3619            /* ------------------------------------------------------------ */
3620          case ':':                 /* Non-capturing bracket */          case ':':                 /* Non-capturing bracket */
3621          bravalue = OP_BRA;          bravalue = OP_BRA;
3622          ptr++;          ptr++;
# Line 3617  for (;; ptr++) Line 3652  for (;; ptr++)
3652    
3653          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
3654          skipbytes = 3;          skipbytes = 3;
3655            refsign = -1;
3656    
3657          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
3658    
# Line 3640  for (;; ptr++) Line 3676  for (;; ptr++)
3676            terminator = '\'';            terminator = '\'';
3677            ptr++;            ptr++;
3678            }            }
3679          else terminator = 0;          else
3680              {
3681              terminator = 0;
3682              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3683              }
3684    
3685          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
3686    
# Line 3676  for (;; ptr++) Line 3716  for (;; ptr++)
3716          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
3717    
3718          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
3719          reference. */          reference. If the string started with "+" or "-" we require the rest to
3720            be digits, in which case recno will be set. */
3721    
3722            if (refsign > 0)
3723              {
3724              if (recno <= 0)
3725                {
3726                *errorcodeptr = ERR58;
3727                goto FAILED;
3728                }
3729              if (refsign == '-')
3730                {
3731                recno = cd->bracount - recno + 1;
3732                if (recno <= 0)
3733                  {
3734                  *errorcodeptr = ERR15;
3735                  goto FAILED;
3736                  }
3737                }
3738              else recno += cd->bracount;
3739              PUT2(code, 2+LINK_SIZE, recno);
3740              break;
3741              }
3742    
3743            /* Otherwise (did not start with "+" or "-"), start by looking for the
3744            name. */
3745    
3746          slot = cd->name_table;          slot = cd->name_table;
3747          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 3995  for (;; ptr++) Line 4060  for (;; ptr++)
4060    
4061    
4062          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4063            case '-': case '+':
4064          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4065          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4066            {            {
4067            const uschar *called;            const uschar *called;
4068    
4069              if ((refsign = *ptr) == '+') ptr++;
4070              else if (refsign == '-')
4071                {
4072                if ((digitab[ptr[1]] & ctype_digit) == 0)
4073                  goto OTHER_CHAR_AFTER_QUERY;
4074                ptr++;
4075                }
4076    
4077            recno = 0;            recno = 0;
4078            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4079              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4080    
4081            if (*ptr != ')')            if (*ptr != ')')
4082              {              {
4083              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4084              goto FAILED;              goto FAILED;
4085              }              }
4086    
4087              if (refsign == '-')
4088                {
4089                if (recno == 0)
4090                  {
4091                  *errorcodeptr = ERR58;
4092                  goto FAILED;
4093                  }
4094                recno = cd->bracount - recno + 1;
4095                if (recno <= 0)
4096                  {
4097                  *errorcodeptr = ERR15;
4098                  goto FAILED;
4099                  }
4100                }
4101              else if (refsign == '+')
4102                {
4103                if (recno == 0)
4104                  {
4105                  *errorcodeptr = ERR58;
4106                  goto FAILED;
4107                  }
4108                recno += cd->bracount;
4109                }
4110    
4111            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4112    
4113            HANDLE_RECURSION:            HANDLE_RECURSION:
# Line 4080  for (;; ptr++) Line 4180  for (;; ptr++)
4180    
4181          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4182          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4183            OTHER_CHAR_AFTER_QUERY:
4184          set = unset = 0;          set = unset = 0;
4185          optset = &set;          optset = &set;
4186    
# Line 4214  for (;; ptr++) Line 4315  for (;; ptr++)
4315           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4316           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4317            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4318             reset_bracount,               /* True if (?| group */
4319           skipbytes,                    /* Skip over bracket number */           skipbytes,                    /* Skip over bracket number */
4320           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4321           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
# Line 4230  for (;; ptr++) Line 4332  for (;; ptr++)
4332      is on the bracket. */      is on the bracket. */
4333    
4334      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4335      two branches in the group, or just one if it's a DEFINE group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4336        in the real compile phase, not in the pre-pass, where the whole group may
4337        not be available. */
4338    
4339      if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4340        {        {
4341        uschar *tc = code;        uschar *tc = code;
4342        int condcount = 0;        int condcount = 0;
# Line 4392  for (;; ptr++) Line 4496  for (;; ptr++)
4496        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4497        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4498    
4499        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4500          We also support \k{name} (.NET syntax) */
4501    
4502        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4503          {          {
4504          is_recurse = FALSE;          is_recurse = FALSE;
4505          terminator = (*(++ptr) == '<')? '>' : '\'';          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4506          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
4507          }          }
4508    
# Line 4563  This function is used during the pre-com Line 4668  This function is used during the pre-com
4668  out the amount of memory needed, as well as during the real compile phase. The  out the amount of memory needed, as well as during the real compile phase. The
4669  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
4670    
4671  Argument:  Arguments:
4672    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4673    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
4674    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4675    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4676    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4677    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4678      reset_bracount TRUE to reset the count for each branch
4679    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4680    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4681    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
# Line 4583  Returns:         TRUE on success Line 4689  Returns:         TRUE on success
4689    
4690  static BOOL  static BOOL
4691  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4692    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4693    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4694      int *lengthptr)
4695  {  {
4696  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4697  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 4594  uschar *reverse_count = NULL; Line 4701  uschar *reverse_count = NULL;
4701  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4702  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4703  int length;  int length;
4704    int orig_bracount;
4705    int max_bracount;
4706  branch_chain bc;  branch_chain bc;
4707    
4708  bc.outer = bcptr;  bc.outer = bcptr;
# Line 4622  code += 1 + LINK_SIZE + skipbytes; Line 4731  code += 1 + LINK_SIZE + skipbytes;
4731    
4732  /* Loop for each alternative branch */  /* Loop for each alternative branch */
4733    
4734    orig_bracount = max_bracount = cd->bracount;
4735  for (;;)  for (;;)
4736    {    {
4737      /* For a (?| group, reset the capturing bracket count so that each branch
4738      uses the same numbers. */
4739    
4740      if (reset_bracount) cd->bracount = orig_bracount;
4741    
4742    /* Handle a change of ims options at the start of the branch */    /* Handle a change of ims options at the start of the branch */
4743    
4744    if ((options & PCRE_IMS) != oldims)    if ((options & PCRE_IMS) != oldims)
# Line 4652  for (;;) Line 4767  for (;;)
4767      *ptrptr = ptr;      *ptrptr = ptr;
4768      return FALSE;      return FALSE;
4769      }      }
4770    
4771      /* Keep the highest bracket count in case (?| was used and some branch
4772      has fewer than the rest. */
4773    
4774      if (cd->bracount > max_bracount) max_bracount = cd->bracount;
4775    
4776    /* In the real compile phase, there is some post-processing to be done. */    /* In the real compile phase, there is some post-processing to be done. */
4777    
# Line 4716  for (;;) Line 4836  for (;;)
4836        }        }
4837      }      }
4838    
4839    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
4840    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
4841    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
4842    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
4843    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
4844    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
4845    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
4846    at the terminating char. */    Return leaving the pointer at the terminating char. */
4847    
4848    if (*ptr != '|')    if (*ptr != '|')
4849      {      {
4850      int branch_length = code - last_branch;      if (lengthptr == NULL)
     do  
4851        {        {
4852        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
4853        PUT(last_branch, 1, branch_length);        do
4854        branch_length = prev_length;          {
4855        last_branch -= branch_length;          int prev_length = GET(last_branch, 1);
4856            PUT(last_branch, 1, branch_length);
4857            branch_length = prev_length;
4858            last_branch -= branch_length;
4859            }
4860          while (branch_length > 0);
4861        }        }
     while (branch_length > 0);  
4862    
4863      /* Fill in the ket */      /* Fill in the ket */
4864    
# Line 4751  for (;;) Line 4874  for (;;)
4874        *code++ = oldims;        *code++ = oldims;
4875        length += 2;        length += 2;
4876        }        }
4877    
4878        /* Retain the highest bracket number, in case resetting was used. */
4879    
4880        cd->bracount = max_bracount;
4881    
4882      /* Set values to pass back */      /* Set values to pass back */
4883    
# Line 4762  for (;;) Line 4889  for (;;)
4889      return TRUE;      return TRUE;
4890      }      }
4891    
4892    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
4893      pointer back to where it was for the start of the first branch. (That is,
4894      pretend that each branch is the only one.)
4895    
4896      In the real compile phase, insert an ALT node. Its length field points back
4897    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
4898    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
4899    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
4900    
4901    *code = OP_ALT;    if (lengthptr != NULL)
4902    PUT(code, 1, code - last_branch);      {
4903    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
4904    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
4905        }
4906      else
4907        {
4908        *code = OP_ALT;
4909        PUT(code, 1, code - last_branch);
4910        bc.current = last_branch = code;
4911        code += 1 + LINK_SIZE;
4912        }
4913    
4914    ptr++;    ptr++;
   length += 1 + LINK_SIZE;  
4915    }    }
4916  /* Control never reaches here */  /* Control never reaches here */
4917  }  }
# Line 5039  Returns:        pointer to compiled data Line 5178  Returns:        pointer to compiled data
5178                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5179  */  */
5180    
5181  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5182  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5183    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5184  {  {
# Line 5047  return pcre_compile2(pattern, options, N Line 5186  return pcre_compile2(pattern, options, N
5186  }  }
5187    
5188    
5189  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5190  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5191    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5192  {  {
# Line 5096  if (errorcodeptr != NULL) *errorcodeptr Line 5235  if (errorcodeptr != NULL) *errorcodeptr
5235  if (erroroffset == NULL)  if (erroroffset == NULL)
5236    {    {
5237    errorcode = ERR16;    errorcode = ERR16;
5238    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5239    }    }
5240    
5241  *erroroffset = 0;  *erroroffset = 0;
# Line 5109  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5248  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5248       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5249    {    {
5250    errorcode = ERR44;    errorcode = ERR44;
5251    goto PCRE_UTF8_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5252    }    }
5253  #else  #else
5254  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 5134  cd->cbits = tables + cbits_offset; Line 5273  cd->cbits = tables + cbits_offset;
5273  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5274    
5275  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
5276  current code allows for fixed one- or two-byte sequences, plus "any". */  current code allows for fixed one- or two-byte sequences, plus "any" and
5277    "anycrlf". */
5278    
5279  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5280    {    {
# Line 5144  switch (options & (PCRE_NEWLINE_CRLF | P Line 5284  switch (options & (PCRE_NEWLINE_CRLF | P
5284    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5285         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5286    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
5287      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5288    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5289    }    }
5290    
5291  if (newline < 0)  if (newline == -2)
5292      {
5293      cd->nltype = NLTYPE_ANYCRLF;
5294      }
5295    else if (newline < 0)
5296    {    {
5297    cd->nltype = NLTYPE_ANY;    cd->nltype = NLTYPE_ANY;
5298    }    }
# Line 5208  outside can help speed up starting point Line 5353  outside can help speed up starting point
5353  code = cworkspace;  code = cworkspace;
5354  *code = OP_BRA;  *code = OP_BRA;
5355  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5356    &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5357      &length);
5358  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5359    
5360  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 5276  ptr = (const uschar *)pattern; Line 5422  ptr = (const uschar *)pattern;
5422  code = (uschar *)codestart;  code = (uschar *)codestart;
5423  *code = OP_BRA;  *code = OP_BRA;
5424  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5425    &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5426  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
5427  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
5428    
# Line 5321  if (errorcode != 0) Line 5467  if (errorcode != 0)
5467    (pcre_free)(re);    (pcre_free)(re);
5468    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
5469    *erroroffset = ptr - (const uschar *)pattern;    *erroroffset = ptr - (const uschar *)pattern;
5470  #ifdef SUPPORT_UTF8    PCRE_EARLY_ERROR_RETURN2:
   PCRE_UTF8_ERROR_RETURN:  
 #endif  
5471    *errorptr = error_texts[errorcode];    *errorptr = error_texts[errorcode];
5472    if (errorcodeptr != NULL) *errorcodeptr = errorcode;    if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5473    return NULL;    return NULL;
# Line 5413  if ((re->options & PCRE_REQCHSET) != 0) Line 5557  if ((re->options & PCRE_REQCHSET) != 0)
5557      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
5558    }    }
5559    
5560  pcre_printint(re, stdout);  pcre_printint(re, stdout, TRUE);
5561    
5562  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that
5563  was compiled can be seen. */  was compiled can be seen. */

Legend:
Removed from v.96  
changed lines
  Added in v.176

  ViewVC Help
Powered by ViewVC 1.1.5