/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 172 by ph10, Tue Jun 5 10:40:13 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 53  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 106  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 155  static const int posix_class_maps[] = { Line 165  static const int posix_class_maps[] = {
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 171  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 181  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 190  static const char *error_texts[] = { Line 205  static const char *error_texts[] = {
205    "unrecognized character after (?<",    "unrecognized character after (?<",
206    /* 25 */    /* 25 */
207    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
208    "malformed number after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 209  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named groups have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
234      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236      /* 50 */
237      "repeated subpattern is too long",
238      "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by a braced name or an optionally braced non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 235  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 357  static BOOL Line 385  static BOOL
385    
386  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
387  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
388  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
389  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391    ptr is pointing at the \. On exit, it is on the final character of the escape
392    sequence.
393    
394  Arguments:  Arguments:
395    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 392  if (c == 0) *errorcodeptr = ERR1; Line 422  if (c == 0) *errorcodeptr = ERR1;
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 436  else if ((i = escapes[c - 0x48]) != 0)
436  else  else
437    {    {
438    const uschar *oldptr;    const uschar *oldptr;
439      BOOL braced, negated;
440    
441    switch (c)    switch (c)
442      {      {
443      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 451  else
451      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
452      break;      break;
453    
454        /* \g must be followed by a number, either plain or braced. If positive, it
455        is an absolute backreference. If negative, it is a relative backreference.
456        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457        reference to a named group. This is part of Perl's movement towards a
458        unified syntax for back references. As this is synonymous with \k{name}, we
459        fudge it up by pretending it really was \k. */
460    
461        case 'g':
462        if (ptr[1] == '{')
463          {
464          const uschar *p;
465          for (p = ptr+2; *p != 0 && *p != '}'; p++)
466            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467          if (*p != 0 && *p != '}')
468            {
469            c = -ESC_k;
470            break;
471            }
472          braced = TRUE;
473          ptr++;
474          }
475        else braced = FALSE;
476    
477        if (ptr[1] == '-')
478          {
479          negated = TRUE;
480          ptr++;
481          }
482        else negated = FALSE;
483    
484        c = 0;
485        while ((digitab[ptr[1]] & ctype_digit) != 0)
486          c = c * 10 + *(++ptr) - '0';
487    
488        if (c == 0 || (braced && *(++ptr) != '}'))
489          {
490          *errorcodeptr = ERR57;
491          return 0;
492          }
493    
494        if (negated)
495          {
496          if (c > bracount)
497            {
498            *errorcodeptr = ERR15;
499            return 0;
500            }
501          c = bracount - (c - 1);
502          }
503    
504        c = -(ESC_REF + c);
505        break;
506    
507      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
508      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
509      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 460  else Line 545  else
545        }        }
546    
547      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
548      larger first octal digit. */      larger first octal digit. The original code used just to take the least
549        significant 8 bits of octal numbers (I think this is what early Perls used
550        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551        than 3 octal digits. */
552    
553      case '0':      case '0':
554      c -= '0';      c -= '0';
555      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
557      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
558      break;      break;
559    
560      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 574  else
574          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
575          count++;          count++;
576    
577  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
578          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
579          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
581          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
582          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583  #endif  #endif
# Line 513  else Line 601  else
601        {        {
602        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
603        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
604  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
605        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
606        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
608        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
609        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610  #endif  #endif
611        }        }
612      break;      break;
613    
614      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615        This coding is ASCII-specific, but then the whole concept of \cx is
616        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617    
618      case 'c':      case 'c':
619      c = *(++ptr);      c = *(++ptr);
# Line 533  else Line 623  else
623        return 0;        return 0;
624        }        }
625    
626      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
627      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
628      c ^= 0x40;      c ^= 0x40;
629  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
630      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
631      c ^= 0xC0;      c ^= 0xC0;
632  #endif  #endif
# Line 763  return p; Line 849  return p;
849    
850    
851  /*************************************************  /*************************************************
852    *       Find forward referenced subpattern       *
853    *************************************************/
854    
855    /* This function scans along a pattern's text looking for capturing
856    subpatterns, and counting them. If it finds a named pattern that matches the
857    name it is given, it returns its number. Alternatively, if the name is NULL, it
858    returns when it reaches a given numbered subpattern. This is used for forward
859    references to subpatterns. We know that if (?P< is encountered, the name will
860    be terminated by '>' because that is checked in the first pass.
861    
862    Arguments:
863      ptr          current position in the pattern
864      count        current count of capturing parens so far encountered
865      name         name to seek, or NULL if seeking a numbered subpattern
866      lorn         name length, or subpattern number if name is NULL
867      xmode        TRUE if we are in /x mode
868    
869    Returns:       the number of the named subpattern, or -1 if not found
870    */
871    
872    static int
873    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874      BOOL xmode)
875    {
876    const uschar *thisname;
877    
878    for (; *ptr != 0; ptr++)
879      {
880      int term;
881    
882      /* Skip over backslashed characters and also entire \Q...\E */
883    
884      if (*ptr == '\\')
885        {
886        if (*(++ptr) == 0) return -1;
887        if (*ptr == 'Q') for (;;)
888          {
889          while (*(++ptr) != 0 && *ptr != '\\');
890          if (*ptr == 0) return -1;
891          if (*(++ptr) == 'E') break;
892          }
893        continue;
894        }
895    
896      /* Skip over character classes */
897    
898      if (*ptr == '[')
899        {
900        while (*(++ptr) != ']')
901          {
902          if (*ptr == '\\')
903            {
904            if (*(++ptr) == 0) return -1;
905            if (*ptr == 'Q') for (;;)
906              {
907              while (*(++ptr) != 0 && *ptr != '\\');
908              if (*ptr == 0) return -1;
909              if (*(++ptr) == 'E') break;
910              }
911            continue;
912            }
913          }
914        continue;
915        }
916    
917      /* Skip comments in /x mode */
918    
919      if (xmode && *ptr == '#')
920        {
921        while (*(++ptr) != 0 && *ptr != '\n');
922        if (*ptr == 0) return -1;
923        continue;
924        }
925    
926      /* An opening parens must now be a real metacharacter */
927    
928      if (*ptr != '(') continue;
929      if (ptr[1] != '?')
930        {
931        count++;
932        if (name == NULL && count == lorn) return count;
933        continue;
934        }
935    
936      ptr += 2;
937      if (*ptr == 'P') ptr++;                      /* Allow optional P */
938    
939      /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942           *ptr != '\'')
943        continue;
944    
945      count++;
946    
947      if (name == NULL && count == lorn) return count;
948      term = *ptr++;
949      if (term == '<') term = '>';
950      thisname = ptr;
951      while (*ptr != term) ptr++;
952      if (name != NULL && lorn == ptr - thisname &&
953          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954        return count;
955      }
956    
957    return -1;
958    }
959    
960    
961    
962    /*************************************************
963  *      Find first significant op code            *  *      Find first significant op code            *
964  *************************************************/  *************************************************/
965    
# Line 811  for (;;) Line 1008  for (;;)
1008    
1009      case OP_CALLOUT:      case OP_CALLOUT:
1010      case OP_CREF:      case OP_CREF:
1011      case OP_BRANUMBER:      case OP_RREF:
1012        case OP_DEF:
1013      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1014      break;      break;
1015    
# Line 856  for (;;) Line 1054  for (;;)
1054    {    {
1055    int d;    int d;
1056    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1057    
1058    switch (op)    switch (op)
1059      {      {
1060        case OP_CBRA:
1061      case OP_BRA:      case OP_BRA:
1062      case OP_ONCE:      case OP_ONCE:
1063      case OP_COND:      case OP_COND:
1064      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065      if (d < 0) return d;      if (d < 0) return d;
1066      branchlength += d;      branchlength += d;
1067      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1096  for (;;)
1096      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1097    
1098      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1099      case OP_CREF:      case OP_CREF:
1100        case OP_RREF:
1101        case OP_DEF:
1102      case OP_OPT:      case OP_OPT:
1103      case OP_CALLOUT:      case OP_CALLOUT:
1104      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1116  for (;;)
1116    
1117      case OP_CHAR:      case OP_CHAR:
1118      case OP_CHARNC:      case OP_CHARNC:
1119        case OP_NOT:
1120      branchlength++;      branchlength++;
1121      cc += 2;      cc += 2;
1122  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1031  Returns:      pointer to the opcode for Line 1231  Returns:      pointer to the opcode for
1231  static const uschar *  static const uschar *
1232  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1233  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1234  for (;;)  for (;;)
1235    {    {
1236    register int c = *code;    register int c = *code;
1237    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1238    else if (c > OP_BRA)  
1239      /* XCLASS is used for classes that cannot be represented just by a bit
1240      map. This includes negated single high-valued characters. The length in
1241      the table is zero; the actual length is stored in the compiled code. */
1242    
1243      if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245      /* Handle capturing bracket */
1246    
1247      else if (c == OP_CBRA)
1248      {      {
1249      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1250      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1251      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1252      }      }
1253    
1254      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255      a multi-byte character. The length in the table is a minimum, so we have to
1256      arrange to skip the extra bytes. */
1257    
1258    else    else
1259      {      {
1260      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1261  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1262      if (utf8) switch(c)      if (utf8) switch(c)
1263        {        {
1264        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1266  for (;;)
1266        case OP_EXACT:        case OP_EXACT:
1267        case OP_UPTO:        case OP_UPTO:
1268        case OP_MINUPTO:        case OP_MINUPTO:
1269          case OP_POSUPTO:
1270        case OP_STAR:        case OP_STAR:
1271        case OP_MINSTAR:        case OP_MINSTAR:
1272          case OP_POSSTAR:
1273        case OP_PLUS:        case OP_PLUS:
1274        case OP_MINPLUS:        case OP_MINPLUS:
1275          case OP_POSPLUS:
1276        case OP_QUERY:        case OP_QUERY:
1277        case OP_MINQUERY:        case OP_MINQUERY:
1278        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1279        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1280        break;        break;
1281        }        }
1282  #endif  #endif
# Line 1105  Returns:      pointer to the opcode for Line 1303  Returns:      pointer to the opcode for
1303  static const uschar *  static const uschar *
1304  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1305  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1306  for (;;)  for (;;)
1307    {    {
1308    register int c = *code;    register int c = *code;
1309    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1310    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1311    else if (c > OP_BRA)  
1312      {    /* XCLASS is used for classes that cannot be represented just by a bit
1313      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1314      }    the table is zero; the actual length is stored in the compiled code. */
1315    
1316      if (c == OP_XCLASS) code += GET(code, 1);
1317    
1318      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319      that are followed by a character may be followed by a multi-byte character.
1320      The length in the table is a minimum, so we have to arrange to skip the extra
1321      bytes. */
1322    
1323    else    else
1324      {      {
1325      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1326  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1327      if (utf8) switch(c)      if (utf8) switch(c)
1328        {        {
1329        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1331  for (;;)
1331        case OP_EXACT:        case OP_EXACT:
1332        case OP_UPTO:        case OP_UPTO:
1333        case OP_MINUPTO:        case OP_MINUPTO:
1334          case OP_POSUPTO:
1335        case OP_STAR:        case OP_STAR:
1336        case OP_MINSTAR:        case OP_MINSTAR:
1337          case OP_POSSTAR:
1338        case OP_PLUS:        case OP_PLUS:
1339        case OP_MINPLUS:        case OP_MINPLUS:
1340          case OP_POSPLUS:
1341        case OP_QUERY:        case OP_QUERY:
1342        case OP_MINQUERY:        case OP_MINQUERY:
1343        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1344        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1345        break;        break;
1346        }        }
1347  #endif  #endif
# Line 1165  for (;;) Line 1356  for (;;)
1356  *************************************************/  *************************************************/
1357    
1358  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1359  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1360  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1361  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1362  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363    struck an inner bracket whose current branch will already have been scanned.
1364    
1365  Arguments:  Arguments:
1366    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1374  static BOOL
1374  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375  {  {
1376  register int c;  register int c;
1377  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378       code < endcode;       code < endcode;
1379       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380    {    {
# Line 1190  for (code = first_significant_code(code Line 1382  for (code = first_significant_code(code
1382    
1383    c = *code;    c = *code;
1384    
1385    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388        {
1389        code += _pcre_OP_lengths[c];
1390        do code += GET(code, 1); while (*code == OP_ALT);
1391        c = *code;
1392        continue;
1393        }
1394    
1395      /* For other groups, scan the branches. */
1396    
1397      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398      {      {
1399      BOOL empty_branch;      BOOL empty_branch;
1400      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1410  for (code = first_significant_code(code
1410        }        }
1411      while (*code == OP_ALT);      while (*code == OP_ALT);
1412      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1413      c = *code;      c = *code;
1414        continue;
1415      }      }
1416    
1417    else switch (c)    /* Handle the other opcodes */
1418    
1419      switch (c)
1420      {      {
1421      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1422    
# Line 1266  for (code = first_significant_code(code Line 1472  for (code = first_significant_code(code
1472      case OP_NOT:      case OP_NOT:
1473      case OP_PLUS:      case OP_PLUS:
1474      case OP_MINPLUS:      case OP_MINPLUS:
1475        case OP_POSPLUS:
1476      case OP_EXACT:      case OP_EXACT:
1477      case OP_NOTPLUS:      case OP_NOTPLUS:
1478      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1479        case OP_NOTPOSPLUS:
1480      case OP_NOTEXACT:      case OP_NOTEXACT:
1481      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1482      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1483        case OP_TYPEPOSPLUS:
1484      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1485      return FALSE;      return FALSE;
1486    
# Line 1283  for (code = first_significant_code(code Line 1492  for (code = first_significant_code(code
1492      case OP_ALT:      case OP_ALT:
1493      return TRUE;      return TRUE;
1494    
1495      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1497    
1498  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1499      case OP_STAR:      case OP_STAR:
1500      case OP_MINSTAR:      case OP_MINSTAR:
1501        case OP_POSSTAR:
1502      case OP_QUERY:      case OP_QUERY:
1503      case OP_MINQUERY:      case OP_MINQUERY:
1504        case OP_POSQUERY:
1505      case OP_UPTO:      case OP_UPTO:
1506      case OP_MINUPTO:      case OP_MINUPTO:
1507        case OP_POSUPTO:
1508      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509      break;      break;
1510  #endif  #endif
# Line 1410  earlier groups that are outside the curr Line 1622  earlier groups that are outside the curr
1622  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1624  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1625  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1626  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1627    
1628    This function has been extended with the possibility of forward references for
1629    recursions and subroutine calls. It must also check the list of such references
1630    for the group we are dealing with. If it finds that one of the recursions in
1631    the current group is on this list, it adjusts the offset in the list, not the
1632    value in the reference (which is a group number).
1633    
1634  Arguments:  Arguments:
1635    group      points to the start of the group    group      points to the start of the group
1636    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1637    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1638    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1639      save_hwm   the hwm forward reference pointer at the start of the group
1640    
1641  Returns:     nothing  Returns:     nothing
1642  */  */
1643    
1644  static void  static void
1645  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646      uschar *save_hwm)
1647  {  {
1648  uschar *ptr = group;  uschar *ptr = group;
1649  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650    {    {
1651    int offset = GET(ptr, 1);    int offset;
1652    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1653    
1654      /* See if this recursion is on the forward reference list. If so, adjust the
1655      reference. */
1656    
1657      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658        {
1659        offset = GET(hc, 0);
1660        if (cd->start_code + offset == ptr + 1)
1661          {
1662          PUT(hc, 0, offset + adjust);
1663          break;
1664          }
1665        }
1666    
1667      /* Otherwise, adjust the recursion offset if it's after the start of this
1668      group. */
1669    
1670      if (hc >= cd->hwm)
1671        {
1672        offset = GET(ptr, 1);
1673        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674        }
1675    
1676    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1677    }    }
1678  }  }
# Line 1508  Yield:        TRUE when range returned; Line 1751  Yield:        TRUE when range returned;
1751  */  */
1752    
1753  static BOOL  static BOOL
1754  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755      unsigned int *odptr)
1756  {  {
1757  int c, othercase, next;  unsigned int c, othercase, next;
1758    
1759  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1760    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1761    
1762  if (c > d) return FALSE;  if (c > d) return FALSE;
1763    
# Line 1534  return TRUE; Line 1778  return TRUE;
1778  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1779    
1780    
1781    
1782    /*************************************************
1783    *     Check if auto-possessifying is possible    *
1784    *************************************************/
1785    
1786    /* This function is called for unlimited repeats of certain items, to see
1787    whether the next thing could possibly match the repeated item. If not, it makes
1788    sense to automatically possessify the repeated item.
1789    
1790    Arguments:
1791      op_code       the repeated op code
1792      this          data for this item, depends on the opcode
1793      utf8          TRUE in UTF-8 mode
1794      utf8_char     used for utf8 character bytes, NULL if not relevant
1795      ptr           next character in pattern
1796      options       options bits
1797      cd            contains pointers to tables etc.
1798    
1799    Returns:        TRUE if possessifying is wanted
1800    */
1801    
1802    static BOOL
1803    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804      const uschar *ptr, int options, compile_data *cd)
1805    {
1806    int next;
1807    
1808    /* Skip whitespace and comments in extended mode */
1809    
1810    if ((options & PCRE_EXTENDED) != 0)
1811      {
1812      for (;;)
1813        {
1814        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815        if (*ptr == '#')
1816          {
1817          while (*(++ptr) != 0)
1818            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819          }
1820        else break;
1821        }
1822      }
1823    
1824    /* If the next item is one that we can handle, get its value. A non-negative
1825    value is a character, a negative value is an escape value. */
1826    
1827    if (*ptr == '\\')
1828      {
1829      int temperrorcode = 0;
1830      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831      if (temperrorcode != 0) return FALSE;
1832      ptr++;    /* Point after the escape sequence */
1833      }
1834    
1835    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836      {
1837    #ifdef SUPPORT_UTF8
1838      if (utf8) { GETCHARINC(next, ptr); } else
1839    #endif
1840      next = *ptr++;
1841      }
1842    
1843    else return FALSE;
1844    
1845    /* Skip whitespace and comments in extended mode */
1846    
1847    if ((options & PCRE_EXTENDED) != 0)
1848      {
1849      for (;;)
1850        {
1851        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852        if (*ptr == '#')
1853          {
1854          while (*(++ptr) != 0)
1855            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856          }
1857        else break;
1858        }
1859      }
1860    
1861    /* If the next thing is itself optional, we have to give up. */
1862    
1863    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864      return FALSE;
1865    
1866    /* Now compare the next item with the previous opcode. If the previous is a
1867    positive single character match, "item" either contains the character or, if
1868    "item" is greater than 127 in utf8 mode, the character's bytes are in
1869    utf8_char. */
1870    
1871    
1872    /* Handle cases when the next item is a character. */
1873    
1874    if (next >= 0) switch(op_code)
1875      {
1876      case OP_CHAR:
1877    #ifdef SUPPORT_UTF8
1878      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879    #endif
1880      return item != next;
1881    
1882      /* For CHARNC (caseless character) we must check the other case. If we have
1883      Unicode property support, we can use it to test the other case of
1884      high-valued characters. */
1885    
1886      case OP_CHARNC:
1887    #ifdef SUPPORT_UTF8
1888      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889    #endif
1890      if (item == next) return FALSE;
1891    #ifdef SUPPORT_UTF8
1892      if (utf8)
1893        {
1894        unsigned int othercase;
1895        if (next < 128) othercase = cd->fcc[next]; else
1896    #ifdef SUPPORT_UCP
1897        othercase = _pcre_ucp_othercase((unsigned int)next);
1898    #else
1899        othercase = NOTACHAR;
1900    #endif
1901        return (unsigned int)item != othercase;
1902        }
1903      else
1904    #endif  /* SUPPORT_UTF8 */
1905      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1906    
1907      /* For OP_NOT, "item" must be a single-byte character. */
1908    
1909      case OP_NOT:
1910      if (next < 0) return FALSE;  /* Not a character */
1911      if (item == next) return TRUE;
1912      if ((options & PCRE_CASELESS) == 0) return FALSE;
1913    #ifdef SUPPORT_UTF8
1914      if (utf8)
1915        {
1916        unsigned int othercase;
1917        if (next < 128) othercase = cd->fcc[next]; else
1918    #ifdef SUPPORT_UCP
1919        othercase = _pcre_ucp_othercase(next);
1920    #else
1921        othercase = NOTACHAR;
1922    #endif
1923        return (unsigned int)item == othercase;
1924        }
1925      else
1926    #endif  /* SUPPORT_UTF8 */
1927      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1928    
1929      case OP_DIGIT:
1930      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931    
1932      case OP_NOT_DIGIT:
1933      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934    
1935      case OP_WHITESPACE:
1936      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937    
1938      case OP_NOT_WHITESPACE:
1939      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940    
1941      case OP_WORDCHAR:
1942      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943    
1944      case OP_NOT_WORDCHAR:
1945      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946    
1947      default:
1948      return FALSE;
1949      }
1950    
1951    
1952    /* Handle the case when the next item is \d, \s, etc. */
1953    
1954    switch(op_code)
1955      {
1956      case OP_CHAR:
1957      case OP_CHARNC:
1958    #ifdef SUPPORT_UTF8
1959      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960    #endif
1961      switch(-next)
1962        {
1963        case ESC_d:
1964        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965    
1966        case ESC_D:
1967        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968    
1969        case ESC_s:
1970        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971    
1972        case ESC_S:
1973        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974    
1975        case ESC_w:
1976        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977    
1978        case ESC_W:
1979        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980    
1981        default:
1982        return FALSE;
1983        }
1984    
1985      case OP_DIGIT:
1986      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987    
1988      case OP_NOT_DIGIT:
1989      return next == -ESC_d;
1990    
1991      case OP_WHITESPACE:
1992      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993    
1994      case OP_NOT_WHITESPACE:
1995      return next == -ESC_s;
1996    
1997      case OP_WORDCHAR:
1998      return next == -ESC_W || next == -ESC_s;
1999    
2000      case OP_NOT_WORDCHAR:
2001      return next == -ESC_w || next == -ESC_d;
2002    
2003      default:
2004      return FALSE;
2005      }
2006    
2007    /* Control does not reach here */
2008    }
2009    
2010    
2011    
2012  /*************************************************  /*************************************************
2013  *           Compile one branch                   *  *           Compile one branch                   *
2014  *************************************************/  *************************************************/
2015    
2016  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
2017  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2018  bits.  bits. This function is used during the pre-compile phase when we are trying
2019    to find out the amount of memory needed, as well as during the real compile
2020    phase. The value of lengthptr distinguishes the two phases.
2021    
2022  Arguments:  Arguments:
2023    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2024    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2025    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2026    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1552  Arguments: Line 2028  Arguments:
2028    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2029    bcptr          points to current branch chain    bcptr          points to current branch chain
2030    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2031      lengthptr      NULL during the real compile phase
2032                     points to length accumulator during pre-compile phase
2033    
2034  Returns:         TRUE on success  Returns:         TRUE on success
2035                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2036  */  */
2037    
2038  static BOOL  static BOOL
2039  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2042  {  {
2043  int repeat_type, op_type;  int repeat_type, op_type;
2044  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1569  int greedy_default, greedy_non_default; Line 2047  int greedy_default, greedy_non_default;
2047  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2048  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2049  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2050  int options = *optionsptr;  int options = *optionsptr;
2051  int after_manual_callout = 0;  int after_manual_callout = 0;
2052    int length_prevgroup = 0;
2053  register int c;  register int c;
2054  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2055    uschar *last_code = code;
2056    uschar *orig_code = code;
2057  uschar *tempcode;  uschar *tempcode;
2058  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2059  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1581  const uschar *ptr = *ptrptr; Line 2061  const uschar *ptr = *ptrptr;
2061  const uschar *tempptr;  const uschar *tempptr;
2062  uschar *previous = NULL;  uschar *previous = NULL;
2063  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2064    uschar *save_hwm = NULL;
2065  uschar classbits[32];  uschar classbits[32];
2066    
2067  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1590  uschar *class_utf8data; Line 2071  uschar *class_utf8data;
2071  uschar utf8_char[6];  uschar utf8_char[6];
2072  #else  #else
2073  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2074    uschar *utf8_char = NULL;
2075    #endif
2076    
2077    #ifdef DEBUG
2078    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079  #endif  #endif
2080    
2081  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1623  for (;; ptr++) Line 2109  for (;; ptr++)
2109    BOOL negate_class;    BOOL negate_class;
2110    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2111    BOOL is_quantifier;    BOOL is_quantifier;
2112      BOOL is_recurse;
2113    int class_charcount;    int class_charcount;
2114    int class_lastchar;    int class_lastchar;
2115    int newoptions;    int newoptions;
2116    int recno;    int recno;
2117      int refsign;
2118    int skipbytes;    int skipbytes;
2119    int subreqbyte;    int subreqbyte;
2120    int subfirstbyte;    int subfirstbyte;
2121      int terminator;
2122    int mclength;    int mclength;
2123    uschar mcbuffer[8];    uschar mcbuffer[8];
2124    
2125    /* Next byte in the pattern */    /* Get next byte in the pattern */
2126    
2127    c = *ptr;    c = *ptr;
2128    
2129    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If we are in the pre-compile phase, accumulate the length used for the
2130      previous cycle of this loop. */
2131    
2132    if (inescq && c != 0)    if (lengthptr != NULL)
2133      {      {
2134      if (c == '\\' && ptr[1] == 'E')  #ifdef DEBUG
2135        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2136    #endif
2137        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2138        {        {
2139        inescq = FALSE;        *errorcodeptr = ERR52;
2140        ptr++;        goto FAILED;
2141          }
2142    
2143        /* There is at least one situation where code goes backwards: this is the
2144        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2145        the class is simply eliminated. However, it is created first, so we have to
2146        allow memory for it. Therefore, don't ever reduce the length at this point.
2147        */
2148    
2149        if (code < last_code) code = last_code;
2150        *lengthptr += code - last_code;
2151        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2152    
2153        /* If "previous" is set and it is not at the start of the work space, move
2154        it back to there, in order to avoid filling up the work space. Otherwise,
2155        if "previous" is NULL, reset the current code pointer to the start. */
2156    
2157        if (previous != NULL)
2158          {
2159          if (previous > orig_code)
2160            {
2161            memmove(orig_code, previous, code - previous);
2162            code -= previous - orig_code;
2163            previous = orig_code;
2164            }
2165          }
2166        else code = orig_code;
2167    
2168        /* Remember where this code item starts so we can pick up the length
2169        next time round. */
2170    
2171        last_code = code;
2172        }
2173    
2174      /* In the real compile phase, just check the workspace used by the forward
2175      reference list. */
2176    
2177      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2178        {
2179        *errorcodeptr = ERR52;
2180        goto FAILED;
2181        }
2182    
2183      /* If in \Q...\E, check for the end; if not, we have a literal */
2184    
2185      if (inescq && c != 0)
2186        {
2187        if (c == '\\' && ptr[1] == 'E')
2188          {
2189          inescq = FALSE;
2190          ptr++;
2191        continue;        continue;
2192        }        }
2193      else      else
2194        {        {
2195        if (previous_callout != NULL)        if (previous_callout != NULL)
2196          {          {
2197          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2198              complete_callout(previous_callout, ptr, cd);
2199          previous_callout = NULL;          previous_callout = NULL;
2200          }          }
2201        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1672  for (;; ptr++) Line 2216  for (;; ptr++)
2216    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2217         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2218      {      {
2219      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2220          complete_callout(previous_callout, ptr, cd);
2221      previous_callout = NULL;      previous_callout = NULL;
2222      }      }
2223    
# Line 1683  for (;; ptr++) Line 2228  for (;; ptr++)
2228      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2229      if (c == '#')      if (c == '#')
2230        {        {
2231        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2232        on the Macintosh. */          {
2233        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2234        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2235          if (*ptr != 0) continue;
2236    
2237          /* Else fall through to handle end of string */
2238          c = 0;
2239        }        }
2240      }      }
2241    
# Line 1700  for (;; ptr++) Line 2249  for (;; ptr++)
2249    
2250    switch(c)    switch(c)
2251      {      {
2252      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2253        case 0:                        /* The branch terminates at string end */
2254      case 0:      case '|':                      /* or | or ) */
     case '|':  
2255      case ')':      case ')':
2256      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2257      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2258      *codeptr = code;      *codeptr = code;
2259      *ptrptr = ptr;      *ptrptr = ptr;
2260        if (lengthptr != NULL)
2261          {
2262          *lengthptr += code - last_code;   /* To include callout length */
2263          DPRINTF((">> end branch\n"));
2264          }
2265      return TRUE;      return TRUE;
2266    
2267    
2268        /* ===================================================================*/
2269      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2270      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2271    
# Line 1739  for (;; ptr++) Line 2294  for (;; ptr++)
2294      *code++ = OP_ANY;      *code++ = OP_ANY;
2295      break;      break;
2296    
2297    
2298        /* ===================================================================*/
2299      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2300      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2301      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1777  for (;; ptr++) Line 2334  for (;; ptr++)
2334        }        }
2335    
2336      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2337      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2338      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2339    
2340      class_charcount = 0;      class_charcount = 0;
2341      class_lastchar = -1;      class_lastchar = -1;
2342    
2343        /* Initialize the 32-char bit map to all zeros. We build the map in a
2344        temporary bit of memory, in case the class contains only 1 character (less
2345        than 256), because in that case the compiled code doesn't use the bit map.
2346        */
2347    
2348        memset(classbits, 0, 32 * sizeof(uschar));
2349    
2350  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2351      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2352      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2353  #endif  #endif
2354    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2355      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2356      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2357      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2358    
2359      do      if (c != 0) do
2360        {        {
2361          const uschar *oldptr;
2362    
2363  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2364        if (utf8 && c > 127)        if (utf8 && c > 127)
2365          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1814  for (;; ptr++) Line 2371  for (;; ptr++)
2371    
2372        if (inescq)        if (inescq)
2373          {          {
2374          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2375            {            {
2376            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2377            ptr++;            ptr++;                            /* Skip the 'E' */
2378            continue;            continue;                         /* Carry on with next */
2379            }            }
2380          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2381          }          }
2382    
2383        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1911  for (;; ptr++) Line 2468  for (;; ptr++)
2468          }          }
2469    
2470        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2471        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2472        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2473        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2474        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2475        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2476    
2477        if (c == '\\')        if (c == '\\')
2478          {          {
2479          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2480            if (*errorcodeptr != 0) goto FAILED;
2481    
2482          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2483          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2484            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2485          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2486            {            {
2487            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1938  for (;; ptr++) Line 2496  for (;; ptr++)
2496            {            {
2497            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2498            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2499            switch (-c)  
2500              /* Save time by not doing this in the pre-compile phase. */
2501    
2502              if (lengthptr == NULL) switch (-c)
2503              {              {
2504              case ESC_d:              case ESC_d:
2505              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1966  for (;; ptr++) Line 2527  for (;; ptr++)
2527              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2528              continue;              continue;
2529    
2530                case ESC_E: /* Perl ignores an orphan \E */
2531                continue;
2532    
2533                default:    /* Not recognized; fall through */
2534                break;      /* Need "default" setting to stop compiler warning. */
2535                }
2536    
2537              /* In the pre-compile phase, just do the recognition. */
2538    
2539              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2540                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2541    
2542              /* We need to deal with \P and \p in both phases. */
2543    
2544  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2545              case ESC_p:            if (-c == ESC_p || -c == ESC_P)
2546              case ESC_P:              {
2547                {              BOOL negated;
2548                BOOL negated;              int pdata;
2549                int pdata;              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2550                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);              if (ptype < 0) goto FAILED;
2551                if (ptype < 0) goto FAILED;              class_utf8 = TRUE;
2552                class_utf8 = TRUE;              *class_utf8data++ = ((-c == ESC_p) != negated)?
2553                *class_utf8data++ = ((-c == ESC_p) != negated)?                XCL_PROP : XCL_NOTPROP;
2554                  XCL_PROP : XCL_NOTPROP;              *class_utf8data++ = ptype;
2555                *class_utf8data++ = ptype;              *class_utf8data++ = pdata;
2556                *class_utf8data++ = pdata;              class_charcount -= 2;   /* Not a < 256 character */
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2557              continue;              continue;
2558                }
2559  #endif  #endif
2560              /* Unrecognized escapes are faulted if PCRE is running in its
2561              strict mode. By default, for compatibility with Perl, they are
2562              treated as literals. */
2563    
2564              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2565              strict mode. By default, for compatibility with Perl, they are              {
2566              treated as literals. */              *errorcodeptr = ERR7;
2567                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2568              }              }
2569    
2570              class_charcount -= 2;  /* Undo the default count from above */
2571              c = *ptr;              /* Get the final character and fall through */
2572            }            }
2573    
2574          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2575          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2576    
2577          }   /* End of backslash handling */          }   /* End of backslash handling */
2578    
2579        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2580        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2581        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2582          entirely. The code for handling \Q and \E is messy. */
2583    
2584          CHECK_RANGE:
2585          while (ptr[1] == '\\' && ptr[2] == 'E')
2586            {
2587            inescq = FALSE;
2588            ptr += 2;
2589            }
2590    
2591          oldptr = ptr;
2592    
2593        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2594          {          {
2595          int d;          int d;
2596          ptr += 2;          ptr += 2;
2597            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2598    
2599            /* If we hit \Q (not followed by \E) at this point, go into escaped
2600            mode. */
2601    
2602            while (*ptr == '\\' && ptr[1] == 'Q')
2603              {
2604              ptr += 2;
2605              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2606              inescq = TRUE;
2607              break;
2608              }
2609    
2610            if (*ptr == 0 || (!inescq && *ptr == ']'))
2611              {
2612              ptr = oldptr;
2613              goto LONE_SINGLE_CHARACTER;
2614              }
2615    
2616  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2617          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 2626  for (;; ptr++)
2626          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2627          in such circumstances. */          in such circumstances. */
2628    
2629          if (d == '\\')          if (!inescq && d == '\\')
2630            {            {
2631            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2632            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2633    
2634            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2635            was literal */            special means the '-' was literal */
2636    
2637            if (d < 0)            if (d < 0)
2638              {              {
2639              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2640              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2641                else if (d == -ESC_R) d = 'R'; else
2642                {                {
2643                ptr = oldptr - 2;                ptr = oldptr;
2644                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2645                }                }
2646              }              }
2647            }            }
2648    
2649          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2650          the pre-pass. Optimize one-character ranges */          one-character ranges */
2651    
2652            if (d < c)
2653              {
2654              *errorcodeptr = ERR8;
2655              goto FAILED;
2656              }
2657    
2658          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2659    
# Line 2067  for (;; ptr++) Line 2674  for (;; ptr++)
2674  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2675            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2676              {              {
2677              int occ, ocd;              unsigned int occ, ocd;
2678              int cc = c;              unsigned int cc = c;
2679              int origd = d;              unsigned int origd = d;
2680              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2681                {                {
2682                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2127  for (;; ptr++) Line 2734  for (;; ptr++)
2734          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2735          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2736    
2737          for (; c <= d; c++)          class_charcount += d - c + 1;
2738            class_lastchar = d;
2739    
2740            /* We can save a bit of time by skipping this in the pre-compile. */
2741    
2742            if (lengthptr == NULL) for (; c <= d; c++)
2743            {            {
2744            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2745            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 2747  for (;; ptr++)
2747              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2748              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2749              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2750            }            }
2751    
2752          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 2770  for (;; ptr++)
2770  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2771          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2772            {            {
2773            int othercase;            unsigned int othercase;
2774            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2775              {              {
2776              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2777              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 2796  for (;; ptr++)
2796          }          }
2797        }        }
2798    
2799      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2800    
2801      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2802    
2803        if (c == 0)                          /* Missing terminating ']' */
2804          {
2805          *errorcodeptr = ERR6;
2806          goto FAILED;
2807          }
2808    
2809      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2810      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2253  for (;; ptr++) Line 2868  for (;; ptr++)
2868    
2869      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2870      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2871      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2872    
2873  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2874      if (class_utf8)      if (class_utf8)
# Line 2263  for (;; ptr++) Line 2878  for (;; ptr++)
2878        code += LINK_SIZE;        code += LINK_SIZE;
2879        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2880    
2881        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2882        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2883    
2884        if (class_charcount > 0)        if (class_charcount > 0)
2885          {          {
2886          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2887            memmove(code + 32, code, class_utf8data - code);
2888          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2889          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2890          }          }
2891          else code = class_utf8data;
2892    
2893        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2894    
# Line 2297  for (;; ptr++) Line 2905  for (;; ptr++)
2905      if (negate_class)      if (negate_class)
2906        {        {
2907        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2908        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2909            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2910        }        }
2911      else      else
2912        {        {
# Line 2307  for (;; ptr++) Line 2916  for (;; ptr++)
2916      code += 32;      code += 32;
2917      break;      break;
2918    
2919    
2920        /* ===================================================================*/
2921      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2922      has been tested above. */      has been tested above. */
2923    
# Line 2374  for (;; ptr++) Line 2985  for (;; ptr++)
2985        }        }
2986      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2987    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2988      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2989      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2990      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3018  for (;; ptr++)
3018          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3019          }          }
3020    
3021          /* If the repetition is unlimited, it pays to see if the next thing on
3022          the line is something that cannot possibly match this character. If so,
3023          automatically possessifying this item gains some performance in the case
3024          where the match fails. */
3025    
3026          if (!possessive_quantifier &&
3027              repeat_max < 0 &&
3028              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3029                options, cd))
3030            {
3031            repeat_type = 0;    /* Force greedy */
3032            possessive_quantifier = TRUE;
3033            }
3034    
3035        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3036        }        }
3037    
3038      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3039      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3040      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3041      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3042        currently used only for single-byte chars. */
3043    
3044      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3045        {        {
3046        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3047        c = previous[1];        c = previous[1];
3048          if (!possessive_quantifier &&
3049              repeat_max < 0 &&
3050              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3051            {
3052            repeat_type = 0;    /* Force greedy */
3053            possessive_quantifier = TRUE;
3054            }
3055        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3056        }        }
3057    
# Line 2450  for (;; ptr++) Line 3069  for (;; ptr++)
3069        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3070        c = *previous;        c = *previous;
3071    
3072          if (!possessive_quantifier &&
3073              repeat_max < 0 &&
3074              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3075            {
3076            repeat_type = 0;    /* Force greedy */
3077            possessive_quantifier = TRUE;
3078            }
3079    
3080        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3081        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3082          {          {
# Line 2490  for (;; ptr++) Line 3117  for (;; ptr++)
3117          }          }
3118    
3119        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3120        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3121        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3122        one less than the maximum. */        one less than the maximum. */
3123    
# Line 2543  for (;; ptr++) Line 3170  for (;; ptr++)
3170            }            }
3171    
3172          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3173          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3174            UPTO is just for 1 instance, we can use QUERY instead. */
3175    
3176          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3177            {            {
# Line 2562  for (;; ptr++) Line 3190  for (;; ptr++)
3190              *code++ = prop_value;              *code++ = prop_value;
3191              }              }
3192            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3193            *code++ = OP_UPTO + repeat_type;  
3194            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3195                {
3196                *code++ = OP_QUERY + repeat_type;
3197                }
3198              else
3199                {
3200                *code++ = OP_UPTO + repeat_type;
3201                PUT2INC(code, 0, repeat_max);
3202                }
3203            }            }
3204          }          }
3205    
# Line 2630  for (;; ptr++) Line 3266  for (;; ptr++)
3266      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3267      cases. */      cases. */
3268    
3269      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3270               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3271        {        {
3272        register int i;        register int i;
3273        int ketoffset = 0;        int ketoffset = 0;
3274        int len = code - previous;        int len = code - previous;
3275        uschar *bralink = NULL;        uschar *bralink = NULL;
3276    
3277          /* Repeating a DEFINE group is pointless */
3278    
3279          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3280            {
3281            *errorcodeptr = ERR55;
3282            goto FAILED;
3283            }
3284    
3285          /* This is a paranoid check to stop integer overflow later on */
3286    
3287          if (len > MAX_DUPLENGTH)
3288            {
3289            *errorcodeptr = ERR50;
3290            goto FAILED;
3291            }
3292    
3293        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3294        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3295        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2672  for (;; ptr++) Line 3324  for (;; ptr++)
3324          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3325          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3326          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3327          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3328          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3329            doing this. */
3330    
3331          if (repeat_max <= 1)          if (repeat_max <= 1)
3332            {            {
3333            *code = OP_END;            *code = OP_END;
3334            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3335            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3336            code++;            code++;
3337            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2696  for (;; ptr++) Line 3349  for (;; ptr++)
3349            {            {
3350            int offset;            int offset;
3351            *code = OP_END;            *code = OP_END;
3352            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3353            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3354            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3355            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3369  for (;; ptr++)
3369        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3370        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3371        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3372        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3373          forward reference subroutine calls in the group, there will be entries on
3374          the workspace list; replicate these with an appropriate increment. */
3375    
3376        else        else
3377          {          {
3378          if (repeat_min > 1)          if (repeat_min > 1)
3379            {            {
3380            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3381            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3382    
3383              if (lengthptr != NULL)
3384                *lengthptr += (repeat_min - 1)*length_prevgroup;
3385    
3386              /* This is compiling for real */
3387    
3388              else
3389              {              {
3390              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3391              code += len;              for (i = 1; i < repeat_min; i++)
3392                  {
3393                  uschar *hc;
3394                  uschar *this_hwm = cd->hwm;
3395                  memcpy(code, previous, len);
3396                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3397                    {
3398                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3399                    cd->hwm += LINK_SIZE;
3400                    }
3401                  save_hwm = this_hwm;
3402                  code += len;
3403                  }
3404              }              }
3405            }            }
3406    
3407          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3408          }          }
3409    
# Line 2736  for (;; ptr++) Line 3411  for (;; ptr++)
3411        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3412        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3413        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3414        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3415          replicate entries on the forward reference list. */
3416    
3417        if (repeat_max >= 0)        if (repeat_max >= 0)
3418          {          {
3419          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3420            just adjust the length as if we had. For each repetition we must add 1
3421            to the length for BRAZERO and for all but the last repetition we must
3422            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3423    
3424            if (lengthptr != NULL && repeat_max > 0)
3425              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3426                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3427    
3428            /* This is compiling for real */
3429    
3430            else for (i = repeat_max - 1; i >= 0; i--)
3431            {            {
3432              uschar *hc;
3433              uschar *this_hwm = cd->hwm;
3434    
3435            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3436    
3437            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 3447  for (;; ptr++)
3447              }              }
3448    
3449            memcpy(code, previous, len);            memcpy(code, previous, len);
3450              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3451                {
3452                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3453                cd->hwm += LINK_SIZE;
3454                }
3455              save_hwm = this_hwm;
3456            code += len;            code += len;
3457            }            }
3458    
# Line 2779  for (;; ptr++) Line 3475  for (;; ptr++)
3475        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3476        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3477        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3478        correct offset was computed above. */        correct offset was computed above.
3479    
3480        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3481          this group is a non-atomic one that could match an empty string. If so,
3482          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3483          that runtime checking can be done. [This check is also applied to
3484          atomic groups at runtime, but in a different way.] */
3485    
3486          else
3487            {
3488            uschar *ketcode = code - ketoffset;
3489            uschar *bracode = ketcode - GET(ketcode, 1);
3490            *ketcode = OP_KETRMAX + repeat_type;
3491            if (lengthptr == NULL && *bracode != OP_ONCE)
3492              {
3493              uschar *scode = bracode;
3494              do
3495                {
3496                if (could_be_empty_branch(scode, ketcode, utf8))
3497                  {
3498                  *bracode += OP_SBRA - OP_BRA;
3499                  break;
3500                  }
3501                scode += GET(scode, 1);
3502                }
3503              while (*scode == OP_ALT);
3504              }
3505            }
3506        }        }
3507    
3508      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2792  for (;; ptr++) Line 3513  for (;; ptr++)
3513        goto FAILED;        goto FAILED;
3514        }        }
3515    
3516      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3517      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3518      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3519      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3520      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3521        but the special opcodes can optimize it a bit. The repeated item starts at
3522        tempcode, not at previous, which might be the first part of a string whose
3523        (former) last char we repeated.
3524    
3525        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3526        an 'upto' may follow. We skip over an 'exact' item, and then test the
3527        length of what remains before proceeding. */
3528    
3529      if (possessive_quantifier)      if (possessive_quantifier)
3530        {        {
3531        int len = code - tempcode;        int len;
3532        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3533        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3534        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3535        tempcode[0] = OP_ONCE;        len = code - tempcode;
3536        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3537        PUTINC(code, 0, len);          {
3538        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3539            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3540            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3541            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3542    
3543            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3544            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3545            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3546            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3547    
3548            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3549            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3550            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3551            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3552    
3553            default:
3554            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3555            code += 1 + LINK_SIZE;
3556            len += 1 + LINK_SIZE;
3557            tempcode[0] = OP_ONCE;
3558            *code++ = OP_KET;
3559            PUTINC(code, 0, len);
3560            PUT(tempcode, 1, len);
3561            break;
3562            }
3563        }        }
3564    
3565      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 3572  for (;; ptr++)
3572      break;      break;
3573    
3574    
3575      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3576      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3577      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3578      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3579      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3580      check for syntax errors here.  */      group. */
3581    
3582      case '(':      case '(':
3583      newoptions = options;      newoptions = options;
3584      skipbytes = 0;      skipbytes = 0;
3585        bravalue = OP_CBRA;
3586        save_hwm = cd->hwm;
3587    
3588      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3589        {        {
3590        int set, unset;        int i, set, unset, namelen;
3591        int *optset;        int *optset;
3592          const uschar *name;
3593          uschar *slot;
3594    
3595        switch (*(++ptr))        switch (*(++ptr))
3596          {          {
3597          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3598          ptr++;          ptr++;
3599          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3600            if (*ptr == 0)
3601              {
3602              *errorcodeptr = ERR18;
3603              goto FAILED;
3604              }
3605          continue;          continue;
3606    
3607          case ':':                 /* Non-extracting bracket */  
3608            /* ------------------------------------------------------------ */
3609            case ':':                 /* Non-capturing bracket */
3610          bravalue = OP_BRA;          bravalue = OP_BRA;
3611          ptr++;          ptr++;
3612          break;          break;
3613    
3614    
3615            /* ------------------------------------------------------------ */
3616          case '(':          case '(':
3617          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3618    
3619          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3620            group), a name (referring to a named group), or 'R', referring to
3621            recursion. R<digits> and R&name are also permitted for recursion tests.
3622    
3623            There are several syntaxes for testing a named group: (?(name)) is used
3624            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3625    
3626            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3627            be the recursive thing or the name 'R' (and similarly for 'R' followed
3628            by digits), and (b) a number could be a name that consists of digits.
3629            In both cases, we look for a name first; if not found, we try the other
3630            cases. */
3631    
3632            /* For conditions that are assertions, check the syntax, and then exit
3633            the switch. This will take control down to where bracketed groups,
3634            including assertions, are processed. */
3635    
3636            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3637              break;
3638    
3639            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3640            below), and all need to skip 3 bytes at the start of the group. */
3641    
3642            code[1+LINK_SIZE] = OP_CREF;
3643            skipbytes = 3;
3644            refsign = -1;
3645    
3646            /* Check for a test for recursion in a named group. */
3647    
3648            if (ptr[1] == 'R' && ptr[2] == '&')
3649              {
3650              terminator = -1;
3651              ptr += 2;
3652              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3653              }
3654    
3655            /* Check for a test for a named group's having been set, using the Perl
3656            syntax (?(<name>) or (?('name') */
3657    
3658            else if (ptr[1] == '<')
3659              {
3660              terminator = '>';
3661              ptr++;
3662              }
3663            else if (ptr[1] == '\'')
3664              {
3665              terminator = '\'';
3666              ptr++;
3667              }
3668            else
3669              {
3670              terminator = 0;
3671              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3672              }
3673    
3674            /* We now expect to read a name; any thing else is an error */
3675    
3676            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3677              {
3678              ptr += 1;  /* To get the right offset */
3679              *errorcodeptr = ERR28;
3680              goto FAILED;
3681              }
3682    
3683            /* Read the name, but also get it as a number if it's all digits */
3684    
3685            recno = 0;
3686            name = ++ptr;
3687            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3688              {
3689              if (recno >= 0)
3690                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3691                  recno * 10 + *ptr - '0' : -1;
3692              ptr++;
3693              }
3694            namelen = ptr - name;
3695    
3696          if (ptr[1] == 'R')          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3697            {            {
3698            code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3699            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            *errorcodeptr = ERR26;
3700            skipbytes = 3;            goto FAILED;
           ptr += 3;  
3701            }            }
3702    
3703          /* Condition to test for a numbered subpattern match. We know that          /* Do no further checking in the pre-compile phase. */
3704          if a digit follows ( then there will just be digits until ) because  
3705          the syntax was checked in the first pass. */          if (lengthptr != NULL) break;
3706    
3707          else if ((digitab[ptr[1]] && ctype_digit) != 0)          /* In the real compile we do the work of looking for the actual
3708            reference. If the string started with "+" or "-" we require the rest to
3709            be digits, in which case recno will be set. */
3710    
3711            if (refsign > 0)
3712            {            {
3713            int condref;                 /* Don't amalgamate; some compilers */            if (recno <= 0)
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3714              {              {
3715              *errorcodeptr = ERR35;              *errorcodeptr = ERR58;
3716              goto FAILED;              goto FAILED;
3717              }              }
3718            ptr++;            if (refsign == '-')
3719            code[1+LINK_SIZE] = OP_CREF;              {
3720            PUT2(code, 2+LINK_SIZE, condref);              recno = cd->bracount - recno + 1;
3721            skipbytes = 3;              if (recno <= 0)
3722                  {
3723                  *errorcodeptr = ERR15;
3724                  goto FAILED;
3725                  }
3726                }
3727              else recno += cd->bracount;
3728              PUT2(code, 2+LINK_SIZE, recno);
3729              break;
3730              }
3731    
3732            /* Otherwise (did not start with "+" or "-"), start by looking for the
3733            name. */
3734    
3735            slot = cd->name_table;
3736            for (i = 0; i < cd->names_found; i++)
3737              {
3738              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3739              slot += cd->name_entry_size;
3740              }
3741    
3742            /* Found a previous named subpattern */
3743    
3744            if (i < cd->names_found)
3745              {
3746              recno = GET2(slot, 0);
3747              PUT2(code, 2+LINK_SIZE, recno);
3748              }
3749    
3750            /* Search the pattern for a forward reference */
3751    
3752            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3753                            (options & PCRE_EXTENDED) != 0)) > 0)
3754              {
3755              PUT2(code, 2+LINK_SIZE, i);
3756              }
3757    
3758            /* If terminator == 0 it means that the name followed directly after
3759            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3760            some further alternatives to try. For the cases where terminator != 0
3761            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3762            now checked all the possibilities, so give an error. */
3763    
3764            else if (terminator != 0)
3765              {
3766              *errorcodeptr = ERR15;
3767              goto FAILED;
3768              }
3769    
3770            /* Check for (?(R) for recursion. Allow digits after R to specify a
3771            specific group number. */
3772    
3773            else if (*name == 'R')
3774              {
3775              recno = 0;
3776              for (i = 1; i < namelen; i++)
3777                {
3778                if ((digitab[name[i]] & ctype_digit) == 0)
3779                  {
3780                  *errorcodeptr = ERR15;
3781                  goto FAILED;
3782                  }
3783                recno = recno * 10 + name[i] - '0';
3784                }
3785              if (recno == 0) recno = RREF_ANY;
3786              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3787              PUT2(code, 2+LINK_SIZE, recno);
3788              }
3789    
3790            /* Similarly, check for the (?(DEFINE) "condition", which is always
3791            false. */
3792    
3793            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3794              {
3795              code[1+LINK_SIZE] = OP_DEF;
3796              skipbytes = 1;
3797              }
3798    
3799            /* Check for the "name" actually being a subpattern number. */
3800    
3801            else if (recno > 0)
3802              {
3803              PUT2(code, 2+LINK_SIZE, recno);
3804              }
3805    
3806            /* Either an unidentified subpattern, or a reference to (?(0) */
3807    
3808            else
3809              {
3810              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3811              goto FAILED;
3812            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
3813          break;          break;
3814    
3815    
3816            /* ------------------------------------------------------------ */
3817          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3818          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3819          ptr++;          ptr++;
3820          break;          break;
3821    
3822    
3823            /* ------------------------------------------------------------ */
3824          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3825          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3826          ptr++;          ptr++;
3827          break;          break;
3828    
3829          case '<':                 /* Lookbehinds */  
3830          switch (*(++ptr))          /* ------------------------------------------------------------ */
3831            case '<':                 /* Lookbehind or named define */
3832            switch (ptr[1])
3833            {            {
3834            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3835            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3836            ptr++;            ptr += 2;
3837            break;            break;
3838    
3839            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3840            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3841            ptr++;            ptr += 2;
3842            break;            break;
3843            }  
3844              default:                /* Could be name define, else bad */
3845              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3846              ptr++;                  /* Correct offset for error */
3847              *errorcodeptr = ERR24;
3848              goto FAILED;
3849              }
3850          break;          break;
3851    
3852    
3853            /* ------------------------------------------------------------ */
3854          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3855          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3856          ptr++;          ptr++;
3857          break;          break;
3858    
3859    
3860            /* ------------------------------------------------------------ */
3861          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3862          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3863          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3864          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3865            {                       /* closing parenthesis is present. */            {
3866            int n = 0;            int n = 0;
3867            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3868              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3869              if (*ptr != ')')
3870                {
3871                *errorcodeptr = ERR39;
3872                goto FAILED;
3873                }
3874            if (n > 255)            if (n > 255)
3875              {              {
3876              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2935  for (;; ptr++) Line 3884  for (;; ptr++)
3884          previous = NULL;          previous = NULL;
3885          continue;          continue;
3886    
3887          case 'P':                 /* Named subpattern handling */  
3888          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3889            case 'P':                 /* Python-style named subpattern handling */
3890            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3891              {
3892              is_recurse = *ptr == '>';
3893              terminator = ')';
3894              goto NAMED_REF_OR_RECURSE;
3895              }
3896            else if (*ptr != '<')    /* Test for Python-style definition */
3897            {            {
3898            int i, namelen;            *errorcodeptr = ERR41;
3899            uschar *slot = cd->name_table;            goto FAILED;
3900            const uschar *name;     /* Don't amalgamate; some compilers */            }
3901            name = ++ptr;           /* grumble at autoincrement in declaration */          /* Fall through to handle (?P< as (?< is handled */
3902    
           while (*ptr++ != '>');  
           namelen = ptr - name - 1;  
3903    
3904            for (i = 0; i < cd->names_found; i++)          /* ------------------------------------------------------------ */
3905            DEFINE_NAME:    /* Come here from (?< handling */
3906            case '\'':
3907              {
3908              terminator = (*ptr == '<')? '>' : '\'';
3909              name = ++ptr;
3910    
3911              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3912              namelen = ptr - name;
3913    
3914              /* In the pre-compile phase, just do a syntax check. */
3915    
3916              if (lengthptr != NULL)
3917              {              {
3918              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
3919              if (crc == 0)                {
3920                  *errorcodeptr = ERR42;
3921                  goto FAILED;
3922                  }
3923                if (cd->names_found >= MAX_NAME_COUNT)
3924                {                {
3925                if (slot[2+namelen] == 0)                *errorcodeptr = ERR49;
3926                  goto FAILED;
3927                  }
3928                if (namelen + 3 > cd->name_entry_size)
3929                  {
3930                  cd->name_entry_size = namelen + 3;
3931                  if (namelen > MAX_NAME_SIZE)
3932                  {                  {
3933                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
3934                  goto FAILED;                  goto FAILED;
3935                  }                  }
               crc = -1;             /* Current name is substring */  
3936                }                }
3937              if (crc < 0)              }
3938    
3939              /* In the real compile, create the entry in the table */
3940    
3941              else
3942                {
3943                slot = cd->name_table;
3944                for (i = 0; i < cd->names_found; i++)
3945                {                {
3946                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
3947                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
3948                break;                  {
3949                    if (slot[2+namelen] == 0)
3950                      {
3951                      if ((options & PCRE_DUPNAMES) == 0)
3952                        {
3953                        *errorcodeptr = ERR43;
3954                        goto FAILED;
3955                        }
3956                      }
3957                    else crc = -1;      /* Current name is substring */
3958                    }
3959                  if (crc < 0)
3960                    {
3961                    memmove(slot + cd->name_entry_size, slot,
3962                      (cd->names_found - i) * cd->name_entry_size);
3963                    break;
3964                    }
3965                  slot += cd->name_entry_size;
3966                }                }
             slot += cd->name_entry_size;  
             }  
3967    
3968            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3969            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3970            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3971            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3972            }            }
3973    
3974          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
3975    
3976            ptr++;                    /* Move past > or ' */
3977            cd->names_found++;
3978            goto NUMBERED_GROUP;
3979    
3980    
3981            /* ------------------------------------------------------------ */
3982            case '&':                 /* Perl recursion/subroutine syntax */
3983            terminator = ')';
3984            is_recurse = TRUE;
3985            /* Fall through */
3986    
3987            /* We come here from the Python syntax above that handles both
3988            references (?P=name) and recursion (?P>name), as well as falling
3989            through from the Perl recursion syntax (?&name). */
3990    
3991            NAMED_REF_OR_RECURSE:
3992            name = ++ptr;
3993            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3994            namelen = ptr - name;
3995    
3996            /* In the pre-compile phase, do a syntax check and set a dummy
3997            reference number. */
3998    
3999            if (lengthptr != NULL)
4000            {            {
4001            int i, namelen;            if (*ptr != terminator)
4002            int type = *ptr++;              {
4003            const uschar *name = ptr;              *errorcodeptr = ERR42;
4004            uschar *slot = cd->name_table;              goto FAILED;
4005                }
4006              if (namelen > MAX_NAME_SIZE)
4007                {
4008                *errorcodeptr = ERR48;
4009                goto FAILED;
4010                }
4011              recno = 0;
4012              }
4013    
4014            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4015    
4016            else
4017              {
4018              slot = cd->name_table;
4019            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4020              {              {
4021              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4022              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4023              }              }
4024            if (i >= cd->names_found)  
4025              if (i < cd->names_found)         /* Back reference */
4026                {
4027                recno = GET2(slot, 0);
4028                }
4029              else if ((recno =                /* Forward back reference */
4030                        find_parens(ptr, cd->bracount, name, namelen,
4031                          (options & PCRE_EXTENDED) != 0)) <= 0)
4032              {              {
4033              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4034              goto FAILED;              goto FAILED;
4035              }              }
4036              }
4037    
4038            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4039            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4040    
4041            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4042            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4043    
         /* Should never happen */  
         break;  
4044    
4045          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4046            case 'R':                 /* Recursion */
4047          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4048          /* Fall through */          /* Fall through */
4049    
         /* Recursion or "subroutine" call */  
4050    
4051          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4052          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4053            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4054            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4055            {            {
4056            const uschar *called;            const uschar *called;
4057    
4058              if ((refsign = *ptr) == '+') ptr++;
4059              else if (refsign == '-')
4060                {
4061                if ((digitab[ptr[1]] & ctype_digit) == 0)
4062                  goto OTHER_CHAR_AFTER_QUERY;
4063                ptr++;
4064                }
4065    
4066            recno = 0;            recno = 0;
4067            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4068              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4069    
4070              if (*ptr != ')')
4071                {
4072                *errorcodeptr = ERR29;
4073                goto FAILED;
4074                }
4075    
4076              if (refsign == '-')
4077                {
4078                if (recno == 0)
4079                  {
4080                  *errorcodeptr = ERR58;
4081                  goto FAILED;
4082                  }
4083                recno = cd->bracount - recno + 1;
4084                if (recno <= 0)
4085                  {
4086                  *errorcodeptr = ERR15;
4087                  goto FAILED;
4088                  }
4089                }
4090              else if (refsign == '+')
4091                {
4092                if (recno == 0)
4093                  {
4094                  *errorcodeptr = ERR58;
4095                  goto FAILED;
4096                  }
4097                recno += cd->bracount;
4098                }
4099    
4100            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4101    
4102            HANDLE_RECURSION:            HANDLE_RECURSION:
4103    
4104            previous = code;            previous = code;
4105              called = cd->start_code;
4106    
4107            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4108            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4109              this point. If we end up with a forward reference, first check that
4110            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4111            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4112              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4113    
4114            if (called == NULL)            if (lengthptr == NULL)
4115              {              {
4116              *errorcodeptr = ERR15;              *code = OP_END;
4117              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4118    
4119            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4120    
4121            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4122              {                {
4123              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4124              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4125                    {
4126                    *errorcodeptr = ERR15;
4127                    goto FAILED;
4128                    }
4129                  called = cd->start_code + recno;
4130                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4131                  }
4132    
4133                /* If not a forward reference, and the subpattern is still open,
4134                this is a recursive call. We check to see if this is a left
4135                recursion that could loop for ever, and diagnose that case. */
4136    
4137                else if (GET(called, 1) == 0 &&
4138                         could_be_empty(called, code, bcptr, utf8))
4139                  {
4140                  *errorcodeptr = ERR40;
4141                  goto FAILED;
4142                  }
4143              }              }
4144    
4145            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4146            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4147              subsequent quantifier will work. */
4148    
4149            *code = OP_ONCE;            *code = OP_ONCE;
4150            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3069  for (;; ptr++) Line 4157  for (;; ptr++)
4157            *code = OP_KET;            *code = OP_KET;
4158            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4159            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4160    
4161              length_prevgroup = 3 + 3*LINK_SIZE;
4162            }            }
4163    
4164            /* Can't determine a first byte now */
4165    
4166            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4167          continue;          continue;
4168    
         /* Character after (? not specially recognized */  
4169    
4170          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4171            default:              /* Other characters: check option setting */
4172            OTHER_CHAR_AFTER_QUERY:
4173          set = unset = 0;          set = unset = 0;
4174          optset = &set;          optset = &set;
4175    
# Line 3084  for (;; ptr++) Line 4179  for (;; ptr++)
4179              {              {
4180              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4181    
4182                case 'J':    /* Record that it changed in the external options */
4183                *optset |= PCRE_DUPNAMES;
4184                cd->external_options |= PCRE_JCHANGED;
4185                break;
4186    
4187              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4188              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4189              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4190              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4191              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4192              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4193    
4194                default:  *errorcodeptr = ERR12;
4195                          ptr--;    /* Correct the offset */
4196                          goto FAILED;
4197              }              }
4198            }            }
4199    
# Line 3098  for (;; ptr++) Line 4202  for (;; ptr++)
4202          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4203    
4204          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4205          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4206          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4207          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4208          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4209          a group), a resetting item can be compiled.          caseless checking of required bytes.
4210    
4211          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4212          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4213          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4214            that value after the start, because it gets reset as code is discarded
4215            during the pre-compile. However, this can happen only at top level - if
4216            we are within parentheses, the starting BRA will still be present. At
4217            any parenthesis level, the length value can be used to test if anything
4218            has been compiled at that level. Thus, a test for both these conditions
4219            is necessary to ensure we correctly detect the start of the pattern in
4220            both phases.
4221    
4222            If we are not at the pattern start, compile code to change the ims
4223            options if this setting actually changes any of them. We also pass the
4224            new setting back so that it can be put at the start of any following
4225            branches, and when this group ends (if we are in a group), a resetting
4226            item can be compiled. */
4227    
4228          if (*ptr == ')')          if (*ptr == ')')
4229            {            {
4230            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4231                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4232              {              {
4233              *code++ = OP_OPT;              cd->external_options = newoptions;
4234              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4235              }              }
4236             else
4237                {
4238                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4239                  {
4240                  *code++ = OP_OPT;
4241                  *code++ = newoptions & PCRE_IMS;
4242                  }
4243    
4244            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4245            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4246            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4247    
4248            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4249            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4250            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4251            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4252                }
4253    
4254            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4255            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3136  for (;; ptr++) Line 4262  for (;; ptr++)
4262    
4263          bravalue = OP_BRA;          bravalue = OP_BRA;
4264          ptr++;          ptr++;
4265          }          }     /* End of switch for character following (? */
4266        }        }       /* End of (? handling */
4267    
4268      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4269      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4270        brackets. */
4271    
4272      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4273        {        {
4274        bravalue = OP_BRA;        bravalue = OP_BRA;
4275        }        }
4276    
4277      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4278    
4279      else      else
4280        {        {
4281        NUMBERED_GROUP:        NUMBERED_GROUP:
4282        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4283          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4284          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4285        }        }
4286    
4287      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4288      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4289      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4290      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4291        they have changed. */
4292    
4293      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4294      *code = bravalue;      *code = bravalue;
4295      tempcode = code;      tempcode = code;
4296      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4297        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4298    
4299      if (!compile_regex(      if (!compile_regex(
4300           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4301           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4302           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4303           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4304           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4305           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4306            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4307           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4308           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4309           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4310           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4311           cd))                          /* Tables block */           cd,                           /* Tables block */
4312             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4313               &length_prevgroup           /* Pre-compile phase */
4314             ))
4315        goto FAILED;        goto FAILED;
4316    
4317      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3196  for (;; ptr++) Line 4320  for (;; ptr++)
4320      is on the bracket. */      is on the bracket. */
4321    
4322      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4323      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4324        in the real compile phase, not in the pre-pass, where the whole group may
4325        not be available. */
4326    
4327      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4328        {        {
4329        uschar *tc = code;        uschar *tc = code;
4330        condcount = 0;        int condcount = 0;
4331    
4332        do {        do {
4333           condcount++;           condcount++;
# Line 3209  for (;; ptr++) Line 4335  for (;; ptr++)
4335           }           }
4336        while (*tc != OP_KET);        while (*tc != OP_KET);
4337    
4338        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4339          false). It must have only one branch. */
4340    
4341          if (code[LINK_SIZE+1] == OP_DEF)
4342          {          {
4343          *errorcodeptr = ERR27;          if (condcount > 1)
4344          goto FAILED;            {
4345              *errorcodeptr = ERR54;
4346              goto FAILED;
4347              }
4348            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4349            }
4350    
4351          /* A "normal" conditional group. If there is just one branch, we must not
4352          make use of its firstbyte or reqbyte, because this is equivalent to an
4353          empty second branch. */
4354    
4355          else
4356            {
4357            if (condcount > 2)
4358              {
4359              *errorcodeptr = ERR27;
4360              goto FAILED;
4361              }
4362            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4363          }          }
4364          }
4365    
4366        /* Error if hit end of pattern */
4367    
4368        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4369        reqbyte, because this is equivalent to an empty second branch. */        {
4370          *errorcodeptr = ERR14;
4371          goto FAILED;
4372          }
4373    
4374        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4375        group, less the brackets at either end. Then reduce the compiled code to
4376        just the brackets so that it doesn't use much memory if it is duplicated by
4377        a quantifier. */
4378    
4379        if (lengthptr != NULL)
4380          {
4381          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4382          code++;
4383          PUTINC(code, 0, 1 + LINK_SIZE);
4384          *code++ = OP_KET;
4385          PUTINC(code, 0, 1 + LINK_SIZE);
4386        }        }
4387    
4388      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4389      brackets of all kinds, and conditions with two branches (see code above).  
4390      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4391      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4392      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4393        relevant. */
4394    
4395        if (bravalue == OP_DEF) break;
4396    
4397        /* Handle updating of the required and first characters for other types of
4398        group. Update for normal brackets of all kinds, and conditions with two
4399        branches (see code above). If the bracket is followed by a quantifier with
4400        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4401        zerofirstbyte outside the main loop so that they can be accessed for the
4402        back off. */
4403    
4404      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4405      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4406      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4407    
4408      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4409        {        {
4410        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4411        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3272  for (;; ptr++) Line 4446  for (;; ptr++)
4446      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4447    
4448      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4449        break;     /* End of processing '(' */
4450    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
4451    
4452      case '\\':      /* ===================================================================*/
4453      tempptr = ptr;      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
   
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
4454      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4455      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4456      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4457      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4458      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4459    
4460        case '\\':
4461        tempptr = ptr;
4462        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4463        if (*errorcodeptr != 0) goto FAILED;
4464    
4465      if (c < 0)      if (c < 0)
4466        {        {
4467        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3310  for (;; ptr++) Line 4471  for (;; ptr++)
4471          continue;          continue;
4472          }          }
4473    
4474          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4475    
4476        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4477        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4478    
# Line 3321  for (;; ptr++) Line 4484  for (;; ptr++)
4484        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4485        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4486    
4487        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4488          We also support \k{name} (.NET syntax) */
4489    
4490          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4491            {
4492            is_recurse = FALSE;
4493            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4494            goto NAMED_REF_OR_RECURSE;
4495            }
4496    
4497          /* Back references are handled specially; must disable firstbyte if
4498          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4499          ':' later. */
4500    
4501        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4502          {          {
4503          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4504    
4505            HANDLE_REFERENCE:    /* Come here from named backref handling */
4506            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4507          previous = code;          previous = code;
4508          *code++ = OP_REF;          *code++ = OP_REF;
4509          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4510            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4511            if (recno > cd->top_backref) cd->top_backref = recno;
4512          }          }
4513    
4514        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4515    
4516  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4517        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3340  for (;; ptr++) Line 4519  for (;; ptr++)
4519          BOOL negated;          BOOL negated;
4520          int pdata;          int pdata;
4521          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4522            if (ptype < 0) goto FAILED;
4523          previous = code;          previous = code;
4524          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4525          *code++ = ptype;          *code++ = ptype;
4526          *code++ = pdata;          *code++ = pdata;
4527          }          }
4528    #else
4529    
4530          /* If Unicode properties are not supported, \X, \P, and \p are not
4531          allowed. */
4532    
4533          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4534            {
4535            *errorcodeptr = ERR45;
4536            goto FAILED;
4537            }
4538  #endif  #endif
4539    
4540        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4541        value */        can obtain the OP value by negating the escape value. */
4542    
4543        else        else
4544          {          {
# Line 3372  for (;; ptr++) Line 4562  for (;; ptr++)
4562       mcbuffer[0] = c;       mcbuffer[0] = c;
4563       mclength = 1;       mclength = 1;
4564       }       }
   
4565      goto ONE_CHAR;      goto ONE_CHAR;
4566    
4567    
4568        /* ===================================================================*/
4569      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4570      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4571      multi-byte literal character. */      multi-byte literal character. */
# Line 3385  for (;; ptr++) Line 4576  for (;; ptr++)
4576      mcbuffer[0] = c;      mcbuffer[0] = c;
4577    
4578  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4579      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4580        {        {
4581        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4582          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3436  for (;; ptr++) Line 4627  for (;; ptr++)
4627      }      }
4628    }                   /* end of big loop */    }                   /* end of big loop */
4629    
4630    
4631  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4632  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4633  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3452  return FALSE; Line 4644  return FALSE;
4644  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4645  *************************************************/  *************************************************/
4646    
4647  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4648  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4649  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4650  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4651  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4652  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4653  the new options into every subsequent branch compile.  into every subsequent branch compile.
4654    
4655    This function is used during the pre-compile phase when we are trying to find
4656    out the amount of memory needed, as well as during the real compile phase. The
4657    value of lengthptr distinguishes the two phases.
4658    
4659  Argument:  Arguments:
4660    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4661    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4662    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4663    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4664    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4665    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4666    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4667    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4668    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4669    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4670    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4671      lengthptr      NULL during the real compile phase
4672                     points to length accumulator during pre-compile phase
4673    
4674  Returns:      TRUE on success  Returns:         TRUE on success
4675  */  */
4676    
4677  static BOOL  static BOOL
4678  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4679    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4680    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4681  {  {
4682  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4683  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3489  uschar *start_bracket = code; Line 4686  uschar *start_bracket = code;
4686  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4687  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4688  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4689    int length;
4690  branch_chain bc;  branch_chain bc;
4691    
4692  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3496  bc.current = code; Line 4694  bc.current = code;
4694    
4695  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4696    
4697    /* Accumulate the length for use in the pre-compile phase. Start with the
4698    length of the BRA and KET and any extra bytes that are required at the
4699    beginning. We accumulate in a local variable to save frequent testing of
4700    lenthptr for NULL. We cannot do this by looking at the value of code at the
4701    start and end of each alternative, because compiled items are discarded during
4702    the pre-compile phase so that the work space is not exceeded. */
4703    
4704    length = 2 + 2*LINK_SIZE + skipbytes;
4705    
4706    /* WARNING: If the above line is changed for any reason, you must also change
4707    the code that abstracts option settings at the start of the pattern and makes
4708    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4709    pre-compile phase to find out whether anything has yet been compiled or not. */
4710    
4711  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4712    
4713  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3511  for (;;) Line 4723  for (;;)
4723      {      {
4724      *code++ = OP_OPT;      *code++ = OP_OPT;
4725      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4726        length += 2;
4727      }      }
4728    
4729    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3520  for (;;) Line 4733  for (;;)
4733      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4734      reverse_count = code;      reverse_count = code;
4735      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4736        length += 1 + LINK_SIZE;
4737      }      }
4738    
4739    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4740      into the length. */
4741    
4742    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4743          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4744      {      {
4745      *ptrptr = ptr;      *ptrptr = ptr;
4746      return FALSE;      return FALSE;
4747      }      }
4748    
4749    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4750    
4751    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4752      {      {
4753      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4754      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4755    
4756    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4757    match the values from all the previous branches, except that if the previous        {
4758    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4759    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4760          }
4761    
4762    else      /* If this is not the first branch, the first char and reqbyte have to
4763      {      match the values from all the previous branches, except that if the
4764      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4765      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4766    
4767      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4768        {        {
4769        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4770        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4771        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4772    
4773          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4774            {
4775            if (reqbyte < 0) reqbyte = firstbyte;
4776            firstbyte = REQ_NONE;
4777            }
4778    
4779      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4780      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4781    
4782      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4783          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4784    
4785      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4786    
4787      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4788        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4789      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4790      }        }
4791    
4792    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4793    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4794    the branch with OP_END. */      branch with OP_END. */
4795    
4796    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4797        {        {
4798        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4799        *ptrptr = ptr;        *code = OP_END;
4800        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4801          DPRINTF(("fixed length = %d\n", fixed_length));
4802          if (fixed_length < 0)
4803            {
4804            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4805            *ptrptr = ptr;
4806            return FALSE;
4807            }
4808          PUT(reverse_count, 0, fixed_length);
4809        }        }
     PUT(reverse_count, 0, length);  
4810      }      }
4811    
4812    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
4813    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
4814    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
4815    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
4816    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
4817    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
4818    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
4819    at the terminating char. */    Return leaving the pointer at the terminating char. */
4820    
4821    if (*ptr != '|')    if (*ptr != '|')
4822      {      {
4823      int length = code - last_branch;      if (lengthptr == NULL)
     do  
4824        {        {
4825        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
4826        PUT(last_branch, 1, length);        do
4827        length = prev_length;          {
4828        last_branch -= length;          int prev_length = GET(last_branch, 1);
4829            PUT(last_branch, 1, branch_length);
4830            branch_length = prev_length;
4831            last_branch -= branch_length;
4832            }
4833          while (branch_length > 0);
4834        }        }
     while (length > 0);  
4835    
4836      /* Fill in the ket */      /* Fill in the ket */
4837    
# Line 3622  for (;;) Line 4845  for (;;)
4845        {        {
4846        *code++ = OP_OPT;        *code++ = OP_OPT;
4847        *code++ = oldims;        *code++ = oldims;
4848          length += 2;
4849        }        }
4850    
4851      /* Set values to pass back */      /* Set values to pass back */
# Line 3630  for (;;) Line 4854  for (;;)
4854      *ptrptr = ptr;      *ptrptr = ptr;
4855      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
4856      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
4857        if (lengthptr != NULL) *lengthptr += length;
4858      return TRUE;      return TRUE;
4859      }      }
4860    
4861    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
4862      pointer back to where it was for the start of the first branch. (That is,
4863      pretend that each branch is the only one.)
4864    
4865      In the real compile phase, insert an ALT node. Its length field points back
4866    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
4867    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
4868    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
4869    
4870    *code = OP_ALT;    if (lengthptr != NULL)
4871    PUT(code, 1, code - last_branch);      {
4872    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
4873    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
4874        }
4875      else
4876        {
4877        *code = OP_ALT;
4878        PUT(code, 1, code - last_branch);
4879        bc.current = last_branch = code;
4880        code += 1 + LINK_SIZE;
4881        }
4882    
4883    ptr++;    ptr++;
4884    }    }
4885  /* Control never reaches here */  /* Control never reaches here */
# Line 3693  is_anchored(register const uschar *code, Line 4931  is_anchored(register const uschar *code,
4931    unsigned int backref_map)    unsigned int backref_map)
4932  {  {
4933  do {  do {
4934     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4935       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
4936     register int op = *scode;     register int op = *scode;
4937    
4938       /* Non-capturing brackets */
4939    
4940       if (op == OP_BRA)
4941         {
4942         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4943         }
4944    
4945     /* Capturing brackets */     /* Capturing brackets */
4946    
4947     if (op > OP_BRA)     else if (op == OP_CBRA)
4948       {       {
4949       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4950       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4951       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4952       }       }
4953    
4954     /* Other brackets */     /* Other brackets */
4955    
4956     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4957       {       {
4958       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4959       }       }
# Line 3718  do { Line 4961  do {
4961     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4962     are or may be referenced. */     are or may be referenced. */
4963    
4964     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4965                 op == OP_TYPEPOSSTAR) &&
4966              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
4967       {       {
4968       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3763  is_startline(const uschar *code, unsigne Line 5007  is_startline(const uschar *code, unsigne
5007    unsigned int backref_map)    unsigned int backref_map)
5008  {  {
5009  do {  do {
5010     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5011       FALSE);       NULL, 0, FALSE);
5012     register int op = *scode;     register int op = *scode;
5013    
5014       /* Non-capturing brackets */
5015    
5016       if (op == OP_BRA)
5017         {
5018         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5019         }
5020    
5021     /* Capturing brackets */     /* Capturing brackets */
5022    
5023     if (op > OP_BRA)     else if (op == OP_CBRA)
5024       {       {
5025       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
5026       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
5027       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
5028       }       }
5029    
5030     /* Other brackets */     /* Other brackets */
5031    
5032     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5033       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5034    
5035     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
5036     may be referenced. */     may be referenced. */
5037    
5038     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5039       {       {
5040       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5041       }       }
# Line 3835  do { Line 5084  do {
5084       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5085     register int op = *scode;     register int op = *scode;
5086    
    if (op >= OP_BRA) op = OP_BRA;  
   
5087     switch(op)     switch(op)
5088       {       {
5089       default:       default:
5090       return -1;       return -1;
5091    
5092       case OP_BRA:       case OP_BRA:
5093         case OP_CBRA:
5094       case OP_ASSERT:       case OP_ASSERT:
5095       case OP_ONCE:       case OP_ONCE:
5096       case OP_COND:       case OP_COND:
# Line 3858  do { Line 5106  do {
5106       case OP_CHARNC:       case OP_CHARNC:
5107       case OP_PLUS:       case OP_PLUS:
5108       case OP_MINPLUS:       case OP_MINPLUS:
5109         case OP_POSPLUS:
5110       if (!inassert) return -1;       if (!inassert) return -1;
5111       if (c < 0)       if (c < 0)
5112         {         {
# Line 3898  Returns:        pointer to compiled data Line 5147  Returns:        pointer to compiled data
5147                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5148  */  */
5149    
5150  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5151  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5152    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5153  {  {
# Line 3906  return pcre_compile2(pattern, options, N Line 5155  return pcre_compile2(pattern, options, N
5155  }  }
5156    
5157    
5158  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5159  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5160    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5161  {  {
5162  real_pcre *re;  real_pcre *re;
5163  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5164  int c, firstbyte, reqbyte;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5165  int errorcode = 0;  int errorcode = 0;
5166  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5167  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5168  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5169  size_t size;  size_t size;
5170  uschar *code;  uschar *code;
5171  const uschar *codestart;  const uschar *codestart;
5172  const uschar *ptr;  const uschar *ptr;
5173  compile_data compile_block;  compile_data compile_block;
5174  int brastack[BRASTACK_SIZE];  compile_data *cd = &compile_block;
5175  uschar bralenstack[BRASTACK_SIZE];  
5176    /* This space is used for "compiling" into during the first phase, when we are
5177    computing the amount of memory that is needed. Compiled items are thrown away
5178    as soon as possible, so that a fairly large buffer should be sufficient for
5179    this purpose. The same space is used in the second phase for remembering where
5180    to fill in forward references to subpatterns. */
5181    
5182    uschar cworkspace[COMPILE_WORK_SIZE];
5183    
5184    
5185    /* Set this early so that early errors get offset 0. */
5186    
5187    ptr = (const uschar *)pattern;
5188    
5189  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
5190  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 3954  if (errorcodeptr != NULL) *errorcodeptr Line 5204  if (errorcodeptr != NULL) *errorcodeptr
5204  if (erroroffset == NULL)  if (erroroffset == NULL)
5205    {    {
5206    errorcode = ERR16;    errorcode = ERR16;
5207    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5208    }    }
5209    
5210  *erroroffset = 0;  *erroroffset = 0;
# Line 3967  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5217  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5217       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5218    {    {
5219    errorcode = ERR44;    errorcode = ERR44;
5220    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5221    }    }
5222  #else  #else
5223  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 3986  if ((options & ~PUBLIC_OPTIONS) != 0) Line 5236  if ((options & ~PUBLIC_OPTIONS) != 0)
5236  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
5237    
5238  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
5239  compile_block.lcc = tables + lcc_offset;  cd->lcc = tables + lcc_offset;
5240  compile_block.fcc = tables + fcc_offset;  cd->fcc = tables + fcc_offset;
5241  compile_block.cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
5242  compile_block.ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5243    
5244  /* Maximum back reference and backref bitmap. This is updated for numeric  /* Handle different types of newline. The three bits give seven cases. The
5245  references during the first pass, but for named references during the actual  current code allows for fixed one- or two-byte sequences, plus "any" and
5246  compile pass. The bitmap records up to 31 back references to help in deciding  "anycrlf". */
 whether (.*) can be treated as anchored or not. */  
   
 compile_block.top_backref = 0;  
 compile_block.backref_map = 0;  
   
 /* Reflect pattern for debugging output */  
   
 DPRINTF(("------------------------------------------------------------------\n"));  
 DPRINTF(("%s\n", pattern));  
   
 /* The first thing to do is to make a pass over the pattern to compute the  
 amount of store required to hold the compiled code. This does not have to be  
 perfect as long as errors are overestimates. At the same time we can detect any  
 flag settings right at the start, and extract them. Make an attempt to correct  
 for any counted white space if an "extended" flag setting appears late in the  
 pattern. We can't be so clever for #-comments. */  
5247    
5248  ptr = (const uschar *)(pattern - 1);  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
 while ((c = *(++ptr)) != 0)  
5249    {    {
5250    int min, max;    case 0: newline = NEWLINE; break;   /* Compile-time default */
5251    int class_optcount;    case PCRE_NEWLINE_CR: newline = '\r'; break;
5252    int bracket_length;    case PCRE_NEWLINE_LF: newline = '\n'; break;
5253    int duplength;    case PCRE_NEWLINE_CR+
5254           PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5255    /* If we are inside a \Q...\E sequence, all chars are literal */    case PCRE_NEWLINE_ANY: newline = -1; break;
5256      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5257    if (inescq)    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5258      {    }
     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;  
     goto NORMAL_CHAR;  
     }  
   
   /* Otherwise, first check for ignored whitespace and comments */  
5259    
5260    if ((options & PCRE_EXTENDED) != 0)  if (newline == -2)
5261      {
5262      cd->nltype = NLTYPE_ANYCRLF;
5263      }
5264    else if (newline < 0)
5265      {
5266      cd->nltype = NLTYPE_ANY;
5267      }
5268    else
5269      {
5270      cd->nltype = NLTYPE_FIXED;
5271      if (newline > 255)
5272      {      {
5273      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      cd->nllen = 2;
5274      if (c == '#')      cd->nl[0] = (newline >> 8) & 255;
5275        {      cd->nl[1] = newline & 255;
       /* The space before the ; is to avoid a warning on a silly compiler  
       on the Macintosh. */  
       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;  
       if (c == 0) break;  
       continue;  
       }  
5276      }      }
5277      else
   item_count++;    /* Is zero for the first non-comment item */  
   
   /* Allow space for auto callout before every item except quantifiers. */  
   
   if ((options & PCRE_AUTO_CALLOUT) != 0 &&  
        c != '*' && c != '+' && c != '?' &&  
        (c != '{' || !is_counted_repeat(ptr + 1)))  
     length += 2 + 2*LINK_SIZE;  
   
   switch(c)  
5278      {      {
5279      /* A backslashed item may be an escaped data character or it may be a      cd->nllen = 1;
5280      character type. */      cd->nl[0] = newline;
5281        }
5282      case '\\':    }
     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);  
     if (errorcode != 0) goto PCRE_ERROR_RETURN;  
   
     lastitemlength = 1;     /* Default length of last item for repeats */  
   
     if (c >= 0)             /* Data character */  
       {  
       length += 2;          /* For a one-byte character */  
   
 #ifdef SUPPORT_UTF8  
       if (utf8 && c > 127)  
         {  
         int i;  
         for (i = 0; i < _pcre_utf8_table1_size; i++)  
           if (c <= _pcre_utf8_table1[i]) break;  
         length += i;  
         lastitemlength += i;  
         }  
 #endif  
   
       continue;  
       }  
   
     /* If \Q, enter "literal" mode */  
   
     if (-c == ESC_Q)  
       {  
       inescq = TRUE;  
       continue;  
       }  
   
     /* \X is supported only if Unicode property support is compiled */  
   
 #ifndef SUPPORT_UCP  
     if (-c == ESC_X)  
       {  
       errorcode = ERR45;  
       goto PCRE_ERROR_RETURN;  
       }  
 #endif  
   
     /* \P and \p are for Unicode properties, but only when the support has  
     been compiled. Each item needs 3 bytes. */  
   
     else if (-c == ESC_P || -c == ESC_p)  
       {  
 #ifdef SUPPORT_UCP  
       BOOL negated;  
       BOOL pdata;  
       length += 3;  
       lastitemlength = 3;  
       if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)  
         goto PCRE_ERROR_RETURN;  
       continue;  
 #else  
       errorcode = ERR45;  
       goto PCRE_ERROR_RETURN;  
 #endif  
       }  
   
     /* Other escapes need one byte */  
   
     length++;  
   
     /* A back reference needs an additional 2 bytes, plus either one or 5  
     bytes for a repeat. We also need to keep the value of the highest  
     back reference. */  
   
     if (c <= -ESC_REF)  
       {  
       int refnum = -c - ESC_REF;  
       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;  
       if (refnum > compile_block.top_backref)  
         compile_block.top_backref = refnum;  
       length += 2;   /* For single back reference */  
       if (ptr[1] == '{' && is_counted_repeat(ptr+2))  
         {  
         ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);  
         if (errorcode != 0) goto PCRE_ERROR_RETURN;  
         if ((min == 0 && (max == 1 || max == -1)) ||  
           (min == 1 && max == -1))  
             length++;  
         else length += 5;  
         if (ptr[1] == '?') ptr++;  
         }  
       }  
     continue;  
   
     case '^':     /* Single-byte metacharacters */  
     case '.':  
     case '$':  
     length++;  
     lastitemlength = 1;  
     continue;  
   
     case '*':            /* These repeats won't be after brackets; */  
     case '+':            /* those are handled separately */  
     case '?':  
     length++;  
     goto POSESSIVE;      /* A few lines below */  <