/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 178 by ph10, Wed Jun 13 08:44:34 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 54  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
61    /* Macro for setting individual bits in class bitmaps. */
62    
63    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64    
65    
66  /*************************************************  /*************************************************
67  *      Code parameters and static tables         *  *      Code parameters and static tables         *
68  *************************************************/  *************************************************/
69    
70  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
71  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
72  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
73  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
74  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
75    so this number is very generous.
76    
77    The same workspace is used during the second, actual compile phase for
78    remembering forward references to groups so that they can be filled in at the
79    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80    is 4 there is plenty of room. */
81    
82  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
83    
84    
85  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 73  are simple data values; negative values Line 87  are simple data values; negative values
87  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
88  is invalid. */  is invalid. */
89    
90  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
91  static const short int escapes[] = {  static const short int escapes[] = {
92       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
93       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
94     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
95       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
96  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
97  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
98     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
99       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
100  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
101       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
102  };  };
103    
104  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
105  static const short int escapes[] = {  static const short int escapes[] = {
106  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
107  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
112  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
113  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
114  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
115  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
116  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
117  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
118  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
119  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
120  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
121  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
122  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
123  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
124  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
125  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
126  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
127  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
128  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 156  static const int posix_class_maps[] = { Line 170  static const int posix_class_maps[] = {
170  };  };
171    
172    
173    #define STRING(a)  # a
174    #define XSTRING(s) STRING(s)
175    
176  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
177  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
178    they are documented. Always add a new error instead. Messages marked DEAD below
179    are no longer used. */
180    
181  static const char *error_texts[] = {  static const char *error_texts[] = {
182    "no error",    "no error",
# Line 172  static const char *error_texts[] = { Line 191  static const char *error_texts[] = {
191    "range out of order in character class",    "range out of order in character class",
192    "nothing to repeat",    "nothing to repeat",
193    /* 10 */    /* 10 */
194    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
195    "internal error: unexpected repeat",    "internal error: unexpected repeat",
196    "unrecognized character after (?",    "unrecognized character after (?",
197    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 182  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "erroffset passed as NULL",    "erroffset passed as NULL",
202    "unknown option bit(s) set",    "unknown option bit(s) set",
203    "missing ) after comment",    "missing ) after comment",
204    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
205    /* 20 */    /* 20 */
206    "regular expression too large",    "regular expression too large",
207    "failed to get memory",    "failed to get memory",
# Line 194  static const char *error_texts[] = { Line 213  static const char *error_texts[] = {
213    "malformed number or name after (?(",    "malformed number or name after (?(",
214    "conditional group contains more than two branches",    "conditional group contains more than two branches",
215    "assertion expected after (?(",    "assertion expected after (?(",
216    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
217    /* 30 */    /* 30 */
218    "unknown POSIX class name",    "unknown POSIX class name",
219    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
220    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
221    "spare error",    "spare error",  /** DEAD **/
222    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
223    /* 35 */    /* 35 */
224    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 210  static const char *error_texts[] = { Line 229  static const char *error_texts[] = {
229    /* 40 */    /* 40 */
230    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
231    "unrecognized character after (?P",    "unrecognized character after (?P",
232    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
233    "two named subpatterns have the same name",    "two named subpatterns have the same name",
234    "invalid UTF-8 string",    "invalid UTF-8 string",
235    /* 45 */    /* 45 */
236    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
237    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
238    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p",
239    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241    /* 50 */    /* 50 */
242    "repeated subpattern is too long",    "repeated subpattern is too long",
243    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)",
244      "internal error: overran compiling workspace",
245      "internal error: previously-checked referenced subpattern not found",
246      "DEFINE group contains more than one branch",
247      /* 55 */
248      "repeating a DEFINE group is not allowed",
249      "inconsistent NEWLINE options",
250      "\\g is not followed by a braced name or an optionally braced non-zero number",
251      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252  };  };
253    
254    
# Line 241  For convenience, we use the same bit def Line 268  For convenience, we use the same bit def
268    
269  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
270    
271  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
272  static const unsigned char digitab[] =  static const unsigned char digitab[] =
273    {    {
274    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 277  static const unsigned char digitab[] = Line 304  static const unsigned char digitab[] =
304    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306    
307  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
308  static const unsigned char digitab[] =  static const unsigned char digitab[] =
309    {    {
310    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 291  static const unsigned char digitab[] = Line 318  static const unsigned char digitab[] =
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
321    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
322    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
323    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
324    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 325  static const unsigned char ebcdic_charta Line 352  static const unsigned char ebcdic_charta
352    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
353    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
355    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
357    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 352  static const unsigned char ebcdic_charta Line 379  static const unsigned char ebcdic_charta
379  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
380    
381  static BOOL  static BOOL
382    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
384    
385    
386    
# Line 363  static BOOL Line 390  static BOOL
390    
391  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
392  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
393  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
394  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396    ptr is pointing at the \. On exit, it is on the final character of the escape
397    sequence.
398    
399  Arguments:  Arguments:
400    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 398  if (c == 0) *errorcodeptr = ERR1; Line 427  if (c == 0) *errorcodeptr = ERR1;
427  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
428  Otherwise further processing may be required. */  Otherwise further processing may be required. */
429    
430  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
431  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
432  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
433    
434  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
435  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
436  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
437  #endif  #endif
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 441  else if ((i = escapes[c - 0x48]) != 0)
441  else  else
442    {    {
443    const uschar *oldptr;    const uschar *oldptr;
444      BOOL braced, negated;
445    
446    switch (c)    switch (c)
447      {      {
448      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 456  else
456      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
457      break;      break;
458    
459        /* \g must be followed by a number, either plain or braced. If positive, it
460        is an absolute backreference. If negative, it is a relative backreference.
461        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462        reference to a named group. This is part of Perl's movement towards a
463        unified syntax for back references. As this is synonymous with \k{name}, we
464        fudge it up by pretending it really was \k. */
465    
466        case 'g':
467        if (ptr[1] == '{')
468          {
469          const uschar *p;
470          for (p = ptr+2; *p != 0 && *p != '}'; p++)
471            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472          if (*p != 0 && *p != '}')
473            {
474            c = -ESC_k;
475            break;
476            }
477          braced = TRUE;
478          ptr++;
479          }
480        else braced = FALSE;
481    
482        if (ptr[1] == '-')
483          {
484          negated = TRUE;
485          ptr++;
486          }
487        else negated = FALSE;
488    
489        c = 0;
490        while ((digitab[ptr[1]] & ctype_digit) != 0)
491          c = c * 10 + *(++ptr) - '0';
492    
493        if (c == 0 || (braced && *(++ptr) != '}'))
494          {
495          *errorcodeptr = ERR57;
496          return 0;
497          }
498    
499        if (negated)
500          {
501          if (c > bracount)
502            {
503            *errorcodeptr = ERR15;
504            return 0;
505            }
506          c = bracount - (c - 1);
507          }
508    
509        c = -(ESC_REF + c);
510        break;
511    
512      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
513      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
514      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 495  else Line 579  else
579          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
580          count++;          count++;
581    
582  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
583          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
584          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
586          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
587          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588  #endif  #endif
# Line 522  else Line 606  else
606        {        {
607        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
608        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
609  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
610        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
611        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
613        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
614        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615  #endif  #endif
616        }        }
617      break;      break;
618    
619      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620        This coding is ASCII-specific, but then the whole concept of \cx is
621        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622    
623      case 'c':      case 'c':
624      c = *(++ptr);      c = *(++ptr);
# Line 542  else Line 628  else
628        return 0;        return 0;
629        }        }
630    
631      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
632      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
633      c ^= 0x40;      c ^= 0x40;
634  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
635      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
636      c ^= 0xC0;      c ^= 0xC0;
637  #endif  #endif
# Line 772  return p; Line 854  return p;
854    
855    
856  /*************************************************  /*************************************************
857  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
858  *************************************************/  *************************************************/
859    
860  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
861  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
862  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
863  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
864  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
865    be terminated by '>' because that is checked in the first pass.
866    
867  Arguments:  Arguments:
868    pointer      current position in the pattern    ptr          current position in the pattern
869    count        current count of capturing parens    count        current count of capturing parens so far encountered
870    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
871    namelen      name length    lorn         name length, or subpattern number if name is NULL
872      xmode        TRUE if we are in /x mode
873    
874  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
875  */  */
876    
877  static int  static int
878  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879      BOOL xmode)
880  {  {
881  const uschar *thisname;  const uschar *thisname;
882    
883  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
884    {    {
885    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
886    
887      /* Skip over backslashed characters and also entire \Q...\E */
888    
889      if (*ptr == '\\')
890        {
891        if (*(++ptr) == 0) return -1;
892        if (*ptr == 'Q') for (;;)
893          {
894          while (*(++ptr) != 0 && *ptr != '\\');
895          if (*ptr == 0) return -1;
896          if (*(++ptr) == 'E') break;
897          }
898        continue;
899        }
900    
901      /* Skip over character classes */
902    
903      if (*ptr == '[')
904        {
905        while (*(++ptr) != ']')
906          {
907          if (*ptr == '\\')
908            {
909            if (*(++ptr) == 0) return -1;
910            if (*ptr == 'Q') for (;;)
911              {
912              while (*(++ptr) != 0 && *ptr != '\\');
913              if (*ptr == 0) return -1;
914              if (*(++ptr) == 'E') break;
915              }
916            continue;
917            }
918          }
919        continue;
920        }
921    
922      /* Skip comments in /x mode */
923    
924      if (xmode && *ptr == '#')
925        {
926        while (*(++ptr) != 0 && *ptr != '\n');
927        if (*ptr == 0) return -1;
928        continue;
929        }
930    
931      /* An opening parens must now be a real metacharacter */
932    
933    if (*ptr != '(') continue;    if (*ptr != '(') continue;
934    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?')
935    if (ptr[2] == '(') { ptr += 2; continue; }      {
936    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
937        if (name == NULL && count == lorn) return count;
938        continue;
939        }
940    
941      ptr += 2;
942      if (*ptr == 'P') ptr++;                      /* Allow optional P */
943    
944      /* We have to disambiguate (?<! and (?<= from (?<name> */
945    
946      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947           *ptr != '\'')
948        continue;
949    
950    count++;    count++;
951    ptr += 4;  
952      if (name == NULL && count == lorn) return count;
953      term = *ptr++;
954      if (term == '<') term = '>';
955    thisname = ptr;    thisname = ptr;
956    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
957    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
958          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959      return count;      return count;
960    }    }
961    
962  return -1;  return -1;
963  }  }
964    
# Line 862  for (;;) Line 1013  for (;;)
1013    
1014      case OP_CALLOUT:      case OP_CALLOUT:
1015      case OP_CREF:      case OP_CREF:
1016      case OP_BRANUMBER:      case OP_RREF:
1017        case OP_DEF:
1018      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1019      break;      break;
1020    
# Line 907  for (;;) Line 1059  for (;;)
1059    {    {
1060    int d;    int d;
1061    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1062    
1063    switch (op)    switch (op)
1064      {      {
1065        case OP_CBRA:
1066      case OP_BRA:      case OP_BRA:
1067      case OP_ONCE:      case OP_ONCE:
1068      case OP_COND:      case OP_COND:
1069      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070      if (d < 0) return d;      if (d < 0) return d;
1071      branchlength += d;      branchlength += d;
1072      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1101  for (;;)
1101      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1102    
1103      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1104      case OP_CREF:      case OP_CREF:
1105        case OP_RREF:
1106        case OP_DEF:
1107      case OP_OPT:      case OP_OPT:
1108      case OP_CALLOUT:      case OP_CALLOUT:
1109      case OP_SOD:      case OP_SOD:
# Line 1094  for (;;) Line 1247  for (;;)
1247    
1248    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1249    
1250    /* Handle bracketed group */    /* Handle capturing bracket */
1251    
1252    else if (c > OP_BRA)    else if (c == OP_CBRA)
1253      {      {
1254      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1255      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1256      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1257      }      }
1258    
1259    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260    that are followed by a character may be followed by a multi-byte character.    a multi-byte character. The length in the table is a minimum, so we have to
1261    The length in the table is a minimum, so we have to scan along to skip the    arrange to skip the extra bytes. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1262    
1263    else    else
1264      {      {
1265      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1266    #ifdef SUPPORT_UTF8
1267      if (utf8) switch(c)      if (utf8) switch(c)
1268        {        {
1269        case OP_CHAR:        case OP_CHAR:
# Line 1120  for (;;) Line 1271  for (;;)
1271        case OP_EXACT:        case OP_EXACT:
1272        case OP_UPTO:        case OP_UPTO:
1273        case OP_MINUPTO:        case OP_MINUPTO:
1274          case OP_POSUPTO:
1275        case OP_STAR:        case OP_STAR:
1276        case OP_MINSTAR:        case OP_MINSTAR:
1277          case OP_POSSTAR:
1278        case OP_PLUS:        case OP_PLUS:
1279        case OP_MINPLUS:        case OP_MINPLUS:
1280          case OP_POSPLUS:
1281        case OP_QUERY:        case OP_QUERY:
1282        case OP_MINQUERY:        case OP_MINQUERY:
1283        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1284          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1285        break;        break;
1286        }        }
1287    #endif
1288      }      }
1289    }    }
1290  }  }
# Line 1164  for (;;) Line 1320  for (;;)
1320    
1321    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1322    
   /* All bracketed groups have the same length. */  
   
   else if (c > OP_BRA)  
     {  
     code += _pcre_OP_lengths[OP_BRA];  
     }  
   
1323    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324    that are followed by a character may be followed by a multi-byte character.    that are followed by a character may be followed by a multi-byte character.
1325    The length in the table is a minimum, so we have to scan along to skip the    The length in the table is a minimum, so we have to arrange to skip the extra
1326    extra bytes. All opcodes are less than 128, so we can use relatively    bytes. */
   efficient code. */  
1327    
1328    else    else
1329      {      {
1330      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1331    #ifdef SUPPORT_UTF8
1332      if (utf8) switch(c)      if (utf8) switch(c)
1333        {        {
1334        case OP_CHAR:        case OP_CHAR:
# Line 1187  for (;;) Line 1336  for (;;)
1336        case OP_EXACT:        case OP_EXACT:
1337        case OP_UPTO:        case OP_UPTO:
1338        case OP_MINUPTO:        case OP_MINUPTO:
1339          case OP_POSUPTO:
1340        case OP_STAR:        case OP_STAR:
1341        case OP_MINSTAR:        case OP_MINSTAR:
1342          case OP_POSSTAR:
1343        case OP_PLUS:        case OP_PLUS:
1344        case OP_MINPLUS:        case OP_MINPLUS:
1345          case OP_POSPLUS:
1346        case OP_QUERY:        case OP_QUERY:
1347        case OP_MINQUERY:        case OP_MINQUERY:
1348        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1349          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1350        break;        break;
1351        }        }
1352    #endif
1353      }      }
1354    }    }
1355  }  }
# Line 1207  for (;;) Line 1361  for (;;)
1361  *************************************************/  *************************************************/
1362    
1363  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1364  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1365  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1366  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1367  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368    struck an inner bracket whose current branch will already have been scanned.
1369    
1370  Arguments:  Arguments:
1371    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1379  static BOOL
1379  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380  {  {
1381  register int c;  register int c;
1382  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383       code < endcode;       code < endcode;
1384       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385    {    {
# Line 1232  for (code = first_significant_code(code Line 1387  for (code = first_significant_code(code
1387    
1388    c = *code;    c = *code;
1389    
1390    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1391    
1392      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393        {
1394        code += _pcre_OP_lengths[c];
1395        do code += GET(code, 1); while (*code == OP_ALT);
1396        c = *code;
1397        continue;
1398        }
1399    
1400      /* For other groups, scan the branches. */
1401    
1402      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403      {      {
1404      BOOL empty_branch;      BOOL empty_branch;
1405      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1415  for (code = first_significant_code(code
1415        }        }
1416      while (*code == OP_ALT);      while (*code == OP_ALT);
1417      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1418      c = *code;      c = *code;
1419        continue;
1420      }      }
1421    
1422    else switch (c)    /* Handle the other opcodes */
1423    
1424      switch (c)
1425      {      {
1426      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1427    
# Line 1308  for (code = first_significant_code(code Line 1477  for (code = first_significant_code(code
1477      case OP_NOT:      case OP_NOT:
1478      case OP_PLUS:      case OP_PLUS:
1479      case OP_MINPLUS:      case OP_MINPLUS:
1480        case OP_POSPLUS:
1481      case OP_EXACT:      case OP_EXACT:
1482      case OP_NOTPLUS:      case OP_NOTPLUS:
1483      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1484        case OP_NOTPOSPLUS:
1485      case OP_NOTEXACT:      case OP_NOTEXACT:
1486      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1487      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1488        case OP_TYPEPOSPLUS:
1489      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1490      return FALSE;      return FALSE;
1491    
# Line 1325  for (code = first_significant_code(code Line 1497  for (code = first_significant_code(code
1497      case OP_ALT:      case OP_ALT:
1498      return TRUE;      return TRUE;
1499    
1500      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1502    
1503  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1504      case OP_STAR:      case OP_STAR:
1505      case OP_MINSTAR:      case OP_MINSTAR:
1506        case OP_POSSTAR:
1507      case OP_QUERY:      case OP_QUERY:
1508      case OP_MINQUERY:      case OP_MINQUERY:
1509        case OP_POSQUERY:
1510      case OP_UPTO:      case OP_UPTO:
1511      case OP_MINUPTO:      case OP_MINUPTO:
1512        case OP_POSUPTO:
1513      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514      break;      break;
1515  #endif  #endif
# Line 1452  earlier groups that are outside the curr Line 1627  earlier groups that are outside the curr
1627  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1629  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1630  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1631  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1632    
1633    This function has been extended with the possibility of forward references for
1634    recursions and subroutine calls. It must also check the list of such references
1635    for the group we are dealing with. If it finds that one of the recursions in
1636    the current group is on this list, it adjusts the offset in the list, not the
1637    value in the reference (which is a group number).
1638    
1639  Arguments:  Arguments:
1640    group      points to the start of the group    group      points to the start of the group
1641    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1642    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1643    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1644      save_hwm   the hwm forward reference pointer at the start of the group
1645    
1646  Returns:     nothing  Returns:     nothing
1647  */  */
1648    
1649  static void  static void
1650  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651      uschar *save_hwm)
1652  {  {
1653  uschar *ptr = group;  uschar *ptr = group;
1654  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655    {    {
1656    int offset = GET(ptr, 1);    int offset;
1657    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1658    
1659      /* See if this recursion is on the forward reference list. If so, adjust the
1660      reference. */
1661    
1662      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663        {
1664        offset = GET(hc, 0);
1665        if (cd->start_code + offset == ptr + 1)
1666          {
1667          PUT(hc, 0, offset + adjust);
1668          break;
1669          }
1670        }
1671    
1672      /* Otherwise, adjust the recursion offset if it's after the start of this
1673      group. */
1674    
1675      if (hc >= cd->hwm)
1676        {
1677        offset = GET(ptr, 1);
1678        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679        }
1680    
1681    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1682    }    }
1683  }  }
# Line 1550  Yield:        TRUE when range returned; Line 1756  Yield:        TRUE when range returned;
1756  */  */
1757    
1758  static BOOL  static BOOL
1759  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760      unsigned int *odptr)
1761  {  {
1762  int c, othercase, next;  unsigned int c, othercase, next;
1763    
1764  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1765    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1766    
1767  if (c > d) return FALSE;  if (c > d) return FALSE;
1768    
# Line 1576  return TRUE; Line 1783  return TRUE;
1783  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1784    
1785    
1786    
1787    /*************************************************
1788    *     Check if auto-possessifying is possible    *
1789    *************************************************/
1790    
1791    /* This function is called for unlimited repeats of certain items, to see
1792    whether the next thing could possibly match the repeated item. If not, it makes
1793    sense to automatically possessify the repeated item.
1794    
1795    Arguments:
1796      op_code       the repeated op code
1797      this          data for this item, depends on the opcode
1798      utf8          TRUE in UTF-8 mode
1799      utf8_char     used for utf8 character bytes, NULL if not relevant
1800      ptr           next character in pattern
1801      options       options bits
1802      cd            contains pointers to tables etc.
1803    
1804    Returns:        TRUE if possessifying is wanted
1805    */
1806    
1807    static BOOL
1808    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809      const uschar *ptr, int options, compile_data *cd)
1810    {
1811    int next;
1812    
1813    /* Skip whitespace and comments in extended mode */
1814    
1815    if ((options & PCRE_EXTENDED) != 0)
1816      {
1817      for (;;)
1818        {
1819        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820        if (*ptr == '#')
1821          {
1822          while (*(++ptr) != 0)
1823            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824          }
1825        else break;
1826        }
1827      }
1828    
1829    /* If the next item is one that we can handle, get its value. A non-negative
1830    value is a character, a negative value is an escape value. */
1831    
1832    if (*ptr == '\\')
1833      {
1834      int temperrorcode = 0;
1835      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836      if (temperrorcode != 0) return FALSE;
1837      ptr++;    /* Point after the escape sequence */
1838      }
1839    
1840    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841      {
1842    #ifdef SUPPORT_UTF8
1843      if (utf8) { GETCHARINC(next, ptr); } else
1844    #endif
1845      next = *ptr++;
1846      }
1847    
1848    else return FALSE;
1849    
1850    /* Skip whitespace and comments in extended mode */
1851    
1852    if ((options & PCRE_EXTENDED) != 0)
1853      {
1854      for (;;)
1855        {
1856        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857        if (*ptr == '#')
1858          {
1859          while (*(++ptr) != 0)
1860            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861          }
1862        else break;
1863        }
1864      }
1865    
1866    /* If the next thing is itself optional, we have to give up. */
1867    
1868    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869      return FALSE;
1870    
1871    /* Now compare the next item with the previous opcode. If the previous is a
1872    positive single character match, "item" either contains the character or, if
1873    "item" is greater than 127 in utf8 mode, the character's bytes are in
1874    utf8_char. */
1875    
1876    
1877    /* Handle cases when the next item is a character. */
1878    
1879    if (next >= 0) switch(op_code)
1880      {
1881      case OP_CHAR:
1882    #ifdef SUPPORT_UTF8
1883      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884    #endif
1885      return item != next;
1886    
1887      /* For CHARNC (caseless character) we must check the other case. If we have
1888      Unicode property support, we can use it to test the other case of
1889      high-valued characters. */
1890    
1891      case OP_CHARNC:
1892    #ifdef SUPPORT_UTF8
1893      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    #endif
1895      if (item == next) return FALSE;
1896    #ifdef SUPPORT_UTF8
1897      if (utf8)
1898        {
1899        unsigned int othercase;
1900        if (next < 128) othercase = cd->fcc[next]; else
1901    #ifdef SUPPORT_UCP
1902        othercase = _pcre_ucp_othercase((unsigned int)next);
1903    #else
1904        othercase = NOTACHAR;
1905    #endif
1906        return (unsigned int)item != othercase;
1907        }
1908      else
1909    #endif  /* SUPPORT_UTF8 */
1910      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1911    
1912      /* For OP_NOT, "item" must be a single-byte character. */
1913    
1914      case OP_NOT:
1915      if (next < 0) return FALSE;  /* Not a character */
1916      if (item == next) return TRUE;
1917      if ((options & PCRE_CASELESS) == 0) return FALSE;
1918    #ifdef SUPPORT_UTF8
1919      if (utf8)
1920        {
1921        unsigned int othercase;
1922        if (next < 128) othercase = cd->fcc[next]; else
1923    #ifdef SUPPORT_UCP
1924        othercase = _pcre_ucp_othercase(next);
1925    #else
1926        othercase = NOTACHAR;
1927    #endif
1928        return (unsigned int)item == othercase;
1929        }
1930      else
1931    #endif  /* SUPPORT_UTF8 */
1932      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1933    
1934      case OP_DIGIT:
1935      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936    
1937      case OP_NOT_DIGIT:
1938      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939    
1940      case OP_WHITESPACE:
1941      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942    
1943      case OP_NOT_WHITESPACE:
1944      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945    
1946      case OP_WORDCHAR:
1947      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948    
1949      case OP_NOT_WORDCHAR:
1950      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951    
1952      default:
1953      return FALSE;
1954      }
1955    
1956    
1957    /* Handle the case when the next item is \d, \s, etc. */
1958    
1959    switch(op_code)
1960      {
1961      case OP_CHAR:
1962      case OP_CHARNC:
1963    #ifdef SUPPORT_UTF8
1964      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1965    #endif
1966      switch(-next)
1967        {
1968        case ESC_d:
1969        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1970    
1971        case ESC_D:
1972        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1973    
1974        case ESC_s:
1975        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1976    
1977        case ESC_S:
1978        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1979    
1980        case ESC_w:
1981        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1982    
1983        case ESC_W:
1984        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1985    
1986        default:
1987        return FALSE;
1988        }
1989    
1990      case OP_DIGIT:
1991      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1992    
1993      case OP_NOT_DIGIT:
1994      return next == -ESC_d;
1995    
1996      case OP_WHITESPACE:
1997      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1998    
1999      case OP_NOT_WHITESPACE:
2000      return next == -ESC_s;
2001    
2002      case OP_WORDCHAR:
2003      return next == -ESC_W || next == -ESC_s;
2004    
2005      case OP_NOT_WORDCHAR:
2006      return next == -ESC_w || next == -ESC_d;
2007    
2008      default:
2009      return FALSE;
2010      }
2011    
2012    /* Control does not reach here */
2013    }
2014    
2015    
2016    
2017  /*************************************************  /*************************************************
2018  *           Compile one branch                   *  *           Compile one branch                   *
2019  *************************************************/  *************************************************/
2020    
2021  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
2022  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2023  bits.  bits. This function is used during the pre-compile phase when we are trying
2024    to find out the amount of memory needed, as well as during the real compile
2025    phase. The value of lengthptr distinguishes the two phases.
2026    
2027  Arguments:  Arguments:
2028    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2029    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2030    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2031    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1594  Arguments: Line 2033  Arguments:
2033    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2034    bcptr          points to current branch chain    bcptr          points to current branch chain
2035    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2036      lengthptr      NULL during the real compile phase
2037                     points to length accumulator during pre-compile phase
2038    
2039  Returns:         TRUE on success  Returns:         TRUE on success
2040                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2041  */  */
2042    
2043  static BOOL  static BOOL
2044  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2045    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2046    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2047  {  {
2048  int repeat_type, op_type;  int repeat_type, op_type;
2049  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1613  int zeroreqbyte, zerofirstbyte; Line 2054  int zeroreqbyte, zerofirstbyte;
2054  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
2055  int options = *optionsptr;  int options = *optionsptr;
2056  int after_manual_callout = 0;  int after_manual_callout = 0;
2057    int length_prevgroup = 0;
2058  register int c;  register int c;
2059  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2060    uschar *last_code = code;
2061    uschar *orig_code = code;
2062  uschar *tempcode;  uschar *tempcode;
2063  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2064  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1622  const uschar *ptr = *ptrptr; Line 2066  const uschar *ptr = *ptrptr;
2066  const uschar *tempptr;  const uschar *tempptr;
2067  uschar *previous = NULL;  uschar *previous = NULL;
2068  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2069    uschar *save_hwm = NULL;
2070  uschar classbits[32];  uschar classbits[32];
2071    
2072  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1631  uschar *class_utf8data; Line 2076  uschar *class_utf8data;
2076  uschar utf8_char[6];  uschar utf8_char[6];
2077  #else  #else
2078  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2079    uschar *utf8_char = NULL;
2080    #endif
2081    
2082    #ifdef DEBUG
2083    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2084  #endif  #endif
2085    
2086  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1664  for (;; ptr++) Line 2114  for (;; ptr++)
2114    BOOL negate_class;    BOOL negate_class;
2115    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2116    BOOL is_quantifier;    BOOL is_quantifier;
2117      BOOL is_recurse;
2118      BOOL reset_bracount;
2119    int class_charcount;    int class_charcount;
2120    int class_lastchar;    int class_lastchar;
2121    int newoptions;    int newoptions;
2122    int recno;    int recno;
2123      int refsign;
2124    int skipbytes;    int skipbytes;
2125    int subreqbyte;    int subreqbyte;
2126    int subfirstbyte;    int subfirstbyte;
2127      int terminator;
2128    int mclength;    int mclength;
2129    uschar mcbuffer[8];    uschar mcbuffer[8];
2130    
2131    /* Next byte in the pattern */    /* Get next byte in the pattern */
2132    
2133    c = *ptr;    c = *ptr;
2134    
2135      /* If we are in the pre-compile phase, accumulate the length used for the
2136      previous cycle of this loop. */
2137    
2138      if (lengthptr != NULL)
2139        {
2140    #ifdef DEBUG
2141        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2142    #endif
2143        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2144          {
2145          *errorcodeptr = ERR52;
2146          goto FAILED;
2147          }
2148    
2149        /* There is at least one situation where code goes backwards: this is the
2150        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2151        the class is simply eliminated. However, it is created first, so we have to
2152        allow memory for it. Therefore, don't ever reduce the length at this point.
2153        */
2154    
2155        if (code < last_code) code = last_code;
2156        *lengthptr += code - last_code;
2157        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2158    
2159        /* If "previous" is set and it is not at the start of the work space, move
2160        it back to there, in order to avoid filling up the work space. Otherwise,
2161        if "previous" is NULL, reset the current code pointer to the start. */
2162    
2163        if (previous != NULL)
2164          {
2165          if (previous > orig_code)
2166            {
2167            memmove(orig_code, previous, code - previous);
2168            code -= previous - orig_code;
2169            previous = orig_code;
2170            }
2171          }
2172        else code = orig_code;
2173    
2174        /* Remember where this code item starts so we can pick up the length
2175        next time round. */
2176    
2177        last_code = code;
2178        }
2179    
2180      /* In the real compile phase, just check the workspace used by the forward
2181      reference list. */
2182    
2183      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2184        {
2185        *errorcodeptr = ERR52;
2186        goto FAILED;
2187        }
2188    
2189    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2190    
2191    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1692  for (;; ptr++) Line 2200  for (;; ptr++)
2200        {        {
2201        if (previous_callout != NULL)        if (previous_callout != NULL)
2202          {          {
2203          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2204              complete_callout(previous_callout, ptr, cd);
2205          previous_callout = NULL;          previous_callout = NULL;
2206          }          }
2207        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2222  for (;; ptr++)
2222    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2223         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2224      {      {
2225      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2226          complete_callout(previous_callout, ptr, cd);
2227      previous_callout = NULL;      previous_callout = NULL;
2228      }      }
2229    
# Line 1724  for (;; ptr++) Line 2234  for (;; ptr++)
2234      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2235      if (c == '#')      if (c == '#')
2236        {        {
2237        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2238          {          {
2239          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2240          }          }
2241          if (*ptr != 0) continue;
2242    
2243        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2244        c = 0;        c = 0;
2245        }        }
# Line 1745  for (;; ptr++) Line 2255  for (;; ptr++)
2255    
2256    switch(c)    switch(c)
2257      {      {
2258      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2259        case 0:                        /* The branch terminates at string end */
2260      case 0:      case '|':                      /* or | or ) */
     case '|':  
2261      case ')':      case ')':
2262      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2263      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2264      *codeptr = code;      *codeptr = code;
2265      *ptrptr = ptr;      *ptrptr = ptr;
2266        if (lengthptr != NULL)
2267          {
2268          *lengthptr += code - last_code;   /* To include callout length */
2269          DPRINTF((">> end branch\n"));
2270          }
2271      return TRUE;      return TRUE;
2272    
2273    
2274        /* ===================================================================*/
2275      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2276      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2277    
# Line 1784  for (;; ptr++) Line 2300  for (;; ptr++)
2300      *code++ = OP_ANY;      *code++ = OP_ANY;
2301      break;      break;
2302    
2303    
2304        /* ===================================================================*/
2305      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2306      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2307      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1822  for (;; ptr++) Line 2340  for (;; ptr++)
2340        }        }
2341    
2342      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2343      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2344      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2345    
2346      class_charcount = 0;      class_charcount = 0;
2347      class_lastchar = -1;      class_lastchar = -1;
2348    
2349        /* Initialize the 32-char bit map to all zeros. We build the map in a
2350        temporary bit of memory, in case the class contains only 1 character (less
2351        than 256), because in that case the compiled code doesn't use the bit map.
2352        */
2353    
2354        memset(classbits, 0, 32 * sizeof(uschar));
2355    
2356  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2357      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2358      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2359  #endif  #endif
2360    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2361      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2362      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2363      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2364    
2365      do      if (c != 0) do
2366        {        {
2367          const uschar *oldptr;
2368    
2369  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2370        if (utf8 && c > 127)        if (utf8 && c > 127)
2371          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1859  for (;; ptr++) Line 2377  for (;; ptr++)
2377    
2378        if (inescq)        if (inescq)
2379          {          {
2380          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2381            {            {
2382            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2383            ptr++;            ptr++;                            /* Skip the 'E' */
2384            continue;            continue;                         /* Carry on with next */
2385            }            }
2386          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2387          }          }
2388    
2389        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1956  for (;; ptr++) Line 2474  for (;; ptr++)
2474          }          }
2475    
2476        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2477        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2478        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2479        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2480        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2481        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2482    
2483        if (c == '\\')        if (c == '\\')
2484          {          {
2485          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2486            if (*errorcodeptr != 0) goto FAILED;
2487    
2488          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2489          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2490            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2491          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2492            {            {
2493            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1983  for (;; ptr++) Line 2502  for (;; ptr++)
2502            {            {
2503            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2504            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2505            switch (-c)  
2506              /* Save time by not doing this in the pre-compile phase. */
2507    
2508              if (lengthptr == NULL) switch (-c)
2509              {              {
2510              case ESC_d:              case ESC_d:
2511              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 2011  for (;; ptr++) Line 2533  for (;; ptr++)
2533              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2534              continue;              continue;
2535    
2536  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2537              case ESC_p:              continue;
2538              case ESC_P:  
2539                default:    /* Not recognized; fall through */
2540                break;      /* Need "default" setting to stop compiler warning. */
2541                }
2542    
2543              /* In the pre-compile phase, just do the recognition. */
2544    
2545              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2546                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2547    
2548              /* We need to deal with \H, \h, \V, and \v in both phases because
2549              they use extra memory. */
2550    
2551              if (-c == ESC_h)
2552                {
2553                SETBIT(classbits, 0x09); /* VT */
2554                SETBIT(classbits, 0x20); /* SPACE */
2555                SETBIT(classbits, 0xa0); /* NSBP */
2556    #ifdef SUPPORT_UTF8
2557                if (utf8)
2558                  {
2559                  class_utf8 = TRUE;
2560                  *class_utf8data++ = XCL_SINGLE;
2561                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2562                  *class_utf8data++ = XCL_SINGLE;
2563                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2564                  *class_utf8data++ = XCL_RANGE;
2565                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2566                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2567                  *class_utf8data++ = XCL_SINGLE;
2568                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2569                  *class_utf8data++ = XCL_SINGLE;
2570                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2571                  *class_utf8data++ = XCL_SINGLE;
2572                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2573                  }
2574    #endif
2575                continue;
2576                }
2577    
2578              if (-c == ESC_H)
2579                {
2580                for (c = 0; c < 32; c++)
2581                {                {
2582                BOOL negated;                int x = 0xff;
2583                int pdata;                switch (c)
2584                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                  {
2585                if (ptype < 0) goto FAILED;                  case 0x09/8: x ^= 1 << (0x09%8); break;
2586                    case 0x20/8: x ^= 1 << (0x20%8); break;
2587                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2588                    default: break;
2589                    }
2590                  classbits[c] |= x;
2591                  }
2592    
2593    #ifdef SUPPORT_UTF8
2594                if (utf8)
2595                  {
2596                class_utf8 = TRUE;                class_utf8 = TRUE;
2597                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_RANGE;
2598                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2599                *class_utf8data++ = ptype;                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2600                *class_utf8data++ = pdata;                *class_utf8data++ = XCL_RANGE;
2601                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2602                }                class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2603                  *class_utf8data++ = XCL_RANGE;
2604                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2605                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2606                  *class_utf8data++ = XCL_RANGE;
2607                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2608                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2609                  *class_utf8data++ = XCL_RANGE;
2610                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2611                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2612                  *class_utf8data++ = XCL_RANGE;
2613                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2614                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2615                  *class_utf8data++ = XCL_RANGE;
2616                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2617                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2618                  }
2619    #endif
2620                continue;
2621                }
2622    
2623              if (-c == ESC_v)
2624                {
2625                SETBIT(classbits, 0x0a); /* LF */
2626                SETBIT(classbits, 0x0b); /* VT */
2627                SETBIT(classbits, 0x0c); /* FF */
2628                SETBIT(classbits, 0x0d); /* CR */
2629                SETBIT(classbits, 0x85); /* NEL */
2630    #ifdef SUPPORT_UTF8
2631                if (utf8)
2632                  {
2633                  class_utf8 = TRUE;
2634                  *class_utf8data++ = XCL_RANGE;
2635                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2636                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2637                  }
2638    #endif
2639                continue;
2640                }
2641    
2642              if (-c == ESC_V)
2643                {
2644                for (c = 0; c < 32; c++)
2645                  {
2646                  int x = 0xff;
2647                  switch (c)
2648                    {
2649                    case 0x0a/8: x ^= 1 << (0x0a%8);
2650                                 x ^= 1 << (0x0b%8);
2651                                 x ^= 1 << (0x0c%8);
2652                                 x ^= 1 << (0x0d%8);
2653                                 break;
2654                    case 0x85/8: x ^= 1 << (0x85%8); break;
2655                    default: break;
2656                    }
2657                  classbits[c] |= x;
2658                  }
2659    
2660    #ifdef SUPPORT_UTF8
2661                if (utf8)
2662                  {
2663                  class_utf8 = TRUE;
2664                  *class_utf8data++ = XCL_RANGE;
2665                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2666                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2667                  *class_utf8data++ = XCL_RANGE;
2668                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2669                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2670                  }
2671    #endif
2672                continue;
2673                }
2674    
2675              /* We need to deal with \P and \p in both phases. */
2676    
2677    #ifdef SUPPORT_UCP
2678              if (-c == ESC_p || -c == ESC_P)
2679                {
2680                BOOL negated;
2681                int pdata;
2682                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2683                if (ptype < 0) goto FAILED;
2684                class_utf8 = TRUE;
2685                *class_utf8data++ = ((-c == ESC_p) != negated)?
2686                  XCL_PROP : XCL_NOTPROP;
2687                *class_utf8data++ = ptype;
2688                *class_utf8data++ = pdata;
2689                class_charcount -= 2;   /* Not a < 256 character */
2690              continue;              continue;
2691                }
2692  #endif  #endif
2693              /* Unrecognized escapes are faulted if PCRE is running in its
2694              strict mode. By default, for compatibility with Perl, they are
2695              treated as literals. */
2696    
2697              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2698              strict mode. By default, for compatibility with Perl, they are              {
2699              treated as literals. */              *errorcodeptr = ERR7;
2700                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2701              }              }
2702    
2703              class_charcount -= 2;  /* Undo the default count from above */
2704              c = *ptr;              /* Get the final character and fall through */
2705            }            }
2706    
2707          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2708          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2709    
2710          }   /* End of backslash handling */          }   /* End of backslash handling */
2711    
2712        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2713        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2714        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2715          entirely. The code for handling \Q and \E is messy. */
2716    
2717          CHECK_RANGE:
2718          while (ptr[1] == '\\' && ptr[2] == 'E')
2719            {
2720            inescq = FALSE;
2721            ptr += 2;
2722            }
2723    
2724          oldptr = ptr;
2725    
2726        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2727          {          {
2728          int d;          int d;
2729          ptr += 2;          ptr += 2;
2730            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2731    
2732            /* If we hit \Q (not followed by \E) at this point, go into escaped
2733            mode. */
2734    
2735            while (*ptr == '\\' && ptr[1] == 'Q')
2736              {
2737              ptr += 2;
2738              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2739              inescq = TRUE;
2740              break;
2741              }
2742    
2743            if (*ptr == 0 || (!inescq && *ptr == ']'))
2744              {
2745              ptr = oldptr;
2746              goto LONE_SINGLE_CHARACTER;
2747              }
2748    
2749  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2750          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 2759  for (;; ptr++)
2759          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2760          in such circumstances. */          in such circumstances. */
2761    
2762          if (d == '\\')          if (!inescq && d == '\\')
2763            {            {
2764            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2765            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2766    
2767            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2768            was literal */            special means the '-' was literal */
2769    
2770            if (d < 0)            if (d < 0)
2771              {              {
2772              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2773              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2774                else if (d == -ESC_R) d = 'R'; else
2775                {                {
2776                ptr = oldptr - 2;                ptr = oldptr;
2777                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2778                }                }
2779              }              }
2780            }            }
2781    
2782          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2783          the pre-pass. Optimize one-character ranges */          one-character ranges */
2784    
2785            if (d < c)
2786              {
2787              *errorcodeptr = ERR8;
2788              goto FAILED;
2789              }
2790    
2791          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2792    
# Line 2112  for (;; ptr++) Line 2807  for (;; ptr++)
2807  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2808            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2809              {              {
2810              int occ, ocd;              unsigned int occ, ocd;
2811              int cc = c;              unsigned int cc = c;
2812              int origd = d;              unsigned int origd = d;
2813              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2814                {                {
2815                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2816                      ocd <= (unsigned int)d)
2817                    continue;                          /* Skip embedded ranges */
2818    
2819                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2820                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2821                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2822                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2823                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2824                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2825                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2826                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2827                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2828                  d = ocd;                  d = ocd;
2829                  continue;                  continue;
# Line 2172  for (;; ptr++) Line 2871  for (;; ptr++)
2871          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2872          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2873    
2874          for (; c <= d; c++)          class_charcount += d - c + 1;
2875            class_lastchar = d;
2876    
2877            /* We can save a bit of time by skipping this in the pre-compile. */
2878    
2879            if (lengthptr == NULL) for (; c <= d; c++)
2880            {            {
2881            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2882            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 2884  for (;; ptr++)
2884              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2885              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2886              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2887            }            }
2888    
2889          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 2907  for (;; ptr++)
2907  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2908          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2909            {            {
2910            int othercase;            unsigned int othercase;
2911            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2912              {              {
2913              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2914              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 2933  for (;; ptr++)
2933          }          }
2934        }        }
2935    
2936      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2937    
2938      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2939    
2940        if (c == 0)                          /* Missing terminating ']' */
2941          {
2942          *errorcodeptr = ERR6;
2943          goto FAILED;
2944          }
2945    
2946      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2947      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2298  for (;; ptr++) Line 3005  for (;; ptr++)
3005    
3006      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3007      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3008      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3009    
3010  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3011      if (class_utf8)      if (class_utf8)
# Line 2308  for (;; ptr++) Line 3015  for (;; ptr++)
3015        code += LINK_SIZE;        code += LINK_SIZE;
3016        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3017    
3018        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3019        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3020    
3021        if (class_charcount > 0)        if (class_charcount > 0)
3022          {          {
3023          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3024            memmove(code + 32, code, class_utf8data - code);
3025          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3026          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3027          }          }
3028          else code = class_utf8data;
3029    
3030        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3031    
# Line 2342  for (;; ptr++) Line 3042  for (;; ptr++)
3042      if (negate_class)      if (negate_class)
3043        {        {
3044        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3045        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3046            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3047        }        }
3048      else      else
3049        {        {
# Line 2352  for (;; ptr++) Line 3053  for (;; ptr++)
3053      code += 32;      code += 32;
3054      break;      break;
3055    
3056    
3057        /* ===================================================================*/
3058      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3059      has been tested above. */      has been tested above. */
3060    
# Line 2419  for (;; ptr++) Line 3122  for (;; ptr++)
3122        }        }
3123      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3124    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3125      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3126      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3127      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 3155  for (;; ptr++)
3155          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3156          }          }
3157    
3158          /* If the repetition is unlimited, it pays to see if the next thing on
3159          the line is something that cannot possibly match this character. If so,
3160          automatically possessifying this item gains some performance in the case
3161          where the match fails. */
3162    
3163          if (!possessive_quantifier &&
3164              repeat_max < 0 &&
3165              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3166                options, cd))
3167            {
3168            repeat_type = 0;    /* Force greedy */
3169            possessive_quantifier = TRUE;
3170            }
3171    
3172        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3173        }        }
3174    
3175      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3176      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3177      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3178      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3179        currently used only for single-byte chars. */
3180    
3181      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3182        {        {
3183        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3184        c = previous[1];        c = previous[1];
3185          if (!possessive_quantifier &&
3186              repeat_max < 0 &&
3187              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3188            {
3189            repeat_type = 0;    /* Force greedy */
3190            possessive_quantifier = TRUE;
3191            }
3192        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3193        }        }
3194    
# Line 2495  for (;; ptr++) Line 3206  for (;; ptr++)
3206        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3207        c = *previous;        c = *previous;
3208    
3209          if (!possessive_quantifier &&
3210              repeat_max < 0 &&
3211              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3212            {
3213            repeat_type = 0;    /* Force greedy */
3214            possessive_quantifier = TRUE;
3215            }
3216    
3217        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3218        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3219          {          {
# Line 2535  for (;; ptr++) Line 3254  for (;; ptr++)
3254          }          }
3255    
3256        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3257        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3258        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3259        one less than the maximum. */        one less than the maximum. */
3260    
# Line 2588  for (;; ptr++) Line 3307  for (;; ptr++)
3307            }            }
3308    
3309          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3310          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3311            UPTO is just for 1 instance, we can use QUERY instead. */
3312    
3313          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3314            {            {
# Line 2607  for (;; ptr++) Line 3327  for (;; ptr++)
3327              *code++ = prop_value;              *code++ = prop_value;
3328              }              }
3329            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3330            *code++ = OP_UPTO + repeat_type;  
3331            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3332                {
3333                *code++ = OP_QUERY + repeat_type;
3334                }
3335              else
3336                {
3337                *code++ = OP_UPTO + repeat_type;
3338                PUT2INC(code, 0, repeat_max);
3339                }
3340            }            }
3341          }          }
3342    
# Line 2675  for (;; ptr++) Line 3403  for (;; ptr++)
3403      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3404      cases. */      cases. */
3405    
3406      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3407               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3408        {        {
3409        register int i;        register int i;
3410        int ketoffset = 0;        int ketoffset = 0;
3411        int len = code - previous;        int len = code - previous;
3412        uschar *bralink = NULL;        uschar *bralink = NULL;
3413    
3414          /* Repeating a DEFINE group is pointless */
3415    
3416          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3417            {
3418            *errorcodeptr = ERR55;
3419            goto FAILED;
3420            }
3421    
3422          /* This is a paranoid check to stop integer overflow later on */
3423    
3424          if (len > MAX_DUPLENGTH)
3425            {
3426            *errorcodeptr = ERR50;
3427            goto FAILED;
3428            }
3429    
3430        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3431        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3432        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2717  for (;; ptr++) Line 3461  for (;; ptr++)
3461          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3462          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3463          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3464          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3465          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3466            doing this. */
3467    
3468          if (repeat_max <= 1)          if (repeat_max <= 1)
3469            {            {
3470            *code = OP_END;            *code = OP_END;
3471            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3472            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3473            code++;            code++;
3474            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2741  for (;; ptr++) Line 3486  for (;; ptr++)
3486            {            {
3487            int offset;            int offset;
3488            *code = OP_END;            *code = OP_END;
3489            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3490            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3491            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3492            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3506  for (;; ptr++)
3506        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3507        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3508        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3509        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3510          forward reference subroutine calls in the group, there will be entries on
3511          the workspace list; replicate these with an appropriate increment. */
3512    
3513        else        else
3514          {          {
3515          if (repeat_min > 1)          if (repeat_min > 1)
3516            {            {
3517            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3518            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3519    
3520              if (lengthptr != NULL)
3521                *lengthptr += (repeat_min - 1)*length_prevgroup;
3522    
3523              /* This is compiling for real */
3524    
3525              else
3526              {              {
3527              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3528              code += len;              for (i = 1; i < repeat_min; i++)
3529                  {
3530                  uschar *hc;
3531                  uschar *this_hwm = cd->hwm;
3532                  memcpy(code, previous, len);
3533                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3534                    {
3535                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3536                    cd->hwm += LINK_SIZE;
3537                    }
3538                  save_hwm = this_hwm;
3539                  code += len;
3540                  }
3541              }              }
3542            }            }
3543    
3544          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3545          }          }
3546    
# Line 2781  for (;; ptr++) Line 3548  for (;; ptr++)
3548        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3549        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3550        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3551        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3552          replicate entries on the forward reference list. */
3553    
3554        if (repeat_max >= 0)        if (repeat_max >= 0)
3555          {          {
3556          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3557            just adjust the length as if we had. For each repetition we must add 1
3558            to the length for BRAZERO and for all but the last repetition we must
3559            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3560    
3561            if (lengthptr != NULL && repeat_max > 0)
3562              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3563                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3564    
3565            /* This is compiling for real */
3566    
3567            else for (i = repeat_max - 1; i >= 0; i--)
3568            {            {
3569              uschar *hc;
3570              uschar *this_hwm = cd->hwm;
3571    
3572            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3573    
3574            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 3584  for (;; ptr++)
3584              }              }
3585    
3586            memcpy(code, previous, len);            memcpy(code, previous, len);
3587              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3588                {
3589                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3590                cd->hwm += LINK_SIZE;
3591                }
3592              save_hwm = this_hwm;
3593            code += len;            code += len;
3594            }            }
3595    
# Line 2824  for (;; ptr++) Line 3612  for (;; ptr++)
3612        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3613        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3614        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3615        correct offset was computed above. */        correct offset was computed above.
3616    
3617          Then, when we are doing the actual compile phase, check to see whether
3618          this group is a non-atomic one that could match an empty string. If so,
3619          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3620          that runtime checking can be done. [This check is also applied to
3621          atomic groups at runtime, but in a different way.] */
3622    
3623        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3624            {
3625            uschar *ketcode = code - ketoffset;
3626            uschar *bracode = ketcode - GET(ketcode, 1);
3627            *ketcode = OP_KETRMAX + repeat_type;
3628            if (lengthptr == NULL && *bracode != OP_ONCE)
3629              {
3630              uschar *scode = bracode;
3631              do
3632                {
3633                if (could_be_empty_branch(scode, ketcode, utf8))
3634                  {
3635                  *bracode += OP_SBRA - OP_BRA;
3636                  break;
3637                  }
3638                scode += GET(scode, 1);
3639                }
3640              while (*scode == OP_ALT);
3641              }
3642            }
3643        }        }
3644    
3645      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2837  for (;; ptr++) Line 3650  for (;; ptr++)
3650        goto FAILED;        goto FAILED;
3651        }        }
3652    
3653      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3654      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3655      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3656      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3657      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3658        but the special opcodes can optimize it a bit. The repeated item starts at
3659        tempcode, not at previous, which might be the first part of a string whose
3660        (former) last char we repeated.
3661    
3662        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3663        an 'upto' may follow. We skip over an 'exact' item, and then test the
3664        length of what remains before proceeding. */
3665    
3666      if (possessive_quantifier)      if (possessive_quantifier)
3667        {        {
3668        int len = code - tempcode;        int len;
3669        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3670        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3671        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3672        tempcode[0] = OP_ONCE;        len = code - tempcode;
3673        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3674        PUTINC(code, 0, len);          {
3675        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3676            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3677            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3678            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3679    
3680            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3681            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3682            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3683            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3684    
3685            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3686            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3687            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3688            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3689    
3690            default:
3691            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3692            code += 1 + LINK_SIZE;
3693            len += 1 + LINK_SIZE;
3694            tempcode[0] = OP_ONCE;
3695            *code++ = OP_KET;
3696            PUTINC(code, 0, len);
3697            PUT(tempcode, 1, len);
3698            break;
3699            }
3700        }        }
3701    
3702      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 3709  for (;; ptr++)
3709      break;      break;
3710    
3711    
3712      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3713      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3714      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3715      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3716      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3717      check for syntax errors here.  */      group. */
3718    
3719      case '(':      case '(':
3720      newoptions = options;      newoptions = options;
3721      skipbytes = 0;      skipbytes = 0;
3722        bravalue = OP_CBRA;
3723        save_hwm = cd->hwm;
3724        reset_bracount = FALSE;
3725    
3726      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3727        {        {
3728        int set, unset;        int i, set, unset, namelen;
3729        int *optset;        int *optset;
3730          const uschar *name;
3731          uschar *slot;
3732    
3733        switch (*(++ptr))        switch (*(++ptr))
3734          {          {
3735          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3736          ptr++;          ptr++;
3737          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3738            if (*ptr == 0)
3739              {
3740              *errorcodeptr = ERR18;
3741              goto FAILED;
3742              }
3743          continue;          continue;
3744    
3745          case ':':                 /* Non-extracting bracket */  
3746            /* ------------------------------------------------------------ */
3747            case '|':                 /* Reset capture count for each branch */
3748            reset_bracount = TRUE;
3749            /* Fall through */
3750    
3751            /* ------------------------------------------------------------ */
3752            case ':':                 /* Non-capturing bracket */
3753          bravalue = OP_BRA;          bravalue = OP_BRA;
3754          ptr++;          ptr++;
3755          break;          break;
3756    
3757    
3758            /* ------------------------------------------------------------ */
3759          case '(':          case '(':
3760          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3761    
3762          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
3763          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
3764          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
3765          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
3766          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
3767          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3768          character after (?( is a word character, we know the rest up to ) will  
3769          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
3770          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
3771            by digits), and (b) a number could be a name that consists of digits.
3772          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
3773            {          cases. */
3774            int i, namelen;  
3775            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
3776            const uschar *name;          the switch. This will take control down to where bracketed groups,
3777            uschar *slot = cd->name_table;          including assertions, are processed. */
3778    
3779            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3780              break;
3781    
3782            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
3783            below), and all need to skip 3 bytes at the start of the group. */
3784    
3785            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
3786            skipbytes = 3;
3787            refsign = -1;
3788    
3789            name = ++ptr;          /* Check for a test for recursion in a named group. */
3790            while (*ptr != ')')  
3791              {          if (ptr[1] == 'R' && ptr[2] == '&')
3792              if (condref >= 0)            {
3793                condref = ((digitab[*ptr] & ctype_digit) != 0)?            terminator = -1;
3794                  condref * 10 + *ptr - '0' : -1;            ptr += 2;
3795              ptr++;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3796              }            }
3797            namelen = ptr - name;  
3798            /* Check for a test for a named group's having been set, using the Perl
3799            syntax (?(<name>) or (?('name') */
3800    
3801            else if (ptr[1] == '<')
3802              {
3803              terminator = '>';
3804              ptr++;
3805              }
3806            else if (ptr[1] == '\'')
3807              {
3808              terminator = '\'';
3809            ptr++;            ptr++;
3810              }
3811            else
3812              {
3813              terminator = 0;
3814              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3815              }
3816    
3817            for (i = 0; i < cd->names_found; i++)          /* We now expect to read a name; any thing else is an error */
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
3818    
3819            /* Found a previous named subpattern */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3820              {
3821              ptr += 1;  /* To get the right offset */
3822              *errorcodeptr = ERR28;
3823              goto FAILED;
3824              }
3825    
3826            if (i < cd->names_found)          /* Read the name, but also get it as a number if it's all digits */
             {  
             condref = GET2(slot, 0);  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, condref);  
             }  
3827    
3828            /* Search the pattern for a forward reference */          recno = 0;
3829            name = ++ptr;
3830            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3831              {
3832              if (recno >= 0)
3833                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3834                  recno * 10 + *ptr - '0' : -1;
3835              ptr++;
3836              }
3837            namelen = ptr - name;
3838    
3839            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3840              {            {
3841              code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3842              PUT2(code, 2+LINK_SIZE, i);            *errorcodeptr = ERR26;
3843              }            goto FAILED;
3844              }
3845    
3846            /* Check for 'R' for recursion */          /* Do no further checking in the pre-compile phase. */
3847    
3848            else if (namelen == 1 && *name == 'R')          if (lengthptr != NULL) break;
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, CREF_RECURSE);  
             }  
3849    
3850            /* Check for a subpattern number */          /* In the real compile we do the work of looking for the actual
3851            reference. If the string started with "+" or "-" we require the rest to
3852            be digits, in which case recno will be set. */
3853    
3854            else if (condref > 0)          if (refsign > 0)
3855              {
3856              if (recno <= 0)
3857                {
3858                *errorcodeptr = ERR58;
3859                goto FAILED;
3860                }
3861              if (refsign == '-')
3862              {              {
3863              code[1+LINK_SIZE] = OP_CREF;              recno = cd->bracount - recno + 1;
3864              PUT2(code, 2+LINK_SIZE, condref);              if (recno <= 0)
3865                  {
3866                  *errorcodeptr = ERR15;
3867                  goto FAILED;
3868                  }
3869              }              }
3870              else recno += cd->bracount;
3871              PUT2(code, 2+LINK_SIZE, recno);
3872              break;
3873              }
3874    
3875            /* Either an unidentified subpattern, or a reference to (?(0) */          /* Otherwise (did not start with "+" or "-"), start by looking for the
3876            name. */
3877    
3878            else          slot = cd->name_table;
3879            for (i = 0; i < cd->names_found; i++)
3880              {
3881              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3882              slot += cd->name_entry_size;
3883              }
3884    
3885            /* Found a previous named subpattern */
3886    
3887            if (i < cd->names_found)
3888              {
3889              recno = GET2(slot, 0);
3890              PUT2(code, 2+LINK_SIZE, recno);
3891              }
3892    
3893            /* Search the pattern for a forward reference */
3894    
3895            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3896                            (options & PCRE_EXTENDED) != 0)) > 0)
3897              {
3898              PUT2(code, 2+LINK_SIZE, i);
3899              }
3900    
3901            /* If terminator == 0 it means that the name followed directly after
3902            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3903            some further alternatives to try. For the cases where terminator != 0
3904            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3905            now checked all the possibilities, so give an error. */
3906    
3907            else if (terminator != 0)
3908              {
3909              *errorcodeptr = ERR15;
3910              goto FAILED;
3911              }
3912    
3913            /* Check for (?(R) for recursion. Allow digits after R to specify a
3914            specific group number. */
3915    
3916            else if (*name == 'R')
3917              {
3918              recno = 0;
3919              for (i = 1; i < namelen; i++)
3920              {              {
3921              *errorcodeptr = (condref == 0)? ERR35: ERR15;              if ((digitab[name[i]] & ctype_digit) == 0)
3922              goto FAILED;                {
3923                  *errorcodeptr = ERR15;
3924                  goto FAILED;
3925                  }
3926                recno = recno * 10 + name[i] - '0';
3927              }              }
3928              if (recno == 0) recno = RREF_ANY;
3929              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3930              PUT2(code, 2+LINK_SIZE, recno);
3931              }
3932    
3933            /* Similarly, check for the (?(DEFINE) "condition", which is always
3934            false. */
3935    
3936            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3937              {
3938              code[1+LINK_SIZE] = OP_DEF;
3939              skipbytes = 1;
3940              }
3941    
3942            /* Check for the "name" actually being a subpattern number. */
3943    
3944            else if (recno > 0)
3945              {
3946              PUT2(code, 2+LINK_SIZE, recno);
3947            }            }
3948    
3949          /* For conditions that are assertions, we just fall through, having          /* Either an unidentified subpattern, or a reference to (?(0) */
         set bravalue above. */  
3950    
3951            else
3952              {
3953              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3954              goto FAILED;
3955              }
3956          break;          break;
3957    
3958    
3959            /* ------------------------------------------------------------ */
3960          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3961          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3962          ptr++;          ptr++;
3963          break;          break;
3964    
3965    
3966            /* ------------------------------------------------------------ */
3967          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3968          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3969          ptr++;          ptr++;
3970          break;          break;
3971    
3972          case '<':                 /* Lookbehinds */  
3973          switch (*(++ptr))          /* ------------------------------------------------------------ */
3974            case '<':                 /* Lookbehind or named define */
3975            switch (ptr[1])
3976            {            {
3977            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3978            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3979            ptr++;            ptr += 2;
3980            break;            break;
3981    
3982            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3983            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3984            ptr++;            ptr += 2;
3985            break;            break;
3986    
3987              default:                /* Could be name define, else bad */
3988              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3989              ptr++;                  /* Correct offset for error */
3990              *errorcodeptr = ERR24;
3991              goto FAILED;
3992            }            }
3993          break;          break;
3994    
3995    
3996            /* ------------------------------------------------------------ */
3997          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3998          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3999          ptr++;          ptr++;
4000          break;          break;
4001    
4002    
4003            /* ------------------------------------------------------------ */
4004          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
4005          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4006          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4007          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
4008            {                       /* closing parenthesis is present. */            {
4009            int n = 0;            int n = 0;
4010            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4011              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4012              if (*ptr != ')')
4013                {
4014                *errorcodeptr = ERR39;
4015                goto FAILED;
4016                }
4017            if (n > 255)            if (n > 255)
4018              {              {
4019              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 4027  for (;; ptr++)
4027          previous = NULL;          previous = NULL;
4028          continue;          continue;
4029    
4030          case 'P':                 /* Named subpattern handling */  
4031          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4032            case 'P':                 /* Python-style named subpattern handling */
4033            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4034            {            {
4035            int i, namelen;            is_recurse = *ptr == '>';
4036            uschar *slot = cd->name_table;            terminator = ')';
4037            const uschar *name;     /* Don't amalgamate; some compilers */            goto NAMED_REF_OR_RECURSE;
4038            name = ++ptr;           /* grumble at autoincrement in declaration */            }
4039            else if (*ptr != '<')    /* Test for Python-style definition */
4040              {
4041              *errorcodeptr = ERR41;
4042              goto FAILED;
4043              }
4044            /* Fall through to handle (?P< as (?< is handled */
4045    
           while (*ptr++ != '>');  
           namelen = ptr - name - 1;  
4046    
4047            for (i = 0; i < cd->names_found; i++)          /* ------------------------------------------------------------ */
4048            DEFINE_NAME:    /* Come here from (?< handling */
4049            case '\'':
4050              {
4051              terminator = (*ptr == '<')? '>' : '\'';
4052              name = ++ptr;
4053    
4054              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4055              namelen = ptr - name;
4056    
4057              /* In the pre-compile phase, just do a syntax check. */
4058    
4059              if (lengthptr != NULL)
4060              {              {
4061              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
             if (crc == 0)  
4062                {                {
4063                if (slot[2+namelen] == 0)                *errorcodeptr = ERR42;
4064                  {                goto FAILED;
                 if ((options & PCRE_DUPNAMES) == 0)  
                   {  
                   *errorcodeptr = ERR43;  
                   goto FAILED;  
                   }  
                 }  
               else crc = -1;      /* Current name is substring */  
4065                }                }
4066              if (crc < 0)              if (cd->names_found >= MAX_NAME_COUNT)
4067                {                {
4068                memmove(slot + cd->name_entry_size, slot,                *errorcodeptr = ERR49;
4069                  (cd->names_found - i) * cd->name_entry_size);                goto FAILED;
4070                break;                }
4071                if (namelen + 3 > cd->name_entry_size)
4072                  {
4073                  cd->name_entry_size = namelen + 3;
4074                  if (namelen > MAX_NAME_SIZE)
4075                    {
4076                    *errorcodeptr = ERR48;
4077                    goto FAILED;
4078                    }
4079                }                }
             slot += cd->name_entry_size;  
4080              }              }
4081    
4082            PUT2(slot, 0, *brackets + 1);            /* In the real compile, create the entry in the table */
4083            memcpy(slot + 2, name, namelen);  
4084            slot[2+namelen] = 0;            else
4085            cd->names_found++;              {
4086            goto NUMBERED_GROUP;              slot = cd->name_table;
4087                for (i = 0; i < cd->names_found; i++)
4088                  {
4089                  int crc = memcmp(name, slot+2, namelen);
4090                  if (crc == 0)
4091                    {
4092                    if (slot[2+namelen] == 0)
4093                      {
4094                      if ((options & PCRE_DUPNAMES) == 0)
4095                        {
4096                        *errorcodeptr = ERR43;
4097                        goto FAILED;
4098                        }
4099                      }
4100                    else crc = -1;      /* Current name is substring */
4101                    }
4102                  if (crc < 0)
4103                    {
4104                    memmove(slot + cd->name_entry_size, slot,
4105                      (cd->names_found - i) * cd->name_entry_size);
4106                    break;
4107                    }
4108                  slot += cd->name_entry_size;
4109                  }
4110    
4111                PUT2(slot, 0, cd->bracount + 1);
4112                memcpy(slot + 2, name, namelen);
4113                slot[2+namelen] = 0;
4114                }
4115            }            }
4116    
4117          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4118    
4119            ptr++;                    /* Move past > or ' */
4120            cd->names_found++;
4121            goto NUMBERED_GROUP;
4122    
4123    
4124            /* ------------------------------------------------------------ */
4125            case '&':                 /* Perl recursion/subroutine syntax */
4126            terminator = ')';
4127            is_recurse = TRUE;
4128            /* Fall through */
4129    
4130            /* We come here from the Python syntax above that handles both
4131            references (?P=name) and recursion (?P>name), as well as falling
4132            through from the Perl recursion syntax (?&name). */
4133    
4134            NAMED_REF_OR_RECURSE:
4135            name = ++ptr;
4136            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4137            namelen = ptr - name;
4138    
4139            /* In the pre-compile phase, do a syntax check and set a dummy
4140            reference number. */
4141    
4142            if (lengthptr != NULL)
4143            {            {
4144            int i, namelen;            if (*ptr != terminator)
4145            int type = *ptr++;              {
4146            const uschar *name = ptr;              *errorcodeptr = ERR42;
4147            uschar *slot = cd->name_table;              goto FAILED;
4148                }
4149              if (namelen > MAX_NAME_SIZE)
4150                {
4151                *errorcodeptr = ERR48;
4152                goto FAILED;
4153                }
4154              recno = 0;
4155              }
4156    
4157            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4158    
4159            else
4160              {
4161              slot = cd->name_table;
4162            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4163              {              {
4164              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
# Line 3097  for (;; ptr++) Line 4170  for (;; ptr++)
4170              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4171              }              }
4172            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4173                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd->bracount, name, namelen,
4174                          (options & PCRE_EXTENDED) != 0)) <= 0)
4175              {              {
4176              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4177              goto FAILED;              goto FAILED;
4178              }              }
4179              }
4180    
4181            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
4182            recursion or backreferences. */
           /* Back reference */  
4183    
4184            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4185            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4186    
         /* Should never happen */  
         break;  
4187    
4188          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4189            case 'R':                 /* Recursion */
4190          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4191          /* Fall through */          /* Fall through */
4192    
         /* Recursion or "subroutine" call */  
4193    
4194          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4195          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4196            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4197            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4198            {            {
4199            const uschar *called;            const uschar *called;
4200    
4201              if ((refsign = *ptr) == '+') ptr++;
4202              else if (refsign == '-')
4203                {
4204                if ((digitab[ptr[1]] & ctype_digit) == 0)
4205                  goto OTHER_CHAR_AFTER_QUERY;
4206                ptr++;
4207                }
4208    
4209            recno = 0;            recno = 0;
4210            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4211              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4212    
4213              if (*ptr != ')')
4214                {
4215                *errorcodeptr = ERR29;
4216                goto FAILED;
4217                }
4218    
4219              if (refsign == '-')
4220                {
4221                if (recno == 0)
4222                  {
4223                  *errorcodeptr = ERR58;
4224                  goto FAILED;
4225                  }
4226                recno = cd->bracount - recno + 1;
4227                if (recno <= 0)
4228                  {
4229                  *errorcodeptr = ERR15;
4230                  goto FAILED;
4231                  }
4232                }
4233              else if (refsign == '+')
4234                {
4235                if (recno == 0)
4236                  {
4237                  *errorcodeptr = ERR58;
4238                  goto FAILED;
4239                  }
4240                recno += cd->bracount;
4241                }
4242    
4243            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4244    
4245            HANDLE_RECURSION:            HANDLE_RECURSION:
4246    
4247            previous = code;            previous = code;
4248              called = cd->start_code;
4249    
4250            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4251            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4252              this point. If we end up with a forward reference, first check that
4253              the bracket does occur later so we can give the error (and position)
4254              now. Then remember this forward reference in the workspace so it can
4255              be filled in at the end. */
4256    
4257            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);  
           if (called == NULL)  
4258              {              {
4259              *errorcodeptr = ERR15;              *code = OP_END;
4260              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4261    
4262            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4263    
4264            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4265              {                {
4266              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4267              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4268                    {
4269                    *errorcodeptr = ERR15;
4270                    goto FAILED;
4271                    }
4272                  called = cd->start_code + recno;
4273                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4274                  }
4275    
4276                /* If not a forward reference, and the subpattern is still open,
4277                this is a recursive call. We check to see if this is a left
4278                recursion that could loop for ever, and diagnose that case. */
4279    
4280                else if (GET(called, 1) == 0 &&
4281                         could_be_empty(called, code, bcptr, utf8))
4282                  {
4283                  *errorcodeptr = ERR40;
4284                  goto FAILED;
4285                  }
4286              }              }
4287    
4288            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4289            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4290              subsequent quantifier will work. */
4291    
4292            *code = OP_ONCE;            *code = OP_ONCE;
4293            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3174  for (;; ptr++) Line 4300  for (;; ptr++)
4300            *code = OP_KET;            *code = OP_KET;
4301            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4302            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4303    
4304              length_prevgroup = 3 + 3*LINK_SIZE;
4305            }            }
4306    
4307            /* Can't determine a first byte now */
4308    
4309            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4310          continue;          continue;
4311    
         /* Character after (? not specially recognized */  
4312    
4313          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4314            default:              /* Other characters: check option setting */
4315            OTHER_CHAR_AFTER_QUERY:
4316          set = unset = 0;          set = unset = 0;
4317          optset = &set;          optset = &set;
4318    
# Line 3189  for (;; ptr++) Line 4322  for (;; ptr++)
4322              {              {
4323              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4324    
4325                case 'J':    /* Record that it changed in the external options */
4326                *optset |= PCRE_DUPNAMES;
4327                cd->external_options |= PCRE_JCHANGED;
4328                break;
4329    
4330              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
             case 'J': *optset |= PCRE_DUPNAMES; break;  
4331              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4332              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4333              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4334              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4335              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4336    
4337                default:  *errorcodeptr = ERR12;
4338                          ptr--;    /* Correct the offset */
4339                          goto FAILED;
4340              }              }
4341            }            }
4342    
# Line 3204  for (;; ptr++) Line 4345  for (;; ptr++)
4345          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4346    
4347          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4348          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4349          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4350          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4351          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4352          a group), a resetting item can be compiled.          caseless checking of required bytes.
4353    
4354          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4355          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4356          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4357            that value after the start, because it gets reset as code is discarded
4358            during the pre-compile. However, this can happen only at top level - if
4359            we are within parentheses, the starting BRA will still be present. At
4360            any parenthesis level, the length value can be used to test if anything
4361            has been compiled at that level. Thus, a test for both these conditions
4362            is necessary to ensure we correctly detect the start of the pattern in
4363            both phases.
4364    
4365            If we are not at the pattern start, compile code to change the ims
4366            options if this setting actually changes any of them. We also pass the
4367            new setting back so that it can be put at the start of any following
4368            branches, and when this group ends (if we are in a group), a resetting
4369            item can be compiled. */
4370    
4371          if (*ptr == ')')          if (*ptr == ')')
4372            {            {
4373            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4374                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4375              {              {
4376              *code++ = OP_OPT;              cd->external_options = newoptions;
4377              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4378              }              }
4379             else
4380                {
4381                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4382                  {
4383                  *code++ = OP_OPT;
4384                  *code++ = newoptions & PCRE_IMS;
4385                  }
4386    
4387            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4388            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4389            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4390    
4391            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4392            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4393            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4394            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4395                }
4396    
4397            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4398            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3242  for (;; ptr++) Line 4405  for (;; ptr++)
4405    
4406          bravalue = OP_BRA;          bravalue = OP_BRA;
4407          ptr++;          ptr++;
4408          }          }     /* End of switch for character following (? */
4409        }        }       /* End of (? handling */
4410    
4411      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4412      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4413        brackets. */
4414    
4415      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4416        {        {
4417        bravalue = OP_BRA;        bravalue = OP_BRA;
4418        }        }
4419    
4420      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4421    
4422      else      else
4423        {        {
4424        NUMBERED_GROUP:        NUMBERED_GROUP:
4425        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4426          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4427          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4428        }        }
4429    
4430      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4431      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4432      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4433      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4434        they have changed. */
4435    
4436      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4437      *code = bravalue;      *code = bravalue;
4438      tempcode = code;      tempcode = code;
4439      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4440        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4441    
4442      if (!compile_regex(      if (!compile_regex(
4443           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4444           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4445           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4446           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4447           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4448           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4449            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4450           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4451             skipbytes,                    /* Skip over bracket number */
4452           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4453           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4454           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4455           cd))                          /* Tables block */           cd,                           /* Tables block */
4456             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4457               &length_prevgroup           /* Pre-compile phase */
4458             ))
4459        goto FAILED;        goto FAILED;
4460    
4461      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3302  for (;; ptr++) Line 4464  for (;; ptr++)
4464      is on the bracket. */      is on the bracket. */
4465    
4466      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4467      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4468        in the real compile phase, not in the pre-pass, where the whole group may
4469        not be available. */
4470    
4471      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4472        {        {
4473        uschar *tc = code;        uschar *tc = code;
4474        int condcount = 0;        int condcount = 0;
# Line 3315  for (;; ptr++) Line 4479  for (;; ptr++)
4479           }           }
4480        while (*tc != OP_KET);        while (*tc != OP_KET);
4481    
4482        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4483          false). It must have only one branch. */
4484    
4485          if (code[LINK_SIZE+1] == OP_DEF)
4486          {          {
4487          *errorcodeptr = ERR27;          if (condcount > 1)
4488          goto FAILED;            {
4489              *errorcodeptr = ERR54;
4490              goto FAILED;
4491              }
4492            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4493            }
4494    
4495          /* A "normal" conditional group. If there is just one branch, we must not
4496          make use of its firstbyte or reqbyte, because this is equivalent to an
4497          empty second branch. */
4498    
4499          else
4500            {
4501            if (condcount > 2)
4502              {
4503              *errorcodeptr = ERR27;
4504              goto FAILED;
4505              }
4506            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4507          }          }
4508          }
4509    
4510        /* Error if hit end of pattern */
4511    
4512        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4513        reqbyte, because this is equivalent to an empty second branch. */        {
4514          *errorcodeptr = ERR14;
4515          goto FAILED;
4516          }
4517    
4518        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4519        group, less the brackets at either end. Then reduce the compiled code to
4520        just the brackets so that it doesn't use much memory if it is duplicated by
4521        a quantifier. */
4522    
4523        if (lengthptr != NULL)
4524          {
4525          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4526          code++;
4527          PUTINC(code, 0, 1 + LINK_SIZE);
4528          *code++ = OP_KET;
4529          PUTINC(code, 0, 1 + LINK_SIZE);
4530        }        }
4531    
4532      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4533      brackets of all kinds, and conditions with two branches (see code above).  
4534      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4535      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4536      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4537        relevant. */
4538    
4539        if (bravalue == OP_DEF) break;
4540    
4541        /* Handle updating of the required and first characters for other types of
4542        group. Update for normal brackets of all kinds, and conditions with two
4543        branches (see code above). If the bracket is followed by a quantifier with
4544        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4545        zerofirstbyte outside the main loop so that they can be accessed for the
4546        back off. */
4547    
4548      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4549      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4550      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4551    
4552      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4553        {        {
4554        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4555        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3378  for (;; ptr++) Line 4590  for (;; ptr++)
4590      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4591    
4592      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4593        break;     /* End of processing '(' */
4594    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
4595    
4596      /* Error if hit end of pattern */      /* ===================================================================*/
4597        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
   
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
4598      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4599      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4600      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4601      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4602      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4603    
4604        case '\\':
4605        tempptr = ptr;
4606        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4607        if (*errorcodeptr != 0) goto FAILED;
4608    
4609      if (c < 0)      if (c < 0)
4610        {        {
4611        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3416  for (;; ptr++) Line 4615  for (;; ptr++)
4615          continue;          continue;
4616          }          }
4617    
4618          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4619    
4620        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4621        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4622    
# Line 3427  for (;; ptr++) Line 4628  for (;; ptr++)
4628        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4629        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4630    
4631        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4632          We also support \k{name} (.NET syntax) */
4633    
4634          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4635            {
4636            is_recurse = FALSE;
4637            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4638            goto NAMED_REF_OR_RECURSE;
4639            }
4640    
4641          /* Back references are handled specially; must disable firstbyte if
4642          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4643          ':' later. */
4644    
4645        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4646          {          {
4647          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4648    
4649            HANDLE_REFERENCE:    /* Come here from named backref handling */
4650            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4651          previous = code;          previous = code;
4652          *code++ = OP_REF;          *code++ = OP_REF;
4653          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4654            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4655            if (recno > cd->top_backref) cd->top_backref = recno;
4656          }          }
4657    
4658        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4659    
4660  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4661        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3446  for (;; ptr++) Line 4663  for (;; ptr++)
4663          BOOL negated;          BOOL negated;
4664          int pdata;          int pdata;
4665          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4666            if (ptype < 0) goto FAILED;
4667          previous = code;          previous = code;
4668          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4669          *code++ = ptype;          *code++ = ptype;
4670          *code++ = pdata;          *code++ = pdata;
4671          }          }
4672    #else
4673    
4674          /* If Unicode properties are not supported, \X, \P, and \p are not
4675          allowed. */
4676    
4677          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4678            {
4679            *errorcodeptr = ERR45;
4680            goto FAILED;
4681            }
4682  #endif  #endif
4683    
4684        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4685        value */        can obtain the OP value by negating the escape value. */
4686    
4687        else        else
4688          {          {
# Line 3478  for (;; ptr++) Line 4706  for (;; ptr++)
4706       mcbuffer[0] = c;       mcbuffer[0] = c;
4707       mclength = 1;       mclength = 1;
4708       }       }
   
4709      goto ONE_CHAR;      goto ONE_CHAR;
4710    
4711    
4712        /* ===================================================================*/
4713      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4714      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4715      multi-byte literal character. */      multi-byte literal character. */
# Line 3491  for (;; ptr++) Line 4720  for (;; ptr++)
4720      mcbuffer[0] = c;      mcbuffer[0] = c;
4721    
4722  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4723      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4724        {        {
4725        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4726          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3542  for (;; ptr++) Line 4771  for (;; ptr++)
4771      }      }
4772    }                   /* end of big loop */    }                   /* end of big loop */
4773    
4774    
4775  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4776  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4777  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3558  return FALSE; Line 4788  return FALSE;
4788  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4789  *************************************************/  *************************************************/
4790    
4791  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4792  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4793  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4794  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4795  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4796  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4797  the new options into every subsequent branch compile.  into every subsequent branch compile.
4798    
4799    This function is used during the pre-compile phase when we are trying to find
4800    out the amount of memory needed, as well as during the real compile phase. The
4801    value of lengthptr distinguishes the two phases.
4802    
4803  Argument:  Arguments:
4804    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4805    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4806    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4807    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4808    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4809    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4810    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    reset_bracount TRUE to reset the count for each branch
4811      skipbytes      skip this many bytes at start (for brackets and OP_COND)
4812    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4813    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4814    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4815    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4816      lengthptr      NULL during the real compile phase
4817                     points to length accumulator during pre-compile phase
4818    
4819  Returns:      TRUE on success  Returns:         TRUE on success
4820  */  */
4821    
4822  static BOOL  static BOOL
4823  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4824    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4825    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4826      int *lengthptr)
4827  {  {
4828  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4829  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3595  uschar *start_bracket = code; Line 4832  uschar *start_bracket = code;
4832  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4833  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4834  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4835    int length;
4836    int orig_bracount;
4837    int max_bracount;
4838  branch_chain bc;  branch_chain bc;
4839    
4840  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3602  bc.current = code; Line 4842  bc.current = code;
4842    
4843  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4844    
4845    /* Accumulate the length for use in the pre-compile phase. Start with the
4846    length of the BRA and KET and any extra bytes that are required at the
4847    beginning. We accumulate in a local variable to save frequent testing of
4848    lenthptr for NULL. We cannot do this by looking at the value of code at the
4849    start and end of each alternative, because compiled items are discarded during
4850    the pre-compile phase so that the work space is not exceeded. */
4851    
4852    length = 2 + 2*LINK_SIZE + skipbytes;
4853    
4854    /* WARNING: If the above line is changed for any reason, you must also change
4855    the code that abstracts option settings at the start of the pattern and makes
4856    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4857    pre-compile phase to find out whether anything has yet been compiled or not. */
4858    
4859  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4860    
4861  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3609  code += 1 + LINK_SIZE + skipbytes; Line 4863  code += 1 + LINK_SIZE + skipbytes;
4863    
4864  /* Loop for each alternative branch */  /* Loop for each alternative branch */
4865    
4866    orig_bracount = max_bracount = cd->bracount;
4867  for (;;)  for (;;)
4868    {    {
4869      /* For a (?| group, reset the capturing bracket count so that each branch
4870      uses the same numbers. */
4871    
4872      if (reset_bracount) cd->bracount = orig_bracount;
4873    
4874    /* Handle a change of ims options at the start of the branch */    /* Handle a change of ims options at the start of the branch */
4875    
4876    if ((options & PCRE_IMS) != oldims)    if ((options & PCRE_IMS) != oldims)
4877      {      {
4878      *code++ = OP_OPT;      *code++ = OP_OPT;
4879      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4880        length += 2;
4881      }      }
4882    
4883    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3626  for (;;) Line 4887  for (;;)
4887      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4888      reverse_count = code;      reverse_count = code;
4889      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4890        length += 1 + LINK_SIZE;
4891      }      }
4892    
4893    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4894      into the length. */
4895    
4896    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4897          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4898      {      {
4899      *ptrptr = ptr;      *ptrptr = ptr;
4900      return FALSE;      return FALSE;
4901      }      }
4902    
4903      /* Keep the highest bracket count in case (?| was used and some branch
4904      has fewer than the rest. */
4905    
4906      if (cd->bracount > max_bracount) max_bracount = cd->bracount;
4907    
4908    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4909    
4910    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4911      {      {
4912      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4913      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4914    
4915    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4916    match the values from all the previous branches, except that if the previous        {
4917    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4918    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4919          }
4920    
4921    else      /* If this is not the first branch, the first char and reqbyte have to
4922      {      match the values from all the previous branches, except that if the
4923      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4924      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4925    
4926      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4927        {        {
4928        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4929        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4930        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4931    
4932          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4933            {
4934            if (reqbyte < 0) reqbyte = firstbyte;
4935            firstbyte = REQ_NONE;
4936            }
4937    
4938      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4939      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4940    
4941      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4942          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4943    
4944      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4945    
4946      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4947        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4948      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4949      }        }
4950    
4951    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4952    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4953    the branch with OP_END. */      branch with OP_END. */
4954    
4955    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4956        {        {
4957        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4958        *ptrptr = ptr;        *code = OP_END;
4959        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4960          DPRINTF(("fixed length = %d\n", fixed_length));
4961          if (fixed_length < 0)
4962            {
4963            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4964            *ptrptr = ptr;
4965            return FALSE;
4966            }
4967          PUT(reverse_count, 0, fixed_length);
4968        }        }
     PUT(reverse_count, 0, length);  
4969      }      }
4970    
4971    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
4972    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
4973    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
4974    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
4975    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
4976    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
4977    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
4978    at the terminating char. */    Return leaving the pointer at the terminating char. */
4979    
4980    if (*ptr != '|')    if (*ptr != '|')
4981      {      {
4982      int length = code - last_branch;      if (lengthptr == NULL)
     do  
4983        {        {
4984        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
4985        PUT(last_branch, 1, length);        do
4986        length = prev_length;          {
4987        last_branch -= length;          int prev_length = GET(last_branch, 1);
4988            PUT(last_branch, 1, branch_length);
4989            branch_length = prev_length;
4990            last_branch -= branch_length;
4991            }
4992          while (branch_length > 0);
4993        }        }
     while (length > 0);  
4994    
4995      /* Fill in the ket */      /* Fill in the ket */
4996    
# Line 3728  for (;;) Line 5004  for (;;)
5004        {        {
5005        *code++ = OP_OPT;        *code++ = OP_OPT;
5006        *code++ = oldims;        *code++ = oldims;
5007          length += 2;
5008        }        }
5009    
5010        /* Retain the highest bracket number, in case resetting was used. */
5011    
5012        cd->bracount = max_bracount;
5013    
5014      /* Set values to pass back */      /* Set values to pass back */
5015    
# Line 3736  for (;;) Line 5017  for (;;)
5017      *ptrptr = ptr;      *ptrptr = ptr;
5018      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
5019      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
5020        if (lengthptr != NULL) *lengthptr += length;
5021      return TRUE;      return TRUE;
5022      }      }
5023    
5024    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
5025      pointer back to where it was for the start of the first branch. (That is,
5026      pretend that each branch is the only one.)
5027    
5028      In the real compile phase, insert an ALT node. Its length field points back
5029    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
5030    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
5031    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
5032    
5033    *code = OP_ALT;    if (lengthptr != NULL)
5034    PUT(code, 1, code - last_branch);      {
5035    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
5036    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
5037        }
5038      else
5039        {
5040        *code = OP_ALT;
5041        PUT(code, 1, code - last_branch);
5042        bc.current = last_branch = code;
5043        code += 1 + LINK_SIZE;
5044        }
5045    
5046    ptr++;    ptr++;
5047    }    }
5048  /* Control never reaches here */  /* Control never reaches here */
# Line 3799  is_anchored(register const uschar *code, Line 5094  is_anchored(register const uschar *code,
5094    unsigned int backref_map)    unsigned int backref_map)
5095  {  {
5096  do {  do {
5097     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5098       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
5099     register int op = *scode;     register int op = *scode;
5100    
5101       /* Non-capturing brackets */
5102    
5103       if (op == OP_BRA)
5104         {
5105         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5106         }
5107    
5108     /* Capturing brackets */     /* Capturing brackets */
5109    
5110     if (op > OP_BRA)     else if (op == OP_CBRA)
5111       {       {
5112       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
5113       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
5114       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5115       }       }
5116    
5117     /* Other brackets */     /* Other brackets */
5118    
5119     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5120       {       {
5121       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5122       }       }
# Line 3824  do { Line 5124  do {
5124     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5125     are or may be referenced. */     are or may be referenced. */
5126    
5127     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5128                 op == OP_TYPEPOSSTAR) &&
5129              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
5130       {       {
5131       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3869  is_startline(const uschar *code, unsigne Line 5170  is_startline(const uschar *code, unsigne
5170    unsigned int backref_map)    unsigned int backref_map)
5171  {  {
5172  do {  do {
5173     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5174       FALSE);       NULL, 0, FALSE);
5175     register int op = *scode;     register int op = *scode;
5176    
5177       /* Non-capturing brackets */
5178    
5179       if (op == OP_BRA)
5180         {
5181         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5182         }
5183    
5184     /* Capturing brackets */     /* Capturing brackets */
5185    
5186     if (op > OP_BRA)     else if (op == OP_CBRA)
5187       {       {
5188       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
5189       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
5190       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
5191       }       }
5192    
5193     /* Other brackets */     /* Other brackets */
5194    
5195     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5196       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5197    
5198     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
5199     may be referenced. */     may be referenced. */
5200    
5201     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5202       {       {
5203       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5204       }       }
# Line 3941  do { Line 5247  do {
5247       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5248     register int op = *scode;     register int op = *scode;
5249    
    if (op >= OP_BRA) op = OP_BRA;  
   
5250     switch(op)     switch(op)
5251       {       {
5252       default:       default:
5253       return -1;       return -1;
5254    
5255       case OP_BRA:       case OP_BRA:
5256         case OP_CBRA:
5257       case OP_ASSERT:       case OP_ASSERT:
5258       case OP_ONCE:       case OP_ONCE:
5259       case OP_COND:       case OP_COND:
# Line 3964  do { Line 5269  do {
5269       case OP_CHARNC:       case OP_CHARNC:
5270       case OP_PLUS:       case OP_PLUS:
5271       case OP_MINPLUS:       case OP_MINPLUS:
5272         case OP_POSPLUS:
5273       if (!inassert) return -1;       if (!inassert) return -1;
5274       if (c < 0)       if (c < 0)
5275         {         {
# Line 4004  Returns:        pointer to compiled data Line 5310  Returns:        pointer to compiled data
5310                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5311  */  */
5312    
5313  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5314  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5315    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5316  {  {
# Line 4012  return pcre_compile2(pattern, options, N Line 5318  return pcre_compile2(pattern, options, N
5318  }  }
5319    
5320    
5321    PCRE_EXP_DEFN pcre *
 PCRE_DATA_SCOPE pcre *  
5322  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5323    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5324  {  {
5325  real_pcre *re;  real_pcre *re;
5326  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5327  int c, firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5328  int errorcode = 0;  int errorcode = 0;
5329  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5330  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5331  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5332  size_t size;  size_t size;
5333  uschar *code;  uschar *code;
5334  const uschar *codestart;  const uschar *codestart;
5335  const uschar *ptr;  const uschar *ptr;
5336  compile_data compile_block;  compile_data compile_block;
5337  compile_data *cd = &compile_block;  compile_data *cd = &compile_block;
5338  int brastack[BRASTACK_SIZE];  
5339  uschar bralenstack[BRASTACK_SIZE];  /* This space is used for "compiling" into during the first phase, when we are
5340    computing the amount of memory that is needed. Compiled items are thrown away
5341    as soon as possible, so that a fairly large buffer should be sufficient for
5342    this purpose. The same space is used in the second phase for remembering where
5343    to fill in forward references to subpatterns. */
5344    
5345    uschar cworkspace[COMPILE_WORK_SIZE];
5346    
5347    
5348    /* Set this early so that early errors get offset 0. */
5349    
5350    ptr = (const uschar *)pattern;
5351