/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 171 by ph10, Mon Jun 4 14:28:58 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 54  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 73  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 98  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 107  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 156  static const int posix_class_maps[] = { Line 165  static const int posix_class_maps[] = {
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 172  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 182  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 194  static const char *error_texts[] = { Line 208  static const char *error_texts[] = {
208    "malformed number or name after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 210  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named subpatterns have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p",
234    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236    /* 50 */    /* 50 */
237    "repeated subpattern is too long",    "repeated subpattern is too long",
238    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by a braced name or an optionally braced non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 241  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 277  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 291  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 325  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 352  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 363  static BOOL Line 385  static BOOL
385    
386  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
387  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
388  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
389  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391    ptr is pointing at the \. On exit, it is on the final character of the escape
392    sequence.
393    
394  Arguments:  Arguments:
395    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 398  if (c == 0) *errorcodeptr = ERR1; Line 422  if (c == 0) *errorcodeptr = ERR1;
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 436  else if ((i = escapes[c - 0x48]) != 0)
436  else  else
437    {    {
438    const uschar *oldptr;    const uschar *oldptr;
439      BOOL braced, negated;
440    
441    switch (c)    switch (c)
442      {      {
443      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 451  else
451      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
452      break;      break;
453    
454        /* \g must be followed by a number, either plain or braced. If positive, it
455        is an absolute backreference. If negative, it is a relative backreference.
456        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457        reference to a named group. This is part of Perl's movement towards a
458        unified syntax for back references. As this is synonymous with \k{name}, we
459        fudge it up by pretending it really was \k. */
460    
461        case 'g':
462        if (ptr[1] == '{')
463          {
464          const uschar *p;
465          for (p = ptr+2; *p != 0 && *p != '}'; p++)
466            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467          if (*p != 0 && *p != '}')
468            {
469            c = -ESC_k;
470            break;
471            }
472          braced = TRUE;
473          ptr++;
474          }
475        else braced = FALSE;
476    
477        if (ptr[1] == '-')
478          {
479          negated = TRUE;
480          ptr++;
481          }
482        else negated = FALSE;
483    
484        c = 0;
485        while ((digitab[ptr[1]] & ctype_digit) != 0)
486          c = c * 10 + *(++ptr) - '0';
487    
488        if (c == 0 || (braced && *(++ptr) != '}'))
489          {
490          *errorcodeptr = ERR57;
491          return 0;
492          }
493    
494        if (negated)
495          {
496          if (c > bracount)
497            {
498            *errorcodeptr = ERR15;
499            return 0;
500            }
501          c = bracount - (c - 1);
502          }
503    
504        c = -(ESC_REF + c);
505        break;
506    
507      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
508      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
509      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 495  else Line 574  else
574          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
575          count++;          count++;
576    
577  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
578          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
579          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
581          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
582          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583  #endif  #endif
# Line 522  else Line 601  else
601        {        {
602        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
603        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
604  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
605        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
606        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
608        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
609        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610  #endif  #endif
611        }        }
612      break;      break;
613    
614      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615        This coding is ASCII-specific, but then the whole concept of \cx is
616        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617    
618      case 'c':      case 'c':
619      c = *(++ptr);      c = *(++ptr);
# Line 542  else Line 623  else
623        return 0;        return 0;
624        }        }
625    
626      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
627      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
628      c ^= 0x40;      c ^= 0x40;
629  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
630      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
631      c ^= 0xC0;      c ^= 0xC0;
632  #endif  #endif
# Line 772  return p; Line 849  return p;
849    
850    
851  /*************************************************  /*************************************************
852  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
853  *************************************************/  *************************************************/
854    
855  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
856  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
857  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
858  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
859  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
860    be terminated by '>' because that is checked in the first pass.
861    
862  Arguments:  Arguments:
863    pointer      current position in the pattern    ptr          current position in the pattern
864    count        current count of capturing parens    count        current count of capturing parens so far encountered
865    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
866    namelen      name length    lorn         name length, or subpattern number if name is NULL
867      xmode        TRUE if we are in /x mode
868    
869  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
870  */  */
871    
872  static int  static int
873  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874      BOOL xmode)
875  {  {
876  const uschar *thisname;  const uschar *thisname;
877    
878  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
879    {    {
880    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
881    
882      /* Skip over backslashed characters and also entire \Q...\E */
883    
884      if (*ptr == '\\')
885        {
886        if (*(++ptr) == 0) return -1;
887        if (*ptr == 'Q') for (;;)
888          {
889          while (*(++ptr) != 0 && *ptr != '\\');
890          if (*ptr == 0) return -1;
891          if (*(++ptr) == 'E') break;
892          }
893        continue;
894        }
895    
896      /* Skip over character classes */
897    
898      if (*ptr == '[')
899        {
900        while (*(++ptr) != ']')
901          {
902          if (*ptr == '\\')
903            {
904            if (*(++ptr) == 0) return -1;
905            if (*ptr == 'Q') for (;;)
906              {
907              while (*(++ptr) != 0 && *ptr != '\\');
908              if (*ptr == 0) return -1;
909              if (*(++ptr) == 'E') break;
910              }
911            continue;
912            }
913          }
914        continue;
915        }
916    
917      /* Skip comments in /x mode */
918    
919      if (xmode && *ptr == '#')
920        {
921        while (*(++ptr) != 0 && *ptr != '\n');
922        if (*ptr == 0) return -1;
923        continue;
924        }
925    
926      /* An opening parens must now be a real metacharacter */
927    
928    if (*ptr != '(') continue;    if (*ptr != '(') continue;
929    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?')
930    if (ptr[2] == '(') { ptr += 2; continue; }      {
931    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
932        if (name == NULL && count == lorn) return count;
933        continue;
934        }
935    
936      ptr += 2;
937      if (*ptr == 'P') ptr++;                      /* Allow optional P */
938    
939      /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942           *ptr != '\'')
943        continue;
944    
945    count++;    count++;
946    ptr += 4;  
947      if (name == NULL && count == lorn) return count;
948      term = *ptr++;
949      if (term == '<') term = '>';
950    thisname = ptr;    thisname = ptr;
951    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
952    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
953          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954      return count;      return count;
955    }    }
956    
957  return -1;  return -1;
958  }  }
959    
# Line 862  for (;;) Line 1008  for (;;)
1008    
1009      case OP_CALLOUT:      case OP_CALLOUT:
1010      case OP_CREF:      case OP_CREF:
1011      case OP_BRANUMBER:      case OP_RREF:
1012        case OP_DEF:
1013      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1014      break;      break;
1015    
# Line 907  for (;;) Line 1054  for (;;)
1054    {    {
1055    int d;    int d;
1056    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1057    
1058    switch (op)    switch (op)
1059      {      {
1060        case OP_CBRA:
1061      case OP_BRA:      case OP_BRA:
1062      case OP_ONCE:      case OP_ONCE:
1063      case OP_COND:      case OP_COND:
1064      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065      if (d < 0) return d;      if (d < 0) return d;
1066      branchlength += d;      branchlength += d;
1067      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1096  for (;;)
1096      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1097    
1098      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1099      case OP_CREF:      case OP_CREF:
1100        case OP_RREF:
1101        case OP_DEF:
1102      case OP_OPT:      case OP_OPT:
1103      case OP_CALLOUT:      case OP_CALLOUT:
1104      case OP_SOD:      case OP_SOD:
# Line 1094  for (;;) Line 1242  for (;;)
1242    
1243    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245    /* Handle bracketed group */    /* Handle capturing bracket */
1246    
1247    else if (c > OP_BRA)    else if (c == OP_CBRA)
1248      {      {
1249      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1250      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1251      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1252      }      }
1253    
1254    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255    that are followed by a character may be followed by a multi-byte character.    a multi-byte character. The length in the table is a minimum, so we have to
1256    The length in the table is a minimum, so we have to scan along to skip the    arrange to skip the extra bytes. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1257    
1258    else    else
1259      {      {
1260      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1261    #ifdef SUPPORT_UTF8
1262      if (utf8) switch(c)      if (utf8) switch(c)
1263        {        {
1264        case OP_CHAR:        case OP_CHAR:
# Line 1120  for (;;) Line 1266  for (;;)
1266        case OP_EXACT:        case OP_EXACT:
1267        case OP_UPTO:        case OP_UPTO:
1268        case OP_MINUPTO:        case OP_MINUPTO:
1269          case OP_POSUPTO:
1270        case OP_STAR:        case OP_STAR:
1271        case OP_MINSTAR:        case OP_MINSTAR:
1272          case OP_POSSTAR:
1273        case OP_PLUS:        case OP_PLUS:
1274        case OP_MINPLUS:        case OP_MINPLUS:
1275          case OP_POSPLUS:
1276        case OP_QUERY:        case OP_QUERY:
1277        case OP_MINQUERY:        case OP_MINQUERY:
1278        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1279          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280        break;        break;
1281        }        }
1282    #endif
1283      }      }
1284    }    }
1285  }  }
# Line 1164  for (;;) Line 1315  for (;;)
1315    
1316    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1317    
   /* All bracketed groups have the same length. */  
   
   else if (c > OP_BRA)  
     {  
     code += _pcre_OP_lengths[OP_BRA];  
     }  
   
1318    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319    that are followed by a character may be followed by a multi-byte character.    that are followed by a character may be followed by a multi-byte character.
1320    The length in the table is a minimum, so we have to scan along to skip the    The length in the table is a minimum, so we have to arrange to skip the extra
1321    extra bytes. All opcodes are less than 128, so we can use relatively    bytes. */
   efficient code. */  
1322    
1323    else    else
1324      {      {
1325      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1326    #ifdef SUPPORT_UTF8
1327      if (utf8) switch(c)      if (utf8) switch(c)
1328        {        {
1329        case OP_CHAR:        case OP_CHAR:
# Line 1187  for (;;) Line 1331  for (;;)
1331        case OP_EXACT:        case OP_EXACT:
1332        case OP_UPTO:        case OP_UPTO:
1333        case OP_MINUPTO:        case OP_MINUPTO:
1334          case OP_POSUPTO:
1335        case OP_STAR:        case OP_STAR:
1336        case OP_MINSTAR:        case OP_MINSTAR:
1337          case OP_POSSTAR:
1338        case OP_PLUS:        case OP_PLUS:
1339        case OP_MINPLUS:        case OP_MINPLUS:
1340          case OP_POSPLUS:
1341        case OP_QUERY:        case OP_QUERY:
1342        case OP_MINQUERY:        case OP_MINQUERY:
1343        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1344          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345        break;        break;
1346        }        }
1347    #endif
1348      }      }
1349    }    }
1350  }  }
# Line 1207  for (;;) Line 1356  for (;;)
1356  *************************************************/  *************************************************/
1357    
1358  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1359  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1360  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1361  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1362  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363    struck an inner bracket whose current branch will already have been scanned.
1364    
1365  Arguments:  Arguments:
1366    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1374  static BOOL
1374  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375  {  {
1376  register int c;  register int c;
1377  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378       code < endcode;       code < endcode;
1379       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380    {    {
1381    const uschar *ccode;    const uschar *ccode;
1382    
1383    c = *code;    c = *code;
1384    
1385      /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387    if (c >= OP_BRA)    if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388        {
1389        do code += GET(code, 1); while (*code == OP_ALT);
1390        c = *code;
1391        continue;
1392        }
1393    
1394      /* For other groups, scan the branches. */
1395    
1396      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1397      {      {
1398      BOOL empty_branch;      BOOL empty_branch;
1399      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1409  for (code = first_significant_code(code
1409        }        }
1410      while (*code == OP_ALT);      while (*code == OP_ALT);
1411      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1412      code += 1 + LINK_SIZE;      c = *code;
1413      c = *code;      continue;
1414      }      }
1415    
1416    else switch (c)    /* Handle the other opcodes */
1417    
1418      switch (c)
1419      {      {
1420      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1421    
# Line 1308  for (code = first_significant_code(code Line 1471  for (code = first_significant_code(code
1471      case OP_NOT:      case OP_NOT:
1472      case OP_PLUS:      case OP_PLUS:
1473      case OP_MINPLUS:      case OP_MINPLUS:
1474        case OP_POSPLUS:
1475      case OP_EXACT:      case OP_EXACT:
1476      case OP_NOTPLUS:      case OP_NOTPLUS:
1477      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1478        case OP_NOTPOSPLUS:
1479      case OP_NOTEXACT:      case OP_NOTEXACT:
1480      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1481      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1482        case OP_TYPEPOSPLUS:
1483      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1484      return FALSE;      return FALSE;
1485    
# Line 1325  for (code = first_significant_code(code Line 1491  for (code = first_significant_code(code
1491      case OP_ALT:      case OP_ALT:
1492      return TRUE;      return TRUE;
1493    
1494      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1495      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1496    
1497  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1498      case OP_STAR:      case OP_STAR:
1499      case OP_MINSTAR:      case OP_MINSTAR:
1500        case OP_POSSTAR:
1501      case OP_QUERY:      case OP_QUERY:
1502      case OP_MINQUERY:      case OP_MINQUERY:
1503        case OP_POSQUERY:
1504      case OP_UPTO:      case OP_UPTO:
1505      case OP_MINUPTO:      case OP_MINUPTO:
1506        case OP_POSUPTO:
1507      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1508      break;      break;
1509  #endif  #endif
# Line 1452  earlier groups that are outside the curr Line 1621  earlier groups that are outside the curr
1621  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1622  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1623  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1624  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1625  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1626    
1627    This function has been extended with the possibility of forward references for
1628    recursions and subroutine calls. It must also check the list of such references
1629    for the group we are dealing with. If it finds that one of the recursions in
1630    the current group is on this list, it adjusts the offset in the list, not the
1631    value in the reference (which is a group number).
1632    
1633  Arguments:  Arguments:
1634    group      points to the start of the group    group      points to the start of the group
1635    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1636    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1637    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1638      save_hwm   the hwm forward reference pointer at the start of the group
1639    
1640  Returns:     nothing  Returns:     nothing
1641  */  */
1642    
1643  static void  static void
1644  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1645      uschar *save_hwm)
1646  {  {
1647  uschar *ptr = group;  uschar *ptr = group;
1648  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1649    {    {
1650    int offset = GET(ptr, 1);    int offset;
1651    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1652    
1653      /* See if this recursion is on the forward reference list. If so, adjust the
1654      reference. */
1655    
1656      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1657        {
1658        offset = GET(hc, 0);
1659        if (cd->start_code + offset == ptr + 1)
1660          {
1661          PUT(hc, 0, offset + adjust);
1662          break;
1663          }
1664        }
1665    
1666      /* Otherwise, adjust the recursion offset if it's after the start of this
1667      group. */
1668    
1669      if (hc >= cd->hwm)
1670        {
1671        offset = GET(ptr, 1);
1672        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1673        }
1674    
1675    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1676    }    }
1677  }  }
# Line 1550  Yield:        TRUE when range returned; Line 1750  Yield:        TRUE when range returned;
1750  */  */
1751    
1752  static BOOL  static BOOL
1753  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1754      unsigned int *odptr)
1755  {  {
1756  int c, othercase, next;  unsigned int c, othercase, next;
1757    
1758  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1759    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1760    
1761  if (c > d) return FALSE;  if (c > d) return FALSE;
1762    
# Line 1576  return TRUE; Line 1777  return TRUE;
1777  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1778    
1779    
1780    
1781    /*************************************************
1782    *     Check if auto-possessifying is possible    *
1783    *************************************************/
1784    
1785    /* This function is called for unlimited repeats of certain items, to see
1786    whether the next thing could possibly match the repeated item. If not, it makes
1787    sense to automatically possessify the repeated item.
1788    
1789    Arguments:
1790      op_code       the repeated op code
1791      this          data for this item, depends on the opcode
1792      utf8          TRUE in UTF-8 mode
1793      utf8_char     used for utf8 character bytes, NULL if not relevant
1794      ptr           next character in pattern
1795      options       options bits
1796      cd            contains pointers to tables etc.
1797    
1798    Returns:        TRUE if possessifying is wanted
1799    */
1800    
1801    static BOOL
1802    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1803      const uschar *ptr, int options, compile_data *cd)
1804    {
1805    int next;
1806    
1807    /* Skip whitespace and comments in extended mode */
1808    
1809    if ((options & PCRE_EXTENDED) != 0)
1810      {
1811      for (;;)
1812        {
1813        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1814        if (*ptr == '#')
1815          {
1816          while (*(++ptr) != 0)
1817            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1818          }
1819        else break;
1820        }
1821      }
1822    
1823    /* If the next item is one that we can handle, get its value. A non-negative
1824    value is a character, a negative value is an escape value. */
1825    
1826    if (*ptr == '\\')
1827      {
1828      int temperrorcode = 0;
1829      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1830      if (temperrorcode != 0) return FALSE;
1831      ptr++;    /* Point after the escape sequence */
1832      }
1833    
1834    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1835      {
1836    #ifdef SUPPORT_UTF8
1837      if (utf8) { GETCHARINC(next, ptr); } else
1838    #endif
1839      next = *ptr++;
1840      }
1841    
1842    else return FALSE;
1843    
1844    /* Skip whitespace and comments in extended mode */
1845    
1846    if ((options & PCRE_EXTENDED) != 0)
1847      {
1848      for (;;)
1849        {
1850        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1851        if (*ptr == '#')
1852          {
1853          while (*(++ptr) != 0)
1854            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1855          }
1856        else break;
1857        }
1858      }
1859    
1860    /* If the next thing is itself optional, we have to give up. */
1861    
1862    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1863      return FALSE;
1864    
1865    /* Now compare the next item with the previous opcode. If the previous is a
1866    positive single character match, "item" either contains the character or, if
1867    "item" is greater than 127 in utf8 mode, the character's bytes are in
1868    utf8_char. */
1869    
1870    
1871    /* Handle cases when the next item is a character. */
1872    
1873    if (next >= 0) switch(op_code)
1874      {
1875      case OP_CHAR:
1876    #ifdef SUPPORT_UTF8
1877      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1878    #endif
1879      return item != next;
1880    
1881      /* For CHARNC (caseless character) we must check the other case. If we have
1882      Unicode property support, we can use it to test the other case of
1883      high-valued characters. */
1884    
1885      case OP_CHARNC:
1886    #ifdef SUPPORT_UTF8
1887      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1888    #endif
1889      if (item == next) return FALSE;
1890    #ifdef SUPPORT_UTF8
1891      if (utf8)
1892        {
1893        unsigned int othercase;
1894        if (next < 128) othercase = cd->fcc[next]; else
1895    #ifdef SUPPORT_UCP
1896        othercase = _pcre_ucp_othercase((unsigned int)next);
1897    #else
1898        othercase = NOTACHAR;
1899    #endif
1900        return (unsigned int)item != othercase;
1901        }
1902      else
1903    #endif  /* SUPPORT_UTF8 */
1904      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1905    
1906      /* For OP_NOT, "item" must be a single-byte character. */
1907    
1908      case OP_NOT:
1909      if (next < 0) return FALSE;  /* Not a character */
1910      if (item == next) return TRUE;
1911      if ((options & PCRE_CASELESS) == 0) return FALSE;
1912    #ifdef SUPPORT_UTF8
1913      if (utf8)
1914        {
1915        unsigned int othercase;
1916        if (next < 128) othercase = cd->fcc[next]; else
1917    #ifdef SUPPORT_UCP
1918        othercase = _pcre_ucp_othercase(next);
1919    #else
1920        othercase = NOTACHAR;
1921    #endif
1922        return (unsigned int)item == othercase;
1923        }
1924      else
1925    #endif  /* SUPPORT_UTF8 */
1926      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1927    
1928      case OP_DIGIT:
1929      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1930    
1931      case OP_NOT_DIGIT:
1932      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1933    
1934      case OP_WHITESPACE:
1935      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1936    
1937      case OP_NOT_WHITESPACE:
1938      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1939    
1940      case OP_WORDCHAR:
1941      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1942    
1943      case OP_NOT_WORDCHAR:
1944      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1945    
1946      default:
1947      return FALSE;
1948      }
1949    
1950    
1951    /* Handle the case when the next item is \d, \s, etc. */
1952    
1953    switch(op_code)
1954      {
1955      case OP_CHAR:
1956      case OP_CHARNC:
1957    #ifdef SUPPORT_UTF8
1958      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1959    #endif
1960      switch(-next)
1961        {
1962        case ESC_d:
1963        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1964    
1965        case ESC_D:
1966        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1967    
1968        case ESC_s:
1969        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1970    
1971        case ESC_S:
1972        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1973    
1974        case ESC_w:
1975        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1976    
1977        case ESC_W:
1978        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1979    
1980        default:
1981        return FALSE;
1982        }
1983    
1984      case OP_DIGIT:
1985      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1986    
1987      case OP_NOT_DIGIT:
1988      return next == -ESC_d;
1989    
1990      case OP_WHITESPACE:
1991      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1992    
1993      case OP_NOT_WHITESPACE:
1994      return next == -ESC_s;
1995    
1996      case OP_WORDCHAR:
1997      return next == -ESC_W || next == -ESC_s;
1998    
1999      case OP_NOT_WORDCHAR:
2000      return next == -ESC_w || next == -ESC_d;
2001    
2002      default:
2003      return FALSE;
2004      }
2005    
2006    /* Control does not reach here */
2007    }
2008    
2009    
2010    
2011  /*************************************************  /*************************************************
2012  *           Compile one branch                   *  *           Compile one branch                   *
2013  *************************************************/  *************************************************/
2014    
2015  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
2016  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2017  bits.  bits. This function is used during the pre-compile phase when we are trying
2018    to find out the amount of memory needed, as well as during the real compile
2019    phase. The value of lengthptr distinguishes the two phases.
2020    
2021  Arguments:  Arguments:
2022    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2023    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2024    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2025    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1594  Arguments: Line 2027  Arguments:
2027    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2028    bcptr          points to current branch chain    bcptr          points to current branch chain
2029    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2030      lengthptr      NULL during the real compile phase
2031                     points to length accumulator during pre-compile phase
2032    
2033  Returns:         TRUE on success  Returns:         TRUE on success
2034                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2035  */  */
2036    
2037  static BOOL  static BOOL
2038  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2039    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2040    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2041  {  {
2042  int repeat_type, op_type;  int repeat_type, op_type;
2043  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1613  int zeroreqbyte, zerofirstbyte; Line 2048  int zeroreqbyte, zerofirstbyte;
2048  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
2049  int options = *optionsptr;  int options = *optionsptr;
2050  int after_manual_callout = 0;  int after_manual_callout = 0;
2051    int length_prevgroup = 0;
2052  register int c;  register int c;
2053  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2054    uschar *last_code = code;
2055    uschar *orig_code = code;
2056  uschar *tempcode;  uschar *tempcode;
2057  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2058  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1622  const uschar *ptr = *ptrptr; Line 2060  const uschar *ptr = *ptrptr;
2060  const uschar *tempptr;  const uschar *tempptr;
2061  uschar *previous = NULL;  uschar *previous = NULL;
2062  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2063    uschar *save_hwm = NULL;
2064  uschar classbits[32];  uschar classbits[32];
2065    
2066  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1631  uschar *class_utf8data; Line 2070  uschar *class_utf8data;
2070  uschar utf8_char[6];  uschar utf8_char[6];
2071  #else  #else
2072  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2073    uschar *utf8_char = NULL;
2074    #endif
2075    
2076    #ifdef DEBUG
2077    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2078  #endif  #endif
2079    
2080  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1664  for (;; ptr++) Line 2108  for (;; ptr++)
2108    BOOL negate_class;    BOOL negate_class;
2109    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2110    BOOL is_quantifier;    BOOL is_quantifier;
2111      BOOL is_recurse;
2112    int class_charcount;    int class_charcount;
2113    int class_lastchar;    int class_lastchar;
2114    int newoptions;    int newoptions;
2115    int recno;    int recno;
2116      int refsign;
2117    int skipbytes;    int skipbytes;
2118    int subreqbyte;    int subreqbyte;
2119    int subfirstbyte;    int subfirstbyte;
2120      int terminator;
2121    int mclength;    int mclength;
2122    uschar mcbuffer[8];    uschar mcbuffer[8];
2123    
2124    /* Next byte in the pattern */    /* Get next byte in the pattern */
2125    
2126    c = *ptr;    c = *ptr;
2127    
2128      /* If we are in the pre-compile phase, accumulate the length used for the
2129      previous cycle of this loop. */
2130    
2131      if (lengthptr != NULL)
2132        {
2133    #ifdef DEBUG
2134        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2135    #endif
2136        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2137          {
2138          *errorcodeptr = ERR52;
2139          goto FAILED;
2140          }
2141    
2142        /* There is at least one situation where code goes backwards: this is the
2143        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2144        the class is simply eliminated. However, it is created first, so we have to
2145        allow memory for it. Therefore, don't ever reduce the length at this point.
2146        */
2147    
2148        if (code < last_code) code = last_code;
2149        *lengthptr += code - last_code;
2150        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2151    
2152        /* If "previous" is set and it is not at the start of the work space, move
2153        it back to there, in order to avoid filling up the work space. Otherwise,
2154        if "previous" is NULL, reset the current code pointer to the start. */
2155    
2156        if (previous != NULL)
2157          {
2158          if (previous > orig_code)
2159            {
2160            memmove(orig_code, previous, code - previous);
2161            code -= previous - orig_code;
2162            previous = orig_code;
2163            }
2164          }
2165        else code = orig_code;
2166    
2167        /* Remember where this code item starts so we can pick up the length
2168        next time round. */
2169    
2170        last_code = code;
2171        }
2172    
2173      /* In the real compile phase, just check the workspace used by the forward
2174      reference list. */
2175    
2176      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2177        {
2178        *errorcodeptr = ERR52;
2179        goto FAILED;
2180        }
2181    
2182    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2183    
2184    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1692  for (;; ptr++) Line 2193  for (;; ptr++)
2193        {        {
2194        if (previous_callout != NULL)        if (previous_callout != NULL)
2195          {          {
2196          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2197              complete_callout(previous_callout, ptr, cd);
2198          previous_callout = NULL;          previous_callout = NULL;
2199          }          }
2200        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2215  for (;; ptr++)
2215    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2216         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2217      {      {
2218      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2219          complete_callout(previous_callout, ptr, cd);
2220      previous_callout = NULL;      previous_callout = NULL;
2221      }      }
2222    
# Line 1724  for (;; ptr++) Line 2227  for (;; ptr++)
2227      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2228      if (c == '#')      if (c == '#')
2229        {        {
2230        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2231          {          {
2232          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2233          }          }
2234          if (*ptr != 0) continue;
2235    
2236        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2237        c = 0;        c = 0;
2238        }        }
# Line 1745  for (;; ptr++) Line 2248  for (;; ptr++)
2248    
2249    switch(c)    switch(c)
2250      {      {
2251      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2252        case 0:                        /* The branch terminates at string end */
2253      case 0:      case '|':                      /* or | or ) */
     case '|':  
2254      case ')':      case ')':
2255      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2256      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2257      *codeptr = code;      *codeptr = code;
2258      *ptrptr = ptr;      *ptrptr = ptr;
2259        if (lengthptr != NULL)
2260          {
2261          *lengthptr += code - last_code;   /* To include callout length */
2262          DPRINTF((">> end branch\n"));
2263          }
2264      return TRUE;      return TRUE;
2265    
2266    
2267        /* ===================================================================*/
2268      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2269      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2270    
# Line 1784  for (;; ptr++) Line 2293  for (;; ptr++)
2293      *code++ = OP_ANY;      *code++ = OP_ANY;
2294      break;      break;
2295    
2296    
2297        /* ===================================================================*/
2298      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2299      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2300      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1822  for (;; ptr++) Line 2333  for (;; ptr++)
2333        }        }
2334    
2335      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2336      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2337      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2338    
2339      class_charcount = 0;      class_charcount = 0;
2340      class_lastchar = -1;      class_lastchar = -1;
2341    
2342        /* Initialize the 32-char bit map to all zeros. We build the map in a
2343        temporary bit of memory, in case the class contains only 1 character (less
2344        than 256), because in that case the compiled code doesn't use the bit map.
2345        */
2346    
2347        memset(classbits, 0, 32 * sizeof(uschar));
2348    
2349  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2350      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2351      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2352  #endif  #endif
2353    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2354      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2355      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2356      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2357    
2358      do      if (c != 0) do
2359        {        {
2360          const uschar *oldptr;
2361    
2362  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2363        if (utf8 && c > 127)        if (utf8 && c > 127)
2364          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1859  for (;; ptr++) Line 2370  for (;; ptr++)
2370    
2371        if (inescq)        if (inescq)
2372          {          {
2373          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2374            {            {
2375            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2376            ptr++;            ptr++;                            /* Skip the 'E' */
2377            continue;            continue;                         /* Carry on with next */
2378            }            }
2379          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2380          }          }
2381    
2382        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1956  for (;; ptr++) Line 2467  for (;; ptr++)
2467          }          }
2468    
2469        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2470        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2471        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2472        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2473        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2474        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2475    
2476        if (c == '\\')        if (c == '\\')
2477          {          {
2478          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2479            if (*errorcodeptr != 0) goto FAILED;
2480    
2481          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2482          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2483            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2484          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2485            {            {
2486            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1983  for (;; ptr++) Line 2495  for (;; ptr++)
2495            {            {
2496            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2497            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2498            switch (-c)  
2499              /* Save time by not doing this in the pre-compile phase. */
2500    
2501              if (lengthptr == NULL) switch (-c)
2502              {              {
2503              case ESC_d:              case ESC_d:
2504              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 2011  for (;; ptr++) Line 2526  for (;; ptr++)
2526              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2527              continue;              continue;
2528    
2529                case ESC_E: /* Perl ignores an orphan \E */
2530                continue;
2531    
2532                default:    /* Not recognized; fall through */
2533                break;      /* Need "default" setting to stop compiler warning. */
2534                }
2535    
2536              /* In the pre-compile phase, just do the recognition. */
2537    
2538              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2539                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2540    
2541              /* We need to deal with \P and \p in both phases. */
2542    
2543  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2544              case ESC_p:            if (-c == ESC_p || -c == ESC_P)
2545              case ESC_P:              {
2546                {              BOOL negated;
2547                BOOL negated;              int pdata;
2548                int pdata;              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2549                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);              if (ptype < 0) goto FAILED;
2550                if (ptype < 0) goto FAILED;              class_utf8 = TRUE;
2551                class_utf8 = TRUE;              *class_utf8data++ = ((-c == ESC_p) != negated)?
2552                *class_utf8data++ = ((-c == ESC_p) != negated)?                XCL_PROP : XCL_NOTPROP;
2553                  XCL_PROP : XCL_NOTPROP;              *class_utf8data++ = ptype;
2554                *class_utf8data++ = ptype;              *class_utf8data++ = pdata;
2555                *class_utf8data++ = pdata;              class_charcount -= 2;   /* Not a < 256 character */
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2556              continue;              continue;
2557                }
2558  #endif  #endif
2559              /* Unrecognized escapes are faulted if PCRE is running in its
2560              strict mode. By default, for compatibility with Perl, they are
2561              treated as literals. */
2562    
2563              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2564              strict mode. By default, for compatibility with Perl, they are              {
2565              treated as literals. */              *errorcodeptr = ERR7;
2566                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2567              }              }
2568    
2569              class_charcount -= 2;  /* Undo the default count from above */
2570              c = *ptr;              /* Get the final character and fall through */
2571            }            }
2572    
2573          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2574          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2575    
2576          }   /* End of backslash handling */          }   /* End of backslash handling */
2577    
2578        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2579        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2580        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2581          entirely. The code for handling \Q and \E is messy. */
2582    
2583          CHECK_RANGE:
2584          while (ptr[1] == '\\' && ptr[2] == 'E')
2585            {
2586            inescq = FALSE;
2587            ptr += 2;
2588            }
2589    
2590        if (ptr[1] == '-' && ptr[2] != ']')        oldptr = ptr;
2591    
2592          if (!inescq && ptr[1] == '-')
2593          {          {
2594          int d;          int d;
2595          ptr += 2;          ptr += 2;
2596            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2597    
2598            /* If we hit \Q (not followed by \E) at this point, go into escaped
2599            mode. */
2600    
2601            while (*ptr == '\\' && ptr[1] == 'Q')
2602              {
2603              ptr += 2;
2604              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2605              inescq = TRUE;
2606              break;
2607              }
2608    
2609            if (*ptr == 0 || (!inescq && *ptr == ']'))
2610              {
2611              ptr = oldptr;
2612              goto LONE_SINGLE_CHARACTER;
2613              }
2614    
2615  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2616          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 2625  for (;; ptr++)
2625          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2626          in such circumstances. */          in such circumstances. */
2627    
2628          if (d == '\\')          if (!inescq && d == '\\')
2629            {            {
2630            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2631            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2632    
2633            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2634            was literal */            special means the '-' was literal */
2635    
2636            if (d < 0)            if (d < 0)
2637              {              {
2638              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2639              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2640                else if (d == -ESC_R) d = 'R'; else
2641                {                {
2642                ptr = oldptr - 2;                ptr = oldptr;
2643                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2644                }                }
2645              }              }
2646            }            }
2647    
2648          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2649          the pre-pass. Optimize one-character ranges */          one-character ranges */
2650    
2651            if (d < c)
2652              {
2653              *errorcodeptr = ERR8;
2654              goto FAILED;
2655              }
2656    
2657          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2658    
# Line 2112  for (;; ptr++) Line 2673  for (;; ptr++)
2673  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2674            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2675              {              {
2676              int occ, ocd;              unsigned int occ, ocd;
2677              int cc = c;              unsigned int cc = c;
2678              int origd = d;              unsigned int origd = d;
2679              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2680                {                {
2681                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2172  for (;; ptr++) Line 2733  for (;; ptr++)
2733          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2734          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2735    
2736          for (; c <= d; c++)          class_charcount += d - c + 1;
2737            class_lastchar = d;
2738    
2739            /* We can save a bit of time by skipping this in the pre-compile. */
2740    
2741            if (lengthptr == NULL) for (; c <= d; c++)
2742            {            {
2743            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2744            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 2746  for (;; ptr++)
2746              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2747              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2748              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2749            }            }
2750    
2751          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 2769  for (;; ptr++)
2769  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2770          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2771            {            {
2772            int othercase;            unsigned int othercase;
2773            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2774              {              {
2775              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2776              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 2795  for (;; ptr++)
2795          }          }
2796        }        }
2797    
2798      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
2799      loop. This "while" is the end of the "do" above. */  
2800        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2801    
2802      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
2803          {
2804          *errorcodeptr = ERR6;
2805          goto FAILED;
2806          }
2807    
2808      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2809      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2298  for (;; ptr++) Line 2867  for (;; ptr++)
2867    
2868      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2869      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2870      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2871    
2872  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2873      if (class_utf8)      if (class_utf8)
# Line 2308  for (;; ptr++) Line 2877  for (;; ptr++)
2877        code += LINK_SIZE;        code += LINK_SIZE;
2878        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2879    
2880        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2881        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2882    
2883        if (class_charcount > 0)        if (class_charcount > 0)
2884          {          {
2885          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2886            memmove(code + 32, code, class_utf8data - code);
2887          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2888          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2889          }          }
2890          else code = class_utf8data;
2891    
2892        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2893    
# Line 2342  for (;; ptr++) Line 2904  for (;; ptr++)
2904      if (negate_class)      if (negate_class)
2905        {        {
2906        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2907        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2908            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2909        }        }
2910      else      else
2911        {        {
# Line 2352  for (;; ptr++) Line 2915  for (;; ptr++)
2915      code += 32;      code += 32;
2916      break;      break;
2917    
2918    
2919        /* ===================================================================*/
2920      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2921      has been tested above. */      has been tested above. */
2922    
# Line 2419  for (;; ptr++) Line 2984  for (;; ptr++)
2984        }        }
2985      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2986    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2987      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2988      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2989      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 3017  for (;; ptr++)
3017          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3018          }          }
3019    
3020          /* If the repetition is unlimited, it pays to see if the next thing on
3021          the line is something that cannot possibly match this character. If so,
3022          automatically possessifying this item gains some performance in the case
3023          where the match fails. */
3024    
3025          if (!possessive_quantifier &&
3026              repeat_max < 0 &&
3027              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3028                options, cd))
3029            {
3030            repeat_type = 0;    /* Force greedy */
3031            possessive_quantifier = TRUE;
3032            }
3033    
3034        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3035        }        }
3036    
3037      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3038      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3039      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3040      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3041        currently used only for single-byte chars. */
3042    
3043      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3044        {        {
3045        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3046        c = previous[1];        c = previous[1];
3047          if (!possessive_quantifier &&
3048              repeat_max < 0 &&
3049              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3050            {
3051            repeat_type = 0;    /* Force greedy */
3052            possessive_quantifier = TRUE;
3053            }
3054        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3055        }        }
3056    
# Line 2495  for (;; ptr++) Line 3068  for (;; ptr++)
3068        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3069        c = *previous;        c = *previous;
3070    
3071          if (!possessive_quantifier &&
3072              repeat_max < 0 &&
3073              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3074            {
3075            repeat_type = 0;    /* Force greedy */
3076            possessive_quantifier = TRUE;
3077            }
3078    
3079        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3080        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3081          {          {
# Line 2535  for (;; ptr++) Line 3116  for (;; ptr++)
3116          }          }
3117    
3118        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3119        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3120        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3121        one less than the maximum. */        one less than the maximum. */
3122    
# Line 2588  for (;; ptr++) Line 3169  for (;; ptr++)
3169            }            }
3170    
3171          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3172          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3173            UPTO is just for 1 instance, we can use QUERY instead. */
3174    
3175          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3176            {            {
# Line 2607  for (;; ptr++) Line 3189  for (;; ptr++)
3189              *code++ = prop_value;              *code++ = prop_value;
3190              }              }
3191            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3192            *code++ = OP_UPTO + repeat_type;  
3193            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3194                {
3195                *code++ = OP_QUERY + repeat_type;
3196                }
3197              else
3198                {
3199                *code++ = OP_UPTO + repeat_type;
3200                PUT2INC(code, 0, repeat_max);
3201                }
3202            }            }
3203          }          }
3204    
# Line 2675  for (;; ptr++) Line 3265  for (;; ptr++)
3265      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3266      cases. */      cases. */
3267    
3268      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3269               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3270        {        {
3271        register int i;        register int i;
3272        int ketoffset = 0;        int ketoffset = 0;
3273        int len = code - previous;        int len = code - previous;
3274        uschar *bralink = NULL;        uschar *bralink = NULL;
3275    
3276          /* Repeating a DEFINE group is pointless */
3277    
3278          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3279            {
3280            *errorcodeptr = ERR55;
3281            goto FAILED;
3282            }
3283    
3284          /* This is a paranoid check to stop integer overflow later on */
3285    
3286          if (len > MAX_DUPLENGTH)
3287            {
3288            *errorcodeptr = ERR50;
3289            goto FAILED;
3290            }
3291    
3292        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3293        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3294        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2717  for (;; ptr++) Line 3323  for (;; ptr++)
3323          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3324          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3325          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3326          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3327          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3328            doing this. */
3329    
3330          if (repeat_max <= 1)          if (repeat_max <= 1)
3331            {            {
3332            *code = OP_END;            *code = OP_END;
3333            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3334            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3335            code++;            code++;
3336            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2741  for (;; ptr++) Line 3348  for (;; ptr++)
3348            {            {
3349            int offset;            int offset;
3350            *code = OP_END;            *code = OP_END;
3351            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3352            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3353            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3354            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3368  for (;; ptr++)
3368        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3369        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3370        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3371        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3372          forward reference subroutine calls in the group, there will be entries on
3373          the workspace list; replicate these with an appropriate increment. */
3374    
3375        else        else
3376          {          {
3377          if (repeat_min > 1)          if (repeat_min > 1)
3378            {            {
3379            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3380            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3381    
3382              if (lengthptr != NULL)
3383                *lengthptr += (repeat_min - 1)*length_prevgroup;
3384    
3385              /* This is compiling for real */
3386    
3387              else
3388              {              {
3389              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3390              code += len;              for (i = 1; i < repeat_min; i++)
3391                  {
3392                  uschar *hc;
3393                  uschar *this_hwm = cd->hwm;
3394                  memcpy(code, previous, len);
3395                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3396                    {
3397                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3398                    cd->hwm += LINK_SIZE;
3399                    }
3400                  save_hwm = this_hwm;
3401                  code += len;
3402                  }
3403              }              }
3404            }            }
3405    
3406          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3407          }          }
3408    
# Line 2781  for (;; ptr++) Line 3410  for (;; ptr++)
3410        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3411        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3412        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3413        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3414          replicate entries on the forward reference list. */
3415    
3416        if (repeat_max >= 0)        if (repeat_max >= 0)
3417          {          {
3418          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3419            just adjust the length as if we had. For each repetition we must add 1
3420            to the length for BRAZERO and for all but the last repetition we must
3421            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3422    
3423            if (lengthptr != NULL && repeat_max > 0)
3424              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3425                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3426    
3427            /* This is compiling for real */
3428    
3429            else for (i = repeat_max - 1; i >= 0; i--)
3430            {            {
3431              uschar *hc;
3432              uschar *this_hwm = cd->hwm;
3433    
3434            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3435    
3436            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 3446  for (;; ptr++)
3446              }              }
3447    
3448            memcpy(code, previous, len);            memcpy(code, previous, len);
3449              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3450                {
3451                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3452                cd->hwm += LINK_SIZE;
3453                }
3454              save_hwm = this_hwm;
3455            code += len;            code += len;
3456            }            }
3457    
# Line 2824  for (;; ptr++) Line 3474  for (;; ptr++)
3474        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3475        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3476        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3477        correct offset was computed above. */        correct offset was computed above.
3478    
3479        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3480          this group is a non-atomic one that could match an empty string. If so,
3481          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3482          that runtime checking can be done. [This check is also applied to
3483          atomic groups at runtime, but in a different way.] */
3484    
3485          else
3486            {
3487            uschar *ketcode = code - ketoffset;
3488            uschar *bracode = ketcode - GET(ketcode, 1);
3489            *ketcode = OP_KETRMAX + repeat_type;
3490            if (lengthptr == NULL && *bracode != OP_ONCE)
3491              {
3492              uschar *scode = bracode;
3493              do
3494                {
3495                if (could_be_empty_branch(scode, ketcode, utf8))
3496                  {
3497                  *bracode += OP_SBRA - OP_BRA;
3498                  break;
3499                  }
3500                scode += GET(scode, 1);
3501                }
3502              while (*scode == OP_ALT);
3503              }
3504            }
3505        }        }
3506    
3507      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2837  for (;; ptr++) Line 3512  for (;; ptr++)
3512        goto FAILED;        goto FAILED;
3513        }        }
3514    
3515      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3516      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3517      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3518      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3519      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3520        but the special opcodes can optimize it a bit. The repeated item starts at
3521        tempcode, not at previous, which might be the first part of a string whose
3522        (former) last char we repeated.
3523    
3524        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3525        an 'upto' may follow. We skip over an 'exact' item, and then test the
3526        length of what remains before proceeding. */
3527    
3528      if (possessive_quantifier)      if (possessive_quantifier)
3529        {        {
3530        int len = code - tempcode;        int len;
3531        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3532        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3533        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3534        tempcode[0] = OP_ONCE;        len = code - tempcode;
3535        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3536        PUTINC(code, 0, len);          {
3537        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3538            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3539            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3540            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3541    
3542            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3543            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3544            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3545            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3546    
3547            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3548            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3549            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3550            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3551    
3552            default:
3553            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3554            code += 1 + LINK_SIZE;
3555            len += 1 + LINK_SIZE;
3556            tempcode[0] = OP_ONCE;
3557            *code++ = OP_KET;
3558            PUTINC(code, 0, len);
3559            PUT(tempcode, 1, len);
3560            break;
3561            }
3562        }        }
3563    
3564      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 3571  for (;; ptr++)
3571      break;      break;
3572    
3573    
3574      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3575      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3576      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3577      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3578      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3579      check for syntax errors here.  */      group. */
3580    
3581      case '(':      case '(':
3582      newoptions = options;      newoptions = options;
3583      skipbytes = 0;      skipbytes = 0;
3584        bravalue = OP_CBRA;
3585        save_hwm = cd->hwm;
3586    
3587      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3588        {        {
3589        int set, unset;        int i, set, unset, namelen;
3590        int *optset;        int *optset;
3591          const uschar *name;
3592          uschar *slot;
3593    
3594        switch (*(++ptr))        switch (*(++ptr))
3595          {          {
3596          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3597          ptr++;          ptr++;
3598          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3599            if (*ptr == 0)
3600              {
3601              *errorcodeptr = ERR18;
3602              goto FAILED;
3603              }
3604          continue;          continue;
3605    
3606          case ':':                 /* Non-extracting bracket */  
3607            /* ------------------------------------------------------------ */
3608            case ':':                 /* Non-capturing bracket */
3609          bravalue = OP_BRA;          bravalue = OP_BRA;
3610          ptr++;          ptr++;
3611          break;          break;
3612    
3613    
3614            /* ------------------------------------------------------------ */
3615          case '(':          case '(':
3616          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3617    
3618          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
3619          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
3620          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
3621          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
3622          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
3623          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3624          character after (?( is a word character, we know the rest up to ) will  
3625          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
3626          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
3627            by digits), and (b) a number could be a name that consists of digits.
3628          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
3629            {          cases. */
3630            int i, namelen;  
3631            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
3632            const uschar *name;          the switch. This will take control down to where bracketed groups,
3633            uschar *slot = cd->name_table;          including assertions, are processed. */
3634    
3635            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3636              break;
3637    
3638            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
3639            below), and all need to skip 3 bytes at the start of the group. */
3640    
3641            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
3642            skipbytes = 3;
3643            refsign = -1;
3644    
3645            name = ++ptr;          /* Check for a test for recursion in a named group. */
3646            while (*ptr != ')')  
3647              {          if (ptr[1] == 'R' && ptr[2] == '&')
3648              if (condref >= 0)            {
3649                condref = ((digitab[*ptr] & ctype_digit) != 0)?            terminator = -1;
3650                  condref * 10 + *ptr - '0' : -1;            ptr += 2;
3651              ptr++;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3652              }            }
3653            namelen = ptr - name;  
3654            /* Check for a test for a named group's having been set, using the Perl
3655            syntax (?(<name>) or (?('name') */
3656    
3657            else if (ptr[1] == '<')
3658              {
3659              terminator = '>';
3660              ptr++;
3661              }
3662            else if (ptr[1] == '\'')
3663              {
3664              terminator = '\'';
3665            ptr++;            ptr++;
3666              }
3667            else
3668              {
3669              terminator = 0;
3670              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3671              }
3672    
3673            for (i = 0; i < cd->names_found; i++)          /* We now expect to read a name; any thing else is an error */
3674              {  
3675              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3676              slot += cd->name_entry_size;            {
3677              }            ptr += 1;  /* To get the right offset */
3678              *errorcodeptr = ERR28;
3679              goto FAILED;
3680              }
3681    
3682            /* Found a previous named subpattern */          /* Read the name, but also get it as a number if it's all digits */
3683    
3684            if (i < cd->names_found)          recno = 0;
3685              {          name = ++ptr;
3686              condref = GET2(slot, 0);          while ((cd->ctypes[*ptr] & ctype_word) != 0)
3687              code[1+LINK_SIZE] = OP_CREF;            {
3688              PUT2(code, 2+LINK_SIZE, condref);            if (recno >= 0)
3689              }              recno = ((digitab[*ptr] & ctype_digit) != 0)?
3690                  recno * 10 + *ptr - '0' : -1;
3691              ptr++;
3692              }
3693            namelen = ptr - name;
3694    
3695            /* Search the pattern for a forward reference */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3696              {
3697              ptr--;      /* Error offset */
3698              *errorcodeptr = ERR26;
3699              goto FAILED;
3700              }
3701    
3702            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          /* Do no further checking in the pre-compile phase. */
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, i);  
             }  
3703    
3704            /* Check for 'R' for recursion */          if (lengthptr != NULL) break;
3705    
3706            else if (namelen == 1 && *name == 'R')          /* In the real compile we do the work of looking for the actual
3707            reference. If the string started with "+" or "-" we require the rest to
3708            be digits, in which case recno will be set. */
3709    
3710            if (refsign > 0)
3711              {
3712              if (recno <= 0)
3713                {
3714                *errorcodeptr = ERR58;
3715                goto FAILED;
3716                }
3717              if (refsign == '-')
3718              {              {
3719              code[1+LINK_SIZE] = OP_CREF;              recno = cd->bracount - recno + 1;
3720              PUT2(code, 2+LINK_SIZE, CREF_RECURSE);              if (recno <= 0)
3721                  {
3722                  *errorcodeptr = ERR15;
3723                  goto FAILED;
3724                  }
3725              }              }
3726              else recno += cd->bracount;
3727              PUT2(code, 2+LINK_SIZE, recno);
3728              break;
3729              }
3730    
3731            /* Otherwise (did not start with "+" or "-"), start by looking for the
3732            name. */
3733    
3734            slot = cd->name_table;
3735            for (i = 0; i < cd->names_found; i++)
3736              {
3737              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3738              slot += cd->name_entry_size;
3739              }
3740    
3741            /* Check for a subpattern number */          /* Found a previous named subpattern */
3742    
3743            else if (condref > 0)          if (i < cd->names_found)
3744              {            {
3745              code[1+LINK_SIZE] = OP_CREF;            recno = GET2(slot, 0);
3746              PUT2(code, 2+LINK_SIZE, condref);            PUT2(code, 2+LINK_SIZE, recno);
3747              }            }
3748    
3749            /* Either an unidentified subpattern, or a reference to (?(0) */          /* Search the pattern for a forward reference */
3750    
3751            else          else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3752                            (options & PCRE_EXTENDED) != 0)) > 0)
3753              {
3754              PUT2(code, 2+LINK_SIZE, i);
3755              }
3756    
3757            /* If terminator == 0 it means that the name followed directly after
3758            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3759            some further alternatives to try. For the cases where terminator != 0
3760            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3761            now checked all the possibilities, so give an error. */
3762    
3763            else if (terminator != 0)
3764              {
3765              *errorcodeptr = ERR15;
3766              goto FAILED;
3767              }
3768    
3769            /* Check for (?(R) for recursion. Allow digits after R to specify a
3770            specific group number. */
3771    
3772            else if (*name == 'R')
3773              {
3774              recno = 0;
3775              for (i = 1; i < namelen; i++)
3776              {              {
3777              *errorcodeptr = (condref == 0)? ERR35: ERR15;              if ((digitab[name[i]] & ctype_digit) == 0)
3778              goto FAILED;                {
3779                  *errorcodeptr = ERR15;
3780                  goto FAILED;
3781                  }
3782                recno = recno * 10 + name[i] - '0';
3783              }              }
3784              if (recno == 0) recno = RREF_ANY;
3785              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3786              PUT2(code, 2+LINK_SIZE, recno);
3787              }
3788    
3789            /* Similarly, check for the (?(DEFINE) "condition", which is always
3790            false. */
3791    
3792            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3793              {
3794              code[1+LINK_SIZE] = OP_DEF;
3795              skipbytes = 1;
3796              }
3797    
3798            /* Check for the "name" actually being a subpattern number. */
3799    
3800            else if (recno > 0)
3801              {
3802              PUT2(code, 2+LINK_SIZE, recno);
3803            }            }
3804    
3805          /* For conditions that are assertions, we just fall through, having          /* Either an unidentified subpattern, or a reference to (?(0) */
         set bravalue above. */  
3806    
3807            else
3808              {
3809              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3810              goto FAILED;
3811              }
3812          break;          break;
3813    
3814    
3815            /* ------------------------------------------------------------ */
3816          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3817          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3818          ptr++;          ptr++;
3819          break;          break;
3820    
3821    
3822            /* ------------------------------------------------------------ */
3823          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3824          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3825          ptr++;          ptr++;
3826          break;          break;
3827    
3828          case '<':                 /* Lookbehinds */  
3829          switch (*(++ptr))          /* ------------------------------------------------------------ */
3830            case '<':                 /* Lookbehind or named define */
3831            switch (ptr[1])
3832            {            {
3833            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3834            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3835            ptr++;            ptr += 2;
3836            break;            break;
3837    
3838            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3839            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3840            ptr++;            ptr += 2;
3841            break;            break;
3842    
3843              default:                /* Could be name define, else bad */
3844              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3845              ptr++;                  /* Correct offset for error */
3846              *errorcodeptr = ERR24;
3847              goto FAILED;
3848            }            }
3849          break;          break;
3850    
3851    
3852            /* ------------------------------------------------------------ */
3853          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3854          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3855          ptr++;          ptr++;
3856          break;          break;
3857    
3858    
3859            /* ------------------------------------------------------------ */
3860          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3861          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3862          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3863          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3864            {                       /* closing parenthesis is present. */            {
3865            int n = 0;            int n = 0;
3866            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3867              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3868              if (*ptr != ')')
3869                {
3870                *errorcodeptr = ERR39;
3871                goto FAILED;
3872                }
3873            if (n > 255)            if (n > 255)
3874              {              {
3875              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 3883  for (;; ptr++)
3883          previous = NULL;          previous = NULL;
3884          continue;          continue;
3885    
3886          case 'P':                 /* Named subpattern handling */  
3887          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3888            case 'P':                 /* Python-style named subpattern handling */
3889            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3890              {
3891              is_recurse = *ptr == '>';
3892              terminator = ')';
3893              goto NAMED_REF_OR_RECURSE;
3894              }
3895            else if (*ptr != '<')    /* Test for Python-style definition */
3896              {
3897              *errorcodeptr = ERR41;
3898              goto FAILED;
3899              }
3900            /* Fall through to handle (?P< as (?< is handled */
3901    
3902    
3903            /* ------------------------------------------------------------ */
3904            DEFINE_NAME:    /* Come here from (?< handling */
3905            case '\'':
3906            {            {
3907            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
3908            uschar *slot = cd->name_table;            name = ++ptr;
           const uschar *name;     /* Don't amalgamate; some compilers */  
           name = ++ptr;           /* grumble at autoincrement in declaration */  
3909    
3910            while (*ptr++ != '>');            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3911            namelen = ptr - name - 1;            namelen = ptr - name;
3912    
3913            for (i = 0; i < cd->names_found; i++)            /* In the pre-compile phase, just do a syntax check. */
3914    
3915              if (lengthptr != NULL)
3916              {              {
3917              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
             if (crc == 0)  
3918                {                {
3919                if (slot[2+namelen] == 0)                *errorcodeptr = ERR42;
3920                  goto FAILED;
3921                  }
3922                if (cd->names_found >= MAX_NAME_COUNT)
3923                  {
3924                  *errorcodeptr = ERR49;
3925                  goto FAILED;
3926                  }
3927                if (namelen + 3 > cd->name_entry_size)
3928                  {
3929                  cd->name_entry_size = namelen + 3;
3930                  if (namelen > MAX_NAME_SIZE)
3931                  {                  {
3932                  if ((options & PCRE_DUPNAMES) == 0)                  *errorcodeptr = ERR48;
3933                    {                  goto FAILED;
                   *errorcodeptr = ERR43;  
                   goto FAILED;  
                   }  
3934                  }                  }
               else crc = -1;      /* Current name is substring */  
3935                }                }
3936              if (crc < 0)              }
3937    
3938              /* In the real compile, create the entry in the table */
3939    
3940              else
3941                {
3942                slot = cd->name_table;
3943                for (i = 0; i < cd->names_found; i++)
3944                {                {
3945                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
3946                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
3947                break;                  {
3948                    if (slot[2+namelen] == 0)
3949                      {
3950                      if ((options & PCRE_DUPNAMES) == 0)
3951                        {
3952                        *errorcodeptr = ERR43;
3953                        goto FAILED;
3954                        }
3955                      }
3956                    else crc = -1;      /* Current name is substring */
3957                    }
3958                  if (crc < 0)
3959                    {
3960                    memmove(slot + cd->name_entry_size, slot,
3961                      (cd->names_found - i) * cd->name_entry_size);
3962                    break;
3963                    }
3964                  slot += cd->name_entry_size;
3965                }                }
             slot += cd->name_entry_size;  
             }  
3966    
3967            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3968            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3969            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3970            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3971            }            }
3972    
3973          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
           {  
           int i, namelen;  
           int type = *ptr++;  
           const uschar *name = ptr;  
           uschar *slot = cd->name_table;  
3974    
3975            while (*ptr != ')') ptr++;          ptr++;                    /* Move past > or ' */
3976            namelen = ptr - name;          cd->names_found++;
3977            goto NUMBERED_GROUP;
3978    
           for (i = 0; i < cd->names_found; i++)  
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
3979    
3980            if (i < cd->names_found)         /* Back reference */          /* ------------------------------------------------------------ */
3981              {          case '&':                 /* Perl recursion/subroutine syntax */
3982            terminator = ')';
3983            is_recurse = TRUE;
3984            /* Fall through */
3985    
3986            /* We come here from the Python syntax above that handles both
3987            references (?P=name) and recursion (?P>name), as well as falling
3988            through from the Perl recursion syntax (?&name). */
3989    
3990            NAMED_REF_OR_RECURSE:
3991            name = ++ptr;
3992            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3993            namelen = ptr - name;
3994    
3995            /* In the pre-compile phase, do a syntax check and set a dummy
3996            reference number. */
3997    
3998            if (lengthptr != NULL)
3999              {
4000              if (*ptr != terminator)
4001                {
4002                *errorcodeptr = ERR42;
4003                goto FAILED;
4004                }
4005              if (namelen > MAX_NAME_SIZE)
4006                {
4007                *errorcodeptr = ERR48;
4008                goto FAILED;
4009                }
4010              recno = 0;
4011              }
4012    
4013            /* In the real compile, seek the name in the table */
4014    
4015            else
4016              {
4017              slot = cd->name_table;
4018              for (i = 0; i < cd->names_found; i++)
4019                {
4020                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4021                slot += cd->name_entry_size;
4022                }
4023    
4024              if (i < cd->names_found)         /* Back reference */
4025                {
4026              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4027              }              }
4028            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4029                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd->bracount, name, namelen,
4030                          (options & PCRE_EXTENDED) != 0)) <= 0)
4031              {              {
4032              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4033              goto FAILED;              goto FAILED;
4034              }              }
4035              }
4036    
4037            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
4038            recursion or backreferences. */
           /* Back reference */  
4039    
4040            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4041            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4042    
         /* Should never happen */  
         break;  
4043    
4044          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4045            case 'R':                 /* Recursion */
4046          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4047          /* Fall through */          /* Fall through */
4048    
         /* Recursion or "subroutine" call */  
4049    
4050          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4051          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4052            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4053            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4054            {            {
4055            const uschar *called;            const uschar *called;
4056    
4057              if ((refsign = *ptr) == '+') ptr++;
4058              else if (refsign == '-')
4059                {
4060                if ((digitab[ptr[1]] & ctype_digit) == 0)
4061                  goto OTHER_CHAR_AFTER_QUERY;
4062                ptr++;
4063                }
4064    
4065            recno = 0;            recno = 0;
4066            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4067              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4068    
4069              if (*ptr != ')')
4070                {
4071                *errorcodeptr = ERR29;
4072                goto FAILED;
4073                }
4074    
4075              if (refsign == '-')
4076                {
4077                if (recno == 0)
4078                  {
4079                  *errorcodeptr = ERR58;
4080                  goto FAILED;
4081                  }
4082                recno = cd->bracount - recno + 1;
4083                if (recno <= 0)
4084                  {
4085                  *errorcodeptr = ERR15;
4086                  goto FAILED;
4087                  }
4088                }
4089              else if (refsign == '+')
4090                {
4091                if (recno == 0)
4092                  {
4093                  *errorcodeptr = ERR58;
4094                  goto FAILED;
4095                  }
4096                recno += cd->bracount;
4097                }
4098    
4099            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4100    
4101            HANDLE_RECURSION:            HANDLE_RECURSION:
4102    
4103            previous = code;            previous = code;
4104              called = cd->start_code;
4105    
4106            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4107            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4108              this point. If we end up with a forward reference, first check that
4109              the bracket does occur later so we can give the error (and position)
4110              now. Then remember this forward reference in the workspace so it can
4111              be filled in at the end. */
4112    
4113            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);  
           if (called == NULL)  
4114              {              {
4115              *errorcodeptr = ERR15;              *code = OP_END;
4116              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4117    
4118            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4119    
4120            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4121              {                {
4122              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4123              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4124                    {
4125                    *errorcodeptr = ERR15;
4126                    goto FAILED;
4127                    }
4128                  called = cd->start_code + recno;
4129                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4130                  }
4131    
4132                /* If not a forward reference, and the subpattern is still open,
4133                this is a recursive call. We check to see if this is a left
4134                recursion that could loop for ever, and diagnose that case. */
4135    
4136                else if (GET(called, 1) == 0 &&
4137                         could_be_empty(called, code, bcptr, utf8))
4138                  {
4139                  *errorcodeptr = ERR40;
4140                  goto FAILED;
4141                  }
4142              }              }
4143    
4144            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4145            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4146              subsequent quantifier will work. */
4147    
4148            *code = OP_ONCE;            *code = OP_ONCE;
4149            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3174  for (;; ptr++) Line 4156  for (;; ptr++)
4156            *code = OP_KET;            *code = OP_KET;
4157            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4158            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4159    
4160              length_prevgroup = 3 + 3*LINK_SIZE;
4161            }            }
4162    
4163            /* Can't determine a first byte now */
4164    
4165            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4166          continue;          continue;
4167    
         /* Character after (? not specially recognized */  
4168    
4169          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4170            default:              /* Other characters: check option setting */
4171            OTHER_CHAR_AFTER_QUERY:
4172          set = unset = 0;          set = unset = 0;
4173          optset = &set;          optset = &set;
4174    
# Line 3189  for (;; ptr++) Line 4178  for (;; ptr++)
4178              {              {
4179              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4180    
4181                case 'J':    /* Record that it changed in the external options */
4182                *optset |= PCRE_DUPNAMES;
4183                cd->external_options |= PCRE_JCHANGED;
4184                break;
4185    
4186              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
             case 'J': *optset |= PCRE_DUPNAMES; break;  
4187              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4188              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4189              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4190              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4191              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4192    
4193                default:  *errorcodeptr = ERR12;
4194                          ptr--;    /* Correct the offset */
4195                          goto FAILED;
4196              }              }
4197            }            }
4198    
# Line 3204  for (;; ptr++) Line 4201  for (;; ptr++)
4201          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4202    
4203          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4204          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4205          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4206          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4207          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4208          a group), a resetting item can be compiled.          caseless checking of required bytes.
4209    
4210          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4211          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4212          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4213            that value after the start, because it gets reset as code is discarded
4214            during the pre-compile. However, this can happen only at top level - if
4215            we are within parentheses, the starting BRA will still be present. At
4216            any parenthesis level, the length value can be used to test if anything
4217            has been compiled at that level. Thus, a test for both these conditions
4218            is necessary to ensure we correctly detect the start of the pattern in
4219            both phases.
4220    
4221            If we are not at the pattern start, compile code to change the ims
4222            options if this setting actually changes any of them. We also pass the
4223            new setting back so that it can be put at the start of any following
4224            branches, and when this group ends (if we are in a group), a resetting
4225            item can be compiled. */
4226    
4227          if (*ptr == ')')          if (*ptr == ')')
4228            {            {
4229            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4230                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4231              {              {
4232              *code++ = OP_OPT;              cd->external_options = newoptions;
4233              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4234              }              }
4235             else
4236                {
4237                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4238                  {
4239                  *code++ = OP_OPT;
4240                  *code++ = newoptions & PCRE_IMS;
4241                  }
4242    
4243            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4244            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4245            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4246    
4247            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4248            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4249            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4250            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4251                }
4252    
4253            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4254            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3242  for (;; ptr++) Line 4261  for (;; ptr++)
4261    
4262          bravalue = OP_BRA;          bravalue = OP_BRA;
4263          ptr++;          ptr++;
4264          }          }     /* End of switch for character following (? */
4265        }        }       /* End of (? handling */
4266    
4267      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4268      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4269        brackets. */
4270    
4271      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4272        {        {
4273        bravalue = OP_BRA;        bravalue = OP_BRA;
4274        }        }
4275    
4276      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4277    
4278      else      else
4279        {        {
4280        NUMBERED_GROUP:        NUMBERED_GROUP:
4281        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4282          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4283          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4284        }        }
4285    
4286      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4287      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4288      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4289      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4290        they have changed. */
4291    
4292      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4293      *code = bravalue;      *code = bravalue;
4294      tempcode = code;      tempcode = code;
4295      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4296        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4297    
4298      if (!compile_regex(      if (!compile_regex(
4299           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4300           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4301           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4302           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4303           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4304           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4305            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4306           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4307           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4308           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4309           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4310           cd))                          /* Tables block */           cd,                           /* Tables block */
4311             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4312               &length_prevgroup           /* Pre-compile phase */
4313             ))
4314        goto FAILED;        goto FAILED;
4315    
4316      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3302  for (;; ptr++) Line 4319  for (;; ptr++)
4319      is on the bracket. */      is on the bracket. */
4320    
4321      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4322      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. */
4323    
4324      else if (bravalue == OP_COND)      if (bravalue == OP_COND)
4325        {        {
4326        uschar *tc = code;        uschar *tc = code;
4327        int condcount = 0;        int condcount = 0;
# Line 3315  for (;; ptr++) Line 4332  for (;; ptr++)
4332           }           }
4333        while (*tc != OP_KET);        while (*tc != OP_KET);
4334    
4335        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4336          false). It must have only one branch. */
4337    
4338          if (code[LINK_SIZE+1] == OP_DEF)
4339          {          {
4340          *errorcodeptr = ERR27;          if (condcount > 1)
4341          goto FAILED;            {
4342              *errorcodeptr = ERR54;
4343              goto FAILED;
4344              }
4345            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4346            }
4347    
4348          /* A "normal" conditional group. If there is just one branch, we must not
4349          make use of its firstbyte or reqbyte, because this is equivalent to an
4350          empty second branch. */
4351    
4352          else
4353            {
4354            if (condcount > 2)
4355              {
4356              *errorcodeptr = ERR27;
4357              goto FAILED;
4358              }
4359            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4360          }          }
4361          }
4362    
4363        /* Error if hit end of pattern */
4364    
4365        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4366        reqbyte, because this is equivalent to an empty second branch. */        {
4367          *errorcodeptr = ERR14;
4368          goto FAILED;
4369          }
4370    
4371        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4372        group, less the brackets at either end. Then reduce the compiled code to
4373        just the brackets so that it doesn't use much memory if it is duplicated by
4374        a quantifier. */
4375    
4376        if (lengthptr != NULL)
4377          {
4378          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4379          code++;
4380          PUTINC(code, 0, 1 + LINK_SIZE);
4381          *code++ = OP_KET;
4382          PUTINC(code, 0, 1 + LINK_SIZE);
4383        }        }
4384    
4385      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4386      brackets of all kinds, and conditions with two branches (see code above).  
4387      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4388      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4389      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4390        relevant. */
4391    
4392        if (bravalue == OP_DEF) break;
4393    
4394        /* Handle updating of the required and first characters for other types of
4395        group. Update for normal brackets of all kinds, and conditions with two
4396        branches (see code above). If the bracket is followed by a quantifier with
4397        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4398        zerofirstbyte outside the main loop so that they can be accessed for the
4399        back off. */
4400    
4401      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4402      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4403      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4404    
4405      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4406        {        {
4407        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4408        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3378  for (;; ptr++) Line 4443  for (;; ptr++)
4443      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4444    
4445      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4446        break;     /* End of processing '(' */
4447    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
4448    
4449      case '\\':      /* ===================================================================*/
4450      tempptr = ptr;      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
   
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
4451      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4452      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4453      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4454      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4455      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4456    
4457        case '\\':
4458        tempptr = ptr;
4459        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4460        if (*errorcodeptr != 0) goto FAILED;
4461    
4462      if (c < 0)      if (c < 0)
4463        {        {
4464        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3416  for (;; ptr++) Line 4468  for (;; ptr++)
4468          continue;          continue;
4469          }          }
4470    
4471          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4472    
4473        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4474        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4475    
# Line 3427  for (;; ptr++) Line 4481  for (;; ptr++)
4481        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4482        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4483    
4484        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4485          We also support \k{name} (.NET syntax) */
4486    
4487          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4488            {
4489            is_recurse = FALSE;
4490            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4491            goto NAMED_REF_OR_RECURSE;
4492            }
4493    
4494          /* Back references are handled specially; must disable firstbyte if
4495          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4496          ':' later. */
4497    
4498        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4499          {          {
4500          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4501    
4502            HANDLE_REFERENCE:    /* Come here from named backref handling */
4503            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4504          previous = code;          previous = code;
4505          *code++ = OP_REF;          *code++ = OP_REF;
4506          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4507            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4508            if (recno > cd->top_backref) cd->top_backref = recno;
4509          }          }
4510    
4511        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4512    
4513  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4514        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3446  for (;; ptr++) Line 4516  for (;; ptr++)
4516          BOOL negated;          BOOL negated;
4517          int pdata;          int pdata;
4518          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4519            if (ptype < 0) goto FAILED;
4520          previous = code;          previous = code;
4521          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4522          *code++ = ptype;          *code++ = ptype;
4523          *code++ = pdata;          *code++ = pdata;
4524          }          }
4525    #else
4526    
4527          /* If Unicode properties are not supported, \X, \P, and \p are not
4528          allowed. */
4529    
4530          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4531            {
4532            *errorcodeptr = ERR45;
4533            goto FAILED;
4534            }
4535  #endif  #endif
4536    
4537        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4538        value */        can obtain the OP value by negating the escape value. */
4539    
4540        else        else
4541          {          {
# Line 3478  for (;; ptr++) Line 4559  for (;; ptr++)
4559       mcbuffer[0] = c;       mcbuffer[0] = c;
4560       mclength = 1;       mclength = 1;
4561       }       }
   
4562      goto ONE_CHAR;      goto ONE_CHAR;
4563    
4564    
4565        /* ===================================================================*/
4566      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4567      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4568      multi-byte literal character. */      multi-byte literal character. */
# Line 3491  for (;; ptr++) Line 4573  for (;; ptr++)
4573      mcbuffer[0] = c;      mcbuffer[0] = c;
4574    
4575  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4576      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4577        {        {
4578        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4579          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3542  for (;; ptr++) Line 4624  for (;; ptr++)
4624      }      }
4625    }                   /* end of big loop */    }                   /* end of big loop */
4626    
4627    
4628  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4629  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4630  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3558  return FALSE; Line 4641  return FALSE;
4641  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4642  *************************************************/  *************************************************/
4643    
4644  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4645  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4646  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4647  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4648  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4649  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4650  the new options into every subsequent branch compile.  into every subsequent branch compile.
4651    
4652    This function is used during the pre-compile phase when we are trying to find
4653    out the amount of memory needed, as well as during the real compile phase. The
4654    value of lengthptr distinguishes the two phases.
4655    
4656  Argument:  Argument:
4657    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4658    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4659    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4660    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4661    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4662    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4663    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4664    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4665    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4666    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4667    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4668      lengthptr      NULL during the real compile phase
4669                     points to length accumulator during pre-compile phase
4670    
4671  Returns:      TRUE on success  Returns:         TRUE on success
4672  */  */
4673    
4674  static BOOL  static BOOL
4675  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4676    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4677    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4678  {  {
4679  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4680  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3595  uschar *start_bracket = code; Line 4683  uschar *start_bracket = code;
4683  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4684  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4685  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4686    int length;
4687  branch_chain bc;  branch_chain bc;
4688    
4689  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3602  bc.current = code; Line 4691  bc.current = code;
4691    
4692  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4693    
4694    /* Accumulate the length for use in the pre-compile phase. Start with the
4695    length of the BRA and KET and any extra bytes that are required at the
4696    beginning. We accumulate in a local variable to save frequent testing of
4697    lenthptr for NULL. We cannot do this by looking at the value of code at the
4698    start and end of each alternative, because compiled items are discarded during
4699    the pre-compile phase so that the work space is not exceeded. */
4700    
4701    length = 2 + 2*LINK_SIZE + skipbytes;
4702    
4703    /* WARNING: If the above line is changed for any reason, you must also change
4704    the code that abstracts option settings at the start of the pattern and makes
4705    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4706    pre-compile phase to find out whether anything has yet been compiled or not. */
4707    
4708  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4709    
4710  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3617  for (;;) Line 4720  for (;;)
4720      {      {
4721      *code++ = OP_OPT;      *code++ = OP_OPT;
4722      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4723        length += 2;
4724      }      }
4725    
4726    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3626  for (;;) Line 4730  for (;;)
4730      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4731      reverse_count = code;      reverse_count = code;
4732      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4733        length += 1 + LINK_SIZE;
4734      }      }
4735    
4736    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4737      into the length. */
4738    
4739    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4740          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4741      {      {
4742      *ptrptr = ptr;      *ptrptr = ptr;
4743      return FALSE;      return FALSE;
4744      }      }
4745    
4746    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4747    
4748    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4749      {      {
4750      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4751      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4752    
4753    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4754    match the values from all the previous branches, except that if the previous        {
4755    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4756    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4757          }
4758    
4759    else      /* If this is not the first branch, the first char and reqbyte have to
4760      {      match the values from all the previous branches, except that if the
4761      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4762      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4763    
4764      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4765        {        {
4766        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4767        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4768        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4769    
4770          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4771            {
4772            if (reqbyte < 0) reqbyte = firstbyte;
4773            firstbyte = REQ_NONE;
4774            }
4775    
4776      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4777      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4778    
4779      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4780          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4781    
4782      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4783    
4784      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4785        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4786      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4787      }        }
4788    
4789    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4790    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4791    the branch with OP_END. */      branch with OP_END. */
4792    
4793    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4794        {        {
4795        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4796        *ptrptr = ptr;        *code = OP_END;
4797        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4798          DPRINTF(("fixed length = %d\n", fixed_length));
4799          if (fixed_length < 0)
4800            {
4801            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4802            *ptrptr = ptr;
4803            return FALSE;
4804            }
4805          PUT(reverse_count, 0, fixed_length);
4806        }        }
     PUT(reverse_count, 0, length);  
4807      }      }
4808    
4809    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. Go back through
# Line 3706  for (;;) Line 4817  for (;;)
4817    
4818    if (*ptr != '|')    if (*ptr != '|')
4819      {      {
4820      int length = code - last_branch;      int branch_length = code - last_branch;
4821      do      do
4822        {        {
4823        int prev_length = GET(last_branch, 1);        int prev_length = GET(last_branch, 1);
4824        PUT(last_branch, 1, length);        PUT(last_branch, 1, branch_length);
4825        length = prev_length;        branch_length = prev_length;
4826        last_branch -= length;        last_branch -= branch_length;
4827        }        }
4828      while (length > 0);      while (branch_length > 0);
4829    
4830      /* Fill in the ket */      /* Fill in the ket */
4831    
# Line 3728  for (;;) Line 4839  for (;;)
4839        {        {
4840        *code++ = OP_OPT;        *code++ = OP_OPT;
4841        *code++ = oldims;        *code++ = oldims;
4842          length += 2;
4843        }        }
4844    
4845      /* Set values to pass back */      /* Set values to pass back */
# Line 3736  for (;;) Line 4848  for (;;)
4848      *ptrptr = ptr;      *ptrptr = ptr;
4849      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
4850      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
4851        if (lengthptr != NULL) *lengthptr += length;
4852      return TRUE;      return TRUE;
4853      }      }
4854    
# Line 3749  for (;;) Line 4862  for (;;)
4862    bc.current = last_branch = code;    bc.current = last_branch = code;
4863    code += 1 + LINK_SIZE;    code += 1 + LINK_SIZE;
4864    ptr++;    ptr++;
4865      length += 1 + LINK_SIZE;
4866    }    }
4867  /* Control never reaches here */  /* Control never reaches here */
4868  }  }
# Line 3799  is_anchored(register const uschar *code, Line 4913  is_anchored(register const uschar *code,
4913    unsigned int backref_map)    unsigned int backref_map)
4914  {  {
4915  do {  do {
4916     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4917       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
4918     register int op = *scode;     register int op = *scode;
4919    
4920       /* Non-capturing brackets */
4921    
4922       if (op == OP_BRA)
4923         {
4924         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4925         }
4926    
4927     /* Capturing brackets */     /* Capturing brackets */
4928    
4929     if (op > OP_BRA)     else if (op == OP_CBRA)
4930       {       {
4931       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4932       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4933       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4934       }       }
4935    
4936     /* Other brackets */     /* Other brackets */
4937    
4938     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4939       {       {
4940       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4941       }       }
# Line 3824  do { Line 4943  do {
4943     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4944     are or may be referenced. */     are or may be referenced. */
4945    
4946     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4947                 op == OP_TYPEPOSSTAR) &&
4948              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
4949       {       {
4950       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3869  is_startline(const uschar *code, unsigne Line 4989  is_startline(const uschar *code, unsigne
4989    unsigned int backref_map)    unsigned int backref_map)
4990  {  {
4991  do {  do {
4992     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4993       FALSE);       NULL, 0, FALSE);
4994     register int op = *scode;     register int op = *scode;
4995    
4996       /* Non-capturing brackets */
4997    
4998       if (op == OP_BRA)
4999         {
5000         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5001         }
5002    
5003     /* Capturing brackets */     /* Capturing brackets */
5004    
5005     if (op > OP_BRA)     else if (op == OP_CBRA)
5006       {       {
5007       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
5008       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
5009       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
5010       }       }
5011    
5012     /* Other brackets */     /* Other brackets */
5013    
5014     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5015       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5016    
5017     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
5018     may be referenced. */     may be referenced. */
5019    
5020     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5021       {       {
5022       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5023       }       }
# Line 3941  do { Line 5066  do {
5066       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5067     register int op = *scode;     register int op = *scode;
5068    
    if (op >= OP_BRA) op = OP_BRA;  
   
5069     switch(op)     switch(op)
5070       {       {
5071       default:       default:
5072       return -1;       return -1;
5073    
5074       case OP_BRA:       case OP_BRA:
5075         case OP_CBRA:
5076       case OP_ASSERT:       case OP_ASSERT:
5077       case OP_ONCE:       case OP_ONCE:
5078       case OP_COND:       case OP_COND:
# Line 3964  do { Line 5088  do {
5088       case OP_CHARNC:       case OP_CHARNC:
5089       case OP_PLUS:       case OP_PLUS:
5090       case OP_MINPLUS:       case OP_MINPLUS:
5091         case OP_POSPLUS:
5092       if (!inassert) return -1;       if (!inassert) return -1;
5093       if (c < 0)       if (c < 0)
5094         {         {
# Line 4004  Returns:        pointer to compiled data Line 5129  Returns:        pointer to compiled data
5129                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5130  */  */
5131    
5132  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5133  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5134    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5135  {  {
# Line 4012  return pcre_compile2(pattern, options, N Line 5137  return pcre_compile2(pattern, options, N
5137  }  }
5138    
5139    
5140    PCRE_EXP_DEFN pcre *
 PCRE_DATA_SCOPE pcre *  
5141  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5142    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5143  {  {
5144  real_pcre *re;  real_pcre *re;
5145  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5146  int c, firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5147  int errorcode = 0;  int errorcode = 0;
5148  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5149  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5150  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5151  size_t size;  size_t size;
5152  uschar *code;  uschar *code;
5153  const uschar *codestart;  const uschar *codestart;
5154  const uschar *ptr;  const uschar *ptr;
5155  compile_data compile_block;  compile_data compile_block;
5156  compile_data *cd = &compile_block;  compile_data *cd = &compile_block;
5157  int brastack[BRASTACK_SIZE];  
5158  uschar bralenstack[BRASTACK_SIZE];  /* This space is used for "compiling" into during the first phase, when we are
5159    computing the amount of memory that is needed. Compiled items are thrown away
5160    as soon as possible, so that a fairly large buffer should be sufficient for
5161    this purpose. The same space is used in the second phase for remembering where
5162    to fill in forward references to subpatterns. */
5163    
5164    uschar cworkspace[COMPILE_WORK_SIZE];
5165    
5166    
5167    /* Set this early so that early errors get offset 0. */
5168    
5169    ptr = (const uschar *)pattern;
5170    
5171  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
5172  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 4062  if (errorcodeptr != NULL) *errorcodeptr Line 5186  if (errorcodeptr != NULL) *errorcodeptr
5186  if (erroroffset == NULL)  if (erroroffset == NULL)
5187    {    {
5188    errorcode = ERR16;    errorcode = ERR16;
5189    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5190    }    }
5191    
5192  *erroroffset = 0;  *erroroffset = 0;
# Line 4075  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5199  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5199       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5200    {    {
5201    errorcode = ERR44;    errorcode = ERR44;
5202    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5203    }    }
5204  #else  #else
5205  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 4099  cd->fcc = tables + fcc_offset; Line 5223  cd->fcc = tables + fcc_offset;
5223  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
5224  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5225    
5226  /* Handle different types of newline. The two bits give four cases. The current  /* Handle different types of newline. The three bits give seven cases. The
5227  code allows for one- or two-byte sequences. */  current code allows for fixed one- or two-byte sequences, plus "any" and
5228    "anycrlf". */
5229    
5230  switch (options & PCRE_NEWLINE_CRLF)  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5231    {    {
5232    default:              newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
5233    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = '\r'; break;
5234    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = '\n'; break;
5235    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5236         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5237      case PCRE_NEWLINE_ANY: newline = -1; break;
5238      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5239      default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5240    }    }
5241    
5242  if (newline > 255)  if (newline == -2)
5243      {
5244      cd->nltype = NLTYPE_ANYCRLF;
5245      }
5246    else if (newline < 0)
5247    {    {
5248    cd->nllen = 2;    cd->nltype = NLTYPE_ANY;
   cd->nl[0] = (newline >> 8) & 255;  
   cd->nl[1] = newline & 255;  
5249    }    }
5250  else  else
5251    {    {
5252    cd->nllen = 1;    cd->nltype = NLTYPE_FIXED;
5253    cd->nl[0] = newline;    if (newline > 255)
5254        {
5255        cd->nllen = 2;
5256        cd->nl[0] = (newline >> 8) & 255;
5257        cd->nl[1] = newline & 255;
5258        }
5259      else
5260        {
5261        cd->nllen = 1;
5262        cd->nl[0] = newline;
5263        }
5264    }    }
5265    
5266  /* Maximum back reference and backref bitmap. This is updated for numeric  /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5267  references during the first pass, but for named references during the actual  references to help in deciding whether (.*) can be treated as anchored or not.
5268  compile pass. The bitmap records up to 31 back references to help in deciding  */
 whether (.*) can be treated as anchored or not. */  
5269