/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 168 by ph10, Tue May 29 15:18:18 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 54  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 73  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 98  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 107  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 156  static const int posix_class_maps[] = { Line 165  static const int posix_class_maps[] = {
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 172  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 182  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 194  static const char *error_texts[] = { Line 208  static const char *error_texts[] = {
208    "malformed number or name after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 210  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named subpatterns have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p",
234    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236    /* 50 */    /* 50 */
237    "repeated subpattern is too long",    "repeated subpattern is too long",
238    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by an (optionally braced) non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 241  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 277  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 291  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 325  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 352  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 363  static BOOL Line 385  static BOOL
385    
386  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
387  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
388  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
389  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391    ptr is pointing at the \. On exit, it is on the final character of the escape
392    sequence.
393    
394  Arguments:  Arguments:
395    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 398  if (c == 0) *errorcodeptr = ERR1; Line 422  if (c == 0) *errorcodeptr = ERR1;
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 436  else if ((i = escapes[c - 0x48]) != 0)
436  else  else
437    {    {
438    const uschar *oldptr;    const uschar *oldptr;
439      BOOL braced, negated;
440    
441    switch (c)    switch (c)
442      {      {
443      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 451  else
451      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
452      break;      break;
453    
454        /* \g must be followed by a number, either plain or braced. If positive, it
455        is an absolute backreference. If negative, it is a relative backreference.
456        This is a Perl 5.10 feature. */
457    
458        case 'g':
459        if (ptr[1] == '{')
460          {
461          braced = TRUE;
462          ptr++;
463          }
464        else braced = FALSE;
465    
466        if (ptr[1] == '-')
467          {
468          negated = TRUE;
469          ptr++;
470          }
471        else negated = FALSE;
472    
473        c = 0;
474        while ((digitab[ptr[1]] & ctype_digit) != 0)
475          c = c * 10 + *(++ptr) - '0';
476    
477        if (c == 0 || (braced && *(++ptr) != '}'))
478          {
479          *errorcodeptr = ERR57;
480          return 0;
481          }
482    
483        if (negated)
484          {
485          if (c > bracount)
486            {
487            *errorcodeptr = ERR15;
488            return 0;
489            }
490          c = bracount - (c - 1);
491          }
492    
493        c = -(ESC_REF + c);
494        break;
495    
496      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
497      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
498      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 495  else Line 563  else
563          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
564          count++;          count++;
565    
566  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
567          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
568          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
569  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
570          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
571          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
572  #endif  #endif
# Line 522  else Line 590  else
590        {        {
591        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
592        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
593  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
594        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
595        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
596  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
597        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
598        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
599  #endif  #endif
600        }        }
601      break;      break;
602    
603      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
604        This coding is ASCII-specific, but then the whole concept of \cx is
605        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
606    
607      case 'c':      case 'c':
608      c = *(++ptr);      c = *(++ptr);
# Line 542  else Line 612  else
612        return 0;        return 0;
613        }        }
614    
615      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
616      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
617      c ^= 0x40;      c ^= 0x40;
618  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
619      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
620      c ^= 0xC0;      c ^= 0xC0;
621  #endif  #endif
# Line 772  return p; Line 838  return p;
838    
839    
840  /*************************************************  /*************************************************
841  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
842  *************************************************/  *************************************************/
843    
844  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
845  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
846  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
847  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
848  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
849    be terminated by '>' because that is checked in the first pass.
850    
851  Arguments:  Arguments:
852    pointer      current position in the pattern    ptr          current position in the pattern
853    count        current count of capturing parens    count        current count of capturing parens so far encountered
854    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
855    namelen      name length    lorn         name length, or subpattern number if name is NULL
856      xmode        TRUE if we are in /x mode
857    
858  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
859  */  */
860    
861  static int  static int
862  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
863      BOOL xmode)
864  {  {
865  const uschar *thisname;  const uschar *thisname;
866    
867  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
868    {    {
869    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
870    
871      /* Skip over backslashed characters and also entire \Q...\E */
872    
873      if (*ptr == '\\')
874        {
875        if (*(++ptr) == 0) return -1;
876        if (*ptr == 'Q') for (;;)
877          {
878          while (*(++ptr) != 0 && *ptr != '\\');
879          if (*ptr == 0) return -1;
880          if (*(++ptr) == 'E') break;
881          }
882        continue;
883        }
884    
885      /* Skip over character classes */
886    
887      if (*ptr == '[')
888        {
889        while (*(++ptr) != ']')
890          {
891          if (*ptr == '\\')
892            {
893            if (*(++ptr) == 0) return -1;
894            if (*ptr == 'Q') for (;;)
895              {
896              while (*(++ptr) != 0 && *ptr != '\\');
897              if (*ptr == 0) return -1;
898              if (*(++ptr) == 'E') break;
899              }
900            continue;
901            }
902          }
903        continue;
904        }
905    
906      /* Skip comments in /x mode */
907    
908      if (xmode && *ptr == '#')
909        {
910        while (*(++ptr) != 0 && *ptr != '\n');
911        if (*ptr == 0) return -1;
912        continue;
913        }
914    
915      /* An opening parens must now be a real metacharacter */
916    
917    if (*ptr != '(') continue;    if (*ptr != '(') continue;
918    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?')
919    if (ptr[2] == '(') { ptr += 2; continue; }      {
920    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
921        if (name == NULL && count == lorn) return count;
922        continue;
923        }
924    
925      ptr += 2;
926      if (*ptr == 'P') ptr++;                      /* Allow optional P */
927    
928      /* We have to disambiguate (?<! and (?<= from (?<name> */
929    
930      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
931           *ptr != '\'')
932        continue;
933    
934    count++;    count++;
935    ptr += 4;  
936      if (name == NULL && count == lorn) return count;
937      term = *ptr++;
938      if (term == '<') term = '>';
939    thisname = ptr;    thisname = ptr;
940    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
941    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
942          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
943      return count;      return count;
944    }    }
945    
946  return -1;  return -1;
947  }  }
948    
# Line 862  for (;;) Line 997  for (;;)
997    
998      case OP_CALLOUT:      case OP_CALLOUT:
999      case OP_CREF:      case OP_CREF:
1000      case OP_BRANUMBER:      case OP_RREF:
1001        case OP_DEF:
1002      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1003      break;      break;
1004    
# Line 907  for (;;) Line 1043  for (;;)
1043    {    {
1044    int d;    int d;
1045    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1046    
1047    switch (op)    switch (op)
1048      {      {
1049        case OP_CBRA:
1050      case OP_BRA:      case OP_BRA:
1051      case OP_ONCE:      case OP_ONCE:
1052      case OP_COND:      case OP_COND:
1053      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1054      if (d < 0) return d;      if (d < 0) return d;
1055      branchlength += d;      branchlength += d;
1056      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1085  for (;;)
1085      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1086    
1087      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1088      case OP_CREF:      case OP_CREF:
1089        case OP_RREF:
1090        case OP_DEF:
1091      case OP_OPT:      case OP_OPT:
1092      case OP_CALLOUT:      case OP_CALLOUT:
1093      case OP_SOD:      case OP_SOD:
# Line 1094  for (;;) Line 1231  for (;;)
1231    
1232    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1233    
1234    /* Handle bracketed group */    /* Handle capturing bracket */
1235    
1236    else if (c > OP_BRA)    else if (c == OP_CBRA)
1237      {      {
1238      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1239      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1240      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1241      }      }
1242    
1243    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1244    that are followed by a character may be followed by a multi-byte character.    a multi-byte character. The length in the table is a minimum, so we have to
1245    The length in the table is a minimum, so we have to scan along to skip the    arrange to skip the extra bytes. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1246    
1247    else    else
1248      {      {
1249      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1250    #ifdef SUPPORT_UTF8
1251      if (utf8) switch(c)      if (utf8) switch(c)
1252        {        {
1253        case OP_CHAR:        case OP_CHAR:
# Line 1120  for (;;) Line 1255  for (;;)
1255        case OP_EXACT:        case OP_EXACT:
1256        case OP_UPTO:        case OP_UPTO:
1257        case OP_MINUPTO:        case OP_MINUPTO:
1258          case OP_POSUPTO:
1259        case OP_STAR:        case OP_STAR:
1260        case OP_MINSTAR:        case OP_MINSTAR:
1261          case OP_POSSTAR:
1262        case OP_PLUS:        case OP_PLUS:
1263        case OP_MINPLUS:        case OP_MINPLUS:
1264          case OP_POSPLUS:
1265        case OP_QUERY:        case OP_QUERY:
1266        case OP_MINQUERY:        case OP_MINQUERY:
1267        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1268          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1269        break;        break;
1270        }        }
1271    #endif
1272      }      }
1273    }    }
1274  }  }
# Line 1164  for (;;) Line 1304  for (;;)
1304    
1305    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1306    
   /* All bracketed groups have the same length. */  
   
   else if (c > OP_BRA)  
     {  
     code += _pcre_OP_lengths[OP_BRA];  
     }  
   
1307    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1308    that are followed by a character may be followed by a multi-byte character.    that are followed by a character may be followed by a multi-byte character.
1309    The length in the table is a minimum, so we have to scan along to skip the    The length in the table is a minimum, so we have to arrange to skip the extra
1310    extra bytes. All opcodes are less than 128, so we can use relatively    bytes. */
   efficient code. */  
1311    
1312    else    else
1313      {      {
1314      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1315    #ifdef SUPPORT_UTF8
1316      if (utf8) switch(c)      if (utf8) switch(c)
1317        {        {
1318        case OP_CHAR:        case OP_CHAR:
# Line 1187  for (;;) Line 1320  for (;;)
1320        case OP_EXACT:        case OP_EXACT:
1321        case OP_UPTO:        case OP_UPTO:
1322        case OP_MINUPTO:        case OP_MINUPTO:
1323          case OP_POSUPTO:
1324        case OP_STAR:        case OP_STAR:
1325        case OP_MINSTAR:        case OP_MINSTAR:
1326          case OP_POSSTAR:
1327        case OP_PLUS:        case OP_PLUS:
1328        case OP_MINPLUS:        case OP_MINPLUS:
1329          case OP_POSPLUS:
1330        case OP_QUERY:        case OP_QUERY:
1331        case OP_MINQUERY:        case OP_MINQUERY:
1332        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1333          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1334        break;        break;
1335        }        }
1336    #endif
1337      }      }
1338    }    }
1339  }  }
# Line 1207  for (;;) Line 1345  for (;;)
1345  *************************************************/  *************************************************/
1346    
1347  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1348  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1349  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1350  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1351  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1352    struck an inner bracket whose current branch will already have been scanned.
1353    
1354  Arguments:  Arguments:
1355    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1363  static BOOL
1363  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1364  {  {
1365  register int c;  register int c;
1366  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1367       code < endcode;       code < endcode;
1368       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1369    {    {
# Line 1232  for (code = first_significant_code(code Line 1371  for (code = first_significant_code(code
1371    
1372    c = *code;    c = *code;
1373    
1374    if (c >= OP_BRA)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1375      {      {
1376      BOOL empty_branch;      BOOL empty_branch;
1377      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1387  for (code = first_significant_code(code
1387        }        }
1388      while (*code == OP_ALT);      while (*code == OP_ALT);
1389      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1390      code += 1 + LINK_SIZE;  
1391      c = *code;      /* Move past the KET and fudge things so that the increment in the "for"
1392        above has no effect. */
1393    
1394        c = OP_END;
1395        code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1396        continue;
1397      }      }
1398    
1399    else switch (c)    /* Handle the other opcodes */
1400    
1401      switch (c)
1402      {      {
1403      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1404    
# Line 1308  for (code = first_significant_code(code Line 1454  for (code = first_significant_code(code
1454      case OP_NOT:      case OP_NOT:
1455      case OP_PLUS:      case OP_PLUS:
1456      case OP_MINPLUS:      case OP_MINPLUS:
1457        case OP_POSPLUS:
1458      case OP_EXACT:      case OP_EXACT:
1459      case OP_NOTPLUS:      case OP_NOTPLUS:
1460      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1461        case OP_NOTPOSPLUS:
1462      case OP_NOTEXACT:      case OP_NOTEXACT:
1463      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1464      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1465        case OP_TYPEPOSPLUS:
1466      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1467      return FALSE;      return FALSE;
1468    
# Line 1325  for (code = first_significant_code(code Line 1474  for (code = first_significant_code(code
1474      case OP_ALT:      case OP_ALT:
1475      return TRUE;      return TRUE;
1476    
1477      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1478      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1479    
1480  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1481      case OP_STAR:      case OP_STAR:
1482      case OP_MINSTAR:      case OP_MINSTAR:
1483        case OP_POSSTAR:
1484      case OP_QUERY:      case OP_QUERY:
1485      case OP_MINQUERY:      case OP_MINQUERY:
1486        case OP_POSQUERY:
1487      case OP_UPTO:      case OP_UPTO:
1488      case OP_MINUPTO:      case OP_MINUPTO:
1489        case OP_POSUPTO:
1490      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1491      break;      break;
1492  #endif  #endif
# Line 1452  earlier groups that are outside the curr Line 1604  earlier groups that are outside the curr
1604  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1605  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1606  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1607  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1608  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1609    
1610    This function has been extended with the possibility of forward references for
1611    recursions and subroutine calls. It must also check the list of such references
1612    for the group we are dealing with. If it finds that one of the recursions in
1613    the current group is on this list, it adjusts the offset in the list, not the
1614    value in the reference (which is a group number).
1615    
1616  Arguments:  Arguments:
1617    group      points to the start of the group    group      points to the start of the group
1618    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1619    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1620    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1621      save_hwm   the hwm forward reference pointer at the start of the group
1622    
1623  Returns:     nothing  Returns:     nothing
1624  */  */
1625    
1626  static void  static void
1627  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1628      uschar *save_hwm)
1629  {  {
1630  uschar *ptr = group;  uschar *ptr = group;
1631  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1632    {    {
1633    int offset = GET(ptr, 1);    int offset;
1634    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1635    
1636      /* See if this recursion is on the forward reference list. If so, adjust the
1637      reference. */
1638    
1639      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1640        {
1641        offset = GET(hc, 0);
1642        if (cd->start_code + offset == ptr + 1)
1643          {
1644          PUT(hc, 0, offset + adjust);
1645          break;
1646          }
1647        }
1648    
1649      /* Otherwise, adjust the recursion offset if it's after the start of this
1650      group. */
1651    
1652      if (hc >= cd->hwm)
1653        {
1654        offset = GET(ptr, 1);
1655        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1656        }
1657    
1658    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1659    }    }
1660  }  }
# Line 1550  Yield:        TRUE when range returned; Line 1733  Yield:        TRUE when range returned;
1733  */  */
1734    
1735  static BOOL  static BOOL
1736  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1737      unsigned int *odptr)
1738  {  {
1739  int c, othercase, next;  unsigned int c, othercase, next;
1740    
1741  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1742    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1743    
1744  if (c > d) return FALSE;  if (c > d) return FALSE;
1745    
# Line 1576  return TRUE; Line 1760  return TRUE;
1760  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1761    
1762    
1763    
1764    /*************************************************
1765    *     Check if auto-possessifying is possible    *
1766    *************************************************/
1767    
1768    /* This function is called for unlimited repeats of certain items, to see
1769    whether the next thing could possibly match the repeated item. If not, it makes
1770    sense to automatically possessify the repeated item.
1771    
1772    Arguments:
1773      op_code       the repeated op code
1774      this          data for this item, depends on the opcode
1775      utf8          TRUE in UTF-8 mode
1776      utf8_char     used for utf8 character bytes, NULL if not relevant
1777      ptr           next character in pattern
1778      options       options bits
1779      cd            contains pointers to tables etc.
1780    
1781    Returns:        TRUE if possessifying is wanted
1782    */
1783    
1784    static BOOL
1785    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1786      const uschar *ptr, int options, compile_data *cd)
1787    {
1788    int next;
1789    
1790    /* Skip whitespace and comments in extended mode */
1791    
1792    if ((options & PCRE_EXTENDED) != 0)
1793      {
1794      for (;;)
1795        {
1796        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1797        if (*ptr == '#')
1798          {
1799          while (*(++ptr) != 0)
1800            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1801          }
1802        else break;
1803        }
1804      }
1805    
1806    /* If the next item is one that we can handle, get its value. A non-negative
1807    value is a character, a negative value is an escape value. */
1808    
1809    if (*ptr == '\\')
1810      {
1811      int temperrorcode = 0;
1812      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1813      if (temperrorcode != 0) return FALSE;
1814      ptr++;    /* Point after the escape sequence */
1815      }
1816    
1817    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1818      {
1819    #ifdef SUPPORT_UTF8
1820      if (utf8) { GETCHARINC(next, ptr); } else
1821    #endif
1822      next = *ptr++;
1823      }
1824    
1825    else return FALSE;
1826    
1827    /* Skip whitespace and comments in extended mode */
1828    
1829    if ((options & PCRE_EXTENDED) != 0)
1830      {
1831      for (;;)
1832        {
1833        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1834        if (*ptr == '#')
1835          {
1836          while (*(++ptr) != 0)
1837            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1838          }
1839        else break;
1840        }
1841      }
1842    
1843    /* If the next thing is itself optional, we have to give up. */
1844    
1845    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1846      return FALSE;
1847    
1848    /* Now compare the next item with the previous opcode. If the previous is a
1849    positive single character match, "item" either contains the character or, if
1850    "item" is greater than 127 in utf8 mode, the character's bytes are in
1851    utf8_char. */
1852    
1853    
1854    /* Handle cases when the next item is a character. */
1855    
1856    if (next >= 0) switch(op_code)
1857      {
1858      case OP_CHAR:
1859    #ifdef SUPPORT_UTF8
1860      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1861    #endif
1862      return item != next;
1863    
1864      /* For CHARNC (caseless character) we must check the other case. If we have
1865      Unicode property support, we can use it to test the other case of
1866      high-valued characters. */
1867    
1868      case OP_CHARNC:
1869    #ifdef SUPPORT_UTF8
1870      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1871    #endif
1872      if (item == next) return FALSE;
1873    #ifdef SUPPORT_UTF8
1874      if (utf8)
1875        {
1876        unsigned int othercase;
1877        if (next < 128) othercase = cd->fcc[next]; else
1878    #ifdef SUPPORT_UCP
1879        othercase = _pcre_ucp_othercase((unsigned int)next);
1880    #else
1881        othercase = NOTACHAR;
1882    #endif
1883        return (unsigned int)item != othercase;
1884        }
1885      else
1886    #endif  /* SUPPORT_UTF8 */
1887      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1888    
1889      /* For OP_NOT, "item" must be a single-byte character. */
1890    
1891      case OP_NOT:
1892      if (next < 0) return FALSE;  /* Not a character */
1893      if (item == next) return TRUE;
1894      if ((options & PCRE_CASELESS) == 0) return FALSE;
1895    #ifdef SUPPORT_UTF8
1896      if (utf8)
1897        {
1898        unsigned int othercase;
1899        if (next < 128) othercase = cd->fcc[next]; else
1900    #ifdef SUPPORT_UCP
1901        othercase = _pcre_ucp_othercase(next);
1902    #else
1903        othercase = NOTACHAR;
1904    #endif
1905        return (unsigned int)item == othercase;
1906        }
1907      else
1908    #endif  /* SUPPORT_UTF8 */
1909      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1910    
1911      case OP_DIGIT:
1912      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1913    
1914      case OP_NOT_DIGIT:
1915      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1916    
1917      case OP_WHITESPACE:
1918      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1919    
1920      case OP_NOT_WHITESPACE:
1921      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1922    
1923      case OP_WORDCHAR:
1924      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1925    
1926      case OP_NOT_WORDCHAR:
1927      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1928    
1929      default:
1930      return FALSE;
1931      }
1932    
1933    
1934    /* Handle the case when the next item is \d, \s, etc. */
1935    
1936    switch(op_code)
1937      {
1938      case OP_CHAR:
1939      case OP_CHARNC:
1940    #ifdef SUPPORT_UTF8
1941      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1942    #endif
1943      switch(-next)
1944        {
1945        case ESC_d:
1946        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1947    
1948        case ESC_D:
1949        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1950    
1951        case ESC_s:
1952        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1953    
1954        case ESC_S:
1955        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1956    
1957        case ESC_w:
1958        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1959    
1960        case ESC_W:
1961        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1962    
1963        default:
1964        return FALSE;
1965        }
1966    
1967      case OP_DIGIT:
1968      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1969    
1970      case OP_NOT_DIGIT:
1971      return next == -ESC_d;
1972    
1973      case OP_WHITESPACE:
1974      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1975    
1976      case OP_NOT_WHITESPACE:
1977      return next == -ESC_s;
1978    
1979      case OP_WORDCHAR:
1980      return next == -ESC_W || next == -ESC_s;
1981    
1982      case OP_NOT_WORDCHAR:
1983      return next == -ESC_w || next == -ESC_d;
1984    
1985      default:
1986      return FALSE;
1987      }
1988    
1989    /* Control does not reach here */
1990    }
1991    
1992    
1993    
1994  /*************************************************  /*************************************************
1995  *           Compile one branch                   *  *           Compile one branch                   *
1996  *************************************************/  *************************************************/
1997    
1998  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
1999  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2000  bits.  bits. This function is used during the pre-compile phase when we are trying
2001    to find out the amount of memory needed, as well as during the real compile
2002    phase. The value of lengthptr distinguishes the two phases.
2003    
2004  Arguments:  Arguments:
2005    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2006    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2007    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2008    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1594  Arguments: Line 2010  Arguments:
2010    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2011    bcptr          points to current branch chain    bcptr          points to current branch chain
2012    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2013      lengthptr      NULL during the real compile phase
2014                     points to length accumulator during pre-compile phase
2015    
2016  Returns:         TRUE on success  Returns:         TRUE on success
2017                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2018  */  */
2019    
2020  static BOOL  static BOOL
2021  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2022    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2023    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2024  {  {
2025  int repeat_type, op_type;  int repeat_type, op_type;
2026  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1613  int zeroreqbyte, zerofirstbyte; Line 2031  int zeroreqbyte, zerofirstbyte;
2031  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
2032  int options = *optionsptr;  int options = *optionsptr;
2033  int after_manual_callout = 0;  int after_manual_callout = 0;
2034    int length_prevgroup = 0;
2035  register int c;  register int c;
2036  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2037    uschar *last_code = code;
2038    uschar *orig_code = code;
2039  uschar *tempcode;  uschar *tempcode;
2040  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2041  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1622  const uschar *ptr = *ptrptr; Line 2043  const uschar *ptr = *ptrptr;
2043  const uschar *tempptr;  const uschar *tempptr;
2044  uschar *previous = NULL;  uschar *previous = NULL;
2045  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2046    uschar *save_hwm = NULL;
2047  uschar classbits[32];  uschar classbits[32];
2048    
2049  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1631  uschar *class_utf8data; Line 2053  uschar *class_utf8data;
2053  uschar utf8_char[6];  uschar utf8_char[6];
2054  #else  #else
2055  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2056    uschar *utf8_char = NULL;
2057    #endif
2058    
2059    #ifdef DEBUG
2060    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2061  #endif  #endif
2062    
2063  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1664  for (;; ptr++) Line 2091  for (;; ptr++)
2091    BOOL negate_class;    BOOL negate_class;
2092    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2093    BOOL is_quantifier;    BOOL is_quantifier;
2094      BOOL is_recurse;
2095    int class_charcount;    int class_charcount;
2096    int class_lastchar;    int class_lastchar;
2097    int newoptions;    int newoptions;
2098    int recno;    int recno;
2099      int refsign;
2100    int skipbytes;    int skipbytes;
2101    int subreqbyte;    int subreqbyte;
2102    int subfirstbyte;    int subfirstbyte;
2103      int terminator;
2104    int mclength;    int mclength;
2105    uschar mcbuffer[8];    uschar mcbuffer[8];
2106    
2107    /* Next byte in the pattern */    /* Get next byte in the pattern */
2108    
2109    c = *ptr;    c = *ptr;
2110    
2111      /* If we are in the pre-compile phase, accumulate the length used for the
2112      previous cycle of this loop. */
2113    
2114      if (lengthptr != NULL)
2115        {
2116    #ifdef DEBUG
2117        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2118    #endif
2119        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2120          {
2121          *errorcodeptr = ERR52;
2122          goto FAILED;
2123          }
2124    
2125        /* There is at least one situation where code goes backwards: this is the
2126        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2127        the class is simply eliminated. However, it is created first, so we have to
2128        allow memory for it. Therefore, don't ever reduce the length at this point.
2129        */
2130    
2131        if (code < last_code) code = last_code;
2132        *lengthptr += code - last_code;
2133        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2134    
2135        /* If "previous" is set and it is not at the start of the work space, move
2136        it back to there, in order to avoid filling up the work space. Otherwise,
2137        if "previous" is NULL, reset the current code pointer to the start. */
2138    
2139        if (previous != NULL)
2140          {
2141          if (previous > orig_code)
2142            {
2143            memmove(orig_code, previous, code - previous);
2144            code -= previous - orig_code;
2145            previous = orig_code;
2146            }
2147          }
2148        else code = orig_code;
2149    
2150        /* Remember where this code item starts so we can pick up the length
2151        next time round. */
2152    
2153        last_code = code;
2154        }
2155    
2156      /* In the real compile phase, just check the workspace used by the forward
2157      reference list. */
2158    
2159      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2160        {
2161        *errorcodeptr = ERR52;
2162        goto FAILED;
2163        }
2164    
2165    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2166    
2167    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1692  for (;; ptr++) Line 2176  for (;; ptr++)
2176        {        {
2177        if (previous_callout != NULL)        if (previous_callout != NULL)
2178          {          {
2179          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2180              complete_callout(previous_callout, ptr, cd);
2181          previous_callout = NULL;          previous_callout = NULL;
2182          }          }
2183        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2198  for (;; ptr++)
2198    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2199         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2200      {      {
2201      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2202          complete_callout(previous_callout, ptr, cd);
2203      previous_callout = NULL;      previous_callout = NULL;
2204      }      }
2205    
# Line 1724  for (;; ptr++) Line 2210  for (;; ptr++)
2210      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2211      if (c == '#')      if (c == '#')
2212        {        {
2213        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2214          {          {
2215          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2216          }          }
2217          if (*ptr != 0) continue;
2218    
2219        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2220        c = 0;        c = 0;
2221        }        }
# Line 1745  for (;; ptr++) Line 2231  for (;; ptr++)
2231    
2232    switch(c)    switch(c)
2233      {      {
2234      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2235        case 0:                        /* The branch terminates at string end */
2236      case 0:      case '|':                      /* or | or ) */
     case '|':  
2237      case ')':      case ')':
2238      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2239      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2240      *codeptr = code;      *codeptr = code;
2241      *ptrptr = ptr;      *ptrptr = ptr;
2242        if (lengthptr != NULL)
2243          {
2244          *lengthptr += code - last_code;   /* To include callout length */
2245          DPRINTF((">> end branch\n"));
2246          }
2247      return TRUE;      return TRUE;
2248    
2249    
2250        /* ===================================================================*/
2251      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2252      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2253    
# Line 1784  for (;; ptr++) Line 2276  for (;; ptr++)
2276      *code++ = OP_ANY;      *code++ = OP_ANY;
2277      break;      break;
2278    
2279    
2280        /* ===================================================================*/
2281      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2282      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2283      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1822  for (;; ptr++) Line 2316  for (;; ptr++)
2316        }        }
2317    
2318      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2319      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2320      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2321    
2322      class_charcount = 0;      class_charcount = 0;
2323      class_lastchar = -1;      class_lastchar = -1;
2324    
2325        /* Initialize the 32-char bit map to all zeros. We build the map in a
2326        temporary bit of memory, in case the class contains only 1 character (less
2327        than 256), because in that case the compiled code doesn't use the bit map.
2328        */
2329    
2330        memset(classbits, 0, 32 * sizeof(uschar));
2331    
2332  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2333      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2334      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2335  #endif  #endif
2336    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2337      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2338      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2339      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2340    
2341      do      if (c != 0) do
2342        {        {
2343          const uschar *oldptr;
2344    
2345  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2346        if (utf8 && c > 127)        if (utf8 && c > 127)
2347          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1859  for (;; ptr++) Line 2353  for (;; ptr++)
2353    
2354        if (inescq)        if (inescq)
2355          {          {
2356          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2357            {            {
2358            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2359            ptr++;            ptr++;                            /* Skip the 'E' */
2360            continue;            continue;                         /* Carry on with next */
2361            }            }
2362          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2363          }          }
2364    
2365        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1956  for (;; ptr++) Line 2450  for (;; ptr++)
2450          }          }
2451    
2452        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2453        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2454        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2455        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2456        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2457        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2458    
2459        if (c == '\\')        if (c == '\\')
2460          {          {
2461          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2462            if (*errorcodeptr != 0) goto FAILED;
2463    
2464          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2465          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2466            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2467          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2468            {            {
2469            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1983  for (;; ptr++) Line 2478  for (;; ptr++)
2478            {            {
2479            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2480            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2481            switch (-c)  
2482              /* Save time by not doing this in the pre-compile phase. */
2483    
2484              if (lengthptr == NULL) switch (-c)
2485              {              {
2486              case ESC_d:              case ESC_d:
2487              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 2011  for (;; ptr++) Line 2509  for (;; ptr++)
2509              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2510              continue;              continue;
2511    
2512  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = ptype;  
               *class_utf8data++ = pdata;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2513              continue;              continue;
 #endif  
2514    
2515              /* Unrecognized escapes are faulted if PCRE is running in its              default:    /* Not recognized; fall through */
2516              strict mode. By default, for compatibility with Perl, they are              break;      /* Need "default" setting to stop compiler warning. */
2517              treated as literals. */              }
2518    
2519              default:            /* In the pre-compile phase, just do the recognition. */
2520              if ((options & PCRE_EXTRA) != 0)  
2521                {            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2522                *errorcodeptr = ERR7;                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2523                goto FAILED;  
2524                }            /* We need to deal with \P and \p in both phases. */
2525              c = *ptr;              /* The final character */  
2526              class_charcount -= 2;  /* Undo the default count from above */  #ifdef SUPPORT_UCP
2527              if (-c == ESC_p || -c == ESC_P)
2528                {
2529                BOOL negated;
2530                int pdata;
2531                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2532                if (ptype < 0) goto FAILED;
2533                class_utf8 = TRUE;
2534                *class_utf8data++ = ((-c == ESC_p) != negated)?
2535                  XCL_PROP : XCL_NOTPROP;
2536                *class_utf8data++ = ptype;
2537                *class_utf8data++ = pdata;
2538                class_charcount -= 2;   /* Not a < 256 character */
2539                continue;
2540              }              }
2541    #endif
2542              /* Unrecognized escapes are faulted if PCRE is running in its
2543              strict mode. By default, for compatibility with Perl, they are
2544              treated as literals. */
2545    
2546              if ((options & PCRE_EXTRA) != 0)
2547                {
2548                *errorcodeptr = ERR7;
2549                goto FAILED;
2550                }
2551    
2552              class_charcount -= 2;  /* Undo the default count from above */
2553              c = *ptr;              /* Get the final character and fall through */
2554            }            }
2555    
2556          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2557          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2558    
2559          }   /* End of backslash handling */          }   /* End of backslash handling */
2560    
2561        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2562        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2563        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2564          entirely. The code for handling \Q and \E is messy. */
2565    
2566          CHECK_RANGE:
2567          while (ptr[1] == '\\' && ptr[2] == 'E')
2568            {
2569            inescq = FALSE;
2570            ptr += 2;
2571            }
2572    
2573          oldptr = ptr;
2574    
2575        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2576          {          {
2577          int d;          int d;
2578          ptr += 2;          ptr += 2;
2579            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2580    
2581            /* If we hit \Q (not followed by \E) at this point, go into escaped
2582            mode. */
2583    
2584            while (*ptr == '\\' && ptr[1] == 'Q')
2585              {
2586              ptr += 2;
2587              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2588              inescq = TRUE;
2589              break;
2590              }
2591    
2592            if (*ptr == 0 || (!inescq && *ptr == ']'))
2593              {
2594              ptr = oldptr;
2595              goto LONE_SINGLE_CHARACTER;
2596              }
2597    
2598  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2599          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 2608  for (;; ptr++)
2608          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2609          in such circumstances. */          in such circumstances. */
2610    
2611          if (d == '\\')          if (!inescq && d == '\\')
2612            {            {
2613            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2614            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2615    
2616            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2617            was literal */            special means the '-' was literal */
2618    
2619            if (d < 0)            if (d < 0)
2620              {              {
2621              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2622              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2623                else if (d == -ESC_R) d = 'R'; else
2624                {                {
2625                ptr = oldptr - 2;                ptr = oldptr;
2626                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2627                }                }
2628              }              }
2629            }            }
2630    
2631          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2632          the pre-pass. Optimize one-character ranges */          one-character ranges */
2633    
2634            if (d < c)
2635              {
2636              *errorcodeptr = ERR8;
2637              goto FAILED;
2638              }
2639    
2640          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2641    
# Line 2112  for (;; ptr++) Line 2656  for (;; ptr++)
2656  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2657            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2658              {              {
2659              int occ, ocd;              unsigned int occ, ocd;
2660              int cc = c;              unsigned int cc = c;
2661              int origd = d;              unsigned int origd = d;
2662              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2663                {                {
2664                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2172  for (;; ptr++) Line 2716  for (;; ptr++)
2716          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2717          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2718    
2719          for (; c <= d; c++)          class_charcount += d - c + 1;
2720            class_lastchar = d;
2721    
2722            /* We can save a bit of time by skipping this in the pre-compile. */
2723    
2724            if (lengthptr == NULL) for (; c <= d; c++)
2725            {            {
2726            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2727            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 2729  for (;; ptr++)
2729              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2730              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2731              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2732            }            }
2733    
2734          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 2752  for (;; ptr++)
2752  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2753          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2754            {            {
2755            int othercase;            unsigned int othercase;
2756            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2757              {              {
2758              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2759              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 2778  for (;; ptr++)
2778          }          }
2779        }        }
2780    
2781      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2782    
2783      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2784    
2785        if (c == 0)                          /* Missing terminating ']' */
2786          {
2787          *errorcodeptr = ERR6;
2788          goto FAILED;
2789          }
2790    
2791      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2792      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2298  for (;; ptr++) Line 2850  for (;; ptr++)
2850    
2851      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2852      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2853      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2854    
2855  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2856      if (class_utf8)      if (class_utf8)
# Line 2308  for (;; ptr++) Line 2860  for (;; ptr++)
2860        code += LINK_SIZE;        code += LINK_SIZE;
2861        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2862    
2863        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2864        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2865    
2866        if (class_charcount > 0)        if (class_charcount > 0)
2867          {          {
2868          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2869            memmove(code + 32, code, class_utf8data - code);
2870          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2871          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2872          }          }
2873          else code = class_utf8data;
2874    
2875        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2876    
# Line 2342  for (;; ptr++) Line 2887  for (;; ptr++)
2887      if (negate_class)      if (negate_class)
2888        {        {
2889        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2890        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2891            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2892        }        }
2893      else      else
2894        {        {
# Line 2352  for (;; ptr++) Line 2898  for (;; ptr++)
2898      code += 32;      code += 32;
2899      break;      break;
2900    
2901    
2902        /* ===================================================================*/
2903      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2904      has been tested above. */      has been tested above. */
2905    
# Line 2419  for (;; ptr++) Line 2967  for (;; ptr++)
2967        }        }
2968      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2969    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2970      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2971      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2972      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 3000  for (;; ptr++)
3000          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3001          }          }
3002    
3003          /* If the repetition is unlimited, it pays to see if the next thing on
3004          the line is something that cannot possibly match this character. If so,
3005          automatically possessifying this item gains some performance in the case
3006          where the match fails. */
3007    
3008          if (!possessive_quantifier &&
3009              repeat_max < 0 &&
3010              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3011                options, cd))
3012            {
3013            repeat_type = 0;    /* Force greedy */
3014            possessive_quantifier = TRUE;
3015            }
3016    
3017        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3018        }        }
3019    
3020      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3021      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3022      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3023      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3024        currently used only for single-byte chars. */
3025    
3026      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3027        {        {
3028        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3029        c = previous[1];        c = previous[1];
3030          if (!possessive_quantifier &&
3031              repeat_max < 0 &&
3032              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3033            {
3034            repeat_type = 0;    /* Force greedy */
3035            possessive_quantifier = TRUE;
3036            }
3037        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3038        }        }
3039    
# Line 2495  for (;; ptr++) Line 3051  for (;; ptr++)
3051        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3052        c = *previous;        c = *previous;
3053    
3054          if (!possessive_quantifier &&
3055              repeat_max < 0 &&
3056              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3057            {
3058            repeat_type = 0;    /* Force greedy */
3059            possessive_quantifier = TRUE;
3060            }
3061    
3062        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3063        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3064          {          {
# Line 2535  for (;; ptr++) Line 3099  for (;; ptr++)
3099          }          }
3100    
3101        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3102        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3103        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3104        one less than the maximum. */        one less than the maximum. */
3105    
# Line 2588  for (;; ptr++) Line 3152  for (;; ptr++)
3152            }            }
3153    
3154          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3155          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3156            UPTO is just for 1 instance, we can use QUERY instead. */
3157    
3158          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3159            {            {
# Line 2607  for (;; ptr++) Line 3172  for (;; ptr++)
3172              *code++ = prop_value;              *code++ = prop_value;
3173              }              }
3174            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3175            *code++ = OP_UPTO + repeat_type;  
3176            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3177                {
3178                *code++ = OP_QUERY + repeat_type;
3179                }
3180              else
3181                {
3182                *code++ = OP_UPTO + repeat_type;
3183                PUT2INC(code, 0, repeat_max);
3184                }
3185            }            }
3186          }          }
3187    
# Line 2675  for (;; ptr++) Line 3248  for (;; ptr++)
3248      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3249      cases. */      cases. */
3250    
3251      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3252               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3253        {        {
3254        register int i;        register int i;
3255        int ketoffset = 0;        int ketoffset = 0;
3256        int len = code - previous;        int len = code - previous;
3257        uschar *bralink = NULL;        uschar *bralink = NULL;
3258    
3259          /* Repeating a DEFINE group is pointless */
3260    
3261          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3262            {
3263            *errorcodeptr = ERR55;
3264            goto FAILED;
3265            }
3266    
3267          /* This is a paranoid check to stop integer overflow later on */
3268    
3269          if (len > MAX_DUPLENGTH)
3270            {
3271            *errorcodeptr = ERR50;
3272            goto FAILED;
3273            }
3274    
3275        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3276        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3277        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2717  for (;; ptr++) Line 3306  for (;; ptr++)
3306          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3307          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3308          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3309          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3310          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3311            doing this. */
3312    
3313          if (repeat_max <= 1)          if (repeat_max <= 1)
3314            {            {
3315            *code = OP_END;            *code = OP_END;
3316            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3317            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3318            code++;            code++;
3319            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2741  for (;; ptr++) Line 3331  for (;; ptr++)
3331            {            {
3332            int offset;            int offset;
3333            *code = OP_END;            *code = OP_END;
3334            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3335            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3336            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3337            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3351  for (;; ptr++)
3351        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3352        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3353        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3354        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3355          forward reference subroutine calls in the group, there will be entries on
3356          the workspace list; replicate these with an appropriate increment. */
3357    
3358        else        else
3359          {          {
3360          if (repeat_min > 1)          if (repeat_min > 1)
3361            {            {
3362            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3363            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3364    
3365              if (lengthptr != NULL)
3366                *lengthptr += (repeat_min - 1)*length_prevgroup;
3367    
3368              /* This is compiling for real */
3369    
3370              else
3371              {              {
3372              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3373              code += len;              for (i = 1; i < repeat_min; i++)
3374                  {
3375                  uschar *hc;
3376                  uschar *this_hwm = cd->hwm;
3377                  memcpy(code, previous, len);
3378                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3379                    {
3380                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3381                    cd->hwm += LINK_SIZE;
3382                    }
3383                  save_hwm = this_hwm;
3384                  code += len;
3385                  }
3386              }              }
3387            }            }
3388    
3389          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3390          }          }
3391    
# Line 2781  for (;; ptr++) Line 3393  for (;; ptr++)
3393        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3394        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3395        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3396        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3397          replicate entries on the forward reference list. */
3398    
3399        if (repeat_max >= 0)        if (repeat_max >= 0)
3400          {          {
3401          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3402            just adjust the length as if we had. For each repetition we must add 1
3403            to the length for BRAZERO and for all but the last repetition we must
3404            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3405    
3406            if (lengthptr != NULL && repeat_max > 0)
3407              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3408                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3409    
3410            /* This is compiling for real */
3411    
3412            else for (i = repeat_max - 1; i >= 0; i--)
3413            {            {
3414              uschar *hc;
3415              uschar *this_hwm = cd->hwm;
3416    
3417            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3418    
3419            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 3429  for (;; ptr++)
3429              }              }
3430    
3431            memcpy(code, previous, len);            memcpy(code, previous, len);
3432              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3433                {
3434                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3435                cd->hwm += LINK_SIZE;
3436                }
3437              save_hwm = this_hwm;
3438            code += len;            code += len;
3439            }            }
3440    
# Line 2824  for (;; ptr++) Line 3457  for (;; ptr++)
3457        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3458        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3459        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3460        correct offset was computed above. */        correct offset was computed above.
3461    
3462          Then, when we are doing the actual compile phase, check to see whether
3463          this group is a non-atomic one that could match an empty string. If so,
3464          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3465          that runtime checking can be done. [This check is also applied to
3466          atomic groups at runtime, but in a different way.] */
3467    
3468        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3469            {
3470            uschar *ketcode = code - ketoffset;
3471            uschar *bracode = ketcode - GET(ketcode, 1);
3472            *ketcode = OP_KETRMAX + repeat_type;
3473            if (lengthptr == NULL && *bracode != OP_ONCE)
3474              {
3475              uschar *scode = bracode;
3476              do
3477                {
3478                if (could_be_empty_branch(scode, ketcode, utf8))
3479                  {
3480                  *bracode += OP_SBRA - OP_BRA;
3481                  break;
3482                  }
3483                scode += GET(scode, 1);
3484                }
3485              while (*scode == OP_ALT);
3486              }
3487            }
3488        }        }
3489    
3490      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2837  for (;; ptr++) Line 3495  for (;; ptr++)
3495        goto FAILED;        goto FAILED;
3496        }        }
3497    
3498      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3499      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3500      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3501      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3502      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3503        but the special opcodes can optimize it a bit. The repeated item starts at
3504        tempcode, not at previous, which might be the first part of a string whose
3505        (former) last char we repeated.
3506    
3507        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3508        an 'upto' may follow. We skip over an 'exact' item, and then test the
3509        length of what remains before proceeding. */
3510    
3511      if (possessive_quantifier)      if (possessive_quantifier)
3512        {        {
3513        int len = code - tempcode;        int len;
3514        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3515        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3516        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3517        tempcode[0] = OP_ONCE;        len = code - tempcode;
3518        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3519        PUTINC(code, 0, len);          {
3520        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3521            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3522            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3523            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3524    
3525            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3526            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3527            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3528            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3529    
3530            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3531            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3532            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3533            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3534    
3535            default:
3536            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3537            code += 1 + LINK_SIZE;
3538            len += 1 + LINK_SIZE;
3539            tempcode[0] = OP_ONCE;
3540            *code++ = OP_KET;
3541            PUTINC(code, 0, len);
3542            PUT(tempcode, 1, len);
3543            break;
3544            }
3545        }        }
3546    
3547      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 3554  for (;; ptr++)
3554      break;      break;
3555    
3556    
3557      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3558      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3559      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3560      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3561      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3562      check for syntax errors here.  */      group. */
3563    
3564      case '(':      case '(':
3565      newoptions = options;      newoptions = options;
3566      skipbytes = 0;      skipbytes = 0;
3567        bravalue = OP_CBRA;
3568        save_hwm = cd->hwm;
3569    
3570      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3571        {        {
3572        int set, unset;        int i, set, unset, namelen;
3573        int *optset;        int *optset;
3574          const uschar *name;
3575          uschar *slot;
3576    
3577        switch (*(++ptr))        switch (*(++ptr))
3578          {          {
3579          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3580          ptr++;          ptr++;
3581          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3582            if (*ptr == 0)
3583              {
3584              *errorcodeptr = ERR18;
3585              goto FAILED;
3586              }
3587          continue;          continue;
3588    
3589          case ':':                 /* Non-extracting bracket */  
3590            /* ------------------------------------------------------------ */
3591            case ':':                 /* Non-capturing bracket */
3592          bravalue = OP_BRA;          bravalue = OP_BRA;
3593          ptr++;          ptr++;
3594          break;          break;
3595    
3596    
3597            /* ------------------------------------------------------------ */
3598          case '(':          case '(':
3599          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3600    
3601          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
3602          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
3603          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
3604          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
3605          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
3606          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3607          character after (?( is a word character, we know the rest up to ) will  
3608          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
3609          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
3610            by digits), and (b) a number could be a name that consists of digits.
3611          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
3612            {          cases. */
3613            int i, namelen;  
3614            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
3615            const uschar *name;          the switch. This will take control down to where bracketed groups,
3616            uschar *slot = cd->name_table;          including assertions, are processed. */
3617    
3618            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3619              break;
3620    
3621            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
3622            below), and all need to skip 3 bytes at the start of the group. */
3623    
3624            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
3625            skipbytes = 3;
3626            refsign = -1;
3627    
3628            name = ++ptr;          /* Check for a test for recursion in a named group. */
3629            while (*ptr != ')')  
3630              {          if (ptr[1] == 'R' && ptr[2] == '&')
3631              if (condref >= 0)            {
3632                condref = ((digitab[*ptr] & ctype_digit) != 0)?            terminator = -1;
3633                  condref * 10 + *ptr - '0' : -1;            ptr += 2;
3634              ptr++;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3635              }            }
3636            namelen = ptr - name;  
3637            /* Check for a test for a named group's having been set, using the Perl
3638            syntax (?(<name>) or (?('name') */
3639    
3640            else if (ptr[1] == '<')
3641              {
3642              terminator = '>';
3643            ptr++;            ptr++;
3644              }
3645            else if (ptr[1] == '\'')
3646              {
3647              terminator = '\'';
3648              ptr++;
3649              }
3650            else
3651              {
3652              terminator = 0;
3653              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3654              }
3655    
3656            for (i = 0; i < cd->names_found; i++)          /* We now expect to read a name; any thing else is an error */
3657              {  
3658              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3659              slot += cd->name_entry_size;            {
3660              }            ptr += 1;  /* To get the right offset */
3661              *errorcodeptr = ERR28;
3662              goto FAILED;
3663              }
3664    
3665            /* Found a previous named subpattern */          /* Read the name, but also get it as a number if it's all digits */
3666    
3667            if (i < cd->names_found)          recno = 0;
3668              {          name = ++ptr;
3669              condref = GET2(slot, 0);          while ((cd->ctypes[*ptr] & ctype_word) != 0)
3670              code[1+LINK_SIZE] = OP_CREF;            {
3671              PUT2(code, 2+LINK_SIZE, condref);            if (recno >= 0)
3672              }              recno = ((digitab[*ptr] & ctype_digit) != 0)?
3673                  recno * 10 + *ptr - '0' : -1;
3674              ptr++;
3675              }
3676            namelen = ptr - name;
3677    
3678            /* Search the pattern for a forward reference */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3679              {
3680              ptr--;      /* Error offset */
3681              *errorcodeptr = ERR26;
3682              goto FAILED;
3683              }
3684    
3685            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          /* Do no further checking in the pre-compile phase. */
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, i);  
             }  
3686    
3687            /* Check for 'R' for recursion */          if (lengthptr != NULL) break;
3688    
3689            else if (namelen == 1 && *name == 'R')          /* In the real compile we do the work of looking for the actual
3690            reference. If the string started with "+" or "-" we require the rest to
3691            be digits, in which case recno will be set. */
3692    
3693            if (refsign > 0)
3694              {
3695              if (recno <= 0)
3696              {              {
3697              code[1+LINK_SIZE] = OP_CREF;              *errorcodeptr = ERR58;
3698              PUT2(code, 2+LINK_SIZE, CREF_RECURSE);              goto FAILED;
3699                }
3700              if (refsign == '-')
3701                {
3702                recno = cd->bracount - recno + 1;
3703                if (recno <= 0)
3704                  {
3705                  *errorcodeptr = ERR15;
3706                  goto FAILED;
3707                  }
3708              }              }
3709              else recno += cd->bracount;
3710              PUT2(code, 2+LINK_SIZE, recno);
3711              break;
3712              }
3713    
3714            /* Check for a subpattern number */          /* Otherwise (did not start with "+" or "-"), start by looking for the
3715            name. */
3716    
3717            slot = cd->name_table;
3718            for (i = 0; i < cd->names_found; i++)
3719              {
3720              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3721              slot += cd->name_entry_size;
3722              }
3723    
3724            else if (condref > 0)          /* Found a previous named subpattern */
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, condref);  
             }  
3725    
3726            /* Either an unidentified subpattern, or a reference to (?(0) */          if (i < cd->names_found)
3727              {
3728              recno = GET2(slot, 0);
3729              PUT2(code, 2+LINK_SIZE, recno);
3730              }
3731    
3732            else          /* Search the pattern for a forward reference */
3733    
3734            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3735                            (options & PCRE_EXTENDED) != 0)) > 0)
3736              {
3737              PUT2(code, 2+LINK_SIZE, i);
3738              }
3739    
3740            /* If terminator == 0 it means that the name followed directly after
3741            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3742            some further alternatives to try. For the cases where terminator != 0
3743            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3744            now checked all the possibilities, so give an error. */
3745    
3746            else if (terminator != 0)
3747              {
3748              *errorcodeptr = ERR15;
3749              goto FAILED;
3750              }
3751    
3752            /* Check for (?(R) for recursion. Allow digits after R to specify a
3753            specific group number. */
3754    
3755            else if (*name == 'R')
3756              {
3757              recno = 0;
3758              for (i = 1; i < namelen; i++)
3759              {              {
3760              *errorcodeptr = (condref == 0)? ERR35: ERR15;              if ((digitab[name[i]] & ctype_digit) == 0)
3761              goto FAILED;                {
3762                  *errorcodeptr = ERR15;
3763                  goto FAILED;
3764                  }
3765                recno = recno * 10 + name[i] - '0';
3766              }              }
3767              if (recno == 0) recno = RREF_ANY;
3768              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3769              PUT2(code, 2+LINK_SIZE, recno);
3770              }
3771    
3772            /* Similarly, check for the (?(DEFINE) "condition", which is always
3773            false. */
3774    
3775            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3776              {
3777              code[1+LINK_SIZE] = OP_DEF;
3778              skipbytes = 1;
3779            }            }
3780    
3781          /* For conditions that are assertions, we just fall through, having          /* Check for the "name" actually being a subpattern number. */
         set bravalue above. */  
3782    
3783            else if (recno > 0)
3784              {
3785              PUT2(code, 2+LINK_SIZE, recno);
3786              }
3787    
3788            /* Either an unidentified subpattern, or a reference to (?(0) */
3789    
3790            else
3791              {
3792              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3793              goto FAILED;
3794              }
3795          break;          break;
3796    
3797    
3798            /* ------------------------------------------------------------ */
3799          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3800          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3801          ptr++;          ptr++;
3802          break;          break;
3803    
3804    
3805            /* ------------------------------------------------------------ */
3806          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3807          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3808          ptr++;          ptr++;
3809          break;          break;
3810    
3811          case '<':                 /* Lookbehinds */  
3812          switch (*(++ptr))          /* ------------------------------------------------------------ */
3813            case '<':                 /* Lookbehind or named define */
3814            switch (ptr[1])
3815            {            {
3816            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3817            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3818            ptr++;            ptr += 2;
3819            break;            break;
3820    
3821            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3822            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3823            ptr++;            ptr += 2;
3824            break;            break;
3825    
3826              default:                /* Could be name define, else bad */
3827              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3828              ptr++;                  /* Correct offset for error */
3829              *errorcodeptr = ERR24;
3830              goto FAILED;
3831            }            }
3832          break;          break;
3833    
3834    
3835            /* ------------------------------------------------------------ */
3836          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3837          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3838          ptr++;          ptr++;
3839          break;          break;
3840    
3841    
3842            /* ------------------------------------------------------------ */
3843          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3844          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3845          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3846          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3847            {                       /* closing parenthesis is present. */            {
3848            int n = 0;            int n = 0;
3849            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3850              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3851              if (*ptr != ')')
3852                {
3853                *errorcodeptr = ERR39;
3854                goto FAILED;
3855                }
3856            if (n > 255)            if (n > 255)
3857              {              {
3858              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 3866  for (;; ptr++)
3866          previous = NULL;          previous = NULL;
3867          continue;          continue;
3868    
3869          case 'P':                 /* Named subpattern handling */  
3870          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3871            case 'P':                 /* Python-style named subpattern handling */
3872            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3873              {
3874              is_recurse = *ptr == '>';
3875              terminator = ')';
3876              goto NAMED_REF_OR_RECURSE;
3877              }
3878            else if (*ptr != '<')    /* Test for Python-style definition */
3879              {
3880              *errorcodeptr = ERR41;
3881              goto FAILED;
3882              }
3883            /* Fall through to handle (?P< as (?< is handled */
3884    
3885    
3886            /* ------------------------------------------------------------ */
3887            DEFINE_NAME:    /* Come here from (?< handling */
3888            case '\'':
3889            {            {
3890            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
3891            uschar *slot = cd->name_table;            name = ++ptr;
3892            const uschar *name;     /* Don't amalgamate; some compilers */  
3893            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3894              namelen = ptr - name;
3895    
3896            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
3897    
3898            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
3899                {
3900                if (*ptr != terminator)
3901                  {
3902                  *errorcodeptr = ERR42;
3903                  goto FAILED;
3904                  }
3905                if (cd->names_found >= MAX_NAME_COUNT)
3906                  {
3907                  *errorcodeptr = ERR49;
3908                  goto FAILED;
3909                  }
3910                if (namelen + 3 > cd->name_entry_size)
3911                  {
3912                  cd->name_entry_size = namelen + 3;
3913                  if (namelen > MAX_NAME_SIZE)
3914                    {
3915                    *errorcodeptr = ERR48;
3916                    goto FAILED;
3917                    }
3918                  }
3919                }
3920    
3921              /* In the real compile, create the entry in the table */
3922    
3923              else
3924              {              {
3925              int crc = memcmp(name, slot+2, namelen);              slot = cd->name_table;
3926              if (crc == 0)              for (i = 0; i < cd->names_found; i++)
3927                {                {
3928                if (slot[2+namelen] == 0)                int crc = memcmp(name, slot+2, namelen);
3929                  if (crc == 0)
3930                  {                  {
3931                  if ((options & PCRE_DUPNAMES) == 0)                  if (slot[2+namelen] == 0)
3932                    {                    {
3933                    *errorcodeptr = ERR43;                    if ((options & PCRE_DUPNAMES) == 0)
3934                    goto FAILED;                      {
3935                        *errorcodeptr = ERR43;
3936                        goto FAILED;
3937                        }
3938                    }                    }
3939                    else crc = -1;      /* Current name is substring */
3940                  }                  }
3941                else crc = -1;      /* Current name is substring */                if (crc < 0)
3942                }                  {
3943              if (crc < 0)                  memmove(slot + cd->name_entry_size, slot,
3944                {                    (cd->names_found - i) * cd->name_entry_size);
3945                memmove(slot + cd->name_entry_size, slot,                  break;
3946                  (cd->names_found - i) * cd->name_entry_size);                  }
3947                break;                slot += cd->name_entry_size;
3948                }                }
             slot += cd->name_entry_size;  
             }  
3949    
3950            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3951            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3952            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3953            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3954            }            }
3955    
3956          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
           {  
           int i, namelen;  
           int type = *ptr++;  
           const uschar *name = ptr;  
           uschar *slot = cd->name_table;  
3957    
3958            while (*ptr != ')') ptr++;          ptr++;                    /* Move past > or ' */
3959            namelen = ptr - name;          cd->names_found++;
3960            goto NUMBERED_GROUP;
3961    
           for (i = 0; i < cd->names_found; i++)  
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
3962    
3963            if (i < cd->names_found)         /* Back reference */          /* ------------------------------------------------------------ */
3964            case '&':                 /* Perl recursion/subroutine syntax */
3965            terminator = ')';
3966            is_recurse = TRUE;
3967            /* Fall through */
3968    
3969            /* We come here from the Python syntax above that handles both
3970            references (?P=name) and recursion (?P>name), as well as falling
3971            through from the Perl recursion syntax (?&name). */
3972    
3973            NAMED_REF_OR_RECURSE:
3974            name = ++ptr;
3975            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3976            namelen = ptr - name;
3977    
3978            /* In the pre-compile phase, do a syntax check and set a dummy
3979            reference number. */
3980    
3981            if (lengthptr != NULL)
3982              {
3983              if (*ptr != terminator)
3984                {
3985                *errorcodeptr = ERR42;
3986                goto FAILED;
3987                }
3988              if (namelen > MAX_NAME_SIZE)
3989                {
3990                *errorcodeptr = ERR48;
3991                goto FAILED;
3992                }
3993              recno = 0;
3994              }
3995    
3996            /* In the real compile, seek the name in the table */
3997    
3998            else
3999              {
4000              slot = cd->name_table;
4001              for (i = 0; i < cd->names_found; i++)
4002                {
4003                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4004                slot += cd->name_entry_size;
4005                }
4006    
4007              if (i < cd->names_found)         /* Back reference */
4008              {              {
4009              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4010              }              }
4011            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4012                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd->bracount, name, namelen,
4013                          (options & PCRE_EXTENDED) != 0)) <= 0)
4014              {              {
4015              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4016              goto FAILED;              goto FAILED;
4017              }              }
4018              }
4019    
4020            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
4021            recursion or backreferences. */
           /* Back reference */  
4022    
4023            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4024            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4025    
         /* Should never happen */  
         break;  
4026    
4027          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4028            case 'R':                 /* Recursion */
4029          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4030          /* Fall through */          /* Fall through */
4031    
         /* Recursion or "subroutine" call */  
4032    
4033          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4034          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4035            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4036            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4037            {            {
4038            const uschar *called;            const uschar *called;
4039    
4040              if ((refsign = *ptr) == '+') ptr++;
4041              else if (refsign == '-')
4042                {
4043                if ((digitab[ptr[1]] & ctype_digit) == 0)
4044                  goto OTHER_CHAR_AFTER_QUERY;
4045                ptr++;
4046                }
4047    
4048            recno = 0;            recno = 0;
4049            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4050              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4051    
4052              if (*ptr != ')')
4053                {
4054                *errorcodeptr = ERR29;
4055                goto FAILED;
4056                }
4057    
4058              if (refsign == '-')
4059                {
4060                if (recno == 0)
4061                  {
4062                  *errorcodeptr = ERR58;
4063                  goto FAILED;
4064                  }
4065                recno = cd->bracount - recno + 1;
4066                if (recno <= 0)
4067                  {
4068                  *errorcodeptr = ERR15;
4069                  goto FAILED;
4070                  }
4071                }
4072              else if (refsign == '+')
4073                {
4074                if (recno == 0)
4075                  {
4076                  *errorcodeptr = ERR58;
4077                  goto FAILED;
4078                  }
4079                recno += cd->bracount;
4080                }
4081    
4082            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4083    
4084            HANDLE_RECURSION:            HANDLE_RECURSION:
4085    
4086            previous = code;            previous = code;
4087              called = cd->start_code;
4088    
4089            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4090            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4091              this point. If we end up with a forward reference, first check that
4092              the bracket does occur later so we can give the error (and position)
4093              now. Then remember this forward reference in the workspace so it can
4094              be filled in at the end. */
4095    
4096            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);  
           if (called == NULL)  
4097              {              {
4098              *errorcodeptr = ERR15;              *code = OP_END;
4099              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4100    
4101            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4102    
4103            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4104              {                {
4105              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4106              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4107                    {
4108                    *errorcodeptr = ERR15;
4109                    goto FAILED;
4110                    }
4111                  called = cd->start_code + recno;
4112                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4113                  }
4114    
4115                /* If not a forward reference, and the subpattern is still open,
4116                this is a recursive call. We check to see if this is a left
4117                recursion that could loop for ever, and diagnose that case. */
4118    
4119                else if (GET(called, 1) == 0 &&
4120                         could_be_empty(called, code, bcptr, utf8))
4121                  {
4122                  *errorcodeptr = ERR40;
4123                  goto FAILED;
4124                  }
4125              }              }
4126    
4127            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4128            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4129              subsequent quantifier will work. */
4130    
4131            *code = OP_ONCE;            *code = OP_ONCE;
4132            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3174  for (;; ptr++) Line 4139  for (;; ptr++)
4139            *code = OP_KET;            *code = OP_KET;
4140            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4141            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4142    
4143              length_prevgroup = 3 + 3*LINK_SIZE;
4144            }            }
4145    
4146            /* Can't determine a first byte now */
4147    
4148            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4149          continue;          continue;
4150    
         /* Character after (? not specially recognized */  
4151    
4152          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4153            default:              /* Other characters: check option setting */
4154            OTHER_CHAR_AFTER_QUERY:
4155          set = unset = 0;          set = unset = 0;
4156          optset = &set;          optset = &set;
4157    
# Line 3189  for (;; ptr++) Line 4161  for (;; ptr++)
4161              {              {
4162              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4163    
4164                case 'J':    /* Record that it changed in the external options */
4165                *optset |= PCRE_DUPNAMES;
4166                cd->external_options |= PCRE_JCHANGED;
4167                break;
4168    
4169              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
             case 'J': *optset |= PCRE_DUPNAMES; break;  
4170              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4171              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4172              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4173              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4174              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4175    
4176                default:  *errorcodeptr = ERR12;
4177                          ptr--;    /* Correct the offset */
4178                          goto FAILED;
4179              }              }
4180            }            }
4181    
# Line 3204  for (;; ptr++) Line 4184  for (;; ptr++)
4184          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4185    
4186          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4187          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4188          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4189          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4190          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4191          a group), a resetting item can be compiled.          caseless checking of required bytes.
4192    
4193          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4194          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4195          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4196            that value after the start, because it gets reset as code is discarded
4197            during the pre-compile. However, this can happen only at top level - if
4198            we are within parentheses, the starting BRA will still be present. At
4199            any parenthesis level, the length value can be used to test if anything
4200            has been compiled at that level. Thus, a test for both these conditions
4201            is necessary to ensure we correctly detect the start of the pattern in
4202            both phases.
4203    
4204            If we are not at the pattern start, compile code to change the ims
4205            options if this setting actually changes any of them. We also pass the
4206            new setting back so that it can be put at the start of any following
4207            branches, and when this group ends (if we are in a group), a resetting
4208            item can be compiled. */
4209    
4210          if (*ptr == ')')          if (*ptr == ')')
4211            {            {
4212            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4213                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4214              {              {
4215              *code++ = OP_OPT;              cd->external_options = newoptions;
4216              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4217              }              }
4218             else
4219                {
4220                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4221                  {
4222                  *code++ = OP_OPT;
4223                  *code++ = newoptions & PCRE_IMS;
4224                  }
4225    
4226            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4227            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4228            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4229    
4230            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4231            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4232            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4233            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4234                }
4235    
4236            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4237            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3242  for (;; ptr++) Line 4244  for (;; ptr++)
4244    
4245          bravalue = OP_BRA;          bravalue = OP_BRA;
4246          ptr++;          ptr++;
4247          }          }     /* End of switch for character following (? */
4248        }        }       /* End of (? handling */
4249    
4250      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4251      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4252        brackets. */
4253    
4254      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4255        {        {
4256        bravalue = OP_BRA;        bravalue = OP_BRA;
4257        }        }
4258    
4259      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4260    
4261      else      else
4262        {        {
4263        NUMBERED_GROUP:        NUMBERED_GROUP:
4264        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4265          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4266          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4267        }        }
4268    
4269      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4270      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4271      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4272      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4273        they have changed. */
4274    
4275      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4276      *code = bravalue;      *code = bravalue;
4277      tempcode = code;      tempcode = code;
4278      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4279        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4280    
4281      if (!compile_regex(      if (!compile_regex(
4282           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4283           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4284           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4285           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4286           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4287           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4288            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4289           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4290           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4291           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4292           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4293           cd))                          /* Tables block */           cd,                           /* Tables block */
4294             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4295               &length_prevgroup           /* Pre-compile phase */
4296             ))
4297        goto FAILED;        goto FAILED;
4298    
4299      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3302  for (;; ptr++) Line 4302  for (;; ptr++)
4302      is on the bracket. */      is on the bracket. */
4303    
4304      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4305      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. */
4306    
4307      else if (bravalue == OP_COND)      if (bravalue == OP_COND)
4308        {        {
4309        uschar *tc = code;        uschar *tc = code;
4310        int condcount = 0;        int condcount = 0;
# Line 3315  for (;; ptr++) Line 4315  for (;; ptr++)
4315           }           }
4316        while (*tc != OP_KET);        while (*tc != OP_KET);
4317    
4318        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4319          false). It must have only one branch. */
4320    
4321          if (code[LINK_SIZE+1] == OP_DEF)
4322          {          {
4323          *errorcodeptr = ERR27;          if (condcount > 1)
4324          goto FAILED;            {
4325              *errorcodeptr = ERR54;
4326              goto FAILED;
4327              }
4328            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4329            }
4330    
4331          /* A "normal" conditional group. If there is just one branch, we must not
4332          make use of its firstbyte or reqbyte, because this is equivalent to an
4333          empty second branch. */
4334    
4335          else
4336            {
4337            if (condcount > 2)
4338              {
4339              *errorcodeptr = ERR27;
4340              goto FAILED;
4341              }
4342            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4343          }          }
4344          }
4345    
4346        /* Error if hit end of pattern */
4347    
4348        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4349        reqbyte, because this is equivalent to an empty second branch. */        {
4350          *errorcodeptr = ERR14;
4351          goto FAILED;
4352          }
4353    
4354        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4355        group, less the brackets at either end. Then reduce the compiled code to
4356        just the brackets so that it doesn't use much memory if it is duplicated by
4357        a quantifier. */
4358    
4359        if (lengthptr != NULL)
4360          {
4361          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4362          code++;
4363          PUTINC(code, 0, 1 + LINK_SIZE);
4364          *code++ = OP_KET;
4365          PUTINC(code, 0, 1 + LINK_SIZE);
4366        }        }
4367    
4368      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4369      brackets of all kinds, and conditions with two branches (see code above).  
4370      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4371      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4372      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4373        relevant. */
4374    
4375        if (bravalue == OP_DEF) break;
4376    
4377        /* Handle updating of the required and first characters for other types of
4378        group. Update for normal brackets of all kinds, and conditions with two
4379        branches (see code above). If the bracket is followed by a quantifier with
4380        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4381        zerofirstbyte outside the main loop so that they can be accessed for the
4382        back off. */
4383    
4384      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4385      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4386      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4387    
4388      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4389        {        {
4390        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4391        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3378  for (;; ptr++) Line 4426  for (;; ptr++)
4426      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4427    
4428      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4429        break;     /* End of processing '(' */
4430    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
4431    
4432      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* ===================================================================*/
4433        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4434      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4435      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4436      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4437      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4438      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4439    
4440        case '\\':
4441        tempptr = ptr;
4442        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4443        if (*errorcodeptr != 0) goto FAILED;
4444    
4445      if (c < 0)      if (c < 0)
4446        {        {
4447        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3416  for (;; ptr++) Line 4451  for (;; ptr++)
4451          continue;          continue;
4452          }          }
4453    
4454          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4455    
4456        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4457        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4458    
# Line 3427  for (;; ptr++) Line 4464  for (;; ptr++)
4464        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4465        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4466    
4467        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4468    
4469          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4470            {
4471            is_recurse = FALSE;
4472            terminator = (*(++ptr) == '<')? '>' : '\'';
4473            goto NAMED_REF_OR_RECURSE;
4474            }
4475    
4476          /* Back references are handled specially; must disable firstbyte if
4477          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4478          ':' later. */
4479    
4480        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4481          {          {
4482          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4483    
4484            HANDLE_REFERENCE:    /* Come here from named backref handling */
4485            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4486          previous = code;          previous = code;
4487          *code++ = OP_REF;          *code++ = OP_REF;
4488          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4489            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4490            if (recno > cd->top_backref) cd->top_backref = recno;
4491          }          }
4492    
4493        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4494    
4495  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4496        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3446  for (;; ptr++) Line 4498  for (;; ptr++)
4498          BOOL negated;          BOOL negated;
4499          int pdata;          int pdata;
4500          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4501            if (ptype < 0) goto FAILED;
4502          previous = code;          previous = code;
4503          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4504          *code++ = ptype;          *code++ = ptype;
4505          *code++ = pdata;          *code++ = pdata;
4506          }          }
4507    #else
4508    
4509          /* If Unicode properties are not supported, \X, \P, and \p are not
4510          allowed. */
4511    
4512          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4513            {
4514            *errorcodeptr = ERR45;
4515            goto FAILED;
4516            }
4517  #endif  #endif
4518    
4519        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4520        value */        can obtain the OP value by negating the escape value. */
4521    
4522        else        else
4523          {          {
# Line 3478  for (;; ptr++) Line 4541  for (;; ptr++)
4541       mcbuffer[0] = c;       mcbuffer[0] = c;
4542       mclength = 1;       mclength = 1;
4543       }       }
   
4544      goto ONE_CHAR;      goto ONE_CHAR;
4545    
4546    
4547        /* ===================================================================*/
4548      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4549      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4550      multi-byte literal character. */      multi-byte literal character. */
# Line 3491  for (;; ptr++) Line 4555  for (;; ptr++)
4555      mcbuffer[0] = c;      mcbuffer[0] = c;
4556    
4557  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4558      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4559        {        {
4560        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4561          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3542  for (;; ptr++) Line 4606  for (;; ptr++)
4606      }      }
4607    }                   /* end of big loop */    }                   /* end of big loop */
4608    
4609    
4610  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4611  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4612  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3558  return FALSE; Line 4623  return FALSE;
4623  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4624  *************************************************/  *************************************************/
4625    
4626  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4627  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4628  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4629  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4630  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4631  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4632  the new options into every subsequent branch compile.  into every subsequent branch compile.
4633    
4634    This function is used during the pre-compile phase when we are trying to find
4635    out the amount of memory needed, as well as during the real compile phase. The
4636    value of lengthptr distinguishes the two phases.
4637    
4638  Argument:  Argument:
4639    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4640    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4641    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4642    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4643    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4644    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4645    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4646    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4647    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4648    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4649    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4650      lengthptr      NULL during the real compile phase
4651                     points to length accumulator during pre-compile phase
4652    
4653  Returns:      TRUE on success  Returns:         TRUE on success
4654  */  */
4655    
4656  static BOOL  static BOOL
4657  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4658    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4659    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4660  {  {
4661  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4662  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3595  uschar *start_bracket = code; Line 4665  uschar *start_bracket = code;
4665  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4666  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4667  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4668    int length;
4669  branch_chain bc;  branch_chain bc;
4670    
4671  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3602  bc.current = code; Line 4673  bc.current = code;
4673    
4674  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4675    
4676    /* Accumulate the length for use in the pre-compile phase. Start with the
4677    length of the BRA and KET and any extra bytes that are required at the
4678    beginning. We accumulate in a local variable to save frequent testing of
4679    lenthptr for NULL. We cannot do this by looking at the value of code at the
4680    start and end of each alternative, because compiled items are discarded during
4681    the pre-compile phase so that the work space is not exceeded. */
4682    
4683    length = 2 + 2*LINK_SIZE + skipbytes;
4684    
4685    /* WARNING: If the above line is changed for any reason, you must also change
4686    the code that abstracts option settings at the start of the pattern and makes
4687    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4688    pre-compile phase to find out whether anything has yet been compiled or not. */
4689    
4690  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4691    
4692  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3617  for (;;) Line 4702  for (;;)
4702      {      {
4703      *code++ = OP_OPT;      *code++ = OP_OPT;
4704      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4705        length += 2;
4706      }      }
4707    
4708    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3626  for (;;) Line 4712  for (;;)
4712      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4713      reverse_count = code;      reverse_count = code;
4714      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4715        length += 1 + LINK_SIZE;
4716      }      }
4717    
4718    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4719      into the length. */
4720    
4721    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4722          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4723      {      {
4724      *ptrptr = ptr;      *ptrptr = ptr;
4725      return FALSE;      return FALSE;
4726      }      }
4727    
4728    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4729    
4730    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4731      {      {
4732      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4733      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4734    
4735    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4736    match the values from all the previous branches, except that if the previous        {
4737    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4738    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4739          }
4740    
4741    else      /* If this is not the first branch, the first char and reqbyte have to
4742      {      match the values from all the previous branches, except that if the
4743      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4744      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4745    
4746      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4747        {        {
4748        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4749        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4750        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4751    
4752          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4753            {
4754            if (reqbyte < 0) reqbyte = firstbyte;
4755            firstbyte = REQ_NONE;
4756            }
4757    
4758      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4759      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4760    
4761      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4762          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4763    
4764      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4765    
4766      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4767        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4768      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4769      }        }
4770    
4771    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4772    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4773    the branch with OP_END. */      branch with OP_END. */
4774    
4775    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4776        {        {
4777        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4778        *ptrptr = ptr;        *code = OP_END;
4779        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4780          DPRINTF(("fixed length = %d\n", fixed_length));
4781          if (fixed_length < 0)
4782            {
4783            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4784            *ptrptr = ptr;
4785            return FALSE;
4786            }
4787          PUT(reverse_count, 0, fixed_length);
4788        }        }
     PUT(reverse_count, 0, length);  
4789      }      }
4790    
4791    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. Go back through
# Line 3706  for (;;) Line 4799  for (;;)
4799    
4800    if (*ptr != '|')    if (*ptr != '|')
4801      {      {
4802      int length = code - last_branch;      int branch_length = code - last_branch;
4803      do      do
4804        {        {
4805        int prev_length = GET(last_branch, 1);        int prev_length = GET(last_branch, 1);
4806        PUT(last_branch, 1, length);        PUT(last_branch, 1, branch_length);
4807        length = prev_length;        branch_length = prev_length;
4808        last_branch -= length;        last_branch -= branch_length;
4809        }        }
4810      while (length > 0);      while (branch_length > 0);
4811    
4812      /* Fill in the ket */      /* Fill in the ket */
4813    
# Line 3728  for (;;) Line 4821  for (;;)
4821        {        {
4822        *code++ = OP_OPT;        *code++ = OP_OPT;
4823        *code++ = oldims;        *code++ = oldims;
4824          length += 2;
4825        }        }
4826    
4827      /* Set values to pass back */      /* Set values to pass back */
# Line 3736  for (;;) Line 4830  for (;;)
4830      *ptrptr = ptr;      *ptrptr = ptr;
4831      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
4832      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
4833        if (lengthptr != NULL) *lengthptr += length;
4834      return TRUE;      return TRUE;
4835      }      }
4836    
# Line 3749  for (;;) Line 4844  for (;;)
4844    bc.current = last_branch = code;    bc.current = last_branch = code;
4845    code += 1 + LINK_SIZE;    code += 1 + LINK_SIZE;
4846    ptr++;    ptr++;
4847      length += 1 + LINK_SIZE;
4848    }    }
4849  /* Control never reaches here */  /* Control never reaches here */
4850  }  }
# Line 3799  is_anchored(register const uschar *code, Line 4895  is_anchored(register const uschar *code,
4895    unsigned int backref_map)    unsigned int backref_map)
4896  {  {
4897  do {  do {
4898     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4899       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
4900     register int op = *scode;     register int op = *scode;
4901    
4902       /* Non-capturing brackets */
4903    
4904       if (op == OP_BRA)
4905         {
4906         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4907         }
4908    
4909     /* Capturing brackets */     /* Capturing brackets */
4910    
4911     if (op > OP_BRA)     else if (op == OP_CBRA)
4912       {       {
4913       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4914       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4915       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4916       }       }
4917    
4918     /* Other brackets */     /* Other brackets */
4919    
4920     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4921       {       {
4922       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4923       }       }
# Line 3824  do { Line 4925  do {
4925     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4926     are or may be referenced. */     are or may be referenced. */
4927    
4928     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4929                 op == OP_TYPEPOSSTAR) &&
4930              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
4931       {       {
4932       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3869  is_startline(const uschar *code, unsigne Line 4971  is_startline(const uschar *code, unsigne
4971    unsigned int backref_map)    unsigned int backref_map)
4972  {  {
4973  do {  do {
4974     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4975       FALSE);       NULL, 0, FALSE);
4976     register int op = *scode;     register int op = *scode;
4977    
4978       /* Non-capturing brackets */
4979    
4980       if (op == OP_BRA)
4981         {
4982         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4983         }
4984    
4985     /* Capturing brackets */     /* Capturing brackets */
4986    
4987     if (op > OP_BRA)     else if (op == OP_CBRA)
4988       {       {
4989       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4990       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4991       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
4992       }       }
4993    
4994     /* Other brackets */     /* Other brackets */
4995    
4996     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4997       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4998    
4999     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
5000     may be referenced. */     may be referenced. */
5001    
5002     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5003       {       {
5004       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5005       }       }
# Line 3941  do { Line 5048  do {
5048       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5049     register int op = *scode;     register int op = *scode;
5050    
    if (op >= OP_BRA) op = OP_BRA;  
   
5051     switch(op)     switch(op)
5052       {       {
5053       default:       default:
5054       return -1;       return -1;
5055    
5056       case OP_BRA:       case OP_BRA:
5057         case OP_CBRA:
5058       case OP_ASSERT:       case OP_ASSERT:
5059       case OP_ONCE:       case OP_ONCE:
5060       case OP_COND:       case OP_COND:
# Line 3964  do { Line 5070  do {
5070       case OP_CHARNC:       case OP_CHARNC:
5071       case OP_PLUS:       case OP_PLUS:
5072       case OP_MINPLUS:       case OP_MINPLUS:
5073         case OP_POSPLUS:
5074       if (!inassert) return -1;       if (!inassert) return -1;
5075       if (c < 0)       if (c < 0)
5076         {         {
# Line 4004  Returns:        pointer to compiled data Line 5111  Returns:        pointer to compiled data
5111                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5112  */  */
5113    
5114  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5115  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5116    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5117  {  {
# Line 4012  return pcre_compile2(pattern, options, N Line 5119  return pcre_compile2(pattern, options, N
5119  }  }
5120    
5121    
5122    PCRE_EXP_DEFN pcre *
 PCRE_DATA_SCOPE pcre *  
5123  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5124    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5125  {  {
5126  real_pcre *re;  real_pcre *re;
5127  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5128  int c, firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5129  int errorcode = 0;  int errorcode = 0;
5130  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5131  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5132  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5133  size_t size;  size_t size;
5134  uschar *code;  uschar *code;
5135  const uschar *codestart;  const uschar *codestart;
5136  const uschar *ptr;  const uschar *ptr;
5137  compile_data compile_block;  compile_data compile_block;
5138  compile_data *cd = &compile_block;  compile_data *cd = &compile_block;
5139  int brastack[BRASTACK_SIZE];  
5140  uschar bralenstack[BRASTACK_SIZE];  /* This space is used for "compiling" into during the first phase, when we are
5141    computing the amount of memory that is needed. Compiled items are thrown away
5142    as soon as possible, so that a fairly large buffer should be sufficient for
5143    this purpose. The same space is used in the second phase for remembering where
5144    to fill in forward references to subpatterns. */
5145    
5146    uschar cworkspace[COMPILE_WORK_SIZE];
5147    
5148    
5149    /* Set this early so that early errors get offset 0. */
5150    
5151    ptr = (const uschar *)pattern;
5152    
5153  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
5154  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 4062  if (errorcodeptr != NULL) *errorcodeptr Line 5168  if (errorcodeptr != NULL) *errorcodeptr
5168  if (erroroffset == NULL)  if (erroroffset == NULL)
5169    {    {
5170    errorcode = ERR16;    errorcode = ERR16;
5171    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5172    }    }
5173    
5174  *erroroffset = 0;  *erroroffset = 0;
# Line 4075  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5181  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5181       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5182    {    {
5183    errorcode = ERR44;    errorcode = ERR44;
5184    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5185    }    }
5186  #else  #else
5187  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 4099  cd->fcc = tables + fcc_offset; Line 5205  cd->fcc = tables + fcc_offset;
5205  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
5206  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5207    
5208  /* Handle different types of newline. The two bits give four cases. The current  /* Handle different types of newline. The three bits give seven cases. The
5209  code allows for one- or two-byte sequences. */  current code allows for fixed one- or two-byte sequences, plus "any" and
5210    "anycrlf". */
5211    
5212  switch (options & PCRE_NEWLINE_CRLF)  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5213    {    {
5214    default:              newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
5215    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = '\r'; break;
5216    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = '\n'; break;
5217    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5218         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5219      case PCRE_NEWLINE_ANY: newline = -1; break;
5220      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5221      default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5222    }    }
5223    
5224  if (newline > 255)  if (newline == -2)
5225      {
5226      cd->nltype = NLTYPE_ANYCRLF;
5227      }
5228    else if (newline < 0)
5229    {    {
5230    cd->nllen = 2;    cd->nltype = NLTYPE_ANY;
   cd->nl[0] = (newline >> 8) & 255;  
   cd->nl[1] = newline & 255;  
5231    }    }
5232  else  else
5233    {    {
5234    cd->nllen = 1;    cd->nltype = NLTYPE_FIXED;
5235    cd->nl[0] = newline;    if (newline > 255)
5236        {
5237        cd->nllen = 2;
5238        cd->nl[0] = (newline >> 8) & 255;
5239        cd->nl[1] = newline & 255;
5240        }
5241      else
5242        {
5243        cd->nllen = 1;
5244        cd->nl[0] = newline;
5245        }
5246    }    }