/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 166 by ph10, Wed May 9 14:48:28 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 53  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 106  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 155  static const int posix_class_maps[] = { Line 165  static const int posix_class_maps[] = {
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 171  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 181  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 190  static const char *error_texts[] = { Line 205  static const char *error_texts[] = {
205    "unrecognized character after (?<",    "unrecognized character after (?<",
206    /* 25 */    /* 25 */
207    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
208    "malformed number after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 209  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named groups have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
234      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236      /* 50 */
237      "repeated subpattern is too long",
238      "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by an (optionally braced) non-zero number",
246      "(?+ or (?- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 235  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 357  static BOOL Line 385  static BOOL
385    
386  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
387  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
388  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
389  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391    ptr is pointing at the \. On exit, it is on the final character of the escape
392    sequence.
393    
394  Arguments:  Arguments:
395    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 392  if (c == 0) *errorcodeptr = ERR1; Line 422  if (c == 0) *errorcodeptr = ERR1;
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 436  else if ((i = escapes[c - 0x48]) != 0)
436  else  else
437    {    {
438    const uschar *oldptr;    const uschar *oldptr;
439      BOOL braced, negated;
440    
441    switch (c)    switch (c)
442      {      {
443      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 451  else
451      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
452      break;      break;
453    
454        /* \g must be followed by a number, either plain or braced. If positive, it
455        is an absolute backreference. If negative, it is a relative backreference.
456        This is a Perl 5.10 feature. */
457    
458        case 'g':
459        if (ptr[1] == '{')
460          {
461          braced = TRUE;
462          ptr++;
463          }
464        else braced = FALSE;
465    
466        if (ptr[1] == '-')
467          {
468          negated = TRUE;
469          ptr++;
470          }
471        else negated = FALSE;
472    
473        c = 0;
474        while ((digitab[ptr[1]] & ctype_digit) != 0)
475          c = c * 10 + *(++ptr) - '0';
476    
477        if (c == 0 || (braced && *(++ptr) != '}'))
478          {
479          *errorcodeptr = ERR57;
480          return 0;
481          }
482    
483        if (negated)
484          {
485          if (c > bracount)
486            {
487            *errorcodeptr = ERR15;
488            return 0;
489            }
490          c = bracount - (c - 1);
491          }
492    
493        c = -(ESC_REF + c);
494        break;
495    
496      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
497      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
498      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 460  else Line 534  else
534        }        }
535    
536      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
537      larger first octal digit. */      larger first octal digit. The original code used just to take the least
538        significant 8 bits of octal numbers (I think this is what early Perls used
539        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
540        than 3 octal digits. */
541    
542      case '0':      case '0':
543      c -= '0';      c -= '0';
544      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
545          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
546      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
547      break;      break;
548    
549      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 563  else
563          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
564          count++;          count++;
565    
566  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
567          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
568          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
569  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
570          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
571          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
572  #endif  #endif
# Line 513  else Line 590  else
590        {        {
591        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
592        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
593  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
594        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
595        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
596  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
597        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
598        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
599  #endif  #endif
600        }        }
601      break;      break;
602    
603      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
604        This coding is ASCII-specific, but then the whole concept of \cx is
605        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
606    
607      case 'c':      case 'c':
608      c = *(++ptr);      c = *(++ptr);
# Line 533  else Line 612  else
612        return 0;        return 0;
613        }        }
614    
615      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
616      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
617      c ^= 0x40;      c ^= 0x40;
618  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
619      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
620      c ^= 0xC0;      c ^= 0xC0;
621  #endif  #endif
# Line 763  return p; Line 838  return p;
838    
839    
840  /*************************************************  /*************************************************
841    *       Find forward referenced subpattern       *
842    *************************************************/
843    
844    /* This function scans along a pattern's text looking for capturing
845    subpatterns, and counting them. If it finds a named pattern that matches the
846    name it is given, it returns its number. Alternatively, if the name is NULL, it
847    returns when it reaches a given numbered subpattern. This is used for forward
848    references to subpatterns. We know that if (?P< is encountered, the name will
849    be terminated by '>' because that is checked in the first pass.
850    
851    Arguments:
852      ptr          current position in the pattern
853      count        current count of capturing parens so far encountered
854      name         name to seek, or NULL if seeking a numbered subpattern
855      lorn         name length, or subpattern number if name is NULL
856      xmode        TRUE if we are in /x mode
857    
858    Returns:       the number of the named subpattern, or -1 if not found
859    */
860    
861    static int
862    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
863      BOOL xmode)
864    {
865    const uschar *thisname;
866    
867    for (; *ptr != 0; ptr++)
868      {
869      int term;
870    
871      /* Skip over backslashed characters and also entire \Q...\E */
872    
873      if (*ptr == '\\')
874        {
875        if (*(++ptr) == 0) return -1;
876        if (*ptr == 'Q') for (;;)
877          {
878          while (*(++ptr) != 0 && *ptr != '\\');
879          if (*ptr == 0) return -1;
880          if (*(++ptr) == 'E') break;
881          }
882        continue;
883        }
884    
885      /* Skip over character classes */
886    
887      if (*ptr == '[')
888        {
889        while (*(++ptr) != ']')
890          {
891          if (*ptr == '\\')
892            {
893            if (*(++ptr) == 0) return -1;
894            if (*ptr == 'Q') for (;;)
895              {
896              while (*(++ptr) != 0 && *ptr != '\\');
897              if (*ptr == 0) return -1;
898              if (*(++ptr) == 'E') break;
899              }
900            continue;
901            }
902          }
903        continue;
904        }
905    
906      /* Skip comments in /x mode */
907    
908      if (xmode && *ptr == '#')
909        {
910        while (*(++ptr) != 0 && *ptr != '\n');
911        if (*ptr == 0) return -1;
912        continue;
913        }
914    
915      /* An opening parens must now be a real metacharacter */
916    
917      if (*ptr != '(') continue;
918      if (ptr[1] != '?')
919        {
920        count++;
921        if (name == NULL && count == lorn) return count;
922        continue;
923        }
924    
925      ptr += 2;
926      if (*ptr == 'P') ptr++;                      /* Allow optional P */
927    
928      /* We have to disambiguate (?<! and (?<= from (?<name> */
929    
930      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
931           *ptr != '\'')
932        continue;
933    
934      count++;
935    
936      if (name == NULL && count == lorn) return count;
937      term = *ptr++;
938      if (term == '<') term = '>';
939      thisname = ptr;
940      while (*ptr != term) ptr++;
941      if (name != NULL && lorn == ptr - thisname &&
942          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
943        return count;
944      }
945    
946    return -1;
947    }
948    
949    
950    
951    /*************************************************
952  *      Find first significant op code            *  *      Find first significant op code            *
953  *************************************************/  *************************************************/
954    
# Line 811  for (;;) Line 997  for (;;)
997    
998      case OP_CALLOUT:      case OP_CALLOUT:
999      case OP_CREF:      case OP_CREF:
1000      case OP_BRANUMBER:      case OP_RREF:
1001        case OP_DEF:
1002      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1003      break;      break;
1004    
# Line 856  for (;;) Line 1043  for (;;)
1043    {    {
1044    int d;    int d;
1045    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1046    
1047    switch (op)    switch (op)
1048      {      {
1049        case OP_CBRA:
1050      case OP_BRA:      case OP_BRA:
1051      case OP_ONCE:      case OP_ONCE:
1052      case OP_COND:      case OP_COND:
1053      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1054      if (d < 0) return d;      if (d < 0) return d;
1055      branchlength += d;      branchlength += d;
1056      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1085  for (;;)
1085      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1086    
1087      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1088      case OP_CREF:      case OP_CREF:
1089        case OP_RREF:
1090        case OP_DEF:
1091      case OP_OPT:      case OP_OPT:
1092      case OP_CALLOUT:      case OP_CALLOUT:
1093      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1105  for (;;)
1105    
1106      case OP_CHAR:      case OP_CHAR:
1107      case OP_CHARNC:      case OP_CHARNC:
1108        case OP_NOT:
1109      branchlength++;      branchlength++;
1110      cc += 2;      cc += 2;
1111  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1031  Returns:      pointer to the opcode for Line 1220  Returns:      pointer to the opcode for
1220  static const uschar *  static const uschar *
1221  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1222  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1223  for (;;)  for (;;)
1224    {    {
1225    register int c = *code;    register int c = *code;
1226    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1227    else if (c > OP_BRA)  
1228      /* XCLASS is used for classes that cannot be represented just by a bit
1229      map. This includes negated single high-valued characters. The length in
1230      the table is zero; the actual length is stored in the compiled code. */
1231    
1232      if (c == OP_XCLASS) code += GET(code, 1);
1233    
1234      /* Handle capturing bracket */
1235    
1236      else if (c == OP_CBRA)
1237      {      {
1238      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1239      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1240      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1241      }      }
1242    
1243      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1244      a multi-byte character. The length in the table is a minimum, so we have to
1245      arrange to skip the extra bytes. */
1246    
1247    else    else
1248      {      {
1249      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1250  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1251      if (utf8) switch(c)      if (utf8) switch(c)
1252        {        {
1253        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1255  for (;;)
1255        case OP_EXACT:        case OP_EXACT:
1256        case OP_UPTO:        case OP_UPTO:
1257        case OP_MINUPTO:        case OP_MINUPTO:
1258          case OP_POSUPTO:
1259        case OP_STAR:        case OP_STAR:
1260        case OP_MINSTAR:        case OP_MINSTAR:
1261          case OP_POSSTAR:
1262        case OP_PLUS:        case OP_PLUS:
1263        case OP_MINPLUS:        case OP_MINPLUS:
1264          case OP_POSPLUS:
1265        case OP_QUERY:        case OP_QUERY:
1266        case OP_MINQUERY:        case OP_MINQUERY:
1267        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1268        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1269        break;        break;
1270        }        }
1271  #endif  #endif
# Line 1105  Returns:      pointer to the opcode for Line 1292  Returns:      pointer to the opcode for
1292  static const uschar *  static const uschar *
1293  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1294  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1295  for (;;)  for (;;)
1296    {    {
1297    register int c = *code;    register int c = *code;
1298    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1299    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1300    else if (c > OP_BRA)  
1301      {    /* XCLASS is used for classes that cannot be represented just by a bit
1302      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1303      }    the table is zero; the actual length is stored in the compiled code. */
1304    
1305      if (c == OP_XCLASS) code += GET(code, 1);
1306    
1307      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1308      that are followed by a character may be followed by a multi-byte character.
1309      The length in the table is a minimum, so we have to arrange to skip the extra
1310      bytes. */
1311    
1312    else    else
1313      {      {
1314      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1315  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1316      if (utf8) switch(c)      if (utf8) switch(c)
1317        {        {
1318        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1320  for (;;)
1320        case OP_EXACT:        case OP_EXACT:
1321        case OP_UPTO:        case OP_UPTO:
1322        case OP_MINUPTO:        case OP_MINUPTO:
1323          case OP_POSUPTO:
1324        case OP_STAR:        case OP_STAR:
1325        case OP_MINSTAR:        case OP_MINSTAR:
1326          case OP_POSSTAR:
1327        case OP_PLUS:        case OP_PLUS:
1328        case OP_MINPLUS:        case OP_MINPLUS:
1329          case OP_POSPLUS:
1330        case OP_QUERY:        case OP_QUERY:
1331        case OP_MINQUERY:        case OP_MINQUERY:
1332        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1333        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1334        break;        break;
1335        }        }
1336  #endif  #endif
# Line 1165  for (;;) Line 1345  for (;;)
1345  *************************************************/  *************************************************/
1346    
1347  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1348  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1349  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1350  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1351  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1352    struck an inner bracket whose current branch will already have been scanned.
1353    
1354  Arguments:  Arguments:
1355    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1363  static BOOL
1363  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1364  {  {
1365  register int c;  register int c;
1366  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1367       code < endcode;       code < endcode;
1368       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1369    {    {
# Line 1190  for (code = first_significant_code(code Line 1371  for (code = first_significant_code(code
1371    
1372    c = *code;    c = *code;
1373    
1374    if (c >= OP_BRA)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1375      {      {
1376      BOOL empty_branch;      BOOL empty_branch;
1377      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1387  for (code = first_significant_code(code
1387        }        }
1388      while (*code == OP_ALT);      while (*code == OP_ALT);
1389      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1390      code += 1 + LINK_SIZE;  
1391      c = *code;      /* Move past the KET and fudge things so that the increment in the "for"
1392        above has no effect. */
1393    
1394        c = OP_END;
1395        code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1396        continue;
1397      }      }
1398    
1399    else switch (c)    /* Handle the other opcodes */
1400    
1401      switch (c)
1402      {      {
1403      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1404    
# Line 1266  for (code = first_significant_code(code Line 1454  for (code = first_significant_code(code
1454      case OP_NOT:      case OP_NOT:
1455      case OP_PLUS:      case OP_PLUS:
1456      case OP_MINPLUS:      case OP_MINPLUS:
1457        case OP_POSPLUS:
1458      case OP_EXACT:      case OP_EXACT:
1459      case OP_NOTPLUS:      case OP_NOTPLUS:
1460      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1461        case OP_NOTPOSPLUS:
1462      case OP_NOTEXACT:      case OP_NOTEXACT:
1463      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1464      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1465        case OP_TYPEPOSPLUS:
1466      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1467      return FALSE;      return FALSE;
1468    
# Line 1283  for (code = first_significant_code(code Line 1474  for (code = first_significant_code(code
1474      case OP_ALT:      case OP_ALT:
1475      return TRUE;      return TRUE;
1476    
1477      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1478      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1479    
1480  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1481      case OP_STAR:      case OP_STAR:
1482      case OP_MINSTAR:      case OP_MINSTAR:
1483        case OP_POSSTAR:
1484      case OP_QUERY:      case OP_QUERY:
1485      case OP_MINQUERY:      case OP_MINQUERY:
1486        case OP_POSQUERY:
1487      case OP_UPTO:      case OP_UPTO:
1488      case OP_MINUPTO:      case OP_MINUPTO:
1489        case OP_POSUPTO:
1490      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1491      break;      break;
1492  #endif  #endif
# Line 1410  earlier groups that are outside the curr Line 1604  earlier groups that are outside the curr
1604  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1605  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1606  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1607  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1608  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1609    
1610    This function has been extended with the possibility of forward references for
1611    recursions and subroutine calls. It must also check the list of such references
1612    for the group we are dealing with. If it finds that one of the recursions in
1613    the current group is on this list, it adjusts the offset in the list, not the
1614    value in the reference (which is a group number).
1615    
1616  Arguments:  Arguments:
1617    group      points to the start of the group    group      points to the start of the group
1618    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1619    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1620    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1621      save_hwm   the hwm forward reference pointer at the start of the group
1622    
1623  Returns:     nothing  Returns:     nothing
1624  */  */
1625    
1626  static void  static void
1627  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1628      uschar *save_hwm)
1629  {  {
1630  uschar *ptr = group;  uschar *ptr = group;
1631  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1632    {    {
1633    int offset = GET(ptr, 1);    int offset;
1634    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1635    
1636      /* See if this recursion is on the forward reference list. If so, adjust the
1637      reference. */
1638    
1639      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1640        {
1641        offset = GET(hc, 0);
1642        if (cd->start_code + offset == ptr + 1)
1643          {
1644          PUT(hc, 0, offset + adjust);
1645          break;
1646          }
1647        }
1648    
1649      /* Otherwise, adjust the recursion offset if it's after the start of this
1650      group. */
1651    
1652      if (hc >= cd->hwm)
1653        {
1654        offset = GET(ptr, 1);
1655        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1656        }
1657    
1658    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1659    }    }
1660  }  }
# Line 1508  Yield:        TRUE when range returned; Line 1733  Yield:        TRUE when range returned;
1733  */  */
1734    
1735  static BOOL  static BOOL
1736  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1737      unsigned int *odptr)
1738  {  {
1739  int c, othercase, next;  unsigned int c, othercase, next;
1740    
1741  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1742    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1743    
1744  if (c > d) return FALSE;  if (c > d) return FALSE;
1745    
# Line 1534  return TRUE; Line 1760  return TRUE;
1760  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1761    
1762    
1763    
1764    /*************************************************
1765    *     Check if auto-possessifying is possible    *
1766    *************************************************/
1767    
1768    /* This function is called for unlimited repeats of certain items, to see
1769    whether the next thing could possibly match the repeated item. If not, it makes
1770    sense to automatically possessify the repeated item.
1771    
1772    Arguments:
1773      op_code       the repeated op code
1774      this          data for this item, depends on the opcode
1775      utf8          TRUE in UTF-8 mode
1776      utf8_char     used for utf8 character bytes, NULL if not relevant
1777      ptr           next character in pattern
1778      options       options bits
1779      cd            contains pointers to tables etc.
1780    
1781    Returns:        TRUE if possessifying is wanted
1782    */
1783    
1784    static BOOL
1785    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1786      const uschar *ptr, int options, compile_data *cd)
1787    {
1788    int next;
1789    
1790    /* Skip whitespace and comments in extended mode */
1791    
1792    if ((options & PCRE_EXTENDED) != 0)
1793      {
1794      for (;;)
1795        {
1796        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1797        if (*ptr == '#')
1798          {
1799          while (*(++ptr) != 0)
1800            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1801          }
1802        else break;
1803        }
1804      }
1805    
1806    /* If the next item is one that we can handle, get its value. A non-negative
1807    value is a character, a negative value is an escape value. */
1808    
1809    if (*ptr == '\\')
1810      {
1811      int temperrorcode = 0;
1812      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1813      if (temperrorcode != 0) return FALSE;
1814      ptr++;    /* Point after the escape sequence */
1815      }
1816    
1817    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1818      {
1819    #ifdef SUPPORT_UTF8
1820      if (utf8) { GETCHARINC(next, ptr); } else
1821    #endif
1822      next = *ptr++;
1823      }
1824    
1825    else return FALSE;
1826    
1827    /* Skip whitespace and comments in extended mode */
1828    
1829    if ((options & PCRE_EXTENDED) != 0)
1830      {
1831      for (;;)
1832        {
1833        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1834        if (*ptr == '#')
1835          {
1836          while (*(++ptr) != 0)
1837            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1838          }
1839        else break;
1840        }
1841      }
1842    
1843    /* If the next thing is itself optional, we have to give up. */
1844    
1845    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1846      return FALSE;
1847    
1848    /* Now compare the next item with the previous opcode. If the previous is a
1849    positive single character match, "item" either contains the character or, if
1850    "item" is greater than 127 in utf8 mode, the character's bytes are in
1851    utf8_char. */
1852    
1853    
1854    /* Handle cases when the next item is a character. */
1855    
1856    if (next >= 0) switch(op_code)
1857      {
1858      case OP_CHAR:
1859    #ifdef SUPPORT_UTF8
1860      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1861    #endif
1862      return item != next;
1863    
1864      /* For CHARNC (caseless character) we must check the other case. If we have
1865      Unicode property support, we can use it to test the other case of
1866      high-valued characters. */
1867    
1868      case OP_CHARNC:
1869    #ifdef SUPPORT_UTF8
1870      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1871    #endif
1872      if (item == next) return FALSE;
1873    #ifdef SUPPORT_UTF8
1874      if (utf8)
1875        {
1876        unsigned int othercase;
1877        if (next < 128) othercase = cd->fcc[next]; else
1878    #ifdef SUPPORT_UCP
1879        othercase = _pcre_ucp_othercase((unsigned int)next);
1880    #else
1881        othercase = NOTACHAR;
1882    #endif
1883        return (unsigned int)item != othercase;
1884        }
1885      else
1886    #endif  /* SUPPORT_UTF8 */
1887      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1888    
1889      /* For OP_NOT, "item" must be a single-byte character. */
1890    
1891      case OP_NOT:
1892      if (next < 0) return FALSE;  /* Not a character */
1893      if (item == next) return TRUE;
1894      if ((options & PCRE_CASELESS) == 0) return FALSE;
1895    #ifdef SUPPORT_UTF8
1896      if (utf8)
1897        {
1898        unsigned int othercase;
1899        if (next < 128) othercase = cd->fcc[next]; else
1900    #ifdef SUPPORT_UCP
1901        othercase = _pcre_ucp_othercase(next);
1902    #else
1903        othercase = NOTACHAR;
1904    #endif
1905        return (unsigned int)item == othercase;
1906        }
1907      else
1908    #endif  /* SUPPORT_UTF8 */
1909      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1910    
1911      case OP_DIGIT:
1912      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1913    
1914      case OP_NOT_DIGIT:
1915      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1916    
1917      case OP_WHITESPACE:
1918      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1919    
1920      case OP_NOT_WHITESPACE:
1921      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1922    
1923      case OP_WORDCHAR:
1924      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1925    
1926      case OP_NOT_WORDCHAR:
1927      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1928    
1929      default:
1930      return FALSE;
1931      }
1932    
1933    
1934    /* Handle the case when the next item is \d, \s, etc. */
1935    
1936    switch(op_code)
1937      {
1938      case OP_CHAR:
1939      case OP_CHARNC:
1940    #ifdef SUPPORT_UTF8
1941      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1942    #endif
1943      switch(-next)
1944        {
1945        case ESC_d:
1946        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1947    
1948        case ESC_D:
1949        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1950    
1951        case ESC_s:
1952        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1953    
1954        case ESC_S:
1955        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1956    
1957        case ESC_w:
1958        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1959    
1960        case ESC_W:
1961        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1962    
1963        default:
1964        return FALSE;
1965        }
1966    
1967      case OP_DIGIT:
1968      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1969    
1970      case OP_NOT_DIGIT:
1971      return next == -ESC_d;
1972    
1973      case OP_WHITESPACE:
1974      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1975    
1976      case OP_NOT_WHITESPACE:
1977      return next == -ESC_s;
1978    
1979      case OP_WORDCHAR:
1980      return next == -ESC_W || next == -ESC_s;
1981    
1982      case OP_NOT_WORDCHAR:
1983      return next == -ESC_w || next == -ESC_d;
1984    
1985      default:
1986      return FALSE;
1987      }
1988    
1989    /* Control does not reach here */
1990    }
1991    
1992    
1993    
1994  /*************************************************  /*************************************************
1995  *           Compile one branch                   *  *           Compile one branch                   *
1996  *************************************************/  *************************************************/
1997    
1998  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
1999  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2000  bits.  bits. This function is used during the pre-compile phase when we are trying
2001    to find out the amount of memory needed, as well as during the real compile
2002    phase. The value of lengthptr distinguishes the two phases.
2003    
2004  Arguments:  Arguments:
2005    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2006    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2007    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2008    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1552  Arguments: Line 2010  Arguments:
2010    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2011    bcptr          points to current branch chain    bcptr          points to current branch chain
2012    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2013      lengthptr      NULL during the real compile phase
2014                     points to length accumulator during pre-compile phase
2015    
2016  Returns:         TRUE on success  Returns:         TRUE on success
2017                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2018  */  */
2019    
2020  static BOOL  static BOOL
2021  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2022    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2023    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2024  {  {
2025  int repeat_type, op_type;  int repeat_type, op_type;
2026  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1569  int greedy_default, greedy_non_default; Line 2029  int greedy_default, greedy_non_default;
2029  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2030  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2031  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2032  int options = *optionsptr;  int options = *optionsptr;
2033  int after_manual_callout = 0;  int after_manual_callout = 0;
2034    int length_prevgroup = 0;
2035  register int c;  register int c;
2036  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2037    uschar *last_code = code;
2038    uschar *orig_code = code;
2039  uschar *tempcode;  uschar *tempcode;
2040  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2041  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1581  const uschar *ptr = *ptrptr; Line 2043  const uschar *ptr = *ptrptr;
2043  const uschar *tempptr;  const uschar *tempptr;
2044  uschar *previous = NULL;  uschar *previous = NULL;
2045  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2046    uschar *save_hwm = NULL;
2047  uschar classbits[32];  uschar classbits[32];
2048    
2049  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1590  uschar *class_utf8data; Line 2053  uschar *class_utf8data;
2053  uschar utf8_char[6];  uschar utf8_char[6];
2054  #else  #else
2055  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2056    uschar *utf8_char = NULL;
2057    #endif
2058    
2059    #ifdef DEBUG
2060    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2061  #endif  #endif
2062    
2063  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1623  for (;; ptr++) Line 2091  for (;; ptr++)
2091    BOOL negate_class;    BOOL negate_class;
2092    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2093    BOOL is_quantifier;    BOOL is_quantifier;
2094      BOOL is_recurse;
2095    int class_charcount;    int class_charcount;
2096    int class_lastchar;    int class_lastchar;
2097    int newoptions;    int newoptions;
# Line 1630  for (;; ptr++) Line 2099  for (;; ptr++)
2099    int skipbytes;    int skipbytes;
2100    int subreqbyte;    int subreqbyte;
2101    int subfirstbyte;    int subfirstbyte;
2102      int terminator;
2103    int mclength;    int mclength;
2104    uschar mcbuffer[8];    uschar mcbuffer[8];
2105    
2106    /* Next byte in the pattern */    /* Get next byte in the pattern */
2107    
2108    c = *ptr;    c = *ptr;
2109    
2110    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If we are in the pre-compile phase, accumulate the length used for the
2111      previous cycle of this loop. */
2112    
2113    if (inescq && c != 0)    if (lengthptr != NULL)
2114      {      {
2115      if (c == '\\' && ptr[1] == 'E')  #ifdef DEBUG
2116        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2117    #endif
2118        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2119        {        {
2120        inescq = FALSE;        *errorcodeptr = ERR52;
2121        ptr++;        goto FAILED;
       continue;  
2122        }        }
2123      else  
2124        /* There is at least one situation where code goes backwards: this is the
2125        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2126        the class is simply eliminated. However, it is created first, so we have to
2127        allow memory for it. Therefore, don't ever reduce the length at this point.
2128        */
2129    
2130        if (code < last_code) code = last_code;
2131        *lengthptr += code - last_code;
2132        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2133    
2134        /* If "previous" is set and it is not at the start of the work space, move
2135        it back to there, in order to avoid filling up the work space. Otherwise,
2136        if "previous" is NULL, reset the current code pointer to the start. */
2137    
2138        if (previous != NULL)
2139        {        {
2140        if (previous_callout != NULL)        if (previous > orig_code)
2141          {          {
2142          complete_callout(previous_callout, ptr, cd);          memmove(orig_code, previous, code - previous);
2143          previous_callout = NULL;          code -= previous - orig_code;
2144            previous = orig_code;
2145            }
2146          }
2147        else code = orig_code;
2148    
2149        /* Remember where this code item starts so we can pick up the length
2150        next time round. */
2151    
2152        last_code = code;
2153        }
2154    
2155      /* In the real compile phase, just check the workspace used by the forward
2156      reference list. */
2157    
2158      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2159        {
2160        *errorcodeptr = ERR52;
2161        goto FAILED;
2162        }
2163    
2164      /* If in \Q...\E, check for the end; if not, we have a literal */
2165    
2166      if (inescq && c != 0)
2167        {
2168        if (c == '\\' && ptr[1] == 'E')
2169          {
2170          inescq = FALSE;
2171          ptr++;
2172          continue;
2173          }
2174        else
2175          {
2176          if (previous_callout != NULL)
2177            {
2178            if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2179              complete_callout(previous_callout, ptr, cd);
2180            previous_callout = NULL;
2181          }          }
2182        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
2183          {          {
# Line 1672  for (;; ptr++) Line 2197  for (;; ptr++)
2197    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2198         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2199      {      {
2200      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2201          complete_callout(previous_callout, ptr, cd);
2202      previous_callout = NULL;      previous_callout = NULL;
2203      }      }
2204    
# Line 1683  for (;; ptr++) Line 2209  for (;; ptr++)
2209      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2210      if (c == '#')      if (c == '#')
2211        {        {
2212        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2213        on the Macintosh. */          {
2214        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2215        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2216          if (*ptr != 0) continue;
2217    
2218          /* Else fall through to handle end of string */
2219          c = 0;
2220        }        }
2221      }      }
2222    
# Line 1700  for (;; ptr++) Line 2230  for (;; ptr++)
2230    
2231    switch(c)    switch(c)
2232      {      {
2233      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2234        case 0:                        /* The branch terminates at string end */
2235      case 0:      case '|':                      /* or | or ) */
     case '|':  
2236      case ')':      case ')':
2237      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2238      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2239      *codeptr = code;      *codeptr = code;
2240      *ptrptr = ptr;      *ptrptr = ptr;
2241        if (lengthptr != NULL)
2242          {
2243          *lengthptr += code - last_code;   /* To include callout length */
2244          DPRINTF((">> end branch\n"));
2245          }
2246      return TRUE;      return TRUE;
2247    
2248    
2249        /* ===================================================================*/
2250      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2251      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2252    
# Line 1739  for (;; ptr++) Line 2275  for (;; ptr++)
2275      *code++ = OP_ANY;      *code++ = OP_ANY;
2276      break;      break;
2277    
2278    
2279        /* ===================================================================*/
2280      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2281      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2282      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1777  for (;; ptr++) Line 2315  for (;; ptr++)
2315        }        }
2316    
2317      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2318      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2319      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2320    
2321      class_charcount = 0;      class_charcount = 0;
2322      class_lastchar = -1;      class_lastchar = -1;
2323    
2324        /* Initialize the 32-char bit map to all zeros. We build the map in a
2325        temporary bit of memory, in case the class contains only 1 character (less
2326        than 256), because in that case the compiled code doesn't use the bit map.
2327        */
2328    
2329        memset(classbits, 0, 32 * sizeof(uschar));
2330    
2331  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2332      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2333      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2334  #endif  #endif
2335    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2336      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2337      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2338      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2339    
2340      do      if (c != 0) do
2341        {        {
2342          const uschar *oldptr;
2343    
2344  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2345        if (utf8 && c > 127)        if (utf8 && c > 127)
2346          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1814  for (;; ptr++) Line 2352  for (;; ptr++)
2352    
2353        if (inescq)        if (inescq)
2354          {          {
2355          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2356            {            {
2357            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2358            ptr++;            ptr++;                            /* Skip the 'E' */
2359            continue;            continue;                         /* Carry on with next */
2360            }            }
2361          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2362          }          }
2363    
2364        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1911  for (;; ptr++) Line 2449  for (;; ptr++)
2449          }          }
2450    
2451        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2452        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2453        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2454        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2455        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2456        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2457    
2458        if (c == '\\')        if (c == '\\')
2459          {          {
2460          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2461            if (*errorcodeptr != 0) goto FAILED;
2462    
2463          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2464          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2465            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2466          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2467            {            {
2468            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1938  for (;; ptr++) Line 2477  for (;; ptr++)
2477            {            {
2478            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2479            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2480            switch (-c)  
2481              /* Save time by not doing this in the pre-compile phase. */
2482    
2483              if (lengthptr == NULL) switch (-c)
2484              {              {
2485              case ESC_d:              case ESC_d:
2486              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1966  for (;; ptr++) Line 2508  for (;; ptr++)
2508              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2509              continue;              continue;
2510    
2511                case ESC_E: /* Perl ignores an orphan \E */
2512                continue;
2513    
2514                default:    /* Not recognized; fall through */
2515                break;      /* Need "default" setting to stop compiler warning. */
2516                }
2517    
2518              /* In the pre-compile phase, just do the recognition. */
2519    
2520              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2521                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2522    
2523              /* We need to deal with \P and \p in both phases. */
2524    
2525  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2526              case ESC_p:            if (-c == ESC_p || -c == ESC_P)
2527              case ESC_P:              {
2528                {              BOOL negated;
2529                BOOL negated;              int pdata;
2530                int pdata;              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2531                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);              if (ptype < 0) goto FAILED;
2532                if (ptype < 0) goto FAILED;              class_utf8 = TRUE;
2533                class_utf8 = TRUE;              *class_utf8data++ = ((-c == ESC_p) != negated)?
2534                *class_utf8data++ = ((-c == ESC_p) != negated)?                XCL_PROP : XCL_NOTPROP;
2535                  XCL_PROP : XCL_NOTPROP;              *class_utf8data++ = ptype;
2536                *class_utf8data++ = ptype;              *class_utf8data++ = pdata;
2537                *class_utf8data++ = pdata;              class_charcount -= 2;   /* Not a < 256 character */
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2538              continue;              continue;
2539                }
2540  #endif  #endif
2541              /* Unrecognized escapes are faulted if PCRE is running in its
2542              strict mode. By default, for compatibility with Perl, they are
2543              treated as literals. */
2544    
2545              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2546              strict mode. By default, for compatibility with Perl, they are              {
2547              treated as literals. */              *errorcodeptr = ERR7;
2548                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2549              }              }
2550    
2551              class_charcount -= 2;  /* Undo the default count from above */
2552              c = *ptr;              /* Get the final character and fall through */
2553            }            }
2554    
2555          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2556          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2557    
2558          }   /* End of backslash handling */          }   /* End of backslash handling */
2559    
2560        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2561        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2562        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2563          entirely. The code for handling \Q and \E is messy. */
2564    
2565          CHECK_RANGE:
2566          while (ptr[1] == '\\' && ptr[2] == 'E')
2567            {
2568            inescq = FALSE;
2569            ptr += 2;
2570            }
2571    
2572          oldptr = ptr;
2573    
2574        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2575          {          {
2576          int d;          int d;
2577          ptr += 2;          ptr += 2;
2578            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2579    
2580            /* If we hit \Q (not followed by \E) at this point, go into escaped
2581            mode. */
2582    
2583            while (*ptr == '\\' && ptr[1] == 'Q')
2584              {
2585              ptr += 2;
2586              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2587              inescq = TRUE;
2588              break;
2589              }
2590    
2591            if (*ptr == 0 || (!inescq && *ptr == ']'))
2592              {
2593              ptr = oldptr;
2594              goto LONE_SINGLE_CHARACTER;
2595              }
2596    
2597  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2598          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 2607  for (;; ptr++)
2607          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2608          in such circumstances. */          in such circumstances. */
2609    
2610          if (d == '\\')          if (!inescq && d == '\\')
2611            {            {
2612            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2613            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2614    
2615            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2616            was literal */            special means the '-' was literal */
2617    
2618            if (d < 0)            if (d < 0)
2619              {              {
2620              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2621              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2622                else if (d == -ESC_R) d = 'R'; else
2623                {                {
2624                ptr = oldptr - 2;                ptr = oldptr;
2625                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2626                }                }
2627              }              }
2628            }            }
2629    
2630          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2631          the pre-pass. Optimize one-character ranges */          one-character ranges */
2632    
2633            if (d < c)
2634              {
2635              *errorcodeptr = ERR8;
2636              goto FAILED;
2637              }
2638    
2639          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2640    
# Line 2067  for (;; ptr++) Line 2655  for (;; ptr++)
2655  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2656            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2657              {              {
2658              int occ, ocd;              unsigned int occ, ocd;
2659              int cc = c;              unsigned int cc = c;
2660              int origd = d;              unsigned int origd = d;
2661              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2662                {                {
2663                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2127  for (;; ptr++) Line 2715  for (;; ptr++)
2715          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2716          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2717    
2718          for (; c <= d; c++)          class_charcount += d - c + 1;
2719            class_lastchar = d;
2720    
2721            /* We can save a bit of time by skipping this in the pre-compile. */
2722    
2723            if (lengthptr == NULL) for (; c <= d; c++)
2724            {            {
2725            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2726            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 2728  for (;; ptr++)
2728              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2729              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2730              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2731            }            }
2732    
2733          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 2751  for (;; ptr++)
2751  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2752          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2753            {            {
2754            int othercase;            unsigned int othercase;
2755            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2756              {              {
2757              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2758              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 2777  for (;; ptr++)
2777          }          }
2778        }        }
2779    
2780      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2781    
2782      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2783    
2784        if (c == 0)                          /* Missing terminating ']' */
2785          {
2786          *errorcodeptr = ERR6;
2787          goto FAILED;
2788          }
2789    
2790      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2791      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2253  for (;; ptr++) Line 2849  for (;; ptr++)
2849    
2850      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2851      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2852      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2853    
2854  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2855      if (class_utf8)      if (class_utf8)
# Line 2263  for (;; ptr++) Line 2859  for (;; ptr++)
2859        code += LINK_SIZE;        code += LINK_SIZE;
2860        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2861    
2862        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2863        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2864    
2865        if (class_charcount > 0)        if (class_charcount > 0)
2866          {          {
2867          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2868            memmove(code + 32, code, class_utf8data - code);
2869          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2870          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2871          }          }
2872          else code = class_utf8data;
2873    
2874        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2875    
# Line 2297  for (;; ptr++) Line 2886  for (;; ptr++)
2886      if (negate_class)      if (negate_class)
2887        {        {
2888        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2889        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2890            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2891        }        }
2892      else      else
2893        {        {
# Line 2307  for (;; ptr++) Line 2897  for (;; ptr++)
2897      code += 32;      code += 32;
2898      break;      break;
2899    
2900    
2901        /* ===================================================================*/
2902      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2903      has been tested above. */      has been tested above. */
2904    
# Line 2374  for (;; ptr++) Line 2966  for (;; ptr++)
2966        }        }
2967      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2968    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2969      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2970      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2971      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 2999  for (;; ptr++)
2999          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3000          }          }
3001    
3002          /* If the repetition is unlimited, it pays to see if the next thing on
3003          the line is something that cannot possibly match this character. If so,
3004          automatically possessifying this item gains some performance in the case
3005          where the match fails. */
3006    
3007          if (!possessive_quantifier &&
3008              repeat_max < 0 &&
3009              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3010                options, cd))
3011            {
3012            repeat_type = 0;    /* Force greedy */
3013            possessive_quantifier = TRUE;
3014            }
3015    
3016        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3017        }        }
3018    
3019      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3020      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3021      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3022      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3023        currently used only for single-byte chars. */
3024    
3025      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3026        {        {
3027        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3028        c = previous[1];        c = previous[1];
3029          if (!possessive_quantifier &&
3030              repeat_max < 0 &&
3031              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3032            {
3033            repeat_type = 0;    /* Force greedy */
3034            possessive_quantifier = TRUE;
3035            }
3036        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3037        }        }
3038    
# Line 2450  for (;; ptr++) Line 3050  for (;; ptr++)
3050        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3051        c = *previous;        c = *previous;
3052    
3053          if (!possessive_quantifier &&
3054              repeat_max < 0 &&
3055              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3056            {
3057            repeat_type = 0;    /* Force greedy */
3058            possessive_quantifier = TRUE;
3059            }
3060    
3061        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3062        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3063          {          {
# Line 2490  for (;; ptr++) Line 3098  for (;; ptr++)
3098          }          }
3099    
3100        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3101        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3102        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3103        one less than the maximum. */        one less than the maximum. */
3104    
# Line 2543  for (;; ptr++) Line 3151  for (;; ptr++)
3151            }            }
3152    
3153          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3154          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3155            UPTO is just for 1 instance, we can use QUERY instead. */
3156    
3157          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3158            {            {
# Line 2562  for (;; ptr++) Line 3171  for (;; ptr++)
3171              *code++ = prop_value;              *code++ = prop_value;
3172              }              }
3173            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3174            *code++ = OP_UPTO + repeat_type;  
3175            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3176                {
3177                *code++ = OP_QUERY + repeat_type;
3178                }
3179              else
3180                {
3181                *code++ = OP_UPTO + repeat_type;
3182                PUT2INC(code, 0, repeat_max);
3183                }
3184            }            }
3185          }          }
3186    
# Line 2630  for (;; ptr++) Line 3247  for (;; ptr++)
3247      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3248      cases. */      cases. */
3249    
3250      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3251               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3252        {        {
3253        register int i;        register int i;
3254        int ketoffset = 0;        int ketoffset = 0;
3255        int len = code - previous;        int len = code - previous;
3256        uschar *bralink = NULL;        uschar *bralink = NULL;
3257    
3258          /* Repeating a DEFINE group is pointless */
3259    
3260          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3261            {
3262            *errorcodeptr = ERR55;
3263            goto FAILED;
3264            }
3265    
3266          /* This is a paranoid check to stop integer overflow later on */
3267    
3268          if (len > MAX_DUPLENGTH)
3269            {
3270            *errorcodeptr = ERR50;
3271            goto FAILED;
3272            }
3273    
3274        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3275        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3276        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2672  for (;; ptr++) Line 3305  for (;; ptr++)
3305          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3306          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3307          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3308          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3309          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3310            doing this. */
3311    
3312          if (repeat_max <= 1)          if (repeat_max <= 1)
3313            {            {
3314            *code = OP_END;            *code = OP_END;
3315            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3316            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3317            code++;            code++;
3318            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2696  for (;; ptr++) Line 3330  for (;; ptr++)
3330            {            {
3331            int offset;            int offset;
3332            *code = OP_END;            *code = OP_END;
3333            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3334            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3335            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3336            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3350  for (;; ptr++)
3350        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3351        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3352        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3353        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3354          forward reference subroutine calls in the group, there will be entries on
3355          the workspace list; replicate these with an appropriate increment. */
3356    
3357        else        else
3358          {          {
3359          if (repeat_min > 1)          if (repeat_min > 1)
3360            {            {
3361            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3362            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3363    
3364              if (lengthptr != NULL)
3365                *lengthptr += (repeat_min - 1)*length_prevgroup;
3366    
3367              /* This is compiling for real */
3368    
3369              else
3370              {              {
3371              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3372              code += len;              for (i = 1; i < repeat_min; i++)
3373                  {
3374                  uschar *hc;
3375                  uschar *this_hwm = cd->hwm;
3376                  memcpy(code, previous, len);
3377                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3378                    {
3379                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3380                    cd->hwm += LINK_SIZE;
3381                    }
3382                  save_hwm = this_hwm;
3383                  code += len;
3384                  }
3385              }              }
3386            }            }
3387    
3388          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3389          }          }
3390    
# Line 2736  for (;; ptr++) Line 3392  for (;; ptr++)
3392        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3393        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3394        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3395        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3396          replicate entries on the forward reference list. */
3397    
3398        if (repeat_max >= 0)        if (repeat_max >= 0)
3399          {          {
3400          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3401            just adjust the length as if we had. For each repetition we must add 1
3402            to the length for BRAZERO and for all but the last repetition we must
3403            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3404    
3405            if (lengthptr != NULL && repeat_max > 0)
3406              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3407                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3408    
3409            /* This is compiling for real */
3410    
3411            else for (i = repeat_max - 1; i >= 0; i--)
3412            {            {
3413              uschar *hc;
3414              uschar *this_hwm = cd->hwm;
3415    
3416            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3417    
3418            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 3428  for (;; ptr++)
3428              }              }
3429    
3430            memcpy(code, previous, len);            memcpy(code, previous, len);
3431              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3432                {
3433                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3434                cd->hwm += LINK_SIZE;
3435                }
3436              save_hwm = this_hwm;
3437            code += len;            code += len;
3438            }            }
3439    
# Line 2779  for (;; ptr++) Line 3456  for (;; ptr++)
3456        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3457        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3458        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3459        correct offset was computed above. */        correct offset was computed above.
3460    
3461        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3462          this group is a non-atomic one that could match an empty string. If so,
3463          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3464          that runtime checking can be done. [This check is also applied to
3465          atomic groups at runtime, but in a different way.] */
3466    
3467          else
3468            {
3469            uschar *ketcode = code - ketoffset;
3470            uschar *bracode = ketcode - GET(ketcode, 1);
3471            *ketcode = OP_KETRMAX + repeat_type;
3472            if (lengthptr == NULL && *bracode != OP_ONCE)
3473              {
3474              uschar *scode = bracode;
3475              do
3476                {
3477                if (could_be_empty_branch(scode, ketcode, utf8))
3478                  {
3479                  *bracode += OP_SBRA - OP_BRA;
3480                  break;
3481                  }
3482                scode += GET(scode, 1);
3483                }
3484              while (*scode == OP_ALT);
3485              }
3486            }
3487        }        }
3488    
3489      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2792  for (;; ptr++) Line 3494  for (;; ptr++)
3494        goto FAILED;        goto FAILED;
3495        }        }
3496    
3497      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3498      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3499      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3500      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3501      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3502        but the special opcodes can optimize it a bit. The repeated item starts at
3503        tempcode, not at previous, which might be the first part of a string whose
3504        (former) last char we repeated.
3505    
3506        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3507        an 'upto' may follow. We skip over an 'exact' item, and then test the
3508        length of what remains before proceeding. */
3509    
3510      if (possessive_quantifier)      if (possessive_quantifier)
3511        {        {
3512        int len = code - tempcode;        int len;
3513        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3514        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3515        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3516        tempcode[0] = OP_ONCE;        len = code - tempcode;
3517        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3518        PUTINC(code, 0, len);          {
3519        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3520            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3521            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3522            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3523    
3524            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3525            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3526            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3527            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3528    
3529            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3530            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3531            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3532            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3533    
3534            default:
3535            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3536            code += 1 + LINK_SIZE;
3537            len += 1 + LINK_SIZE;
3538            tempcode[0] = OP_ONCE;
3539            *code++ = OP_KET;
3540            PUTINC(code, 0, len);
3541            PUT(tempcode, 1, len);
3542            break;
3543            }
3544        }        }
3545    
3546      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 3553  for (;; ptr++)
3553      break;      break;
3554    
3555    
3556      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3557      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3558      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3559      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3560      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3561      check for syntax errors here.  */      group. */
3562    
3563      case '(':      case '(':
3564      newoptions = options;      newoptions = options;
3565      skipbytes = 0;      skipbytes = 0;
3566        bravalue = OP_CBRA;
3567        save_hwm = cd->hwm;
3568    
3569      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3570        {        {
3571        int set, unset;        int i, set, unset, namelen;
3572        int *optset;        int *optset;
3573          const uschar *name;
3574          uschar *slot;
3575    
3576        switch (*(++ptr))        switch (*(++ptr))
3577          {          {
3578          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3579          ptr++;          ptr++;
3580          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3581            if (*ptr == 0)
3582              {
3583              *errorcodeptr = ERR18;
3584              goto FAILED;
3585              }
3586          continue;          continue;
3587    
3588          case ':':                 /* Non-extracting bracket */  
3589            /* ------------------------------------------------------------ */
3590            case ':':                 /* Non-capturing bracket */
3591          bravalue = OP_BRA;          bravalue = OP_BRA;
3592          ptr++;          ptr++;
3593          break;          break;
3594    
3595    
3596            /* ------------------------------------------------------------ */
3597          case '(':          case '(':
3598          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3599    
3600          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3601            group), a name (referring to a named group), or 'R', referring to
3602            recursion. R<digits> and R&name are also permitted for recursion tests.
3603    
3604            There are several syntaxes for testing a named group: (?(name)) is used
3605            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3606    
3607            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3608            be the recursive thing or the name 'R' (and similarly for 'R' followed
3609            by digits), and (b) a number could be a name that consists of digits.
3610            In both cases, we look for a name first; if not found, we try the other
3611            cases. */
3612    
3613            /* For conditions that are assertions, check the syntax, and then exit
3614            the switch. This will take control down to where bracketed groups,
3615            including assertions, are processed. */
3616    
3617            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3618              break;
3619    
3620            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3621            below), and all need to skip 3 bytes at the start of the group. */
3622    
3623            code[1+LINK_SIZE] = OP_CREF;
3624            skipbytes = 3;
3625    
3626            /* Check for a test for recursion in a named group. */
3627    
3628            if (ptr[1] == 'R' && ptr[2] == '&')
3629              {
3630              terminator = -1;
3631              ptr += 2;
3632              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3633              }
3634    
3635            /* Check for a test for a named group's having been set, using the Perl
3636            syntax (?(<name>) or (?('name') */
3637    
3638            else if (ptr[1] == '<')
3639              {
3640              terminator = '>';
3641              ptr++;
3642              }
3643            else if (ptr[1] == '\'')
3644              {
3645              terminator = '\'';
3646              ptr++;
3647              }
3648            else terminator = 0;
3649    
3650            /* We now expect to read a name; any thing else is an error */
3651    
3652            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3653              {
3654              ptr += 1;  /* To get the right offset */
3655              *errorcodeptr = ERR28;
3656              goto FAILED;
3657              }
3658    
3659            /* Read the name, but also get it as a number if it's all digits */
3660    
3661            recno = 0;
3662            name = ++ptr;
3663            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3664              {
3665              if (recno >= 0)
3666                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3667                  recno * 10 + *ptr - '0' : -1;
3668              ptr++;
3669              }
3670            namelen = ptr - name;
3671    
3672            if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3673              {
3674              ptr--;      /* Error offset */
3675              *errorcodeptr = ERR26;
3676              goto FAILED;
3677              }
3678    
3679            /* Do no further checking in the pre-compile phase. */
3680    
3681            if (lengthptr != NULL) break;
3682    
3683            /* In the real compile we do the work of looking for the actual
3684            reference. */
3685    
3686            slot = cd->name_table;
3687            for (i = 0; i < cd->names_found; i++)
3688              {
3689              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3690              slot += cd->name_entry_size;
3691              }
3692    
3693            /* Found a previous named subpattern */
3694    
3695            if (i < cd->names_found)
3696              {
3697              recno = GET2(slot, 0);
3698              PUT2(code, 2+LINK_SIZE, recno);
3699              }
3700    
3701            /* Search the pattern for a forward reference */
3702    
3703            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3704                            (options & PCRE_EXTENDED) != 0)) > 0)
3705              {
3706              PUT2(code, 2+LINK_SIZE, i);
3707              }
3708    
3709            /* If terminator == 0 it means that the name followed directly after
3710            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3711            some further alternatives to try. For the cases where terminator != 0
3712            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3713            now checked all the possibilities, so give an error. */
3714    
3715          if (ptr[1] == 'R')          else if (terminator != 0)
3716            {            {
3717            code[1+LINK_SIZE] = OP_CREF;            *errorcodeptr = ERR15;
3718            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            goto FAILED;
           skipbytes = 3;  
           ptr += 3;  
3719            }            }
3720    
3721          /* Condition to test for a numbered subpattern match. We know that          /* Check for (?(R) for recursion. Allow digits after R to specify a
3722          if a digit follows ( then there will just be digits until ) because          specific group number. */
         the syntax was checked in the first pass. */  
3723    
3724          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (*name == 'R')
3725            {            {
3726            int condref;                 /* Don't amalgamate; some compilers */            recno = 0;
3727            condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */            for (i = 1; i < namelen; i++)
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3728              {              {
3729              *errorcodeptr = ERR35;              if ((digitab[name[i]] & ctype_digit) == 0)
3730              goto FAILED;                {
3731                  *errorcodeptr = ERR15;
3732                  goto FAILED;
3733                  }
3734                recno = recno * 10 + name[i] - '0';
3735              }              }
3736            ptr++;            if (recno == 0) recno = RREF_ANY;
3737            code[1+LINK_SIZE] = OP_CREF;            code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3738            PUT2(code, 2+LINK_SIZE, condref);            PUT2(code, 2+LINK_SIZE, recno);
3739            skipbytes = 3;            }
3740    
3741            /* Similarly, check for the (?(DEFINE) "condition", which is always
3742            false. */
3743    
3744            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3745              {
3746              code[1+LINK_SIZE] = OP_DEF;
3747              skipbytes = 1;
3748              }
3749    
3750            /* Check for the "name" actually being a subpattern number. */
3751    
3752            else if (recno > 0)
3753              {
3754              PUT2(code, 2+LINK_SIZE, recno);
3755              }
3756    
3757            /* Either an unidentified subpattern, or a reference to (?(0) */
3758    
3759            else
3760              {
3761              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3762              goto FAILED;
3763            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
3764          break;          break;
3765    
3766    
3767            /* ------------------------------------------------------------ */
3768          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3769          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3770          ptr++;          ptr++;
3771          break;          break;
3772    
3773    
3774            /* ------------------------------------------------------------ */
3775          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3776          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3777          ptr++;          ptr++;
3778          break;          break;
3779    
3780          case '<':                 /* Lookbehinds */  
3781          switch (*(++ptr))          /* ------------------------------------------------------------ */
3782            case '<':                 /* Lookbehind or named define */
3783            switch (ptr[1])
3784            {            {
3785            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3786            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3787            ptr++;            ptr += 2;
3788            break;            break;
3789    
3790            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3791            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3792            ptr++;            ptr += 2;
3793            break;            break;
3794    
3795              default:                /* Could be name define, else bad */
3796              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3797              ptr++;                  /* Correct offset for error */
3798              *errorcodeptr = ERR24;
3799              goto FAILED;
3800            }            }
3801          break;          break;
3802    
3803    
3804            /* ------------------------------------------------------------ */
3805          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3806          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3807          ptr++;          ptr++;
3808          break;          break;
3809    
3810    
3811            /* ------------------------------------------------------------ */
3812          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3813          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3814          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3815          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3816            {                       /* closing parenthesis is present. */            {
3817            int n = 0;            int n = 0;
3818            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3819              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3820              if (*ptr != ')')
3821                {
3822                *errorcodeptr = ERR39;
3823                goto FAILED;
3824                }
3825            if (n > 255)            if (n > 255)
3826              {              {
3827              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2935  for (;; ptr++) Line 3835  for (;; ptr++)
3835          previous = NULL;          previous = NULL;
3836          continue;          continue;
3837    
3838          case 'P':                 /* Named subpattern handling */  
3839          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3840            case 'P':                 /* Python-style named subpattern handling */
3841            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3842              {
3843              is_recurse = *ptr == '>';
3844              terminator = ')';
3845              goto NAMED_REF_OR_RECURSE;
3846              }
3847            else if (*ptr != '<')    /* Test for Python-style definition */
3848            {            {
3849            int i, namelen;            *errorcodeptr = ERR41;
3850            uschar *slot = cd->name_table;            goto FAILED;
3851            const uschar *name;     /* Don't amalgamate; some compilers */            }
3852            name = ++ptr;           /* grumble at autoincrement in declaration */          /* Fall through to handle (?P< as (?< is handled */
3853    
           while (*ptr++ != '>');  
           namelen = ptr - name - 1;  
3854    
3855            for (i = 0; i < cd->names_found; i++)          /* ------------------------------------------------------------ */
3856              {          DEFINE_NAME:    /* Come here from (?< handling */
3857              int crc = memcmp(name, slot+2, namelen);          case '\'':
3858              if (crc == 0)            {
3859                {            terminator = (*ptr == '<')? '>' : '\'';
3860                if (slot[2+namelen] == 0)            name = ++ptr;
3861    
3862              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3863              namelen = ptr - name;
3864    
3865              /* In the pre-compile phase, just do a syntax check. */
3866    
3867              if (lengthptr != NULL)
3868                {
3869                if (*ptr != terminator)
3870                  {
3871                  *errorcodeptr = ERR42;
3872                  goto FAILED;
3873                  }
3874                if (cd->names_found >= MAX_NAME_COUNT)
3875                  {
3876                  *errorcodeptr = ERR49;
3877                  goto FAILED;
3878                  }
3879                if (namelen + 3 > cd->name_entry_size)
3880                  {
3881                  cd->name_entry_size = namelen + 3;
3882                  if (namelen > MAX_NAME_SIZE)
3883                  {                  {
3884                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
3885                  goto FAILED;                  goto FAILED;
3886                  }                  }
               crc = -1;             /* Current name is substring */  
3887                }                }
3888              if (crc < 0)              }
3889    
3890              /* In the real compile, create the entry in the table */
3891    
3892              else
3893                {
3894                slot = cd->name_table;
3895                for (i = 0; i < cd->names_found; i++)
3896                {                {
3897                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
3898                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
3899                break;                  {
3900                    if (slot[2+namelen] == 0)
3901                      {
3902                      if ((options & PCRE_DUPNAMES) == 0)
3903                        {
3904                        *errorcodeptr = ERR43;
3905                        goto FAILED;
3906                        }
3907                      }
3908                    else crc = -1;      /* Current name is substring */
3909                    }
3910                  if (crc < 0)
3911                    {
3912                    memmove(slot + cd->name_entry_size, slot,
3913                      (cd->names_found - i) * cd->name_entry_size);
3914                    break;
3915                    }
3916                  slot += cd->name_entry_size;
3917                }                }
             slot += cd->name_entry_size;  
             }  
3918    
3919            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3920            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3921            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3922            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3923            }            }
3924    
3925          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
3926    
3927            ptr++;                    /* Move past > or ' */
3928            cd->names_found++;
3929            goto NUMBERED_GROUP;
3930    
3931    
3932            /* ------------------------------------------------------------ */
3933            case '&':                 /* Perl recursion/subroutine syntax */
3934            terminator = ')';
3935            is_recurse = TRUE;
3936            /* Fall through */
3937    
3938            /* We come here from the Python syntax above that handles both
3939            references (?P=name) and recursion (?P>name), as well as falling
3940            through from the Perl recursion syntax (?&name). */
3941    
3942            NAMED_REF_OR_RECURSE:
3943            name = ++ptr;
3944            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3945            namelen = ptr - name;
3946    
3947            /* In the pre-compile phase, do a syntax check and set a dummy
3948            reference number. */
3949    
3950            if (lengthptr != NULL)
3951            {            {
3952            int i, namelen;            if (*ptr != terminator)
3953            int type = *ptr++;              {
3954            const uschar *name = ptr;              *errorcodeptr = ERR42;
3955            uschar *slot = cd->name_table;              goto FAILED;
3956                }
3957              if (namelen > MAX_NAME_SIZE)
3958                {
3959                *errorcodeptr = ERR48;
3960                goto FAILED;
3961                }
3962              recno = 0;
3963              }
3964    
3965            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
3966    
3967            else
3968              {
3969              slot = cd->name_table;
3970            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
3971              {              {
3972              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3973              slot += cd->name_entry_size;              slot += cd->name_entry_size;
3974              }              }
3975            if (i >= cd->names_found)  
3976              if (i < cd->names_found)         /* Back reference */
3977                {
3978                recno = GET2(slot, 0);
3979                }
3980              else if ((recno =                /* Forward back reference */
3981                        find_parens(ptr, cd->bracount, name, namelen,
3982                          (options & PCRE_EXTENDED) != 0)) <= 0)
3983              {              {
3984              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
3985              goto FAILED;              goto FAILED;
3986              }              }
3987              }
3988    
3989            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
3990            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
3991    
3992            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
3993            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
3994    
         /* Should never happen */  
         break;  
3995    
3996          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
3997            case 'R':                 /* Recursion */
3998          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
3999          /* Fall through */          /* Fall through */
4000    
         /* Recursion or "subroutine" call */  
4001    
4002          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4003          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4004            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4005            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4006            {            {
4007            const uschar *called;            const uschar *called;
4008              int sign = *ptr;
4009    
4010              if (sign == '+') ptr++;
4011              else if (sign == '-')
4012                {
4013                if ((digitab[ptr[1]] & ctype_digit) == 0)
4014                  goto OTHER_CHAR_AFTER_QUERY;
4015                ptr++;
4016                }
4017    
4018            recno = 0;            recno = 0;
4019            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4020              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4021    
4022              if (*ptr != ')')
4023                {
4024                *errorcodeptr = ERR29;
4025                goto FAILED;
4026                }
4027    
4028              if (sign == '-')
4029                {
4030                if (recno == 0)
4031                  {
4032                  *errorcodeptr = ERR58;
4033                  goto FAILED;
4034                  }
4035                recno = cd->bracount - recno + 1;
4036                if (recno <= 0)
4037                  {
4038                  *errorcodeptr = ERR15;
4039                  goto FAILED;
4040                  }
4041                }
4042              else if (sign == '+')
4043                {
4044                if (recno == 0)
4045                  {
4046                  *errorcodeptr = ERR58;
4047                  goto FAILED;
4048                  }
4049                recno += cd->bracount;
4050                }
4051    
4052            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4053    
4054            HANDLE_RECURSION:            HANDLE_RECURSION:
4055    
4056            previous = code;            previous = code;
4057              called = cd->start_code;
4058    
4059            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4060            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4061              this point. If we end up with a forward reference, first check that
4062            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4063            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4064              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4065    
4066            if (called == NULL)            if (lengthptr == NULL)
4067              {              {
4068              *errorcodeptr = ERR15;              *code = OP_END;
4069              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4070    
4071            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4072    
4073            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4074              {                {
4075              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4076              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4077                    {
4078                    *errorcodeptr = ERR15;
4079                    goto FAILED;
4080                    }
4081                  called = cd->start_code + recno;
4082                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4083                  }
4084    
4085                /* If not a forward reference, and the subpattern is still open,
4086                this is a recursive call. We check to see if this is a left
4087                recursion that could loop for ever, and diagnose that case. */
4088    
4089                else if (GET(called, 1) == 0 &&
4090                         could_be_empty(called, code, bcptr, utf8))
4091                  {
4092                  *errorcodeptr = ERR40;
4093                  goto FAILED;
4094                  }
4095              }              }
4096    
4097            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4098            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4099              subsequent quantifier will work. */
4100    
4101            *code = OP_ONCE;            *code = OP_ONCE;
4102            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3069  for (;; ptr++) Line 4109  for (;; ptr++)
4109            *code = OP_KET;            *code = OP_KET;
4110            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4111            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4112    
4113              length_prevgroup = 3 + 3*LINK_SIZE;
4114            }            }
4115    
4116            /* Can't determine a first byte now */
4117    
4118            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4119          continue;          continue;
4120    
         /* Character after (? not specially recognized */  
4121    
4122          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4123            default:              /* Other characters: check option setting */
4124            OTHER_CHAR_AFTER_QUERY:
4125          set = unset = 0;          set = unset = 0;
4126          optset = &set;          optset = &set;
4127    
# Line 3084  for (;; ptr++) Line 4131  for (;; ptr++)
4131              {              {
4132              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4133    
4134                case 'J':    /* Record that it changed in the external options */
4135                *optset |= PCRE_DUPNAMES;
4136                cd->external_options |= PCRE_JCHANGED;
4137                break;
4138    
4139              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4140              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4141              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4142              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4143              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4144              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4145    
4146                default:  *errorcodeptr = ERR12;
4147                          ptr--;    /* Correct the offset */
4148                          goto FAILED;
4149              }              }
4150            }            }
4151    
# Line 3098  for (;; ptr++) Line 4154  for (;; ptr++)
4154          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4155    
4156          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4157          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4158          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4159          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4160          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4161          a group), a resetting item can be compiled.          caseless checking of required bytes.
4162    
4163          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4164          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4165          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4166            that value after the start, because it gets reset as code is discarded
4167            during the pre-compile. However, this can happen only at top level - if
4168            we are within parentheses, the starting BRA will still be present. At
4169            any parenthesis level, the length value can be used to test if anything
4170            has been compiled at that level. Thus, a test for both these conditions
4171            is necessary to ensure we correctly detect the start of the pattern in
4172            both phases.
4173    
4174            If we are not at the pattern start, compile code to change the ims
4175            options if this setting actually changes any of them. We also pass the
4176            new setting back so that it can be put at the start of any following
4177            branches, and when this group ends (if we are in a group), a resetting
4178            item can be compiled. */
4179    
4180          if (*ptr == ')')          if (*ptr == ')')
4181            {            {
4182            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4183                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4184              {              {
4185              *code++ = OP_OPT;              cd->external_options = newoptions;
4186              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4187              }              }
4188             else
4189                {
4190                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4191                  {
4192                  *code++ = OP_OPT;
4193                  *code++ = newoptions & PCRE_IMS;
4194                  }
4195    
4196            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4197            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4198            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4199    
4200            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4201            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4202            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4203            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4204                }
4205    
4206            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4207            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3136  for (;; ptr++) Line 4214  for (;; ptr++)
4214    
4215          bravalue = OP_BRA;          bravalue = OP_BRA;
4216          ptr++;          ptr++;
4217          }          }     /* End of switch for character following (? */
4218        }        }       /* End of (? handling */
4219    
4220      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4221      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4222        brackets. */
4223    
4224      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4225        {        {
4226        bravalue = OP_BRA;        bravalue = OP_BRA;
4227        }        }
4228    
4229      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4230    
4231      else      else
4232        {        {
4233        NUMBERED_GROUP:        NUMBERED_GROUP:
4234        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4235          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4236          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4237        }        }
4238    
4239      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4240      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4241      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4242      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4243        they have changed. */
4244    
4245      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4246      *code = bravalue;      *code = bravalue;
4247      tempcode = code;      tempcode = code;
4248      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4249        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4250    
4251      if (!compile_regex(      if (!compile_regex(
4252           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4253           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4254           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4255           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4256           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4257           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4258            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4259           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4260           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4261           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4262           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4263           cd))                          /* Tables block */           cd,                           /* Tables block */
4264             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4265               &length_prevgroup           /* Pre-compile phase */
4266             ))
4267        goto FAILED;        goto FAILED;
4268    
4269      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3196  for (;; ptr++) Line 4272  for (;; ptr++)
4272      is on the bracket. */      is on the bracket. */
4273    
4274      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4275      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. */
4276    
4277      else if (bravalue == OP_COND)      if (bravalue == OP_COND)
4278        {        {
4279        uschar *tc = code;        uschar *tc = code;
4280        condcount = 0;        int condcount = 0;
4281    
4282        do {        do {
4283           condcount++;           condcount++;
# Line 3209  for (;; ptr++) Line 4285  for (;; ptr++)
4285           }           }
4286        while (*tc != OP_KET);        while (*tc != OP_KET);
4287    
4288        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4289          false). It must have only one branch. */
4290    
4291          if (code[LINK_SIZE+1] == OP_DEF)
4292          {          {
4293          *errorcodeptr = ERR27;          if (condcount > 1)
4294          goto FAILED;            {
4295              *errorcodeptr = ERR54;
4296              goto FAILED;
4297              }
4298            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4299            }
4300    
4301          /* A "normal" conditional group. If there is just one branch, we must not
4302          make use of its firstbyte or reqbyte, because this is equivalent to an
4303          empty second branch. */
4304    
4305          else
4306            {
4307            if (condcount > 2)
4308              {
4309              *errorcodeptr = ERR27;
4310              goto FAILED;
4311              }
4312            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4313          }          }
4314          }
4315    
4316        /* Error if hit end of pattern */
4317    
4318        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4319        reqbyte, because this is equivalent to an empty second branch. */        {
4320          *errorcodeptr = ERR14;
4321          goto FAILED;
4322          }
4323    
4324        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4325        group, less the brackets at either end. Then reduce the compiled code to
4326        just the brackets so that it doesn't use much memory if it is duplicated by
4327        a quantifier. */
4328    
4329        if (lengthptr != NULL)
4330          {
4331          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4332          code++;
4333          PUTINC(code, 0, 1 + LINK_SIZE);
4334          *code++ = OP_KET;
4335          PUTINC(code, 0, 1 + LINK_SIZE);
4336        }        }
4337    
4338      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4339      brackets of all kinds, and conditions with two branches (see code above).  
4340      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4341      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4342      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4343        relevant. */
4344    
4345        if (bravalue == OP_DEF) break;
4346    
4347        /* Handle updating of the required and first characters for other types of
4348        group. Update for normal brackets of all kinds, and conditions with two
4349        branches (see code above). If the bracket is followed by a quantifier with
4350        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4351        zerofirstbyte outside the main loop so that they can be accessed for the
4352        back off. */
4353    
4354      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4355      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4356      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4357    
4358      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4359        {        {
4360        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4361        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3272  for (;; ptr++) Line 4396  for (;; ptr++)
4396      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4397    
4398      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4399        break;     /* End of processing '(' */
4400    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
4401    
4402      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* ===================================================================*/
4403        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4404      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4405      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4406      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4407      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4408      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4409    
4410        case '\\':
4411        tempptr = ptr;
4412        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4413        if (*errorcodeptr != 0) goto FAILED;
4414    
4415      if (c < 0)      if (c < 0)
4416        {        {
4417        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3310  for (;; ptr++) Line 4421  for (;; ptr++)
4421          continue;          continue;
4422          }          }
4423    
4424          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4425    
4426        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4427        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4428    
# Line 3321  for (;; ptr++) Line 4434  for (;; ptr++)
4434        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4435        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4436    
4437        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4438    
4439          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4440            {
4441            is_recurse = FALSE;
4442            terminator = (*(++ptr) == '<')? '>' : '\'';
4443            goto NAMED_REF_OR_RECURSE;
4444            }
4445    
4446          /* Back references are handled specially; must disable firstbyte if
4447          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4448          ':' later. */
4449    
4450        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4451          {          {
4452          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4453    
4454            HANDLE_REFERENCE:    /* Come here from named backref handling */
4455            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4456          previous = code;          previous = code;
4457          *code++ = OP_REF;          *code++ = OP_REF;
4458          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4459            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4460            if (recno > cd->top_backref) cd->top_backref = recno;
4461          }          }
4462    
4463        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4464    
4465  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4466        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3340  for (;; ptr++) Line 4468  for (;; ptr++)
4468          BOOL negated;          BOOL negated;
4469          int pdata;          int pdata;
4470          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4471            if (ptype < 0) goto FAILED;
4472          previous = code;          previous = code;
4473          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4474          *code++ = ptype;          *code++ = ptype;
4475          *code++ = pdata;          *code++ = pdata;
4476          }          }
4477    #else
4478    
4479          /* If Unicode properties are not supported, \X, \P, and \p are not
4480          allowed. */
4481    
4482          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4483            {
4484            *errorcodeptr = ERR45;
4485            goto FAILED;
4486            }
4487  #endif  #endif
4488    
4489        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4490        value */        can obtain the OP value by negating the escape value. */
4491    
4492        else        else
4493          {          {
# Line 3372  for (;; ptr++) Line 4511  for (;; ptr++)
4511       mcbuffer[0] = c;       mcbuffer[0] = c;
4512       mclength = 1;       mclength = 1;
4513       }       }
   
4514      goto ONE_CHAR;      goto ONE_CHAR;
4515    
4516    
4517        /* ===================================================================*/
4518      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4519      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4520      multi-byte literal character. */      multi-byte literal character. */
# Line 3385  for (;; ptr++) Line 4525  for (;; ptr++)
4525      mcbuffer[0] = c;      mcbuffer[0] = c;
4526    
4527  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4528      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4529        {        {
4530        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4531          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3436  for (;; ptr++) Line 4576  for (;; ptr++)
4576      }      }
4577    }                   /* end of big loop */    }                   /* end of big loop */
4578    
4579    
4580  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4581  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4582  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3452  return FALSE; Line 4593  return FALSE;
4593  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4594  *************************************************/  *************************************************/
4595    
4596  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4597  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4598  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4599  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4600  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4601  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4602  the new options into every subsequent branch compile.  into every subsequent branch compile.
4603    
4604    This function is used during the pre-compile phase when we are trying to find
4605    out the amount of memory needed, as well as during the real compile phase. The
4606    value of lengthptr distinguishes the two phases.
4607    
4608  Argument:  Argument:
4609    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4610    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4611    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4612    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4613    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4614    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4615    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4616    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4617    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4618    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4619    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4620      lengthptr      NULL during the real compile phase
4621                     points to length accumulator during pre-compile phase
4622    
4623  Returns:      TRUE on success  Returns:         TRUE on success
4624  */  */
4625    
4626  static BOOL  static BOOL
4627  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4628    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4629    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4630  {  {
4631  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4632  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3489  uschar *start_bracket = code; Line 4635  uschar *start_bracket = code;
4635  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4636  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4637  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4638    int length;
4639  branch_chain bc;  branch_chain bc;
4640    
4641  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3496  bc.current = code; Line 4643  bc.current = code;
4643    
4644  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4645    
4646    /* Accumulate the length for use in the pre-compile phase. Start with the
4647    length of the BRA and KET and any extra bytes that are required at the
4648    beginning. We accumulate in a local variable to save frequent testing of
4649    lenthptr for NULL. We cannot do this by looking at the value of code at the
4650    start and end of each alternative, because compiled items are discarded during
4651    the pre-compile phase so that the work space is not exceeded. */
4652    
4653    length = 2 + 2*LINK_SIZE + skipbytes;
4654    
4655    /* WARNING: If the above line is changed for any reason, you must also change
4656    the code that abstracts option settings at the start of the pattern and makes
4657    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4658    pre-compile phase to find out whether anything has yet been compiled or not. */
4659    
4660  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4661    
4662  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3511  for (;;) Line 4672  for (;;)
4672      {      {
4673      *code++ = OP_OPT;      *code++ = OP_OPT;
4674      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4675        length += 2;
4676      }      }
4677    
4678    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3520  for (;;) Line 4682  for (;;)
4682      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4683      reverse_count = code;      reverse_count = code;
4684      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4685        length += 1 + LINK_SIZE;
4686      }      }
4687    
4688    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4689      into the length. */
4690    
4691    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4692          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4693      {      {
4694      *ptrptr = ptr;      *ptrptr = ptr;
4695      return FALSE;      return FALSE;
4696      }      }
4697    
4698    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4699    
4700    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4701      {      {
4702      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4703      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4704    
4705    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4706    match the values from all the previous branches, except that if the previous        {
4707    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4708    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4709          }
4710    
4711    else      /* If this is not the first branch, the first char and reqbyte have to
4712      {      match the values from all the previous branches, except that if the
4713      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4714      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4715    
4716      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4717        {        {
4718        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4719        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4720        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4721    
4722          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4723            {
4724            if (reqbyte < 0) reqbyte = firstbyte;
4725            firstbyte = REQ_NONE;
4726            }
4727    
4728      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4729      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4730    
4731      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4732          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4733    
4734      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4735    
4736      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4737        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4738      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4739      }        }
4740    
4741    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4742    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4743    the branch with OP_END. */      branch with OP_END. */
4744    
4745    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4746        {        {
4747        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4748        *ptrptr = ptr;        *code = OP_END;
4749        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4750          DPRINTF(("fixed length = %d\n", fixed_length));
4751          if (fixed_length < 0)
4752            {
4753            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4754            *ptrptr = ptr;
4755            return FALSE;
4756            }
4757          PUT(reverse_count, 0, fixed_length);
4758        }        }
     PUT(reverse_count, 0, length);  
4759      }      }
4760    
4761    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. Go back through
# Line 3600  for (;;) Line 4769  for (;;)
4769    
4770    if (*ptr != '|')    if (*ptr != '|')
4771      {      {
4772      int length = code - last_branch;      int branch_length = code - last_branch;
4773      do      do
4774        {        {
4775        int prev_length = GET(last_branch, 1);        int prev_length = GET(last_branch, 1);
4776        PUT(last_branch, 1, length);        PUT(last_branch, 1, branch_length);
4777        length = prev_length;        branch_length = prev_length;
4778        last_branch -= length;        last_branch -= branch_length;
4779        }        }
4780      while (length > 0);      while (branch_length > 0);
4781    
4782      /* Fill in the ket */      /* Fill in the ket */
4783    
# Line 3622  for (;;) Line 4791  for (;;)
4791        {        {
4792        *code++ = OP_OPT;        *code++ = OP_OPT;
4793        *code++ = oldims;        *code++ = oldims;
4794          length += 2;
4795        }        }
4796    
4797      /* Set values to pass back */      /* Set values to pass back */
# Line 3630  for (;;) Line 4800  for (;;)
4800      *ptrptr = ptr;      *ptrptr = ptr;
4801      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
4802      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
4803        if (lengthptr != NULL) *lengthptr += length;
4804      return TRUE;      return TRUE;
4805      }      }
4806    
# Line 3643  for (;;) Line 4814  for (;;)
4814    bc.current = last_branch = code;    bc.current = last_branch = code;
4815    code += 1 + LINK_SIZE;    code += 1 + LINK_SIZE;
4816    ptr++;    ptr++;
4817      length += 1 + LINK_SIZE;
4818    }    }
4819  /* Control never reaches here */  /* Control never reaches here */
4820  }  }
# Line 3693  is_anchored(register const uschar *code, Line 4865  is_anchored(register const uschar *code,
4865    unsigned int backref_map)    unsigned int backref_map)
4866  {  {
4867  do {  do {
4868     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4869       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
4870     register int op = *scode;     register int op = *scode;
4871    
4872       /* Non-capturing brackets */
4873    
4874       if (op == OP_BRA)
4875         {
4876         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4877         }
4878    
4879     /* Capturing brackets */     /* Capturing brackets */
4880    
4881     if (op > OP_BRA)     else if (op == OP_CBRA)
4882       {       {
4883       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4884       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4885       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4886       }       }
4887    
4888     /* Other brackets */     /* Other brackets */
4889    
4890     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4891       {       {
4892       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4893       }       }
# Line 3718  do { Line 4895  do {
4895     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4896     are or may be referenced. */     are or may be referenced. */
4897    
4898     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4899                 op == OP_TYPEPOSSTAR) &&
4900              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
4901       {       {
4902       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3763  is_startline(const uschar *code, unsigne Line 4941  is_startline(const uschar *code, unsigne
4941    unsigned int backref_map)    unsigned int backref_map)
4942  {  {
4943  do {  do {
4944     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4945       FALSE);       NULL, 0, FALSE);
4946     register int op = *scode;     register int op = *scode;
4947    
4948       /* Non-capturing brackets */
4949    
4950       if (op == OP_BRA)
4951         {
4952         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4953         }
4954    
4955     /* Capturing brackets */     /* Capturing brackets */
4956    
4957     if (op > OP_BRA)     else if (op == OP_CBRA)
4958       {       {
4959       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4960       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4961       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
4962       }       }
4963    
4964     /* Other brackets */     /* Other brackets */
4965    
4966     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4967       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4968    
4969     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
4970     may be referenced. */     may be referenced. */
4971    
4972     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4973       {       {
4974       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4975       }       }
# Line 3835  do { Line 5018  do {
5018       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5019     register int op = *scode;     register int op = *scode;
5020    
    if (op >= OP_BRA) op = OP_BRA;  
   
5021     switch(op)     switch(op)
5022       {       {
5023       default:       default:
5024       return -1;       return -1;
5025    
5026       case OP_BRA:       case OP_BRA:
5027         case OP_CBRA:
5028       case OP_ASSERT:       case OP_ASSERT:
5029       case OP_ONCE:       case OP_ONCE:
5030       case OP_COND:       case OP_COND:
# Line 3858  do { Line 5040  do {
5040       case OP_CHARNC:       case OP_CHARNC:
5041       case OP_PLUS:       case OP_PLUS:
5042       case OP_MINPLUS:       case OP_MINPLUS:
5043         case OP_POSPLUS:
5044       if (!inassert) return -1;       if (!inassert) return -1;
5045       if (c < 0)       if (c < 0)
5046         {         {
# Line 3898  Returns:        pointer to compiled data Line 5081  Returns:        pointer to compiled data
5081                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5082  */  */
5083    
5084  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5085  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5086    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5087  {  {
# Line 3906  return pcre_compile2(pattern, options, N Line 5089  return pcre_compile2(pattern, options, N
5089  }  }
5090    
5091    
5092  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5093  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5094    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5095  {  {
5096  real_pcre *re;  real_pcre *re;
5097  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5098  int c, firstbyte, reqbyte;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5099  int errorcode = 0;  int errorcode = 0;
5100  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5101  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5102  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5103  size_t size;  size_t size;
5104  uschar *code;  uschar *code;
5105  const uschar *codestart;  const uschar *codestart;
5106  const uschar *ptr;  const uschar *ptr;
5107  compile_data compile_block;  compile_data compile_block;
5108  int brastack[BRASTACK_SIZE];  compile_data *cd = &compile_block;
5109  uschar bralenstack[BRASTACK_SIZE];  
5110    /* This space is used for "compiling" into during the first phase, when we are
5111    computing the amount of memory that is needed. Compiled items are thrown away
5112    as soon as possible, so that a fairly large buffer should be sufficient for
5113    this purpose. The same space is used in the second phase for remembering where
5114    to fill in forward references to subpatterns. */
5115    
5116    uschar cworkspace[COMPILE_WORK_SIZE];
5117    
5118    
5119    /* Set this early so that early errors get offset 0. */
5120    
5121    ptr = (const uschar *)pattern;
5122    
5123  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
5124  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 3954  if (errorcodeptr != NULL) *errorcodeptr Line 5138  if (errorcodeptr != NULL) *errorcodeptr
5138  if (erroroffset == NULL)  if (erroroffset == NULL)
5139    {    {
5140    errorcode = ERR16;    errorcode = ERR16;
5141    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5142    }    }
5143    
5144  *erroroffset = 0;  *erroroffset = 0;
# Line 3967  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5151  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5151       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5152    {    {
5153    errorcode = ERR44;    errorcode = ERR44;
5154    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5155    }    }
5156  #else  #else
5157  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 3986  if ((options & ~PUBLIC_OPTIONS) != 0) Line 5170  if ((options & ~PUBLIC_OPTIONS) != 0)
5170  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
5171    
5172  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
5173  compile_block.lcc = tables + lcc_offset;  cd->lcc = tables + lcc_offset;
5174  compile_block.fcc = tables + fcc_offset;  cd->fcc = tables + fcc_offset;
5175  compile_block.cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
5176  compile_block.ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5177    
5178  /* Maximum back reference and backref bitmap. This is updated for numeric  /* Handle different types of newline. The three bits give seven cases. The
5179  references during the first pass, but for named references during the actual  current code allows for fixed one- or two-byte sequences, plus "any" and
5180  compile pass. The bitmap records up to 31 back references to help in deciding  "anycrlf". */
 whether (.*) can be treated as anchored or not. */  
   
 compile_block.top_backref = 0;  
 compile_block.backref_map = 0;  
5181    
5182  /* Reflect pattern for debugging output */  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5183