/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 149 by ph10, Mon Apr 16 15:28:08 2007 UTC revision 1387 by ph10, Sat Nov 2 18:29:05 2013 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2013 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
50  #define PSSTART start_pattern  /* Field containing processed string start */  #define PSSTART start_pattern  /* Field containing processed string start */
51  #define PSEND   end_pattern    /* Field containing processed string end */  #define PSEND   end_pattern    /* Field containing processed string end */
52    
   
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57  used by pcretest. DEBUG is not defined when building a production library. */  is also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. We do not need to select pcre16_printint.c specially, because the
59    COMPILE_PCREx macro will already be appropriately set. */
60    
61  #ifdef DEBUG  #ifdef PCRE_DEBUG
62  #include "pcre_printint.src"  /* pcre_printint.c should not include any headers */
63    #define PCRE_INCLUDED
64    #include "pcre_printint.c"
65    #undef PCRE_INCLUDED
66  #endif  #endif
67    
68    
69    /* Macro for setting individual bits in class bitmaps. */
70    
71    #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72    
73    /* Maximum length value to check against when making sure that the integer that
74    holds the compiled pattern length does not overflow. We make it a bit less than
75    INT_MAX to allow for adding in group terminating bytes, so that we don't have
76    to check them every time. */
77    
78    #define OFLOW_MAX (INT_MAX - 20)
79    
80    /* Definitions to allow mutual recursion */
81    
82    static int
83      add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84        const pcre_uint32 *, unsigned int);
85    
86    static BOOL
87      compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88        pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89        compile_data *, int *);
90    
91    
92    
93  /*************************************************  /*************************************************
94  *      Code parameters and static tables         *  *      Code parameters and static tables         *
95  *************************************************/  *************************************************/
# Line 72  so this number is very generous. Line 104  so this number is very generous.
104  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
105  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
106  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
108    filled up by repetitions of forward references, for example patterns like
109    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110    that the workspace is expanded using malloc() in this situation. The value
111    below is therefore a minimum, and we put a maximum on it for safety. The
112    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113    kicks in at the same number of forward references in all cases. */
114    
115    #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117    
118    /* This value determines the size of the initial vector that is used for
119    remembering named groups during the pre-compile. It is allocated on the stack,
120    but if it is too small, it is expanded using malloc(), in a similar way to the
121    workspace. The value is the number of slots in the list. */
122    
123  #define COMPILE_WORK_SIZE (4096)  #define NAMED_GROUP_LIST_SIZE  20
124    
125    /* The overrun tests check for a slightly smaller size so that they detect the
126    overrun before it actually does run off the end of the data block. */
127    
128    #define WORK_SIZE_SAFETY_MARGIN (100)
129    
130    /* Private flags added to firstchar and reqchar. */
131    
132    #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133    #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134    /* Negative values for the firstchar and reqchar flags */
135    #define REQ_UNSET       (-2)
136    #define REQ_NONE        (-1)
137    
138    /* Repeated character flags. */
139    
140    #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141    
142  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
144  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
145  is invalid. */  is invalid. */
146    
147  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
148    
149    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150    in UTF-8 mode. */
151    
152  static const short int escapes[] = {  static const short int escapes[] = {
153       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
154       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
155     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
156       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,                       0,
157  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */       0,                       0,
158  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
159     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
162       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
163         -ESC_D,                  -ESC_E,
164         0,                       -ESC_G,
165         -ESC_H,                  0,
166         0,                       -ESC_K,
167         0,                       0,
168         -ESC_N,                  0,
169         -ESC_P,                  -ESC_Q,
170         -ESC_R,                  -ESC_S,
171         0,                       0,
172         -ESC_V,                  -ESC_W,
173         -ESC_X,                  0,
174         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177         CHAR_GRAVE_ACCENT,       7,
178         -ESC_b,                  0,
179         -ESC_d,                  ESC_e,
180         ESC_f,                   0,
181         -ESC_h,                  0,
182         0,                       -ESC_k,
183         0,                       0,
184         ESC_n,                   0,
185         -ESC_p,                  0,
186         ESC_r,                   -ESC_s,
187         ESC_tee,                 0,
188         -ESC_v,                  -ESC_w,
189         0,                       0,
190         -ESC_z
191  };  };
192    
193  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
194    
195    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196    
197  static const short int escapes[] = {  static const short int escapes[] = {
198  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 106  static const short int escapes[] = { Line 203  static const short int escapes[] = {
203  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
208  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 125  static const short int escapes[] = { Line 222  static const short int escapes[] = {
222  #endif  #endif
223    
224    
225  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
227  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. The
228    string is built from string macros so that it works in UTF-8 mode on EBCDIC
229  static const char *const posix_names[] = {  platforms. */
230    "alpha", "lower", "upper",  
231    "alnum", "ascii", "blank", "cntrl", "digit", "graph",  typedef struct verbitem {
232    "print", "punct", "space", "word",  "xdigit" };    int   len;                 /* Length of verb name */
233      int   op;                  /* Op when no arg, or -1 if arg mandatory */
234      int   op_arg;              /* Op when arg present, or -1 if not allowed */
235    } verbitem;
236    
237    static const char verbnames[] =
238      "\0"                       /* Empty name is a shorthand for MARK */
239      STRING_MARK0
240      STRING_ACCEPT0
241      STRING_COMMIT0
242      STRING_F0
243      STRING_FAIL0
244      STRING_PRUNE0
245      STRING_SKIP0
246      STRING_THEN;
247    
248    static const verbitem verbs[] = {
249      { 0, -1,        OP_MARK },
250      { 4, -1,        OP_MARK },
251      { 6, OP_ACCEPT, -1 },
252      { 6, OP_COMMIT, -1 },
253      { 1, OP_FAIL,   -1 },
254      { 4, OP_FAIL,   -1 },
255      { 5, OP_PRUNE,  OP_PRUNE_ARG },
256      { 4, OP_SKIP,   OP_SKIP_ARG  },
257      { 4, OP_THEN,   OP_THEN_ARG  }
258    };
259    
260    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261    
262    
263    /* Tables of names of POSIX character classes and their lengths. The names are
264    now all in a single string, to reduce the number of relocations when a shared
265    library is dynamically loaded. The list of lengths is terminated by a zero
266    length entry. The first three must be alpha, lower, upper, as this is assumed
267    for handling case independence. The indices for graph, print, and punct are
268    needed, so identify them. */
269    
270    static const char posix_names[] =
271      STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
272      STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
273      STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
274      STRING_word0  STRING_xdigit;
275    
276  static const uschar posix_name_lengths[] = {  static const pcre_uint8 posix_name_lengths[] = {
277    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
278    
279    #define PC_GRAPH  8
280    #define PC_PRINT  9
281    #define PC_PUNCT 10
282    
283    
284  /* Table of class bit maps for each POSIX class. Each class is formed from a  /* Table of class bit maps for each POSIX class. Each class is formed from a
285  base map, with an optional addition or removal of another map. Then, for some  base map, with an optional addition or removal of another map. Then, for some
286  classes, there is some additional tweaking: for [:blank:] the vertical space  classes, there is some additional tweaking: for [:blank:] the vertical space
# Line 164  static const int posix_class_maps[] = { Line 308  static const int posix_class_maps[] = {
308    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
309  };  };
310    
311    /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
312    Unicode property escapes. */
313    
314    #ifdef SUPPORT_UCP
315    static const pcre_uchar string_PNd[]  = {
316      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
317      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318    static const pcre_uchar string_pNd[]  = {
319      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320      CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321    static const pcre_uchar string_PXsp[] = {
322      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
323      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
324    static const pcre_uchar string_pXsp[] = {
325      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
326      CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
327    static const pcre_uchar string_PXwd[] = {
328      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
329      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
330    static const pcre_uchar string_pXwd[] = {
331      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332      CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333    
334    static const pcre_uchar *substitutes[] = {
335      string_PNd,           /* \D */
336      string_pNd,           /* \d */
337      string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
338      string_pXsp,          /* \s */   /* space and POSIX space are the same. */
339      string_PXwd,          /* \W */
340      string_pXwd           /* \w */
341    };
342    
343    /* The POSIX class substitutes must be in the order of the POSIX class names,
344    defined above, and there are both positive and negative cases. NULL means no
345    general substitute of a Unicode property escape (\p or \P). However, for some
346    POSIX classes (e.g. graph, print, punct) a special property code is compiled
347    directly. */
348    
349    static const pcre_uchar string_pL[] =   {
350      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352    static const pcre_uchar string_pLl[] =  {
353      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355    static const pcre_uchar string_pLu[] =  {
356      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
357      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358    static const pcre_uchar string_pXan[] = {
359      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
360      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361    static const pcre_uchar string_h[] =    {
362      CHAR_BACKSLASH, CHAR_h, '\0' };
363    static const pcre_uchar string_pXps[] = {
364      CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
365      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
366    static const pcre_uchar string_PL[] =   {
367      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
368      CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369    static const pcre_uchar string_PLl[] =  {
370      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371      CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372    static const pcre_uchar string_PLu[] =  {
373      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
374      CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375    static const pcre_uchar string_PXan[] = {
376      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
377      CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378    static const pcre_uchar string_H[] =    {
379      CHAR_BACKSLASH, CHAR_H, '\0' };
380    static const pcre_uchar string_PXps[] = {
381      CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
382      CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383    
384    static const pcre_uchar *posix_substitutes[] = {
385      string_pL,            /* alpha */
386      string_pLl,           /* lower */
387      string_pLu,           /* upper */
388      string_pXan,          /* alnum */
389      NULL,                 /* ascii */
390      string_h,             /* blank */
391      NULL,                 /* cntrl */
392      string_pNd,           /* digit */
393      NULL,                 /* graph */
394      NULL,                 /* print */
395      NULL,                 /* punct */
396      string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
397      string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
398      NULL,                 /* xdigit */
399      /* Negated cases */
400      string_PL,            /* ^alpha */
401      string_PLl,           /* ^lower */
402      string_PLu,           /* ^upper */
403      string_PXan,          /* ^alnum */
404      NULL,                 /* ^ascii */
405      string_H,             /* ^blank */
406      NULL,                 /* ^cntrl */
407      string_PNd,           /* ^digit */
408      NULL,                 /* ^graph */
409      NULL,                 /* ^print */
410      NULL,                 /* ^punct */
411      string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
412      string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
413      NULL                  /* ^xdigit */
414    };
415    #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
416    #endif
417    
418  #define STRING(a)  # a  #define STRING(a)  # a
419  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 171  static const int posix_class_maps[] = { Line 421  static const int posix_class_maps[] = {
421  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
422  are passed to the outside world. Do not ever re-use any error number, because  are passed to the outside world. Do not ever re-use any error number, because
423  they are documented. Always add a new error instead. Messages marked DEAD below  they are documented. Always add a new error instead. Messages marked DEAD below
424  are no longer used. */  are no longer used. This used to be a table of strings, but in order to reduce
425    the number of relocations needed when a shared library is loaded dynamically,
426  static const char *error_texts[] = {  it is now one long string. We cannot use a table of offsets, because the
427    "no error",  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
428    "\\ at end of pattern",  simply count through to the one we want - this isn't a performance issue
429    "\\c at end of pattern",  because these strings are used only when there is a compilation error.
430    "unrecognized character follows \\",  
431    "numbers out of order in {} quantifier",  Each substring ends with \0 to insert a null character. This includes the final
432    substring, so that the whole string ends with \0\0, which can be detected when
433    counting through. */
434    
435    static const char error_texts[] =
436      "no error\0"
437      "\\ at end of pattern\0"
438      "\\c at end of pattern\0"
439      "unrecognized character follows \\\0"
440      "numbers out of order in {} quantifier\0"
441    /* 5 */    /* 5 */
442    "number too big in {} quantifier",    "number too big in {} quantifier\0"
443    "missing terminating ] for character class",    "missing terminating ] for character class\0"
444    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
445    "range out of order in character class",    "range out of order in character class\0"
446    "nothing to repeat",    "nothing to repeat\0"
447    /* 10 */    /* 10 */
448    "operand of unlimited repeat could match the empty string",  /** DEAD **/    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
449    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
450    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
451    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
452    "missing )",    "missing )\0"
453    /* 15 */    /* 15 */
454    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
455    "erroffset passed as NULL",    "erroffset passed as NULL\0"
456    "unknown option bit(s) set",    "unknown option bit(s) set\0"
457    "missing ) after comment",    "missing ) after comment\0"
458    "parentheses nested too deeply",  /** DEAD **/    "parentheses nested too deeply\0"  /** DEAD **/
459    /* 20 */    /* 20 */
460    "regular expression too large",    "regular expression is too large\0"
461    "failed to get memory",    "failed to get memory\0"
462    "unmatched parentheses",    "unmatched parentheses\0"
463    "internal error: code overflow",    "internal error: code overflow\0"
464    "unrecognized character after (?<",    "unrecognized character after (?<\0"
465    /* 25 */    /* 25 */
466    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
467    "malformed number or name after (?(",    "malformed number or name after (?(\0"
468    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
469    "assertion expected after (?(",    "assertion expected after (?(\0"
470    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
471    /* 30 */    /* 30 */
472    "unknown POSIX class name",    "unknown POSIX class name\0"
473    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
474    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is compiled without UTF support\0"
475    "spare error",  /** DEAD **/    "spare error\0"  /** DEAD **/
476    "character value in \\x{...} sequence is too large",    "character value in \\x{} or \\o{} is too large\0"
477    /* 35 */    /* 35 */
478    "invalid condition (?(0)",    "invalid condition (?(0)\0"
479    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
480    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
481    "number after (?C is > 255",    "number after (?C is > 255\0"
482    "closing ) for (?C expected",    "closing ) for (?C expected\0"
483    /* 40 */    /* 40 */
484    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
485    "unrecognized character after (?P",    "unrecognized character after (?P\0"
486    "syntax error in subpattern name (missing terminator)",    "syntax error in subpattern name (missing terminator)\0"
487    "two named subpatterns have the same name",    "two named subpatterns have the same name\0"
488    "invalid UTF-8 string",    "invalid UTF-8 string\0"
489    /* 45 */    /* 45 */
490    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
491    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
492    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p\0"
493    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
494    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
495    /* 50 */    /* 50 */
496    "repeated subpattern is too long",    "repeated subpattern is too long\0"    /** DEAD **/
497    "octal value is greater than \\377 (not in UTF-8 mode)",    "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
498    "internal error: overran compiling workspace",    "internal error: overran compiling workspace\0"
499    "internal error: previously-checked referenced subpattern not found",    "internal error: previously-checked referenced subpattern not found\0"
500    "DEFINE group contains more than one branch",    "DEFINE group contains more than one branch\0"
501    /* 55 */    /* 55 */
502    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
503    "inconsistent NEWLINE options",    "inconsistent NEWLINE options\0"
504    "\\g is not followed by an (optionally braced) non-zero number"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
505  };    "a numbered reference must not be zero\0"
506      "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
507      /* 60 */
508      "(*VERB) not recognized or malformed\0"
509      "number is too big\0"
510      "subpattern name expected\0"
511      "digit expected after (?+\0"
512      "] is an invalid data character in JavaScript compatibility mode\0"
513      /* 65 */
514      "different names for subpatterns of the same number are not allowed\0"
515      "(*MARK) must have an argument\0"
516      "this version of PCRE is not compiled with Unicode property support\0"
517      "\\c must be followed by an ASCII character\0"
518      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
519      /* 70 */
520      "internal error: unknown opcode in find_fixedlength()\0"
521      "\\N is not supported in a class\0"
522      "too many forward references\0"
523      "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
524      "invalid UTF-16 string\0"
525      /* 75 */
526      "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
527      "character value in \\u.... sequence is too large\0"
528      "invalid UTF-32 string\0"
529      "setting UTF is disabled by the application\0"
530      "non-hex character in \\x{} (closing brace missing?)\0"
531      /* 80 */
532      "non-octal character in \\o{} (closing brace missing?)\0"
533      "missing opening brace after \\o\0"
534      ;
535    
536  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
537  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 262  For convenience, we use the same bit def Line 549  For convenience, we use the same bit def
549    
550  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
551    
552  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  /* Using a simple comparison for decimal numbers rather than a memory read
553  static const unsigned char digitab[] =  is much faster, and the resulting code is simpler (the compiler turns it
554    into a subtraction and unsigned comparison). */
555    
556    #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
557    
558    #ifndef EBCDIC
559    
560    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
561    UTF-8 mode. */
562    
563    static const pcre_uint8 digitab[] =
564    {    {
565    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
566    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
# Line 298  static const unsigned char digitab[] = Line 595  static const unsigned char digitab[] =
595    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
596    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
597    
598  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
599  static const unsigned char digitab[] =  
600    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
601    
602    static const pcre_uint8 digitab[] =
603    {    {
604    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
605    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
# Line 334  static const unsigned char digitab[] = Line 634  static const unsigned char digitab[] =
634    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
635    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
636    
637  static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */  static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
638    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */    0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
639    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */    0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
640    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */    0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
# Line 370  static const unsigned char ebcdic_charta Line 670  static const unsigned char ebcdic_charta
670  #endif  #endif
671    
672    
673  /* Definition to allow mutual recursion */  /* This table is used to check whether auto-possessification is possible
674    between adjacent character-type opcodes. The left-hand (repeated) opcode is
675    used to select the row, and the right-hand opcode is use to select the column.
676    A value of 1 means that auto-possessification is OK. For example, the second
677    value in the first row means that \D+\d can be turned into \D++\d.
678    
679    The Unicode property types (\P and \p) have to be present to fill out the table
680    because of what their opcode values are, but the table values should always be
681    zero because property types are handled separately in the code. The last four
682    columns apply to items that cannot be repeated, so there is no need to have
683    rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
684    *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
685    
686    #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
687    #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
688    
689    static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
690    /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
691      { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
692      { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
693      { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
694      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
695      { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
696      { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
697      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
698      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
699      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
700      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
701      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
702      { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
703      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
704      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
705      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
706      { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
707      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
708    };
709    
710    
711    /* This table is used to check whether auto-possessification is possible
712    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
713    left-hand (repeated) opcode is used to select the row, and the right-hand
714    opcode is used to select the column. The values are as follows:
715    
716      0   Always return FALSE (never auto-possessify)
717      1   Character groups are distinct (possessify if both are OP_PROP)
718      2   Check character categories in the same group (general or particular)
719      3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
720    
721      4   Check left general category vs right particular category
722      5   Check right general category vs left particular category
723    
724      6   Left alphanum vs right general category
725      7   Left space vs right general category
726      8   Left word vs right general category
727    
728      9   Right alphanum vs left general category
729     10   Right space vs left general category
730     11   Right word vs left general category
731    
732     12   Left alphanum vs right particular category
733     13   Left space vs right particular category
734     14   Left word vs right particular category
735    
736     15   Right alphanum vs left particular category
737     16   Right space vs left particular category
738     17   Right word vs left particular category
739    */
740    
741    static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
742    /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
743      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
744      { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
745      { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
746      { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
747      { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
748      { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
749      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
750      { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
751      { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
752      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
753      { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
754    };
755    
756    /* This table is used to check whether auto-possessification is possible
757    between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
758    specifies a general category and the other specifies a particular category. The
759    row is selected by the general category and the column by the particular
760    category. The value is 1 if the particular category is not part of the general
761    category. */
762    
763    static const pcre_uint8 catposstab[7][30] = {
764    /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
765      { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
766      { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
767      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
768      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
769      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
770      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
771      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
772    };
773    
774    /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
775    a general or particular category. The properties in each row are those
776    that apply to the character set in question. Duplication means that a little
777    unnecessary work is done when checking, but this keeps things much simpler
778    because they can all use the same code. For more details see the comment where
779    this table is used.
780    
781    Note: SPACE and PXSPACE used to be different because Perl excluded VT from
782    "space", but from Perl 5.18 it's included, so both categories are treated the
783    same here. */
784    
785    static const pcre_uint8 posspropstab[3][4] = {
786      { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
787      { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
788      { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
789    };
790    
791    /* This table is used when converting repeating opcodes into possessified
792    versions as a result of an explicit possessive quantifier such as ++. A zero
793    value means there is no possessified version - in those cases the item in
794    question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
795    because all relevant opcodes are less than that. */
796    
797    static const pcre_uint8 opcode_possessify[] = {
798      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
799      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
800    
801      0,                       /* NOTI */
802      OP_POSSTAR, 0,           /* STAR, MINSTAR */
803      OP_POSPLUS, 0,           /* PLUS, MINPLUS */
804      OP_POSQUERY, 0,          /* QUERY, MINQUERY */
805      OP_POSUPTO, 0,           /* UPTO, MINUPTO */
806      0,                       /* EXACT */
807      0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
808    
809      OP_POSSTARI, 0,          /* STARI, MINSTARI */
810      OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
811      OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
812      OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
813      0,                       /* EXACTI */
814      0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
815    
816      OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
817      OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
818      OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
819      OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
820      0,                       /* NOTEXACT */
821      0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
822    
823      OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
824      OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
825      OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
826      OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
827      0,                       /* NOTEXACTI */
828      0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
829    
830      OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
831      OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
832      OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
833      OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
834      0,                       /* TYPEEXACT */
835      0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
836    
837      OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
838      OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
839      OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
840      OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
841      0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
842    
843      0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
844      0, 0,                    /* REF, REFI */
845      0, 0,                    /* DNREF, DNREFI */
846      0, 0                     /* RECURSE, CALLOUT */
847    };
848    
849    
850    
851    /*************************************************
852    *            Find an error text                  *
853    *************************************************/
854    
855    /* The error texts are now all in one long string, to save on relocations. As
856    some of the text is of unknown length, we can't use a table of offsets.
857    Instead, just count through the strings. This is not a performance issue
858    because it happens only when there has been a compilation error.
859    
860    Argument:   the error number
861    Returns:    pointer to the error string
862    */
863    
864    static const char *
865    find_error_text(int n)
866    {
867    const char *s = error_texts;
868    for (; n > 0; n--)
869      {
870      while (*s++ != CHAR_NULL) {};
871      if (*s == CHAR_NULL) return "Error text not found (please report)";
872      }
873    return s;
874    }
875    
876    
877    
878    /*************************************************
879    *           Expand the workspace                 *
880    *************************************************/
881    
882    /* This function is called during the second compiling phase, if the number of
883    forward references fills the existing workspace, which is originally a block on
884    the stack. A larger block is obtained from malloc() unless the ultimate limit
885    has been reached or the increase will be rather small.
886    
887    Argument: pointer to the compile data block
888    Returns:  0 if all went well, else an error number
889    */
890    
891    static int
892    expand_workspace(compile_data *cd)
893    {
894    pcre_uchar *newspace;
895    int newsize = cd->workspace_size * 2;
896    
897    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
898    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
899        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
900     return ERR72;
901    
902    newspace = (PUBL(malloc))(IN_UCHARS(newsize));
903    if (newspace == NULL) return ERR21;
904    memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
905    cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
906    if (cd->workspace_size > COMPILE_WORK_SIZE)
907      (PUBL(free))((void *)cd->start_workspace);
908    cd->start_workspace = newspace;
909    cd->workspace_size = newsize;
910    return 0;
911    }
912    
913    
914    
915    /*************************************************
916    *            Check for counted repeat            *
917    *************************************************/
918    
919    /* This function is called when a '{' is encountered in a place where it might
920    start a quantifier. It looks ahead to see if it really is a quantifier or not.
921    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
922    where the ddds are digits.
923    
924    Arguments:
925      p         pointer to the first char after '{'
926    
927    Returns:    TRUE or FALSE
928    */
929    
930  static BOOL  static BOOL
931    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,  is_counted_repeat(const pcre_uchar *p)
932      int *, branch_chain *, compile_data *, int *);  {
933    if (!IS_DIGIT(*p)) return FALSE;
934    p++;
935    while (IS_DIGIT(*p)) p++;
936    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
937    
938    if (*p++ != CHAR_COMMA) return FALSE;
939    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
940    
941    if (!IS_DIGIT(*p)) return FALSE;
942    p++;
943    while (IS_DIGIT(*p)) p++;
944    
945    return (*p == CHAR_RIGHT_CURLY_BRACKET);
946    }
947    
948    
949    
# Line 383  static BOOL Line 952  static BOOL
952  *************************************************/  *************************************************/
953    
954  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
955  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or 0 for a data character which
956  encodes one of the more complicated things such as \d. A backreference to group  will be placed in chptr. A backreference to group n is returned as negative n.
957  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When  When UTF-8 is enabled, a positive value greater than 255 may be returned in
958  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,  chptr. On entry, ptr is pointing at the \. On exit, it is on the final
959  ptr is pointing at the \. On exit, it is on the final character of the escape  character of the escape sequence.
 sequence.  
960    
961  Arguments:  Arguments:
962    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
963      chptr          points to a returned data character
964    errorcodeptr   points to the errorcode variable    errorcodeptr   points to the errorcode variable
965    bracount       number of previous extracting brackets    bracount       number of previous extracting brackets
966    options        the options bits    options        the options bits
967    isclass        TRUE if inside a character class    isclass        TRUE if inside a character class
968    
969  Returns:         zero or positive => a data character  Returns:         zero => a data character
970                   negative => a special escape sequence                   positive => a special escape sequence
971                   on error, errorptr is set                   negative => a back reference
972                     on error, errorcodeptr is set
973  */  */
974    
975  static int  static int
976  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
977    int options, BOOL isclass)    int bracount, int options, BOOL isclass)
978  {  {
979  BOOL utf8 = (options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
980  const uschar *ptr = *ptrptr + 1;  BOOL utf = (options & PCRE_UTF8) != 0;
981  int c, i;  const pcre_uchar *ptr = *ptrptr + 1;
982    pcre_uint32 c;
983    int escape = 0;
984    int i;
985    
986  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */  GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
987  ptr--;                            /* Set pointer back to the last byte */  ptr--;                            /* Set pointer back to the last byte */
988    
989  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
990    
991  if (c == 0) *errorcodeptr = ERR1;  if (c == CHAR_NULL) *errorcodeptr = ERR1;
992    
993  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
994  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
995  Otherwise further processing may be required. */  Otherwise further processing may be required. */
996    
997  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
998  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  /* Not alphanumeric */
999  else if ((i = escapes[c - '0']) != 0) c = i;  else if (c < CHAR_0 || c > CHAR_z) {}
1000    else if ((i = escapes[c - CHAR_0]) != 0)
1001      { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1002    
1003  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1004  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  /* Not alphanumeric */
1005  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1006    else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1007  #endif  #endif
1008    
1009  /* Escapes that need further processing, or are illegal. */  /* Escapes that need further processing, or are illegal. */
1010    
1011  else  else
1012    {    {
1013    const uschar *oldptr;    const pcre_uchar *oldptr;
1014    BOOL braced, negated;    BOOL braced, negated, overflow;
1015      int s;
1016    
1017    switch (c)    switch (c)
1018      {      {
1019      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
1020      error. */      error. */
1021    
1022      case 'l':      case CHAR_l:
1023      case 'L':      case CHAR_L:
     case 'N':  
     case 'u':  
     case 'U':  
1024      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
1025      break;      break;
1026    
1027      /* \g must be followed by a number, either plain or braced. If positive, it      case CHAR_u:
1028      is an absolute backreference. If negative, it is a relative backreference.      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1029      This is a Perl 5.10 feature. */        {
1030          /* In JavaScript, \u must be followed by four hexadecimal numbers.
1031          Otherwise it is a lowercase u letter. */
1032          if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1033            && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1034            && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1035            && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1036            {
1037            c = 0;
1038            for (i = 0; i < 4; ++i)
1039              {
1040              register pcre_uint32 cc = *(++ptr);
1041    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1043              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1044    #else           /* EBCDIC coding */
1045              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047    #endif
1048              }
1049    
1050    #if defined COMPILE_PCRE8
1051            if (c > (utf ? 0x10ffffU : 0xffU))
1052    #elif defined COMPILE_PCRE16
1053            if (c > (utf ? 0x10ffffU : 0xffffU))
1054    #elif defined COMPILE_PCRE32
1055            if (utf && c > 0x10ffffU)
1056    #endif
1057              {
1058              *errorcodeptr = ERR76;
1059              }
1060            else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1061            }
1062          }
1063        else
1064          *errorcodeptr = ERR37;
1065        break;
1066    
1067        case CHAR_U:
1068        /* In JavaScript, \U is an uppercase U letter. */
1069        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1070        break;
1071    
1072        /* In a character class, \g is just a literal "g". Outside a character
1073        class, \g must be followed by one of a number of specific things:
1074    
1075        (1) A number, either plain or braced. If positive, it is an absolute
1076        backreference. If negative, it is a relative backreference. This is a Perl
1077        5.10 feature.
1078    
1079        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1080        is part of Perl's movement towards a unified syntax for back references. As
1081        this is synonymous with \k{name}, we fudge it up by pretending it really
1082        was \k.
1083    
1084        (3) For Oniguruma compatibility we also support \g followed by a name or a
1085        number either in angle brackets or in single quotes. However, these are
1086        (possibly recursive) subroutine calls, _not_ backreferences. Just return
1087        the ESC_g code (cf \k). */
1088    
1089        case CHAR_g:
1090        if (isclass) break;
1091        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1092          {
1093          escape = ESC_g;
1094          break;
1095          }
1096    
1097        /* Handle the Perl-compatible cases */
1098    
1099      case 'g':      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
     if (ptr[1] == '{')  
1100        {        {
1101          const pcre_uchar *p;
1102          for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1103            if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1104          if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1105            {
1106            escape = ESC_k;
1107            break;
1108            }
1109        braced = TRUE;        braced = TRUE;
1110        ptr++;        ptr++;
1111        }        }
1112      else braced = FALSE;      else braced = FALSE;
1113    
1114      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
1115        {        {
1116        negated = TRUE;        negated = TRUE;
1117        ptr++;        ptr++;
1118        }        }
1119      else negated = FALSE;      else negated = FALSE;
1120    
1121      c = 0;      /* The integer range is limited by the machine's int representation. */
1122      while ((digitab[ptr[1]] & ctype_digit) != 0)      s = 0;
1123        c = c * 10 + *(++ptr) - '0';      overflow = FALSE;
1124        while (IS_DIGIT(ptr[1]))
1125          {
1126          if (s > INT_MAX / 10 - 1) /* Integer overflow */
1127            {
1128            overflow = TRUE;
1129            break;
1130            }
1131          s = s * 10 + (int)(*(++ptr) - CHAR_0);
1132          }
1133        if (overflow) /* Integer overflow */
1134          {
1135          while (IS_DIGIT(ptr[1]))
1136            ptr++;
1137          *errorcodeptr = ERR61;
1138          break;
1139          }
1140    
1141      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1142        {        {
1143        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
1144        return 0;        break;
1145          }
1146    
1147        if (s == 0)
1148          {
1149          *errorcodeptr = ERR58;
1150          break;
1151        }        }
1152    
1153      if (negated)      if (negated)
1154        {        {
1155        if (c > bracount)        if (s > bracount)
1156          {          {
1157          *errorcodeptr = ERR15;          *errorcodeptr = ERR15;
1158          return 0;          break;
1159          }          }
1160        c = bracount - (c - 1);        s = bracount - (s - 1);
1161        }        }
1162    
1163      c = -(ESC_REF + c);      escape = -s;
1164      break;      break;
1165    
1166      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
1167      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. Perl has changed
1168      the way Perl works seems to be as follows:      over the years. Nowadays \g{} for backreferences and \o{} for octal are
1169        recommended to avoid the ambiguities in the old syntax.
1170    
1171      Outside a character class, the digits are read as a decimal number. If the      Outside a character class, the digits are read as a decimal number. If the
1172      number is less than 10, or if there are that many previous extracting      number is less than 8 (used to be 10), or if there are that many previous
1173      left brackets, then it is a back reference. Otherwise, up to three octal      extracting left brackets, then it is a back reference. Otherwise, up to
1174      digits are read to form an escaped byte. Thus \123 is likely to be octal      three octal digits are read to form an escaped byte. Thus \123 is likely to
1175      123 (cf \0123, which is octal 012 followed by the literal 3). If the octal      be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1176      value is greater than 377, the least significant 8 bits are taken. Inside a      the octal value is greater than 377, the least significant 8 bits are
1177      character class, \ followed by a digit is always an octal number. */      taken. \8 and \9 are treated as the literal characters 8 and 9.
1178    
1179      case '1': case '2': case '3': case '4': case '5':      Inside a character class, \ followed by a digit is always either a literal
1180      case '6': case '7': case '8': case '9':      8 or 9 or an octal number. */
1181    
1182        case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1183        case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1184    
1185      if (!isclass)      if (!isclass)
1186        {        {
1187        oldptr = ptr;        oldptr = ptr;
1188        c -= '0';        /* The integer range is limited by the machine's int representation. */
1189        while ((digitab[ptr[1]] & ctype_digit) != 0)        s = (int)(c -CHAR_0);
1190          c = c * 10 + *(++ptr) - '0';        overflow = FALSE;
1191        if (c < 10 || c <= bracount)        while (IS_DIGIT(ptr[1]))
1192            {
1193            if (s > INT_MAX / 10 - 1) /* Integer overflow */
1194              {
1195              overflow = TRUE;
1196              break;
1197              }
1198            s = s * 10 + (int)(*(++ptr) - CHAR_0);
1199            }
1200          if (overflow) /* Integer overflow */
1201            {
1202            while (IS_DIGIT(ptr[1]))
1203              ptr++;
1204            *errorcodeptr = ERR61;
1205            break;
1206            }
1207          if (s < 8 || s <= bracount)  /* Check for back reference */
1208          {          {
1209          c = -(ESC_REF + c);          escape = -s;
1210          break;          break;
1211          }          }
1212        ptr = oldptr;      /* Put the pointer back and fall through */        ptr = oldptr;      /* Put the pointer back and fall through */
1213        }        }
1214    
1215      /* Handle an octal number following \. If the first digit is 8 or 9, Perl      /* Handle a digit following \ when the number is not a back reference. If
1216      generates a binary zero byte and treats the digit as a following literal.      the first digit is 8 or 9, Perl used to generate a binary zero byte and
1217      Thus we have to pull back the pointer by one. */      then treat the digit as a following literal. At least by Perl 5.18 this
1218        changed so as not to insert the binary zero. */
1219    
1220      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8) break;
1221        {  
1222        ptr--;      /* Fall through with a digit less than 8 */
       c = 0;  
       break;  
       }  
1223    
1224      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
1225      larger first octal digit. The original code used just to take the least      larger first octal digit. The original code used just to take the least
1226      significant 8 bits of octal numbers (I think this is what early Perls used      significant 8 bits of octal numbers (I think this is what early Perls used
1227      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1228      than 3 octal digits. */      but no more than 3 octal digits. */
1229    
1230      case '0':      case CHAR_0:
1231      c -= '0';      c -= CHAR_0;
1232      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1233          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
1234      if (!utf8 && c > 255) *errorcodeptr = ERR51;  #ifdef COMPILE_PCRE8
1235        if (!utf && c > 0xff) *errorcodeptr = ERR51;
1236    #endif
1237      break;      break;
1238    
1239      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \o is a relatively new Perl feature, supporting a more general way of
1240      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      specifying character codes in octal. The only supported form is \o{ddd}. */
     treated as a data character. */  
1241    
1242      case 'x':      case CHAR_o:
1243      if (ptr[1] == '{')      if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1244        {        {
1245        const uschar *pt = ptr + 2;        ptr += 2;
       int count = 0;  
   
1246        c = 0;        c = 0;
1247        while ((digitab[*pt] & ctype_xdigit) != 0)        overflow = FALSE;
1248          while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1249          {          {
1250          register int cc = *pt++;          register pcre_uint32 cc = *ptr++;
1251          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1252          count++;  #ifdef COMPILE_PCRE32
1253            if (c >= 0x20000000l) { overflow = TRUE; break; }
1254  #ifndef EBCDIC  /* ASCII coding */  #endif
1255          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          c = (c << 3) + cc - CHAR_0 ;
1256          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));  #if defined COMPILE_PCRE8
1257  #else           /* EBCDIC coding */          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1258          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */  #elif defined COMPILE_PCRE16
1259          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1260    #elif defined COMPILE_PCRE32
1261            if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1262  #endif  #endif
1263          }          }
1264          if (overflow)
       if (*pt == '}')  
1265          {          {
1266          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1267          ptr = pt;          *errorcodeptr = ERR34;
         break;  
1268          }          }
1269          else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1270        /* If the sequence of hex digits does not end with '}', then we don't          {
1271        recognize this construct; fall through to the normal \x handling. */          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1272            }
1273          else *errorcodeptr = ERR80;
1274        }        }
1275        break;
1276    
1277      /* Read just a single-byte hex-defined char */      /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1278        numbers. Otherwise it is a lowercase x letter. */
1279    
1280      c = 0;      case CHAR_x:
1281      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1282        {        {
1283        int cc;                               /* Some compilers don't like ++ */        if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1284        cc = *(++ptr);                        /* in initializers */          && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1285  #ifndef EBCDIC  /* ASCII coding */          {
1286        if (cc >= 'a') cc -= 32;              /* Convert to upper case */          c = 0;
1287        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          for (i = 0; i < 2; ++i)
1288              {
1289              register pcre_uint32 cc = *(++ptr);
1290    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1291              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1292              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1293  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
1294        if (cc <= 'z') cc += 64;              /* Convert to upper case */            if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1295        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1296  #endif  #endif
1297        }            }
1298            }
1299          }    /* End JavaScript handling */
1300    
1301        /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1302        greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1303        digits. If not, { used to be treated as a data character. However, Perl
1304        seems to read hex digits up to the first non-such, and ignore the rest, so
1305        that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1306        now gives an error. */
1307    
1308        else
1309          {
1310          if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1311            {
1312            ptr += 2;
1313            c = 0;
1314            overflow = FALSE;
1315            while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1316              {
1317              register pcre_uint32 cc = *ptr++;
1318              if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1319    
1320    #ifdef COMPILE_PCRE32
1321              if (c >= 0x10000000l) { overflow = TRUE; break; }
1322    #endif
1323    
1324    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1325              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1326              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1327    #else           /* EBCDIC coding */
1328              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1329              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1330    #endif
1331    
1332    #if defined COMPILE_PCRE8
1333              if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1334    #elif defined COMPILE_PCRE16
1335              if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1336    #elif defined COMPILE_PCRE32
1337              if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1338    #endif
1339              }
1340    
1341            if (overflow)
1342              {
1343              while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1344              *errorcodeptr = ERR34;
1345              }
1346    
1347            else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1348              {
1349              if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1350              }
1351    
1352            /* If the sequence of hex digits does not end with '}', give an error.
1353            We used just to recognize this construct and fall through to the normal
1354            \x handling, but nowadays Perl gives an error, which seems much more
1355            sensible, so we do too. */
1356    
1357            else *errorcodeptr = ERR79;
1358            }   /* End of \x{} processing */
1359    
1360          /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1361    
1362          else
1363            {
1364            c = 0;
1365            while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1366              {
1367              pcre_uint32 cc;                          /* Some compilers don't like */
1368              cc = *(++ptr);                           /* ++ in initializers */
1369    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1370              if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1371              c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1372    #else           /* EBCDIC coding */
1373              if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1374              c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1375    #endif
1376              }
1377            }     /* End of \xdd handling */
1378          }       /* End of Perl-style \x handling */
1379      break;      break;
1380    
1381      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1382      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
1383        coding is ASCII-specific, but then the whole concept of \cx is
1384      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1385    
1386      case 'c':      case CHAR_c:
1387      c = *(++ptr);      c = *(++ptr);
1388      if (c == 0)      if (c == CHAR_NULL)
1389        {        {
1390        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
1391        return 0;        break;
1392        }        }
1393    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1394  #ifndef EBCDIC  /* ASCII coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
1395      if (c >= 'a' && c <= 'z') c -= 32;        {
1396          *errorcodeptr = ERR68;
1397          break;
1398          }
1399        if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1400      c ^= 0x40;      c ^= 0x40;
1401  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1402      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1403      c ^= 0xC0;      c ^= 0xC0;
1404  #endif  #endif
1405      break;      break;
1406    
1407      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1408      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
1409      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
1410      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
1411      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
1412    
1413      default:      default:
1414      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 637  else Line 1421  else
1421      }      }
1422    }    }
1423    
1424    /* Perl supports \N{name} for character names, as well as plain \N for "not
1425    newline". PCRE does not support \N{name}. However, it does support
1426    quantification such as \N{2,3}. */
1427    
1428    if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1429         !is_counted_repeat(ptr+2))
1430      *errorcodeptr = ERR37;
1431    
1432    /* If PCRE_UCP is set, we change the values for \d etc. */
1433    
1434    if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1435      escape += (ESC_DU - ESC_D);
1436    
1437    /* Set the pointer to the final character before returning. */
1438    
1439  *ptrptr = ptr;  *ptrptr = ptr;
1440  return c;  *chptr = c;
1441    return escape;
1442  }  }
1443    
1444    
# Line 656  escape sequence. Line 1456  escape sequence.
1456  Argument:  Argument:
1457    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
1458    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
1459    dptr           points to an int that is set to the detailed property value    ptypeptr       points to an unsigned int that is set to the type value
1460      pdataptr       points to an unsigned int that is set to the detailed property value
1461    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
1462    
1463  Returns:         type value from ucp_type_table, or -1 for an invalid type  Returns:         TRUE if the type value was found, or FALSE for an invalid type
1464  */  */
1465    
1466  static int  static BOOL
1467  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)  get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1468      unsigned int *pdataptr, int *errorcodeptr)
1469  {  {
1470  int c, i, bot, top;  pcre_uchar c;
1471  const uschar *ptr = *ptrptr;  int i, bot, top;
1472  char name[32];  const pcre_uchar *ptr = *ptrptr;
1473    pcre_uchar name[32];
1474    
1475  c = *(++ptr);  c = *(++ptr);
1476  if (c == 0) goto ERROR_RETURN;  if (c == CHAR_NULL) goto ERROR_RETURN;
1477    
1478  *negptr = FALSE;  *negptr = FALSE;
1479    
1480  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1481  negation. */  negation. */
1482    
1483  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
1484    {    {
1485    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1486      {      {
1487      *negptr = TRUE;      *negptr = TRUE;
1488      ptr++;      ptr++;
1489      }      }
1490    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1491      {      {
1492      c = *(++ptr);      c = *(++ptr);
1493      if (c == 0) goto ERROR_RETURN;      if (c == CHAR_NULL) goto ERROR_RETURN;
1494      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1495      name[i] = c;      name[i] = c;
1496      }      }
1497    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1498    name[i] = 0;    name[i] = 0;
1499    }    }
1500    
# Line 708  else Line 1511  else
1511  /* Search for a recognized property name using binary chop */  /* Search for a recognized property name using binary chop */
1512    
1513  bot = 0;  bot = 0;
1514  top = _pcre_utt_size;  top = PRIV(utt_size);
1515    
1516  while (bot < top)  while (bot < top)
1517    {    {
1518      int r;
1519    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1520    c = strcmp(name, _pcre_utt[i].name);    r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1521    if (c == 0)    if (r == 0)
1522      {      {
1523      *dptr = _pcre_utt[i].value;      *ptypeptr = PRIV(utt)[i].type;
1524      return _pcre_utt[i].type;      *pdataptr = PRIV(utt)[i].value;
1525        return TRUE;
1526      }      }
1527    if (c > 0) bot = i + 1; else top = i;    if (r > 0) bot = i + 1; else top = i;
1528    }    }
1529    
1530  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
1531  *ptrptr = ptr;  *ptrptr = ptr;
1532  return -1;  return FALSE;
1533    
1534  ERROR_RETURN:  ERROR_RETURN:
1535  *errorcodeptr = ERR46;  *errorcodeptr = ERR46;
1536  *ptrptr = ptr;  *ptrptr = ptr;
1537  return -1;  return FALSE;
1538  }  }
1539  #endif  #endif
1540    
1541    
1542    
   
 /*************************************************  
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == '}') return TRUE;  
   
 if (*p++ != ',') return FALSE;  
 if (*p == '}') return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == '}');  
 }  
   
   
   
1543  /*************************************************  /*************************************************
1544  *         Read repeat counts                     *  *         Read repeat counts                     *
1545  *************************************************/  *************************************************/
# Line 788  Returns:         pointer to '}' on succe Line 1559  Returns:         pointer to '}' on succe
1559                   current ptr on error, with errorcodeptr set non-zero                   current ptr on error, with errorcodeptr set non-zero
1560  */  */
1561    
1562  static const uschar *  static const pcre_uchar *
1563  read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)  read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1564  {  {
1565  int min = 0;  int min = 0;
1566  int max = -1;  int max = -1;
# Line 797  int max = -1; Line 1568  int max = -1;
1568  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1569  an integer overflow. */  an integer overflow. */
1570    
1571  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1572  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1573    {    {
1574    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 807  if (min < 0 || min > 65535) Line 1578  if (min < 0 || min > 65535)
1578  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
1579  Also, max must not be less than min. */  Also, max must not be less than min. */
1580    
1581  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1582    {    {
1583    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1584      {      {
1585      max = 0;      max = 0;
1586      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1587      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1588        {        {
1589        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 837  return p; Line 1608  return p;
1608    
1609    
1610  /*************************************************  /*************************************************
1611  *       Find forward referenced subpattern       *  *      Find first significant op code            *
1612  *************************************************/  *************************************************/
1613    
1614  /* This function scans along a pattern's text looking for capturing  /* This is called by several functions that scan a compiled expression looking
1615  subpatterns, and counting them. If it finds a named pattern that matches the  for a fixed first character, or an anchoring op code etc. It skips over things
1616  name it is given, it returns its number. Alternatively, if the name is NULL, it  that do not influence this. For some calls, it makes sense to skip negative
1617  returns when it reaches a given numbered subpattern. This is used for forward  forward and all backward assertions, and also the \b assertion; for others it
1618  references to subpatterns. We know that if (?P< is encountered, the name will  does not.
 be terminated by '>' because that is checked in the first pass.  
1619    
1620  Arguments:  Arguments:
1621    ptr          current position in the pattern    code         pointer to the start of the group
1622    count        current count of capturing parens so far encountered    skipassert   TRUE if certain assertions are to be skipped
   name         name to seek, or NULL if seeking a numbered subpattern  
   lorn         name length, or subpattern number if name is NULL  
   xmode        TRUE if we are in /x mode  
1623    
1624  Returns:       the number of the named subpattern, or -1 if not found  Returns:       pointer to the first significant opcode
1625  */  */
1626    
1627  static int  static const pcre_uchar*
1628  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,  first_significant_code(const pcre_uchar *code, BOOL skipassert)
   BOOL xmode)  
 {  
 const uschar *thisname;  
   
 for (; *ptr != 0; ptr++)  
   {  
   int term;  
   
   /* Skip over backslashed characters and also entire \Q...\E */  
   
   if (*ptr == '\\')  
     {  
     if (*(++ptr) == 0) return -1;  
     if (*ptr == 'Q') for (;;)  
       {  
       while (*(++ptr) != 0 && *ptr != '\\');  
       if (*ptr == 0) return -1;  
       if (*(++ptr) == 'E') break;  
       }  
     continue;  
     }  
   
   /* Skip over character classes */  
   
   if (*ptr == '[')  
     {  
     while (*(++ptr) != ']')  
       {  
       if (*ptr == '\\')  
         {  
         if (*(++ptr) == 0) return -1;  
         if (*ptr == 'Q') for (;;)  
           {  
           while (*(++ptr) != 0 && *ptr != '\\');  
           if (*ptr == 0) return -1;  
           if (*(++ptr) == 'E') break;  
           }  
         continue;  
         }  
       }  
     continue;  
     }  
   
   /* Skip comments in /x mode */  
   
   if (xmode && *ptr == '#')  
     {  
     while (*(++ptr) != 0 && *ptr != '\n');  
     if (*ptr == 0) return -1;  
     continue;  
     }  
   
   /* An opening parens must now be a real metacharacter */  
   
   if (*ptr != '(') continue;  
   if (ptr[1] != '?')  
     {  
     count++;  
     if (name == NULL && count == lorn) return count;  
     continue;  
     }  
   
   ptr += 2;  
   if (*ptr == 'P') ptr++;                      /* Allow optional P */  
   
   /* We have to disambiguate (?<! and (?<= from (?<name> */  
   
   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  
        *ptr != '\'')  
     continue;  
   
   count++;  
   
   if (name == NULL && count == lorn) return count;  
   term = *ptr++;  
   if (term == '<') term = '>';  
   thisname = ptr;  
   while (*ptr != term) ptr++;  
   if (name != NULL && lorn == ptr - thisname &&  
       strncmp((const char *)name, (const char *)thisname, lorn) == 0)  
     return count;  
   }  
   
 return -1;  
 }  
   
   
   
 /*************************************************  
 *      Find first significant op code            *  
 *************************************************/  
   
 /* This is called by several functions that scan a compiled expression looking  
 for a fixed first character, or an anchoring op code etc. It skips over things  
 that do not influence this. For some calls, a change of option is important.  
 For some calls, it makes sense to skip negative forward and all backward  
 assertions, and also the \b assertion; for others it does not.  
   
 Arguments:  
   code         pointer to the start of the group  
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
   skipassert   TRUE if certain assertions are to be skipped  
   
 Returns:       pointer to the first significant opcode  
 */  
   
 static const uschar*  
 first_significant_code(const uschar *code, int *options, int optbit,  
   BOOL skipassert)  
1629  {  {
1630  for (;;)  for (;;)
1631    {    {
1632    switch ((int)*code)    switch ((int)*code)
1633      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1634      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1635      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1636      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1637      if (!skipassert) return code;      if (!skipassert) return code;
1638      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1639      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1640      break;      break;
1641    
1642      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
# Line 996  for (;;) Line 1646  for (;;)
1646    
1647      case OP_CALLOUT:      case OP_CALLOUT:
1648      case OP_CREF:      case OP_CREF:
1649        case OP_DNCREF:
1650      case OP_RREF:      case OP_RREF:
1651        case OP_DNRREF:
1652      case OP_DEF:      case OP_DEF:
1653      code += _pcre_OP_lengths[*code];      code += PRIV(OP_lengths)[*code];
1654      break;      break;
1655    
1656      default:      default:
# Line 1010  for (;;) Line 1662  for (;;)
1662    
1663    
1664    
   
1665  /*************************************************  /*************************************************
1666  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1667  *************************************************/  *************************************************/
1668    
1669  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1670  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1671  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1672    temporarily terminated with OP_END when this function is called.
1673    
1674    This function is called when a backward assertion is encountered, so that if it
1675    fails, the error message can point to the correct place in the pattern.
1676    However, we cannot do this when the assertion contains subroutine calls,
1677    because they can be forward references. We solve this by remembering this case
1678    and doing the check at the end; a flag specifies which mode we are running in.
1679    
1680  Arguments:  Arguments:
1681    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1682    options  the compiling options    utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1683      atend    TRUE if called when the pattern is complete
1684  Returns:   the fixed length, or -1 if there is no fixed length,    cd       the "compile data" structure
1685               or -2 if \C was encountered  
1686    Returns:   the fixed length,
1687                 or -1 if there is no fixed length,
1688                 or -2 if \C was encountered (in UTF-8 mode only)
1689                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1690                 or -4 if an unknown opcode was encountered (internal error)
1691  */  */
1692    
1693  static int  static int
1694  find_fixedlength(uschar *code, int options)  find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1695  {  {
1696  int length = -1;  int length = -1;
1697    
1698  register int branchlength = 0;  register int branchlength = 0;
1699  register uschar *cc = code + 1 + LINK_SIZE;  register pcre_uchar *cc = code + 1 + LINK_SIZE;
1700    
1701  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
1702  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 1041  branch, check the length against that of Line 1704  branch, check the length against that of
1704  for (;;)  for (;;)
1705    {    {
1706    int d;    int d;
1707    register int op = *cc;    pcre_uchar *ce, *cs;
1708      register pcre_uchar op = *cc;
1709    
1710    switch (op)    switch (op)
1711      {      {
1712        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1713        OP_BRA (normal non-capturing bracket) because the other variants of these
1714        opcodes are all concerned with unlimited repeated groups, which of course
1715        are not of fixed length. */
1716    
1717      case OP_CBRA:      case OP_CBRA:
1718      case OP_BRA:      case OP_BRA:
1719      case OP_ONCE:      case OP_ONCE:
1720        case OP_ONCE_NC:
1721      case OP_COND:      case OP_COND:
1722      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1723      if (d < 0) return d;      if (d < 0) return d;
1724      branchlength += d;      branchlength += d;
1725      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1726      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1727      break;      break;
1728    
1729      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1730      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1731      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1732        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1733        because they all imply an unlimited repeat. */
1734    
1735      case OP_ALT:      case OP_ALT:
1736      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1737      case OP_END:      case OP_END:
1738        case OP_ACCEPT:
1739        case OP_ASSERT_ACCEPT:
1740      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1741        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1742      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1072  for (;;) Line 1744  for (;;)
1744      branchlength = 0;      branchlength = 0;
1745      break;      break;
1746    
1747        /* A true recursion implies not fixed length, but a subroutine call may
1748        be OK. If the subroutine is a forward reference, we can't deal with
1749        it until the end of the pattern, so return -3. */
1750    
1751        case OP_RECURSE:
1752        if (!atend) return -3;
1753        cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1754        do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1755        if (cc > cs && cc < ce) return -1;                    /* Recursion */
1756        d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1757        if (d < 0) return d;
1758        branchlength += d;
1759        cc += 1 + LINK_SIZE;
1760        break;
1761    
1762      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1763    
1764      case OP_ASSERT:      case OP_ASSERT:
# Line 1079  for (;;) Line 1766  for (;;)
1766      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1767      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1768      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1769      /* Fall through */      cc += PRIV(OP_lengths)[*cc];
1770        break;
1771    
1772      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1773    
1774      case OP_REVERSE:      case OP_MARK:
1775        case OP_PRUNE_ARG:
1776        case OP_SKIP_ARG:
1777        case OP_THEN_ARG:
1778        cc += cc[1] + PRIV(OP_lengths)[*cc];
1779        break;
1780    
1781        case OP_CALLOUT:
1782        case OP_CIRC:
1783        case OP_CIRCM:
1784        case OP_CLOSE:
1785        case OP_COMMIT:
1786      case OP_CREF:      case OP_CREF:
     case OP_RREF:  
1787      case OP_DEF:      case OP_DEF:
1788      case OP_OPT:      case OP_DNCREF:
1789      case OP_CALLOUT:      case OP_DNRREF:
1790      case OP_SOD:      case OP_DOLL:
1791      case OP_SOM:      case OP_DOLLM:
1792      case OP_EOD:      case OP_EOD:
1793      case OP_EODN:      case OP_EODN:
1794      case OP_CIRC:      case OP_FAIL:
     case OP_DOLL:  
1795      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1796        case OP_PRUNE:
1797        case OP_REVERSE:
1798        case OP_RREF:
1799        case OP_SET_SOM:
1800        case OP_SKIP:
1801        case OP_SOD:
1802        case OP_SOM:
1803        case OP_THEN:
1804      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1805      cc += _pcre_OP_lengths[*cc];      cc += PRIV(OP_lengths)[*cc];
1806      break;      break;
1807    
1808      /* Handle literal characters */      /* Handle literal characters */
1809    
1810      case OP_CHAR:      case OP_CHAR:
1811      case OP_CHARNC:      case OP_CHARI:
1812      case OP_NOT:      case OP_NOT:
1813        case OP_NOTI:
1814      branchlength++;      branchlength++;
1815      cc += 2;      cc += 2;
1816  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1817      if ((options & PCRE_UTF8) != 0)      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
       {  
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1818  #endif  #endif
1819      break;      break;
1820    
# Line 1119  for (;;) Line 1822  for (;;)
1822      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1823    
1824      case OP_EXACT:      case OP_EXACT:
1825      branchlength += GET2(cc,1);      case OP_EXACTI:
1826      cc += 4;      case OP_NOTEXACT:
1827  #ifdef SUPPORT_UTF8      case OP_NOTEXACTI:
1828      if ((options & PCRE_UTF8) != 0)      branchlength += (int)GET2(cc,1);
1829        {      cc += 2 + IMM2_SIZE;
1830        while((*cc & 0x80) == 0x80) cc++;  #ifdef SUPPORT_UTF
1831        }      if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1832  #endif  #endif
1833      break;      break;
1834    
1835      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1836      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1837      cc += 4;      if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1838          cc += 2;
1839        cc += 1 + IMM2_SIZE + 1;
1840      break;      break;
1841    
1842      /* Handle single-char matchers */      /* Handle single-char matchers */
# Line 1141  for (;;) Line 1846  for (;;)
1846      cc += 2;      cc += 2;
1847      /* Fall through */      /* Fall through */
1848    
1849        case OP_HSPACE:
1850        case OP_VSPACE:
1851        case OP_NOT_HSPACE:
1852        case OP_NOT_VSPACE:
1853      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1854      case OP_DIGIT:      case OP_DIGIT:
1855      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1148  for (;;) Line 1857  for (;;)
1857      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1858      case OP_WORDCHAR:      case OP_WORDCHAR:
1859      case OP_ANY:      case OP_ANY:
1860        case OP_ALLANY:
1861      branchlength++;      branchlength++;
1862      cc++;      cc++;
1863      break;      break;
1864    
1865      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1866        otherwise \C is coded as OP_ALLANY. */
1867    
1868      case OP_ANYBYTE:      case OP_ANYBYTE:
1869      return -2;      return -2;
1870    
1871      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1872    
 #ifdef SUPPORT_UTF8  
     case OP_XCLASS:  
     cc += GET(cc, 1) - 33;  
     /* Fall through */  
 #endif  
   
1873      case OP_CLASS:      case OP_CLASS:
1874      case OP_NCLASS:      case OP_NCLASS:
1875      cc += 33;  #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1876        case OP_XCLASS:
1877        /* The original code caused an unsigned overflow in 64 bit systems,
1878        so now we use a conditional statement. */
1879        if (op == OP_XCLASS)
1880          cc += GET(cc, 1);
1881        else
1882          cc += PRIV(OP_lengths)[OP_CLASS];
1883    #else
1884        cc += PRIV(OP_lengths)[OP_CLASS];
1885    #endif
1886    
1887      switch (*cc)      switch (*cc)
1888        {        {
1889        case OP_CRSTAR:        case OP_CRSTAR:
1890        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1891          case OP_CRPLUS:
1892          case OP_CRMINPLUS:
1893        case OP_CRQUERY:        case OP_CRQUERY:
1894        case OP_CRMINQUERY:        case OP_CRMINQUERY:
1895          case OP_CRPOSSTAR:
1896          case OP_CRPOSPLUS:
1897          case OP_CRPOSQUERY:
1898        return -1;        return -1;
1899    
1900        case OP_CRRANGE:        case OP_CRRANGE:
1901        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1902        if (GET2(cc,1) != GET2(cc,3)) return -1;        case OP_CRPOSRANGE:
1903        branchlength += GET2(cc,1);        if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1904        cc += 5;        branchlength += (int)GET2(cc,1);
1905          cc += 1 + 2 * IMM2_SIZE;
1906        break;        break;
1907    
1908        default:        default:
# Line 1191  for (;;) Line 1912  for (;;)
1912    
1913      /* Anything else is variable length */      /* Anything else is variable length */
1914    
1915      default:      case OP_ANYNL:
1916        case OP_BRAMINZERO:
1917        case OP_BRAPOS:
1918        case OP_BRAPOSZERO:
1919        case OP_BRAZERO:
1920        case OP_CBRAPOS:
1921        case OP_EXTUNI:
1922        case OP_KETRMAX:
1923        case OP_KETRMIN:
1924        case OP_KETRPOS:
1925        case OP_MINPLUS:
1926        case OP_MINPLUSI:
1927        case OP_MINQUERY:
1928        case OP_MINQUERYI:
1929        case OP_MINSTAR:
1930        case OP_MINSTARI:
1931        case OP_MINUPTO:
1932        case OP_MINUPTOI:
1933        case OP_NOTMINPLUS:
1934        case OP_NOTMINPLUSI:
1935        case OP_NOTMINQUERY:
1936        case OP_NOTMINQUERYI:
1937        case OP_NOTMINSTAR:
1938        case OP_NOTMINSTARI:
1939        case OP_NOTMINUPTO:
1940        case OP_NOTMINUPTOI:
1941        case OP_NOTPLUS:
1942        case OP_NOTPLUSI:
1943        case OP_NOTPOSPLUS:
1944        case OP_NOTPOSPLUSI:
1945        case OP_NOTPOSQUERY:
1946        case OP_NOTPOSQUERYI:
1947        case OP_NOTPOSSTAR:
1948        case OP_NOTPOSSTARI:
1949        case OP_NOTPOSUPTO:
1950        case OP_NOTPOSUPTOI:
1951        case OP_NOTQUERY:
1952        case OP_NOTQUERYI:
1953        case OP_NOTSTAR:
1954        case OP_NOTSTARI:
1955        case OP_NOTUPTO:
1956        case OP_NOTUPTOI:
1957        case OP_PLUS:
1958        case OP_PLUSI:
1959        case OP_POSPLUS:
1960        case OP_POSPLUSI:
1961        case OP_POSQUERY:
1962        case OP_POSQUERYI:
1963        case OP_POSSTAR:
1964        case OP_POSSTARI:
1965        case OP_POSUPTO:
1966        case OP_POSUPTOI:
1967        case OP_QUERY:
1968        case OP_QUERYI:
1969        case OP_REF:
1970        case OP_REFI:
1971        case OP_DNREF:
1972        case OP_DNREFI:
1973        case OP_SBRA:
1974        case OP_SBRAPOS:
1975        case OP_SCBRA:
1976        case OP_SCBRAPOS:
1977        case OP_SCOND:
1978        case OP_SKIPZERO:
1979        case OP_STAR:
1980        case OP_STARI:
1981        case OP_TYPEMINPLUS:
1982        case OP_TYPEMINQUERY:
1983        case OP_TYPEMINSTAR:
1984        case OP_TYPEMINUPTO:
1985        case OP_TYPEPLUS:
1986        case OP_TYPEPOSPLUS:
1987        case OP_TYPEPOSQUERY:
1988        case OP_TYPEPOSSTAR:
1989        case OP_TYPEPOSUPTO:
1990        case OP_TYPEQUERY:
1991        case OP_TYPESTAR:
1992        case OP_TYPEUPTO:
1993        case OP_UPTO:
1994        case OP_UPTOI:
1995      return -1;      return -1;
1996    
1997        /* Catch unrecognized opcodes so that when new ones are added they
1998        are not forgotten, as has happened in the past. */
1999    
2000        default:
2001        return -4;
2002      }      }
2003    }    }
2004  /* Control never gets here */  /* Control never gets here */
# Line 1200  for (;;) Line 2006  for (;;)
2006    
2007    
2008    
   
2009  /*************************************************  /*************************************************
2010  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
2011  *************************************************/  *************************************************/
2012    
2013  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
2014  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
2015    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2016    so that it can be called from pcre_study() when finding the minimum matching
2017    length.
2018    
2019  Arguments:  Arguments:
2020    code        points to start of expression    code        points to start of expression
2021    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2022    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
2023    
2024  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
2025  */  */
2026    
2027  static const uschar *  const pcre_uchar *
2028  find_bracket(const uschar *code, BOOL utf8, int number)  PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2029  {  {
2030  for (;;)  for (;;)
2031    {    {
2032    register int c = *code;    register pcre_uchar c = *code;
2033    
2034    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2035    
2036    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1230  for (;;) Line 2039  for (;;)
2039    
2040    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
2041    
2042      /* Handle recursion */
2043    
2044      else if (c == OP_REVERSE)
2045        {
2046        if (number < 0) return (pcre_uchar *)code;
2047        code += PRIV(OP_lengths)[c];
2048        }
2049    
2050    /* Handle capturing bracket */    /* Handle capturing bracket */
2051    
2052    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
2053               c == OP_CBRAPOS || c == OP_SCBRAPOS)
2054      {      {
2055      int n = GET2(code, 1+LINK_SIZE);      int n = (int)GET2(code, 1+LINK_SIZE);
2056      if (n == number) return (uschar *)code;      if (n == number) return (pcre_uchar *)code;
2057      code += _pcre_OP_lengths[c];      code += PRIV(OP_lengths)[c];
2058      }      }
2059    
2060      /* Otherwise, we can get the item's length from the table, except that for
2061      repeated character types, we have to test for \p and \P, which have an extra
2062      two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2063      must add in its length. */
2064    
2065      else
2066        {
2067        switch(c)
2068          {
2069          case OP_TYPESTAR:
2070          case OP_TYPEMINSTAR:
2071          case OP_TYPEPLUS:
2072          case OP_TYPEMINPLUS:
2073          case OP_TYPEQUERY:
2074          case OP_TYPEMINQUERY:
2075          case OP_TYPEPOSSTAR:
2076          case OP_TYPEPOSPLUS:
2077          case OP_TYPEPOSQUERY:
2078          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2079          break;
2080    
2081          case OP_TYPEUPTO:
2082          case OP_TYPEMINUPTO:
2083          case OP_TYPEEXACT:
2084          case OP_TYPEPOSUPTO:
2085          if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2086            code += 2;
2087          break;
2088    
2089          case OP_MARK:
2090          case OP_PRUNE_ARG:
2091          case OP_SKIP_ARG:
2092          case OP_THEN_ARG:
2093          code += code[1];
2094          break;
2095          }
2096    
2097        /* Add in the fixed length from the table */
2098    
2099        code += PRIV(OP_lengths)[c];
2100    
2101    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2102    a multi-byte character. The length in the table is a minimum, so we have to    a multi-byte character. The length in the table is a minimum, so we have to
2103    arrange to skip the extra bytes. */    arrange to skip the extra bytes. */
2104    
2105    else  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2106      {      if (utf) switch(c)
     code += _pcre_OP_lengths[c];  
 #ifdef SUPPORT_UTF8  
     if (utf8) switch(c)  
2107        {        {
2108        case OP_CHAR:        case OP_CHAR:
2109        case OP_CHARNC:        case OP_CHARI:
2110        case OP_EXACT:        case OP_EXACT:
2111          case OP_EXACTI:
2112        case OP_UPTO:        case OP_UPTO:
2113          case OP_UPTOI:
2114        case OP_MINUPTO:        case OP_MINUPTO:
2115          case OP_MINUPTOI:
2116        case OP_POSUPTO:        case OP_POSUPTO:
2117          case OP_POSUPTOI:
2118        case OP_STAR:        case OP_STAR:
2119          case OP_STARI:
2120        case OP_MINSTAR:        case OP_MINSTAR:
2121          case OP_MINSTARI:
2122        case OP_POSSTAR:        case OP_POSSTAR:
2123          case OP_POSSTARI:
2124        case OP_PLUS:        case OP_PLUS:
2125          case OP_PLUSI:
2126        case OP_MINPLUS:        case OP_MINPLUS:
2127          case OP_MINPLUSI:
2128        case OP_POSPLUS:        case OP_POSPLUS:
2129          case OP_POSPLUSI:
2130        case OP_QUERY:        case OP_QUERY:
2131          case OP_QUERYI:
2132        case OP_MINQUERY:        case OP_MINQUERY:
2133          case OP_MINQUERYI:
2134        case OP_POSQUERY:        case OP_POSQUERY:
2135        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2136          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2137        break;        break;
2138        }        }
2139    #else
2140        (void)(utf);  /* Keep compiler happy by referencing function argument */
2141  #endif  #endif
2142      }      }
2143    }    }
# Line 1283  instance of OP_RECURSE. Line 2154  instance of OP_RECURSE.
2154    
2155  Arguments:  Arguments:
2156    code        points to start of expression    code        points to start of expression
2157    utf8        TRUE in UTF-8 mode    utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2158    
2159  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found  Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2160  */  */
2161    
2162  static const uschar *  static const pcre_uchar *
2163  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const pcre_uchar *code, BOOL utf)
2164  {  {
2165  for (;;)  for (;;)
2166    {    {
2167    register int c = *code;    register pcre_uchar c = *code;
2168    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
2169    if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
2170    
# Line 1303  for (;;) Line 2174  for (;;)
2174    
2175    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
2176    
2177    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we can get the item's length from the table, except that for
2178    that are followed by a character may be followed by a multi-byte character.    repeated character types, we have to test for \p and \P, which have an extra
2179    The length in the table is a minimum, so we have to arrange to skip the extra    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2180    bytes. */    must add in its length. */
2181    
2182    else    else
2183      {      {
2184      code += _pcre_OP_lengths[c];      switch(c)
2185  #ifdef SUPPORT_UTF8        {
2186      if (utf8) switch(c)        case OP_TYPESTAR:
2187          case OP_TYPEMINSTAR:
2188          case OP_TYPEPLUS:
2189          case OP_TYPEMINPLUS:
2190          case OP_TYPEQUERY:
2191          case OP_TYPEMINQUERY:
2192          case OP_TYPEPOSSTAR:
2193          case OP_TYPEPOSPLUS:
2194          case OP_TYPEPOSQUERY:
2195          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2196          break;
2197    
2198          case OP_TYPEPOSUPTO:
2199          case OP_TYPEUPTO:
2200          case OP_TYPEMINUPTO:
2201          case OP_TYPEEXACT:
2202          if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2203            code += 2;
2204          break;
2205    
2206          case OP_MARK:
2207          case OP_PRUNE_ARG:
2208          case OP_SKIP_ARG:
2209          case OP_THEN_ARG:
2210          code += code[1];
2211          break;
2212          }
2213    
2214        /* Add in the fixed length from the table */
2215    
2216        code += PRIV(OP_lengths)[c];
2217    
2218        /* In UTF-8 mode, opcodes that are followed by a character may be followed
2219        by a multi-byte character. The length in the table is a minimum, so we have
2220        to arrange to skip the extra bytes. */
2221    
2222    #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2223        if (utf) switch(c)
2224        {        {
2225        case OP_CHAR:        case OP_CHAR:
2226        case OP_CHARNC:        case OP_CHARI:
2227          case OP_NOT:
2228          case OP_NOTI:
2229        case OP_EXACT:        case OP_EXACT:
2230          case OP_EXACTI:
2231          case OP_NOTEXACT:
2232          case OP_NOTEXACTI:
2233        case OP_UPTO:        case OP_UPTO:
2234          case OP_UPTOI:
2235          case OP_NOTUPTO:
2236          case OP_NOTUPTOI:
2237        case OP_MINUPTO:        case OP_MINUPTO:
2238          case OP_MINUPTOI:
2239          case OP_NOTMINUPTO:
2240          case OP_NOTMINUPTOI:
2241        case OP_POSUPTO:        case OP_POSUPTO:
2242          case OP_POSUPTOI:
2243          case OP_NOTPOSUPTO:
2244          case OP_NOTPOSUPTOI:
2245        case OP_STAR:        case OP_STAR:
2246          case OP_STARI:
2247          case OP_NOTSTAR:
2248          case OP_NOTSTARI:
2249        case OP_MINSTAR:        case OP_MINSTAR:
2250          case OP_MINSTARI:
2251          case OP_NOTMINSTAR:
2252          case OP_NOTMINSTARI:
2253        case OP_POSSTAR:        case OP_POSSTAR:
2254          case OP_POSSTARI:
2255          case OP_NOTPOSSTAR:
2256          case OP_NOTPOSSTARI:
2257        case OP_PLUS:        case OP_PLUS:
2258          case OP_PLUSI:
2259          case OP_NOTPLUS:
2260          case OP_NOTPLUSI:
2261        case OP_MINPLUS:        case OP_MINPLUS:
2262          case OP_MINPLUSI:
2263          case OP_NOTMINPLUS:
2264          case OP_NOTMINPLUSI:
2265        case OP_POSPLUS:        case OP_POSPLUS:
2266          case OP_POSPLUSI:
2267          case OP_NOTPOSPLUS:
2268          case OP_NOTPOSPLUSI:
2269        case OP_QUERY:        case OP_QUERY:
2270          case OP_QUERYI:
2271          case OP_NOTQUERY:
2272          case OP_NOTQUERYI:
2273        case OP_MINQUERY:        case OP_MINQUERY:
2274          case OP_MINQUERYI:
2275          case OP_NOTMINQUERY:
2276          case OP_NOTMINQUERYI:
2277        case OP_POSQUERY:        case OP_POSQUERY:
2278        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        case OP_POSQUERYI:
2279          case OP_NOTPOSQUERY:
2280          case OP_NOTPOSQUERYI:
2281          if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2282        break;        break;
2283        }        }
2284    #else
2285        (void)(utf);  /* Keep compiler happy by referencing function argument */
2286  #endif  #endif
2287      }      }
2288    }    }
# Line 1347  for (;;) Line 2298  for (;;)
2298  can match the empty string or not. It is called from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
2299  below and from compile_branch() when checking for an unlimited repeat of a  below and from compile_branch() when checking for an unlimited repeat of a
2300  group that can match nothing. Note that first_significant_code() skips over  group that can match nothing. Note that first_significant_code() skips over
2301  assertions. If we hit an unclosed bracket, we return "empty" - this means we've  backward and negative forward assertions when its final argument is TRUE. If we
2302  struck an inner bracket whose current branch will already have been scanned.  hit an unclosed bracket, we return "empty" - this means we've struck an inner
2303    bracket whose current branch will already have been scanned.
2304    
2305  Arguments:  Arguments:
2306    code        points to start of search    code        points to start of search
2307    endcode     points to where to stop    endcode     points to where to stop
2308    utf8        TRUE if in UTF8 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2309      cd          contains pointers to tables etc.
2310      recurses    chain of recurse_check to catch mutual recursion
2311    
2312  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2313  */  */
2314    
2315    typedef struct recurse_check {
2316      struct recurse_check *prev;
2317      const pcre_uchar *group;
2318    } recurse_check;
2319    
2320  static BOOL  static BOOL
2321  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2322      BOOL utf, compile_data *cd, recurse_check *recurses)
2323  {  {
2324  register int c;  register pcre_uchar c;
2325  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  recurse_check this_recurse;
2326    
2327    for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2328       code < endcode;       code < endcode;
2329       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2330    {    {
2331    const uschar *ccode;    const pcre_uchar *ccode;
2332    
2333    c = *code;    c = *code;
2334    
2335    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    /* Skip over forward assertions; the other assertions are skipped by
2336      first_significant_code() with a TRUE final argument. */
2337    
2338      if (c == OP_ASSERT)
2339        {
2340        do code += GET(code, 1); while (*code == OP_ALT);
2341        c = *code;
2342        continue;
2343        }
2344    
2345      /* For a recursion/subroutine call, if its end has been reached, which
2346      implies a backward reference subroutine call, we can scan it. If it's a
2347      forward reference subroutine call, we can't. To detect forward reference
2348      we have to scan up the list that is kept in the workspace. This function is
2349      called only when doing the real compile, not during the pre-compile that
2350      measures the size of the compiled pattern. */
2351    
2352      if (c == OP_RECURSE)
2353      {      {
2354        const pcre_uchar *scode = cd->start_code + GET(code, 1);
2355      BOOL empty_branch;      BOOL empty_branch;
     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */  
2356    
2357      /* Scan a closed bracket */      /* Test for forward reference or uncompleted reference. This is disabled
2358        when called to scan a completed pattern by setting cd->start_workspace to
2359        NULL. */
2360    
2361        if (cd->start_workspace != NULL)
2362          {
2363          const pcre_uchar *tcode;
2364          for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2365            if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2366          if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2367          }
2368    
2369        /* If we are scanning a completed pattern, there are no forward references
2370        and all groups are complete. We need to detect whether this is a recursive
2371        call, as otherwise there will be an infinite loop. If it is a recursion,
2372        just skip over it. Simple recursions are easily detected. For mutual
2373        recursions we keep a chain on the stack. */
2374    
2375        else
2376          {
2377          recurse_check *r = recurses;
2378          const pcre_uchar *endgroup = scode;
2379    
2380          do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2381          if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2382    
2383          for (r = recurses; r != NULL; r = r->prev)
2384            if (r->group == scode) break;
2385          if (r != NULL) continue;   /* Mutual recursion */
2386          }
2387    
2388        /* Completed reference; scan the referenced group, remembering it on the
2389        stack chain to detect mutual recursions. */
2390    
2391      empty_branch = FALSE;      empty_branch = FALSE;
2392        this_recurse.prev = recurses;
2393        this_recurse.group = scode;
2394    
2395      do      do
2396        {        {
2397        if (!empty_branch && could_be_empty_branch(code, endcode, utf8))        if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2398            {
2399          empty_branch = TRUE;          empty_branch = TRUE;
2400        code += GET(code, 1);          break;
2401            }
2402          scode += GET(scode, 1);
2403        }        }
2404      while (*code == OP_ALT);      while (*scode == OP_ALT);
2405      if (!empty_branch) return FALSE;   /* All branches are non-empty */  
2406        if (!empty_branch) return FALSE;  /* All branches are non-empty */
2407        continue;
2408        }
2409    
2410      /* Groups with zero repeats can of course be empty; skip them. */
2411    
2412      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2413          c == OP_BRAPOSZERO)
2414        {
2415        code += PRIV(OP_lengths)[c];
2416        do code += GET(code, 1); while (*code == OP_ALT);
2417        c = *code;
2418        continue;
2419        }
2420    
2421      /* A nested group that is already marked as "could be empty" can just be
2422      skipped. */
2423    
2424      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2425          c == OP_SCBRA || c == OP_SCBRAPOS)
2426        {
2427        do code += GET(code, 1); while (*code == OP_ALT);
2428        c = *code;
2429        continue;
2430        }
2431    
2432      /* Move past the KET and fudge things so that the increment in the "for"    /* For other groups, scan the branches. */
2433      above has no effect. */  
2434      if (c == OP_BRA  || c == OP_BRAPOS ||
2435          c == OP_CBRA || c == OP_CBRAPOS ||
2436          c == OP_ONCE || c == OP_ONCE_NC ||
2437          c == OP_COND)
2438        {
2439        BOOL empty_branch;
2440        if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2441    
2442        /* If a conditional group has only one branch, there is a second, implied,
2443        empty branch, so just skip over the conditional, because it could be empty.
2444        Otherwise, scan the individual branches of the group. */
2445    
2446        if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2447          code += GET(code, 1);
2448        else
2449          {
2450          empty_branch = FALSE;
2451          do
2452            {
2453            if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2454              empty_branch = TRUE;
2455            code += GET(code, 1);
2456            }
2457          while (*code == OP_ALT);
2458          if (!empty_branch) return FALSE;   /* All branches are non-empty */
2459          }
2460    
2461      c = OP_END;      c = *code;
     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];  
2462      continue;      continue;
2463      }      }
2464    
# Line 1399  for (code = first_significant_code(code Line 2466  for (code = first_significant_code(code
2466    
2467    switch (c)    switch (c)
2468      {      {
2469      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
2470        cannot be represented just by a bit map. This includes negated single
2471        high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2472        actual length is stored in the compiled code, so we must update "code"
2473        here. */
2474    
2475  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2476      case OP_XCLASS:      case OP_XCLASS:
2477      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
2478      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
2479  #endif  #endif
2480    
2481      case OP_CLASS:      case OP_CLASS:
2482      case OP_NCLASS:      case OP_NCLASS:
2483      ccode = code + 33;      ccode = code + PRIV(OP_lengths)[OP_CLASS];
2484    
2485  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2486      CHECK_CLASS_REPEAT:      CHECK_CLASS_REPEAT:
2487  #endif  #endif
2488    
# Line 1421  for (code = first_significant_code(code Line 2492  for (code = first_significant_code(code
2492        case OP_CRMINSTAR:        case OP_CRMINSTAR:
2493        case OP_CRQUERY:        case OP_CRQUERY:
2494        case OP_CRMINQUERY:        case OP_CRMINQUERY:
2495          case OP_CRPOSSTAR:
2496          case OP_CRPOSQUERY:
2497        break;        break;
2498    
2499        default:                   /* Non-repeat => class must match */        default:                   /* Non-repeat => class must match */
2500        case OP_CRPLUS:            /* These repeats aren't empty */        case OP_CRPLUS:            /* These repeats aren't empty */
2501        case OP_CRMINPLUS:        case OP_CRMINPLUS:
2502          case OP_CRPOSPLUS:
2503        return FALSE;        return FALSE;
2504    
2505        case OP_CRRANGE:        case OP_CRRANGE:
2506        case OP_CRMINRANGE:        case OP_CRMINRANGE:
2507          case OP_CRPOSRANGE:
2508        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */        if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2509        break;        break;
2510        }        }
# Line 1437  for (code = first_significant_code(code Line 2512  for (code = first_significant_code(code
2512    
2513      /* Opcodes that must match a character */      /* Opcodes that must match a character */
2514    
2515        case OP_ANY:
2516        case OP_ALLANY:
2517        case OP_ANYBYTE:
2518    
2519      case OP_PROP:      case OP_PROP:
2520      case OP_NOTPROP:      case OP_NOTPROP:
2521        case OP_ANYNL:
2522    
2523        case OP_NOT_HSPACE:
2524        case OP_HSPACE:
2525        case OP_NOT_VSPACE:
2526        case OP_VSPACE:
2527      case OP_EXTUNI:      case OP_EXTUNI:
2528    
2529      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
2530      case OP_DIGIT:      case OP_DIGIT:
2531      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
2532      case OP_WHITESPACE:      case OP_WHITESPACE:
2533      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2534      case OP_WORDCHAR:      case OP_WORDCHAR:
2535      case OP_ANY:  
     case OP_ANYBYTE:  
2536      case OP_CHAR:      case OP_CHAR:
2537      case OP_CHARNC:      case OP_CHARI:
2538      case OP_NOT:      case OP_NOT:
2539        case OP_NOTI:
2540    
2541      case OP_PLUS:      case OP_PLUS:
2542        case OP_PLUSI:
2543      case OP_MINPLUS:      case OP_MINPLUS:
2544      case OP_POSPLUS:      case OP_MINPLUSI:
2545      case OP_EXACT:  
2546      case OP_NOTPLUS:      case OP_NOTPLUS:
2547        case OP_NOTPLUSI:
2548      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
2549        case OP_NOTMINPLUSI:
2550    
2551        case OP_POSPLUS:
2552        case OP_POSPLUSI:
2553      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
2554        case OP_NOTPOSPLUSI:
2555    
2556        case OP_EXACT:
2557        case OP_EXACTI:
2558      case OP_NOTEXACT:      case OP_NOTEXACT:
2559        case OP_NOTEXACTI:
2560    
2561      case OP_TYPEPLUS:      case OP_TYPEPLUS:
2562      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
2563      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
2564      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2565    
2566      return FALSE;      return FALSE;
2567    
2568        /* These are going to continue, as they may be empty, but we have to
2569        fudge the length for the \p and \P cases. */
2570    
2571        case OP_TYPESTAR:
2572        case OP_TYPEMINSTAR:
2573        case OP_TYPEPOSSTAR:
2574        case OP_TYPEQUERY:
2575        case OP_TYPEMINQUERY:
2576        case OP_TYPEPOSQUERY:
2577        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2578        break;
2579    
2580        /* Same for these */
2581    
2582        case OP_TYPEUPTO:
2583        case OP_TYPEMINUPTO:
2584        case OP_TYPEPOSUPTO:
2585        if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2586          code += 2;
2587        break;
2588    
2589      /* End of branch */      /* End of branch */
2590    
2591      case OP_KET:      case OP_KET:
2592      case OP_KETRMAX:      case OP_KETRMAX:
2593      case OP_KETRMIN:      case OP_KETRMIN:
2594        case OP_KETRPOS:
2595      case OP_ALT:      case OP_ALT:
2596      return TRUE;      return TRUE;
2597    
2598      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2599      MINUPTO, and POSUPTO may be followed by a multibyte character */      MINUPTO, and POSUPTO and their caseless and negative versions may be
2600        followed by a multibyte character. */
2601    
2602  #ifdef SUPPORT_UTF8  #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2603      case OP_STAR:      case OP_STAR:
2604        case OP_STARI:
2605        case OP_NOTSTAR:
2606        case OP_NOTSTARI:
2607    
2608      case OP_MINSTAR:      case OP_MINSTAR:
2609        case OP_MINSTARI:
2610        case OP_NOTMINSTAR:
2611        case OP_NOTMINSTARI:
2612    
2613      case OP_POSSTAR:      case OP_POSSTAR:
2614        case OP_POSSTARI:
2615        case OP_NOTPOSSTAR:
2616        case OP_NOTPOSSTARI:
2617    
2618      case OP_QUERY:      case OP_QUERY:
2619        case OP_QUERYI:
2620        case OP_NOTQUERY:
2621        case OP_NOTQUERYI:
2622    
2623      case OP_MINQUERY:      case OP_MINQUERY:
2624        case OP_MINQUERYI:
2625        case OP_NOTMINQUERY:
2626        case OP_NOTMINQUERYI:
2627    
2628      case OP_POSQUERY:      case OP_POSQUERY:
2629        case OP_POSQUERYI:
2630        case OP_NOTPOSQUERY:
2631        case OP_NOTPOSQUERYI:
2632    
2633        if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2634        break;
2635    
2636      case OP_UPTO:      case OP_UPTO:
2637        case OP_UPTOI:
2638        case OP_NOTUPTO:
2639        case OP_NOTUPTOI:
2640    
2641      case OP_MINUPTO:      case OP_MINUPTO:
2642        case OP_MINUPTOI:
2643        case OP_NOTMINUPTO:
2644        case OP_NOTMINUPTOI:
2645    
2646      case OP_POSUPTO:      case OP_POSUPTO:
2647      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      case OP_POSUPTOI:
2648        case OP_NOTPOSUPTO:
2649        case OP_NOTPOSUPTOI:
2650    
2651        if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2652      break;      break;
2653  #endif  #endif
2654    
2655        /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2656        string. */
2657    
2658        case OP_MARK:
2659        case OP_PRUNE_ARG:
2660        case OP_SKIP_ARG:
2661        case OP_THEN_ARG:
2662        code += code[1];
2663        break;
2664    
2665        /* None of the remaining opcodes are required to match a character. */
2666    
2667        default:
2668        break;
2669      }      }
2670    }    }
2671    
# Line 1505  return TRUE; Line 2682  return TRUE;
2682  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2683  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2684  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2685    This function is called only during the real compile, not during the
2686    pre-compile.
2687    
2688  Arguments:  Arguments:
2689    code        points to start of the recursion    code        points to start of the recursion
2690    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2691    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2692    utf8        TRUE if in UTF-8 mode    utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2693      cd          pointers to tables etc
2694    
2695  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2696  */  */
2697    
2698  static BOOL  static BOOL
2699  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2700    BOOL utf8)    branch_chain *bcptr, BOOL utf, compile_data *cd)
2701  {  {
2702  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2703    {    {
2704    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2705        return FALSE;
2706    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2707    }    }
2708  return TRUE;  return TRUE;
2709    }
2710    
2711    
2712    
2713    /*************************************************
2714    *        Base opcode of repeated opcodes         *
2715    *************************************************/
2716    
2717    /* Returns the base opcode for repeated single character type opcodes. If the
2718    opcode is not a repeated character type, it returns with the original value.
2719    
2720    Arguments:  c opcode
2721    Returns:    base opcode for the type
2722    */
2723    
2724    static pcre_uchar
2725    get_repeat_base(pcre_uchar c)
2726    {
2727    return (c > OP_TYPEPOSUPTO)? c :
2728           (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2729           (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2730           (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2731           (c >= OP_STARI)?      OP_STARI :
2732                                 OP_STAR;
2733    }
2734    
2735    
2736    
2737    #ifdef SUPPORT_UCP
2738    /*************************************************
2739    *        Check a character and a property        *
2740    *************************************************/
2741    
2742    /* This function is called by check_auto_possessive() when a property item
2743    is adjacent to a fixed character.
2744    
2745    Arguments:
2746      c            the character
2747      ptype        the property type
2748      pdata        the data for the type
2749      negated      TRUE if it's a negated property (\P or \p{^)
2750    
2751    Returns:       TRUE if auto-possessifying is OK
2752    */
2753    
2754    static BOOL
2755    check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2756      BOOL negated)
2757    {
2758    const pcre_uint32 *p;
2759    const ucd_record *prop = GET_UCD(c);
2760    
2761    switch(ptype)
2762      {
2763      case PT_LAMP:
2764      return (prop->chartype == ucp_Lu ||
2765              prop->chartype == ucp_Ll ||
2766              prop->chartype == ucp_Lt) == negated;
2767    
2768      case PT_GC:
2769      return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2770    
2771      case PT_PC:
2772      return (pdata == prop->chartype) == negated;
2773    
2774      case PT_SC:
2775      return (pdata == prop->script) == negated;
2776    
2777      /* These are specials */
2778    
2779      case PT_ALNUM:
2780      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2781              PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2782    
2783      /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2784      means that Perl space and POSIX space are now identical. PCRE was changed
2785      at release 8.34. */
2786    
2787      case PT_SPACE:    /* Perl space */
2788      case PT_PXSPACE:  /* POSIX space */
2789      switch(c)
2790        {
2791        HSPACE_CASES:
2792        VSPACE_CASES:
2793        return negated;
2794    
2795        default:
2796        return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2797        }
2798      break;  /* Control never reaches here */
2799    
2800      case PT_WORD:
2801      return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2802              PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2803              c == CHAR_UNDERSCORE) == negated;
2804    
2805      case PT_CLIST:
2806      p = PRIV(ucd_caseless_sets) + prop->caseset;
2807      for (;;)
2808        {
2809        if (c < *p) return !negated;
2810        if (c == *p++) return negated;
2811        }
2812      break;  /* Control never reaches here */
2813      }
2814    
2815    return FALSE;
2816    }
2817    #endif  /* SUPPORT_UCP */
2818    
2819    
2820    
2821    /*************************************************
2822    *        Fill the character property list        *
2823    *************************************************/
2824    
2825    /* Checks whether the code points to an opcode that can take part in auto-
2826    possessification, and if so, fills a list with its properties.
2827    
2828    Arguments:
2829      code        points to start of expression
2830      utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2831      fcc         points to case-flipping table
2832      list        points to output list
2833                  list[0] will be filled with the opcode
2834                  list[1] will be non-zero if this opcode
2835                    can match an empty character string
2836                  list[2..7] depends on the opcode
2837    
2838    Returns:      points to the start of the next opcode if *code is accepted
2839                  NULL if *code is not accepted
2840    */
2841    
2842    static const pcre_uchar *
2843    get_chr_property_list(const pcre_uchar *code, BOOL utf,
2844      const pcre_uint8 *fcc, pcre_uint32 *list)
2845    {
2846    pcre_uchar c = *code;
2847    const pcre_uchar *end;
2848    const pcre_uint32 *clist_src;
2849    pcre_uint32 *clist_dest;
2850    pcre_uint32 chr;
2851    pcre_uchar base;
2852    
2853    list[0] = c;
2854    list[1] = FALSE;
2855    code++;
2856    
2857    if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2858      {
2859      base = get_repeat_base(c);
2860      c -= (base - OP_STAR);
2861    
2862      if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2863        code += IMM2_SIZE;
2864    
2865      list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2866    
2867      switch(base)
2868        {
2869        case OP_STAR:
2870        list[0] = OP_CHAR;
2871        break;
2872    
2873        case OP_STARI:
2874        list[0] = OP_CHARI;
2875        break;
2876    
2877        case OP_NOTSTAR:
2878        list[0] = OP_NOT;
2879        break;
2880    
2881        case OP_NOTSTARI:
2882        list[0] = OP_NOTI;
2883        break;
2884    
2885        case OP_TYPESTAR:
2886        list[0] = *code;
2887        code++;
2888        break;
2889        }
2890      c = list[0];
2891      }
2892    
2893    switch(c)
2894      {
2895      case OP_NOT_DIGIT:
2896      case OP_DIGIT:
2897      case OP_NOT_WHITESPACE:
2898      case OP_WHITESPACE:
2899      case OP_NOT_WORDCHAR:
2900      case OP_WORDCHAR:
2901      case OP_ANY:
2902      case OP_ALLANY:
2903      case OP_ANYNL:
2904      case OP_NOT_HSPACE:
2905      case OP_HSPACE:
2906      case OP_NOT_VSPACE:
2907      case OP_VSPACE:
2908      case OP_EXTUNI:
2909      case OP_EODN:
2910      case OP_EOD:
2911      case OP_DOLL:
2912      case OP_DOLLM:
2913      return code;
2914    
2915      case OP_CHAR:
2916      case OP_NOT:
2917      GETCHARINCTEST(chr, code);
2918      list[2] = chr;
2919      list[3] = NOTACHAR;
2920      return code;
2921    
2922      case OP_CHARI:
2923      case OP_NOTI:
2924      list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2925      GETCHARINCTEST(chr, code);
2926      list[2] = chr;
2927    
2928    #ifdef SUPPORT_UCP
2929      if (chr < 128 || (chr < 256 && !utf))
2930        list[3] = fcc[chr];
2931      else
2932        list[3] = UCD_OTHERCASE(chr);
2933    #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2934      list[3] = (chr < 256) ? fcc[chr] : chr;
2935    #else
2936      list[3] = fcc[chr];
2937    #endif
2938    
2939      /* The othercase might be the same value. */
2940    
2941      if (chr == list[3])
2942        list[3] = NOTACHAR;
2943      else
2944        list[4] = NOTACHAR;
2945      return code;
2946    
2947    #ifdef SUPPORT_UCP
2948      case OP_PROP:
2949      case OP_NOTPROP:
2950      if (code[0] != PT_CLIST)
2951        {
2952        list[2] = code[0];
2953        list[3] = code[1];
2954        return code + 2;
2955        }
2956    
2957      /* Convert only if we have enough space. */
2958    
2959      clist_src = PRIV(ucd_caseless_sets) + code[1];
2960      clist_dest = list + 2;
2961      code += 2;
2962    
2963      do {
2964         if (clist_dest >= list + 8)
2965           {
2966           /* Early return if there is not enough space. This should never
2967           happen, since all clists are shorter than 5 character now. */
2968           list[2] = code[0];
2969           list[3] = code[1];
2970           return code;
2971           }
2972         *clist_dest++ = *clist_src;
2973         }
2974      while(*clist_src++ != NOTACHAR);
2975    
2976      /* All characters are stored. The terminating NOTACHAR
2977      is copied form the clist itself. */
2978    
2979      list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2980      return code;
2981    #endif
2982    
2983      case OP_NCLASS:
2984      case OP_CLASS:
2985    #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2986      case OP_XCLASS:
2987      if (c == OP_XCLASS)
2988        end = code + GET(code, 0) - 1;
2989      else
2990    #endif
2991        end = code + 32 / sizeof(pcre_uchar);
2992    
2993      switch(*end)
2994        {
2995        case OP_CRSTAR:
2996        case OP_CRMINSTAR:
2997        case OP_CRQUERY:
2998        case OP_CRMINQUERY:
2999        case OP_CRPOSSTAR:
3000        case OP_CRPOSQUERY:
3001        list[1] = TRUE;
3002        end++;
3003        break;
3004    
3005        case OP_CRPLUS:
3006        case OP_CRMINPLUS:
3007        case OP_CRPOSPLUS:
3008        end++;
3009        break;
3010    
3011        case OP_CRRANGE:
3012        case OP_CRMINRANGE:
3013        case OP_CRPOSRANGE:
3014        list[1] = (GET2(end, 1) == 0);
3015        end += 1 + 2 * IMM2_SIZE;
3016        break;
3017        }
3018      list[2] = end - code;
3019      return end;
3020      }
3021    return NULL;    /* Opcode not accepted */
3022    }
3023    
3024    
3025    
3026    /*************************************************
3027    *    Scan further character sets for match       *
3028    *************************************************/
3029    
3030    /* Checks whether the base and the current opcode have a common character, in
3031    which case the base cannot be possessified.
3032    
3033    Arguments:
3034      code        points to the byte code
3035      utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3036      cd          static compile data
3037      base_list   the data list of the base opcode
3038    
3039    Returns:      TRUE if the auto-possessification is possible
3040    */
3041    
3042    static BOOL
3043    compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3044      const pcre_uint32 *base_list, const pcre_uchar *base_end)
3045    {
3046    pcre_uchar c;
3047    pcre_uint32 list[8];
3048    const pcre_uint32 *chr_ptr;
3049    const pcre_uint32 *ochr_ptr;
3050    const pcre_uint32 *list_ptr;
3051    const pcre_uchar *next_code;
3052    const pcre_uint8 *class_bitset;
3053    const pcre_uint32 *set1, *set2, *set_end;
3054    pcre_uint32 chr;
3055    BOOL accepted, invert_bits;
3056    
3057    /* Note: the base_list[1] contains whether the current opcode has greedy
3058    (represented by a non-zero value) quantifier. This is a different from
3059    other character type lists, which stores here that the character iterator
3060    matches to an empty string (also represented by a non-zero value). */
3061    
3062    for(;;)
3063      {
3064      /* All operations move the code pointer forward.
3065      Therefore infinite recursions are not possible. */
3066    
3067      c = *code;
3068    
3069      /* Skip over callouts */
3070    
3071      if (c == OP_CALLOUT)
3072        {
3073        code += PRIV(OP_lengths)[c];
3074        continue;
3075        }
3076    
3077      if (c == OP_ALT)
3078        {
3079        do code += GET(code, 1); while (*code == OP_ALT);
3080        c = *code;
3081        }
3082    
3083      switch(c)
3084        {
3085        case OP_END:
3086        case OP_KETRPOS:
3087        /* TRUE only in greedy case. The non-greedy case could be replaced by
3088        an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3089        uses more memory, which we cannot get at this stage.) */
3090    
3091        return base_list[1] != 0;
3092    
3093        case OP_KET:
3094        /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3095        it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3096        cannot be converted to a possessive form. */
3097    
3098        if (base_list[1] == 0) return FALSE;
3099    
3100        switch(*(code - GET(code, 1)))
3101          {
3102          case OP_ASSERT:
3103          case OP_ASSERT_NOT:
3104          case OP_ASSERTBACK:
3105          case OP_ASSERTBACK_NOT:
3106          case OP_ONCE:
3107          case OP_ONCE_NC:
3108          /* Atomic sub-patterns and assertions can always auto-possessify their
3109          last iterator. */
3110          return TRUE;
3111          }
3112    
3113        code += PRIV(OP_lengths)[c];
3114        continue;
3115    
3116        case OP_ONCE:
3117        case OP_ONCE_NC:
3118        case OP_BRA:
3119        case OP_CBRA:
3120        next_code = code + GET(code, 1);
3121        code += PRIV(OP_lengths)[c];
3122    
3123        while (*next_code == OP_ALT)
3124          {
3125          if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3126          code = next_code + 1 + LINK_SIZE;
3127          next_code += GET(next_code, 1);
3128          }
3129        continue;
3130    
3131        case OP_BRAZERO:
3132        case OP_BRAMINZERO:
3133    
3134        next_code = code + 1;
3135        if (*next_code != OP_BRA && *next_code != OP_CBRA
3136            && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3137    
3138        do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3139    
3140        /* The bracket content will be checked by the
3141        OP_BRA/OP_CBRA case above. */
3142        next_code += 1 + LINK_SIZE;
3143        if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3144          return FALSE;
3145    
3146        code += PRIV(OP_lengths)[c];
3147        continue;
3148        }
3149    
3150      /* Check for a supported opcode, and load its properties. */
3151    
3152      code = get_chr_property_list(code, utf, cd->fcc, list);
3153      if (code == NULL) return FALSE;    /* Unsupported */
3154    
3155      /* If either opcode is a small character list, set pointers for comparing
3156      characters from that list with another list, or with a property. */
3157    
3158      if (base_list[0] == OP_CHAR)
3159        {
3160        chr_ptr = base_list + 2;
3161        list_ptr = list;
3162        }
3163      else if (list[0] == OP_CHAR)
3164        {
3165        chr_ptr = list + 2;
3166        list_ptr = base_list;
3167        }
3168    
3169      /* Character bitsets can also be compared to certain opcodes. */
3170    
3171      else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3172    #ifdef COMPILE_PCRE8
3173          /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3174          || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3175    #endif
3176          )
3177        {
3178    #ifdef COMPILE_PCRE8
3179        if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3180    #else
3181        if (base_list[0] == OP_CLASS)
3182    #endif
3183          {
3184          set1 = (pcre_uint32 *)(base_end - base_list[2]);
3185          list_ptr = list;
3186          }
3187        else
3188          {
3189          set1 = (pcre_uint32 *)(code - list[2]);
3190          list_ptr = base_list;
3191          }
3192    
3193        invert_bits = FALSE;
3194        switch(list_ptr[0])
3195          {
3196          case OP_CLASS:
3197          case OP_NCLASS:
3198          set2 = (pcre_uint32 *)
3199            ((list_ptr == list ? code : base_end) - list_ptr[2]);
3200          break;
3201    
3202          /* OP_XCLASS cannot be supported here, because its bitset
3203          is not necessarily complete. E.g: [a-\0x{200}] is stored
3204          as a character range, and the appropriate bits are not set. */
3205    
3206          case OP_NOT_DIGIT:
3207            invert_bits = TRUE;
3208            /* Fall through */
3209          case OP_DIGIT:
3210            set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3211            break;
3212    
3213          case OP_NOT_WHITESPACE:
3214            invert_bits = TRUE;
3215            /* Fall through */
3216          case OP_WHITESPACE:
3217            set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3218            break;
3219    
3220          case OP_NOT_WORDCHAR:
3221            invert_bits = TRUE;
3222            /* Fall through */
3223          case OP_WORDCHAR:
3224            set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3225            break;
3226    
3227          default:
3228          return FALSE;
3229          }
3230    
3231        /* Compare 4 bytes to improve speed. */
3232        set_end = set1 + (32 / 4);
3233        if (invert_bits)
3234          {
3235          do
3236            {
3237            if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3238            }
3239          while (set1 < set_end);
3240          }
3241        else
3242          {
3243          do
3244            {
3245            if ((*set1++ & *set2++) != 0) return FALSE;
3246            }
3247          while (set1 < set_end);
3248          }
3249    
3250        if (list[1] == 0) return TRUE;
3251        /* Might be an empty repeat. */
3252        continue;
3253        }
3254    
3255      /* Some property combinations also acceptable. Unicode property opcodes are
3256      processed specially; the rest can be handled with a lookup table. */
3257    
3258      else
3259        {
3260        pcre_uint32 leftop, rightop;
3261    
3262        leftop = base_list[0];
3263        rightop = list[0];
3264    
3265    #ifdef SUPPORT_UCP
3266        accepted = FALSE; /* Always set in non-unicode case. */
3267        if (leftop == OP_PROP || leftop == OP_NOTPROP)
3268          {
3269          if (rightop == OP_EOD)
3270            accepted = TRUE;
3271          else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3272            {
3273            int n;
3274            const pcre_uint8 *p;
3275            BOOL same = leftop == rightop;
3276            BOOL lisprop = leftop == OP_PROP;
3277            BOOL risprop = rightop == OP_PROP;
3278            BOOL bothprop = lisprop && risprop;
3279    
3280            /* There's a table that specifies how each combination is to be
3281            processed:
3282              0   Always return FALSE (never auto-possessify)
3283              1   Character groups are distinct (possessify if both are OP_PROP)
3284              2   Check character categories in the same group (general or particular)
3285              3   Return TRUE if the two opcodes are not the same
3286              ... see comments below
3287            */
3288    
3289            n = propposstab[base_list[2]][list[2]];
3290            switch(n)
3291              {
3292              case 0: break;
3293              case 1: accepted = bothprop; break;
3294              case 2: accepted = (base_list[3] == list[3]) != same; break;
3295              case 3: accepted = !same; break;
3296    
3297              case 4:  /* Left general category, right particular category */
3298              accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3299              break;
3300    
3301              case 5:  /* Right general category, left particular category */
3302              accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3303              break;
3304    
3305              /* This code is logically tricky. Think hard before fiddling with it.
3306              The posspropstab table has four entries per row. Each row relates to
3307              one of PCRE's special properties such as ALNUM or SPACE or WORD.
3308              Only WORD actually needs all four entries, but using repeats for the
3309              others means they can all use the same code below.
3310    
3311              The first two entries in each row are Unicode general categories, and
3312              apply always, because all the characters they include are part of the
3313              PCRE character set. The third and fourth entries are a general and a
3314              particular category, respectively, that include one or more relevant
3315              characters. One or the other is used, depending on whether the check
3316              is for a general or a particular category. However, in both cases the
3317              category contains more characters than the specials that are defined
3318              for the property being tested against. Therefore, it cannot be used
3319              in a NOTPROP case.
3320    
3321              Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3322              Underscore is covered by ucp_P or ucp_Po. */
3323    
3324              case 6:  /* Left alphanum vs right general category */
3325              case 7:  /* Left space vs right general category */
3326              case 8:  /* Left word vs right general category */
3327              p = posspropstab[n-6];
3328              accepted = risprop && lisprop ==
3329                (list[3] != p[0] &&
3330                 list[3] != p[1] &&
3331                (list[3] != p[2] || !lisprop));
3332              break;
3333    
3334              case 9:   /* Right alphanum vs left general category */
3335              case 10:  /* Right space vs left general category */
3336              case 11:  /* Right word vs left general category */
3337              p = posspropstab[n-9];
3338              accepted = lisprop && risprop ==
3339                (base_list[3] != p[0] &&
3340                 base_list[3] != p[1] &&
3341                (base_list[3] != p[2] || !risprop));
3342              break;
3343    
3344              case 12:  /* Left alphanum vs right particular category */
3345              case 13:  /* Left space vs right particular category */
3346              case 14:  /* Left word vs right particular category */
3347              p = posspropstab[n-12];
3348              accepted = risprop && lisprop ==
3349                (catposstab[p[0]][list[3]] &&
3350                 catposstab[p[1]][list[3]] &&
3351                (list[3] != p[3] || !lisprop));
3352              break;
3353    
3354              case 15:  /* Right alphanum vs left particular category */
3355              case 16:  /* Right space vs left particular category */
3356              case 17:  /* Right word vs left particular category */
3357              p = posspropstab[n-15];
3358              accepted = lisprop && risprop ==
3359                (catposstab[p[0]][base_list[3]] &&
3360                 catposstab[p[1]][base_list[3]] &&
3361                (base_list[3] != p[3] || !risprop));
3362              break;
3363              }
3364            }
3365          }
3366    
3367        else
3368    #endif  /* SUPPORT_UCP */
3369    
3370        accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3371               rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3372               autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3373    
3374        if (!accepted)
3375          return FALSE;
3376    
3377        if (list[1] == 0) return TRUE;
3378        /* Might be an empty repeat. */
3379        continue;
3380        }
3381    
3382      /* Control reaches here only if one of the items is a small character list.
3383      All characters are checked against the other side. */
3384    
3385      do
3386        {
3387        chr = *chr_ptr;
3388    
3389        switch(list_ptr[0])
3390          {
3391          case OP_CHAR:
3392          ochr_ptr = list_ptr + 2;
3393          do
3394            {
3395            if (chr == *ochr_ptr) return FALSE;
3396            ochr_ptr++;
3397            }
3398          while(*ochr_ptr != NOTACHAR);
3399          break;
3400    
3401          case OP_NOT:
3402          ochr_ptr = list_ptr + 2;
3403          do
3404            {
3405            if (chr == *ochr_ptr)
3406              break;
3407            ochr_ptr++;
3408            }
3409          while(*ochr_ptr != NOTACHAR);
3410          if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3411          break;
3412    
3413          /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3414          set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3415    
3416          case OP_DIGIT:
3417          if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3418          break;
3419    
3420          case OP_NOT_DIGIT:
3421          if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3422          break;
3423    
3424          case OP_WHITESPACE:
3425          if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3426          break;
3427    
3428          case OP_NOT_WHITESPACE:
3429          if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3430          break;
3431    
3432          case OP_WORDCHAR:
3433          if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3434          break;
3435    
3436          case OP_NOT_WORDCHAR:
3437          if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3438          break;
3439    
3440          case OP_HSPACE:
3441          switch(chr)
3442            {
3443            HSPACE_CASES: return FALSE;
3444            default: break;
3445            }
3446          break;
3447    
3448          case OP_NOT_HSPACE:
3449          switch(chr)
3450            {
3451            HSPACE_CASES: break;
3452            default: return FALSE;
3453            }
3454          break;
3455    
3456          case OP_ANYNL:
3457          case OP_VSPACE:
3458          switch(chr)
3459            {
3460            VSPACE_CASES: return FALSE;
3461            default: break;
3462            }
3463          break;
3464    
3465          case OP_NOT_VSPACE:
3466          switch(chr)
3467            {
3468            VSPACE_CASES: break;
3469            default: return FALSE;
3470            }
3471          break;
3472    
3473          case OP_DOLL:
3474          case OP_EODN:
3475          switch (chr)
3476            {
3477            case CHAR_CR:
3478            case CHAR_LF:
3479            case CHAR_VT:
3480            case CHAR_FF:
3481            case CHAR_NEL:
3482    #ifndef EBCDIC
3483            case 0x2028:
3484            case 0x2029:
3485    #endif  /* Not EBCDIC */
3486            return FALSE;
3487            }
3488          break;
3489    
3490          case OP_EOD:    /* Can always possessify before \z */
3491          break;
3492    
3493   &