/[pcre2]/code/trunk/src/pcre2_jit_compile.c
ViewVC logotype

Contents of /code/trunk/src/pcre2_jit_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1092 - (show annotations)
Mon May 13 16:38:18 2019 UTC (2 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 436056 byte(s)
Forgot this file in previous commit. Fixes JIT non-UTF bug.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "config.h"
43 #endif
44
45 #include "pcre2_internal.h"
46
47 #ifdef SUPPORT_JIT
48
49 /* All-in-one: Since we use the JIT compiler only from here,
50 we just include it. This way we don't need to touch the build
51 system files. */
52
53 #define SLJIT_CONFIG_AUTO 1
54 #define SLJIT_CONFIG_STATIC 1
55 #define SLJIT_VERBOSE 0
56
57 #ifdef PCRE2_DEBUG
58 #define SLJIT_DEBUG 1
59 #else
60 #define SLJIT_DEBUG 0
61 #endif
62
63 #define SLJIT_MALLOC(size, allocator_data) pcre2_jit_malloc(size, allocator_data)
64 #define SLJIT_FREE(ptr, allocator_data) pcre2_jit_free(ptr, allocator_data)
65
66 static void * pcre2_jit_malloc(size_t size, void *allocator_data)
67 {
68 pcre2_memctl *allocator = ((pcre2_memctl*)allocator_data);
69 return allocator->malloc(size, allocator->memory_data);
70 }
71
72 static void pcre2_jit_free(void *ptr, void *allocator_data)
73 {
74 pcre2_memctl *allocator = ((pcre2_memctl*)allocator_data);
75 allocator->free(ptr, allocator->memory_data);
76 }
77
78 #include "sljit/sljitLir.c"
79
80 #if defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED
81 #error Unsupported architecture
82 #endif
83
84 /* Defines for debugging purposes. */
85
86 /* 1 - Use unoptimized capturing brackets.
87 2 - Enable capture_last_ptr (includes option 1). */
88 /* #define DEBUG_FORCE_UNOPTIMIZED_CBRAS 2 */
89
90 /* 1 - Always have a control head. */
91 /* #define DEBUG_FORCE_CONTROL_HEAD 1 */
92
93 /* Allocate memory for the regex stack on the real machine stack.
94 Fast, but limited size. */
95 #define MACHINE_STACK_SIZE 32768
96
97 /* Growth rate for stack allocated by the OS. Should be the multiply
98 of page size. */
99 #define STACK_GROWTH_RATE 8192
100
101 /* Enable to check that the allocation could destroy temporaries. */
102 #if defined SLJIT_DEBUG && SLJIT_DEBUG
103 #define DESTROY_REGISTERS 1
104 #endif
105
106 /*
107 Short summary about the backtracking mechanism empolyed by the jit code generator:
108
109 The code generator follows the recursive nature of the PERL compatible regular
110 expressions. The basic blocks of regular expressions are condition checkers
111 whose execute different commands depending on the result of the condition check.
112 The relationship between the operators can be horizontal (concatenation) and
113 vertical (sub-expression) (See struct backtrack_common for more details).
114
115 'ab' - 'a' and 'b' regexps are concatenated
116 'a+' - 'a' is the sub-expression of the '+' operator
117
118 The condition checkers are boolean (true/false) checkers. Machine code is generated
119 for the checker itself and for the actions depending on the result of the checker.
120 The 'true' case is called as the matching path (expected path), and the other is called as
121 the 'backtrack' path. Branch instructions are expesive for all CPUs, so we avoid taken
122 branches on the matching path.
123
124 Greedy star operator (*) :
125 Matching path: match happens.
126 Backtrack path: match failed.
127 Non-greedy star operator (*?) :
128 Matching path: no need to perform a match.
129 Backtrack path: match is required.
130
131 The following example shows how the code generated for a capturing bracket
132 with two alternatives. Let A, B, C, D are arbirary regular expressions, and
133 we have the following regular expression:
134
135 A(B|C)D
136
137 The generated code will be the following:
138
139 A matching path
140 '(' matching path (pushing arguments to the stack)
141 B matching path
142 ')' matching path (pushing arguments to the stack)
143 D matching path
144 return with successful match
145
146 D backtrack path
147 ')' backtrack path (If we arrived from "C" jump to the backtrack of "C")
148 B backtrack path
149 C expected path
150 jump to D matching path
151 C backtrack path
152 A backtrack path
153
154 Notice, that the order of backtrack code paths are the opposite of the fast
155 code paths. In this way the topmost value on the stack is always belong
156 to the current backtrack code path. The backtrack path must check
157 whether there is a next alternative. If so, it needs to jump back to
158 the matching path eventually. Otherwise it needs to clear out its own stack
159 frame and continue the execution on the backtrack code paths.
160 */
161
162 /*
163 Saved stack frames:
164
165 Atomic blocks and asserts require reloading the values of private data
166 when the backtrack mechanism performed. Because of OP_RECURSE, the data
167 are not necessarly known in compile time, thus we need a dynamic restore
168 mechanism.
169
170 The stack frames are stored in a chain list, and have the following format:
171 ([ capturing bracket offset ][ start value ][ end value ])+ ... [ 0 ] [ previous head ]
172
173 Thus we can restore the private data to a particular point in the stack.
174 */
175
176 typedef struct jit_arguments {
177 /* Pointers first. */
178 struct sljit_stack *stack;
179 PCRE2_SPTR str;
180 PCRE2_SPTR begin;
181 PCRE2_SPTR end;
182 pcre2_match_data *match_data;
183 PCRE2_SPTR startchar_ptr;
184 PCRE2_UCHAR *mark_ptr;
185 int (*callout)(pcre2_callout_block *, void *);
186 void *callout_data;
187 /* Everything else after. */
188 sljit_uw offset_limit;
189 sljit_u32 limit_match;
190 sljit_u32 oveccount;
191 sljit_u32 options;
192 } jit_arguments;
193
194 #define JIT_NUMBER_OF_COMPILE_MODES 3
195
196 typedef struct executable_functions {
197 void *executable_funcs[JIT_NUMBER_OF_COMPILE_MODES];
198 void *read_only_data_heads[JIT_NUMBER_OF_COMPILE_MODES];
199 sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_MODES];
200 sljit_u32 top_bracket;
201 sljit_u32 limit_match;
202 } executable_functions;
203
204 typedef struct jump_list {
205 struct sljit_jump *jump;
206 struct jump_list *next;
207 } jump_list;
208
209 typedef struct stub_list {
210 struct sljit_jump *start;
211 struct sljit_label *quit;
212 struct stub_list *next;
213 } stub_list;
214
215 typedef struct label_addr_list {
216 struct sljit_label *label;
217 sljit_uw *update_addr;
218 struct label_addr_list *next;
219 } label_addr_list;
220
221 enum frame_types {
222 no_frame = -1,
223 no_stack = -2
224 };
225
226 enum control_types {
227 type_mark = 0,
228 type_then_trap = 1
229 };
230
231 typedef int (SLJIT_FUNC *jit_function)(jit_arguments *args);
232
233 /* The following structure is the key data type for the recursive
234 code generator. It is allocated by compile_matchingpath, and contains
235 the arguments for compile_backtrackingpath. Must be the first member
236 of its descendants. */
237 typedef struct backtrack_common {
238 /* Concatenation stack. */
239 struct backtrack_common *prev;
240 jump_list *nextbacktracks;
241 /* Internal stack (for component operators). */
242 struct backtrack_common *top;
243 jump_list *topbacktracks;
244 /* Opcode pointer. */
245 PCRE2_SPTR cc;
246 } backtrack_common;
247
248 typedef struct assert_backtrack {
249 backtrack_common common;
250 jump_list *condfailed;
251 /* Less than 0 if a frame is not needed. */
252 int framesize;
253 /* Points to our private memory word on the stack. */
254 int private_data_ptr;
255 /* For iterators. */
256 struct sljit_label *matchingpath;
257 } assert_backtrack;
258
259 typedef struct bracket_backtrack {
260 backtrack_common common;
261 /* Where to coninue if an alternative is successfully matched. */
262 struct sljit_label *alternative_matchingpath;
263 /* For rmin and rmax iterators. */
264 struct sljit_label *recursive_matchingpath;
265 /* For greedy ? operator. */
266 struct sljit_label *zero_matchingpath;
267 /* Contains the branches of a failed condition. */
268 union {
269 /* Both for OP_COND, OP_SCOND. */
270 jump_list *condfailed;
271 assert_backtrack *assert;
272 /* For OP_ONCE. Less than 0 if not needed. */
273 int framesize;
274 } u;
275 /* Points to our private memory word on the stack. */
276 int private_data_ptr;
277 } bracket_backtrack;
278
279 typedef struct bracketpos_backtrack {
280 backtrack_common common;
281 /* Points to our private memory word on the stack. */
282 int private_data_ptr;
283 /* Reverting stack is needed. */
284 int framesize;
285 /* Allocated stack size. */
286 int stacksize;
287 } bracketpos_backtrack;
288
289 typedef struct braminzero_backtrack {
290 backtrack_common common;
291 struct sljit_label *matchingpath;
292 } braminzero_backtrack;
293
294 typedef struct char_iterator_backtrack {
295 backtrack_common common;
296 /* Next iteration. */
297 struct sljit_label *matchingpath;
298 union {
299 jump_list *backtracks;
300 struct {
301 unsigned int othercasebit;
302 PCRE2_UCHAR chr;
303 BOOL enabled;
304 } charpos;
305 } u;
306 } char_iterator_backtrack;
307
308 typedef struct ref_iterator_backtrack {
309 backtrack_common common;
310 /* Next iteration. */
311 struct sljit_label *matchingpath;
312 } ref_iterator_backtrack;
313
314 typedef struct recurse_entry {
315 struct recurse_entry *next;
316 /* Contains the function entry label. */
317 struct sljit_label *entry_label;
318 /* Contains the function entry label. */
319 struct sljit_label *backtrack_label;
320 /* Collects the entry calls until the function is not created. */
321 jump_list *entry_calls;
322 /* Collects the backtrack calls until the function is not created. */
323 jump_list *backtrack_calls;
324 /* Points to the starting opcode. */
325 sljit_sw start;
326 } recurse_entry;
327
328 typedef struct recurse_backtrack {
329 backtrack_common common;
330 /* Return to the matching path. */
331 struct sljit_label *matchingpath;
332 /* Recursive pattern. */
333 recurse_entry *entry;
334 /* Pattern is inlined. */
335 BOOL inlined_pattern;
336 } recurse_backtrack;
337
338 #define OP_THEN_TRAP OP_TABLE_LENGTH
339
340 typedef struct then_trap_backtrack {
341 backtrack_common common;
342 /* If then_trap is not NULL, this structure contains the real
343 then_trap for the backtracking path. */
344 struct then_trap_backtrack *then_trap;
345 /* Points to the starting opcode. */
346 sljit_sw start;
347 /* Exit point for the then opcodes of this alternative. */
348 jump_list *quit;
349 /* Frame size of the current alternative. */
350 int framesize;
351 } then_trap_backtrack;
352
353 #define MAX_N_CHARS 12
354 #define MAX_DIFF_CHARS 5
355
356 typedef struct fast_forward_char_data {
357 /* Number of characters in the chars array, 255 for any character. */
358 sljit_u8 count;
359 /* Number of last UTF-8 characters in the chars array. */
360 sljit_u8 last_count;
361 /* Available characters in the current position. */
362 PCRE2_UCHAR chars[MAX_DIFF_CHARS];
363 } fast_forward_char_data;
364
365 #define MAX_CLASS_RANGE_SIZE 4
366 #define MAX_CLASS_CHARS_SIZE 3
367
368 typedef struct compiler_common {
369 /* The sljit ceneric compiler. */
370 struct sljit_compiler *compiler;
371 /* Compiled regular expression. */
372 pcre2_real_code *re;
373 /* First byte code. */
374 PCRE2_SPTR start;
375 /* Maps private data offset to each opcode. */
376 sljit_s32 *private_data_ptrs;
377 /* Chain list of read-only data ptrs. */
378 void *read_only_data_head;
379 /* Tells whether the capturing bracket is optimized. */
380 sljit_u8 *optimized_cbracket;
381 /* Tells whether the starting offset is a target of then. */
382 sljit_u8 *then_offsets;
383 /* Current position where a THEN must jump. */
384 then_trap_backtrack *then_trap;
385 /* Starting offset of private data for capturing brackets. */
386 sljit_s32 cbra_ptr;
387 /* Output vector starting point. Must be divisible by 2. */
388 sljit_s32 ovector_start;
389 /* Points to the starting character of the current match. */
390 sljit_s32 start_ptr;
391 /* Last known position of the requested byte. */
392 sljit_s32 req_char_ptr;
393 /* Head of the last recursion. */
394 sljit_s32 recursive_head_ptr;
395 /* First inspected character for partial matching.
396 (Needed for avoiding zero length partial matches.) */
397 sljit_s32 start_used_ptr;
398 /* Starting pointer for partial soft matches. */
399 sljit_s32 hit_start;
400 /* Pointer of the match end position. */
401 sljit_s32 match_end_ptr;
402 /* Points to the marked string. */
403 sljit_s32 mark_ptr;
404 /* Recursive control verb management chain. */
405 sljit_s32 control_head_ptr;
406 /* Points to the last matched capture block index. */
407 sljit_s32 capture_last_ptr;
408 /* Fast forward skipping byte code pointer. */
409 PCRE2_SPTR fast_forward_bc_ptr;
410 /* Locals used by fast fail optimization. */
411 sljit_s32 fast_fail_start_ptr;
412 sljit_s32 fast_fail_end_ptr;
413
414 /* Flipped and lower case tables. */
415 const sljit_u8 *fcc;
416 sljit_sw lcc;
417 /* Mode can be PCRE2_JIT_COMPLETE and others. */
418 int mode;
419 /* TRUE, when minlength is greater than 0. */
420 BOOL might_be_empty;
421 /* \K is found in the pattern. */
422 BOOL has_set_som;
423 /* (*SKIP:arg) is found in the pattern. */
424 BOOL has_skip_arg;
425 /* (*THEN) is found in the pattern. */
426 BOOL has_then;
427 /* (*SKIP) or (*SKIP:arg) is found in lookbehind assertion. */
428 BOOL has_skip_in_assert_back;
429 /* Quit is redirected by recurse, negative assertion, or positive assertion in conditional block. */
430 BOOL local_quit_available;
431 /* Currently in a positive assertion. */
432 BOOL in_positive_assertion;
433 /* Newline control. */
434 int nltype;
435 sljit_u32 nlmax;
436 sljit_u32 nlmin;
437 int newline;
438 int bsr_nltype;
439 sljit_u32 bsr_nlmax;
440 sljit_u32 bsr_nlmin;
441 /* Dollar endonly. */
442 int endonly;
443 /* Tables. */
444 sljit_sw ctypes;
445 /* Named capturing brackets. */
446 PCRE2_SPTR name_table;
447 sljit_sw name_count;
448 sljit_sw name_entry_size;
449
450 /* Labels and jump lists. */
451 struct sljit_label *partialmatchlabel;
452 struct sljit_label *quit_label;
453 struct sljit_label *abort_label;
454 struct sljit_label *accept_label;
455 struct sljit_label *ff_newline_shortcut;
456 stub_list *stubs;
457 label_addr_list *label_addrs;
458 recurse_entry *entries;
459 recurse_entry *currententry;
460 jump_list *partialmatch;
461 jump_list *quit;
462 jump_list *positive_assertion_quit;
463 jump_list *abort;
464 jump_list *failed_match;
465 jump_list *accept;
466 jump_list *calllimit;
467 jump_list *stackalloc;
468 jump_list *revertframes;
469 jump_list *wordboundary;
470 jump_list *anynewline;
471 jump_list *hspace;
472 jump_list *vspace;
473 jump_list *casefulcmp;
474 jump_list *caselesscmp;
475 jump_list *reset_match;
476 BOOL unset_backref;
477 BOOL alt_circumflex;
478 #ifdef SUPPORT_UNICODE
479 BOOL utf;
480 BOOL invalid_utf;
481 BOOL use_ucp;
482 /* Points to saving area for iref. */
483 sljit_s32 iref_ptr;
484 jump_list *getucd;
485 jump_list *getucdtype;
486 #if PCRE2_CODE_UNIT_WIDTH == 8
487 jump_list *utfreadchar;
488 jump_list *utfreadtype8;
489 jump_list *utfpeakcharback;
490 #endif
491 #if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
492 jump_list *utfreadchar_invalid;
493 jump_list *utfreadnewline_invalid;
494 jump_list *utfmoveback_invalid;
495 jump_list *utfpeakcharback_invalid;
496 #endif
497 #endif /* SUPPORT_UNICODE */
498 } compiler_common;
499
500 /* For byte_sequence_compare. */
501
502 typedef struct compare_context {
503 int length;
504 int sourcereg;
505 #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
506 int ucharptr;
507 union {
508 sljit_s32 asint;
509 sljit_u16 asushort;
510 #if PCRE2_CODE_UNIT_WIDTH == 8
511 sljit_u8 asbyte;
512 sljit_u8 asuchars[4];
513 #elif PCRE2_CODE_UNIT_WIDTH == 16
514 sljit_u16 asuchars[2];
515 #elif PCRE2_CODE_UNIT_WIDTH == 32
516 sljit_u32 asuchars[1];
517 #endif
518 } c;
519 union {
520 sljit_s32 asint;
521 sljit_u16 asushort;
522 #if PCRE2_CODE_UNIT_WIDTH == 8
523 sljit_u8 asbyte;
524 sljit_u8 asuchars[4];
525 #elif PCRE2_CODE_UNIT_WIDTH == 16
526 sljit_u16 asuchars[2];
527 #elif PCRE2_CODE_UNIT_WIDTH == 32
528 sljit_u32 asuchars[1];
529 #endif
530 } oc;
531 #endif
532 } compare_context;
533
534 /* Undefine sljit macros. */
535 #undef CMP
536
537 /* Used for accessing the elements of the stack. */
538 #define STACK(i) ((i) * (int)sizeof(sljit_sw))
539
540 #ifdef SLJIT_PREF_SHIFT_REG
541 #if SLJIT_PREF_SHIFT_REG == SLJIT_R2
542 /* Nothing. */
543 #elif SLJIT_PREF_SHIFT_REG == SLJIT_R3
544 #define SHIFT_REG_IS_R3
545 #else
546 #error "Unsupported shift register"
547 #endif
548 #endif
549
550 #define TMP1 SLJIT_R0
551 #ifdef SHIFT_REG_IS_R3
552 #define TMP2 SLJIT_R3
553 #define TMP3 SLJIT_R2
554 #else
555 #define TMP2 SLJIT_R2
556 #define TMP3 SLJIT_R3
557 #endif
558 #define STR_PTR SLJIT_R1
559 #define STR_END SLJIT_S0
560 #define STACK_TOP SLJIT_S1
561 #define STACK_LIMIT SLJIT_S2
562 #define COUNT_MATCH SLJIT_S3
563 #define ARGUMENTS SLJIT_S4
564 #define RETURN_ADDR SLJIT_R4
565
566 /* Local space layout. */
567 /* These two locals can be used by the current opcode. */
568 #define LOCALS0 (0 * sizeof(sljit_sw))
569 #define LOCALS1 (1 * sizeof(sljit_sw))
570 /* Two local variables for possessive quantifiers (char1 cannot use them). */
571 #define POSSESSIVE0 (2 * sizeof(sljit_sw))
572 #define POSSESSIVE1 (3 * sizeof(sljit_sw))
573 /* Max limit of recursions. */
574 #define LIMIT_MATCH (4 * sizeof(sljit_sw))
575 /* The output vector is stored on the stack, and contains pointers
576 to characters. The vector data is divided into two groups: the first
577 group contains the start / end character pointers, and the second is
578 the start pointers when the end of the capturing group has not yet reached. */
579 #define OVECTOR_START (common->ovector_start)
580 #define OVECTOR(i) (OVECTOR_START + (i) * (sljit_sw)sizeof(sljit_sw))
581 #define OVECTOR_PRIV(i) (common->cbra_ptr + (i) * (sljit_sw)sizeof(sljit_sw))
582 #define PRIVATE_DATA(cc) (common->private_data_ptrs[(cc) - common->start])
583
584 #if PCRE2_CODE_UNIT_WIDTH == 8
585 #define MOV_UCHAR SLJIT_MOV_U8
586 #define IN_UCHARS(x) (x)
587 #elif PCRE2_CODE_UNIT_WIDTH == 16
588 #define MOV_UCHAR SLJIT_MOV_U16
589 #define UCHAR_SHIFT (1)
590 #define IN_UCHARS(x) ((x) * 2)
591 #elif PCRE2_CODE_UNIT_WIDTH == 32
592 #define MOV_UCHAR SLJIT_MOV_U32
593 #define UCHAR_SHIFT (2)
594 #define IN_UCHARS(x) ((x) * 4)
595 #else
596 #error Unsupported compiling mode
597 #endif
598
599 /* Shortcuts. */
600 #define DEFINE_COMPILER \
601 struct sljit_compiler *compiler = common->compiler
602 #define OP1(op, dst, dstw, src, srcw) \
603 sljit_emit_op1(compiler, (op), (dst), (dstw), (src), (srcw))
604 #define OP2(op, dst, dstw, src1, src1w, src2, src2w) \
605 sljit_emit_op2(compiler, (op), (dst), (dstw), (src1), (src1w), (src2), (src2w))
606 #define LABEL() \
607 sljit_emit_label(compiler)
608 #define JUMP(type) \
609 sljit_emit_jump(compiler, (type))
610 #define JUMPTO(type, label) \
611 sljit_set_label(sljit_emit_jump(compiler, (type)), (label))
612 #define JUMPHERE(jump) \
613 sljit_set_label((jump), sljit_emit_label(compiler))
614 #define SET_LABEL(jump, label) \
615 sljit_set_label((jump), (label))
616 #define CMP(type, src1, src1w, src2, src2w) \
617 sljit_emit_cmp(compiler, (type), (src1), (src1w), (src2), (src2w))
618 #define CMPTO(type, src1, src1w, src2, src2w, label) \
619 sljit_set_label(sljit_emit_cmp(compiler, (type), (src1), (src1w), (src2), (src2w)), (label))
620 #define OP_FLAGS(op, dst, dstw, type) \
621 sljit_emit_op_flags(compiler, (op), (dst), (dstw), (type))
622 #define CMOV(type, dst_reg, src, srcw) \
623 sljit_emit_cmov(compiler, (type), (dst_reg), (src), (srcw))
624 #define GET_LOCAL_BASE(dst, dstw, offset) \
625 sljit_get_local_base(compiler, (dst), (dstw), (offset))
626
627 #define READ_CHAR_MAX 0x7fffffff
628
629 #define INVALID_UTF_CHAR -1
630 #define UNASSIGNED_UTF_CHAR 888
631
632 #if defined SUPPORT_UNICODE
633 #if PCRE2_CODE_UNIT_WIDTH == 8
634
635 #define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
636 { \
637 if (ptr[0] <= 0x7f) \
638 c = *ptr++; \
639 else if (ptr + 1 < end && ptr[1] >= 0x80 && ptr[1] < 0xc0) \
640 { \
641 c = ptr[1] - 0x80; \
642 \
643 if (ptr[0] >= 0xc2 && ptr[0] <= 0xdf) \
644 { \
645 c |= (ptr[0] - 0xc0) << 6; \
646 ptr += 2; \
647 } \
648 else if (ptr + 2 < end && ptr[2] >= 0x80 && ptr[2] < 0xc0) \
649 { \
650 c = c << 6 | (ptr[2] - 0x80); \
651 \
652 if (ptr[0] >= 0xe0 && ptr[0] <= 0xef) \
653 { \
654 c |= (ptr[0] - 0xe0) << 12; \
655 ptr += 3; \
656 \
657 if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \
658 { \
659 invalid_action; \
660 } \
661 } \
662 else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \
663 { \
664 c = c << 6 | (ptr[3] - 0x80); \
665 \
666 if (ptr[0] >= 0xf0 && ptr[0] <= 0xf4) \
667 { \
668 c |= (ptr[0] - 0xf0) << 18; \
669 ptr += 4; \
670 \
671 if (c >= 0x110000 || c < 0x10000) \
672 { \
673 invalid_action; \
674 } \
675 } \
676 else \
677 { \
678 invalid_action; \
679 } \
680 } \
681 else \
682 { \
683 invalid_action; \
684 } \
685 } \
686 else \
687 { \
688 invalid_action; \
689 } \
690 } \
691 else \
692 { \
693 invalid_action; \
694 } \
695 }
696
697 #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
698 { \
699 c = ptr[-1]; \
700 if (c <= 0x7f) \
701 ptr--; \
702 else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \
703 { \
704 c -= 0x80; \
705 \
706 if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \
707 { \
708 c |= (ptr[-2] - 0xc0) << 6; \
709 ptr -= 2; \
710 } \
711 else if (ptr - 2 > start && ptr[-2] >= 0x80 && ptr[-2] < 0xc0) \
712 { \
713 c = c << 6 | (ptr[-2] - 0x80); \
714 \
715 if (ptr[-3] >= 0xe0 && ptr[-3] <= 0xef) \
716 { \
717 c |= (ptr[-3] - 0xe0) << 12; \
718 ptr -= 3; \
719 \
720 if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \
721 { \
722 invalid_action; \
723 } \
724 } \
725 else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \
726 { \
727 c = c << 6 | (ptr[-3] - 0x80); \
728 \
729 if (ptr[-4] >= 0xf0 && ptr[-4] <= 0xf4) \
730 { \
731 c |= (ptr[-4] - 0xf0) << 18; \
732 ptr -= 4; \
733 \
734 if (c >= 0x110000 || c < 0x10000) \
735 { \
736 invalid_action; \
737 } \
738 } \
739 else \
740 { \
741 invalid_action; \
742 } \
743 } \
744 else \
745 { \
746 invalid_action; \
747 } \
748 } \
749 else \
750 { \
751 invalid_action; \
752 } \
753 } \
754 else \
755 { \
756 invalid_action; \
757 } \
758 }
759
760 #elif PCRE2_CODE_UNIT_WIDTH == 16
761
762 #define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
763 { \
764 if (ptr[0] < 0xd800 || ptr[0] >= 0xe000) \
765 c = *ptr++; \
766 else if (ptr[0] < 0xdc00 && ptr + 1 < end && ptr[1] >= 0xdc00 && ptr[1] < 0xe000) \
767 { \
768 c = (((ptr[0] - 0xd800) << 10) | (ptr[1] - 0xdc00)) + 0x10000; \
769 ptr += 2; \
770 } \
771 else \
772 { \
773 invalid_action; \
774 } \
775 }
776
777 #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
778 { \
779 c = ptr[-1]; \
780 if (c < 0xd800 || c >= 0xe000) \
781 ptr--; \
782 else if (c >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \
783 { \
784 c = (((ptr[-2] - 0xd800) << 10) | (c - 0xdc00)) + 0x10000; \
785 ptr -= 2; \
786 } \
787 else \
788 { \
789 invalid_action; \
790 } \
791 }
792
793
794 #elif PCRE2_CODE_UNIT_WIDTH == 32
795
796 #define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
797 { \
798 if (ptr[0] < 0xd800 || (ptr[0] >= 0xe000 && ptr[0] < 0x110000)) \
799 c = *ptr++; \
800 else \
801 { \
802 invalid_action; \
803 } \
804 }
805
806 #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
807 { \
808 c = ptr[-1]; \
809 if (ptr[-1] < 0xd800 || (ptr[-1] >= 0xe000 && ptr[-1] < 0x110000)) \
810 ptr--; \
811 else \
812 { \
813 invalid_action; \
814 } \
815 }
816
817 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
818 #endif /* SUPPORT_UNICODE */
819
820 static PCRE2_SPTR bracketend(PCRE2_SPTR cc)
821 {
822 SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND));
823 do cc += GET(cc, 1); while (*cc == OP_ALT);
824 SLJIT_ASSERT(*cc >= OP_KET && *cc <= OP_KETRPOS);
825 cc += 1 + LINK_SIZE;
826 return cc;
827 }
828
829 static int no_alternatives(PCRE2_SPTR cc)
830 {
831 int count = 0;
832 SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND));
833 do
834 {
835 cc += GET(cc, 1);
836 count++;
837 }
838 while (*cc == OP_ALT);
839 SLJIT_ASSERT(*cc >= OP_KET && *cc <= OP_KETRPOS);
840 return count;
841 }
842
843 /* Functions whose might need modification for all new supported opcodes:
844 next_opcode
845 check_opcode_types
846 set_private_data_ptrs
847 get_framesize
848 init_frame
849 get_recurse_data_length
850 copy_recurse_data
851 compile_matchingpath
852 compile_backtrackingpath
853 */
854
855 static PCRE2_SPTR next_opcode(compiler_common *common, PCRE2_SPTR cc)
856 {
857 SLJIT_UNUSED_ARG(common);
858 switch(*cc)
859 {
860 case OP_SOD:
861 case OP_SOM:
862 case OP_SET_SOM:
863 case OP_NOT_WORD_BOUNDARY:
864 case OP_WORD_BOUNDARY:
865 case OP_NOT_DIGIT:
866 case OP_DIGIT:
867 case OP_NOT_WHITESPACE:
868 case OP_WHITESPACE:
869 case OP_NOT_WORDCHAR:
870 case OP_WORDCHAR:
871 case OP_ANY:
872 case OP_ALLANY:
873 case OP_NOTPROP:
874 case OP_PROP:
875 case OP_ANYNL:
876 case OP_NOT_HSPACE:
877 case OP_HSPACE:
878 case OP_NOT_VSPACE:
879 case OP_VSPACE:
880 case OP_EXTUNI:
881 case OP_EODN:
882 case OP_EOD:
883 case OP_CIRC:
884 case OP_CIRCM:
885 case OP_DOLL:
886 case OP_DOLLM:
887 case OP_CRSTAR:
888 case OP_CRMINSTAR:
889 case OP_CRPLUS:
890 case OP_CRMINPLUS:
891 case OP_CRQUERY:
892 case OP_CRMINQUERY:
893 case OP_CRRANGE:
894 case OP_CRMINRANGE:
895 case OP_CRPOSSTAR:
896 case OP_CRPOSPLUS:
897 case OP_CRPOSQUERY:
898 case OP_CRPOSRANGE:
899 case OP_CLASS:
900 case OP_NCLASS:
901 case OP_REF:
902 case OP_REFI:
903 case OP_DNREF:
904 case OP_DNREFI:
905 case OP_RECURSE:
906 case OP_CALLOUT:
907 case OP_ALT:
908 case OP_KET:
909 case OP_KETRMAX:
910 case OP_KETRMIN:
911 case OP_KETRPOS:
912 case OP_REVERSE:
913 case OP_ASSERT:
914 case OP_ASSERT_NOT:
915 case OP_ASSERTBACK:
916 case OP_ASSERTBACK_NOT:
917 case OP_ONCE:
918 case OP_SCRIPT_RUN:
919 case OP_BRA:
920 case OP_BRAPOS:
921 case OP_CBRA:
922 case OP_CBRAPOS:
923 case OP_COND:
924 case OP_SBRA:
925 case OP_SBRAPOS:
926 case OP_SCBRA:
927 case OP_SCBRAPOS:
928 case OP_SCOND:
929 case OP_CREF:
930 case OP_DNCREF:
931 case OP_RREF:
932 case OP_DNRREF:
933 case OP_FALSE:
934 case OP_TRUE:
935 case OP_BRAZERO:
936 case OP_BRAMINZERO:
937 case OP_BRAPOSZERO:
938 case OP_PRUNE:
939 case OP_SKIP:
940 case OP_THEN:
941 case OP_COMMIT:
942 case OP_FAIL:
943 case OP_ACCEPT:
944 case OP_ASSERT_ACCEPT:
945 case OP_CLOSE:
946 case OP_SKIPZERO:
947 return cc + PRIV(OP_lengths)[*cc];
948
949 case OP_CHAR:
950 case OP_CHARI:
951 case OP_NOT:
952 case OP_NOTI:
953 case OP_STAR:
954 case OP_MINSTAR:
955 case OP_PLUS:
956 case OP_MINPLUS:
957 case OP_QUERY:
958 case OP_MINQUERY:
959 case OP_UPTO:
960 case OP_MINUPTO:
961 case OP_EXACT:
962 case OP_POSSTAR:
963 case OP_POSPLUS:
964 case OP_POSQUERY:
965 case OP_POSUPTO:
966 case OP_STARI:
967 case OP_MINSTARI:
968 case OP_PLUSI:
969 case OP_MINPLUSI:
970 case OP_QUERYI:
971 case OP_MINQUERYI:
972 case OP_UPTOI:
973 case OP_MINUPTOI:
974 case OP_EXACTI:
975 case OP_POSSTARI:
976 case OP_POSPLUSI:
977 case OP_POSQUERYI:
978 case OP_POSUPTOI:
979 case OP_NOTSTAR:
980 case OP_NOTMINSTAR:
981 case OP_NOTPLUS:
982 case OP_NOTMINPLUS:
983 case OP_NOTQUERY:
984 case OP_NOTMINQUERY:
985 case OP_NOTUPTO:
986 case OP_NOTMINUPTO:
987 case OP_NOTEXACT:
988 case OP_NOTPOSSTAR:
989 case OP_NOTPOSPLUS:
990 case OP_NOTPOSQUERY:
991 case OP_NOTPOSUPTO:
992 case OP_NOTSTARI:
993 case OP_NOTMINSTARI:
994 case OP_NOTPLUSI:
995 case OP_NOTMINPLUSI:
996 case OP_NOTQUERYI:
997 case OP_NOTMINQUERYI:
998 case OP_NOTUPTOI:
999 case OP_NOTMINUPTOI:
1000 case OP_NOTEXACTI:
1001 case OP_NOTPOSSTARI:
1002 case OP_NOTPOSPLUSI:
1003 case OP_NOTPOSQUERYI:
1004 case OP_NOTPOSUPTOI:
1005 cc += PRIV(OP_lengths)[*cc];
1006 #ifdef SUPPORT_UNICODE
1007 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1008 #endif
1009 return cc;
1010
1011 /* Special cases. */
1012 case OP_TYPESTAR:
1013 case OP_TYPEMINSTAR:
1014 case OP_TYPEPLUS:
1015 case OP_TYPEMINPLUS:
1016 case OP_TYPEQUERY:
1017 case OP_TYPEMINQUERY:
1018 case OP_TYPEUPTO:
1019 case OP_TYPEMINUPTO:
1020 case OP_TYPEEXACT:
1021 case OP_TYPEPOSSTAR:
1022 case OP_TYPEPOSPLUS:
1023 case OP_TYPEPOSQUERY:
1024 case OP_TYPEPOSUPTO:
1025 return cc + PRIV(OP_lengths)[*cc] - 1;
1026
1027 case OP_ANYBYTE:
1028 #ifdef SUPPORT_UNICODE
1029 if (common->utf) return NULL;
1030 #endif
1031 return cc + 1;
1032
1033 case OP_CALLOUT_STR:
1034 return cc + GET(cc, 1 + 2*LINK_SIZE);
1035
1036 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1037 case OP_XCLASS:
1038 return cc + GET(cc, 1);
1039 #endif
1040
1041 case OP_MARK:
1042 case OP_COMMIT_ARG:
1043 case OP_PRUNE_ARG:
1044 case OP_SKIP_ARG:
1045 case OP_THEN_ARG:
1046 return cc + 1 + 2 + cc[1];
1047
1048 default:
1049 /* All opcodes are supported now! */
1050 SLJIT_UNREACHABLE();
1051 return NULL;
1052 }
1053 }
1054
1055 static BOOL check_opcode_types(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend)
1056 {
1057 int count;
1058 PCRE2_SPTR slot;
1059 PCRE2_SPTR assert_back_end = cc - 1;
1060
1061 /* Calculate important variables (like stack size) and checks whether all opcodes are supported. */
1062 while (cc < ccend)
1063 {
1064 switch(*cc)
1065 {
1066 case OP_SET_SOM:
1067 common->has_set_som = TRUE;
1068 common->might_be_empty = TRUE;
1069 cc += 1;
1070 break;
1071
1072 case OP_REFI:
1073 #ifdef SUPPORT_UNICODE
1074 if (common->iref_ptr == 0)
1075 {
1076 common->iref_ptr = common->ovector_start;
1077 common->ovector_start += 3 * sizeof(sljit_sw);
1078 }
1079 #endif /* SUPPORT_UNICODE */
1080 /* Fall through. */
1081 case OP_REF:
1082 common->optimized_cbracket[GET2(cc, 1)] = 0;
1083 cc += 1 + IMM2_SIZE;
1084 break;
1085
1086 case OP_CBRAPOS:
1087 case OP_SCBRAPOS:
1088 common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] = 0;
1089 cc += 1 + LINK_SIZE + IMM2_SIZE;
1090 break;
1091
1092 case OP_COND:
1093 case OP_SCOND:
1094 /* Only AUTO_CALLOUT can insert this opcode. We do
1095 not intend to support this case. */
1096 if (cc[1 + LINK_SIZE] == OP_CALLOUT || cc[1 + LINK_SIZE] == OP_CALLOUT_STR)
1097 return FALSE;
1098 cc += 1 + LINK_SIZE;
1099 break;
1100
1101 case OP_CREF:
1102 common->optimized_cbracket[GET2(cc, 1)] = 0;
1103 cc += 1 + IMM2_SIZE;
1104 break;
1105
1106 case OP_DNREF:
1107 case OP_DNREFI:
1108 case OP_DNCREF:
1109 count = GET2(cc, 1 + IMM2_SIZE);
1110 slot = common->name_table + GET2(cc, 1) * common->name_entry_size;
1111 while (count-- > 0)
1112 {
1113 common->optimized_cbracket[GET2(slot, 0)] = 0;
1114 slot += common->name_entry_size;
1115 }
1116 cc += 1 + 2 * IMM2_SIZE;
1117 break;
1118
1119 case OP_RECURSE:
1120 /* Set its value only once. */
1121 if (common->recursive_head_ptr == 0)
1122 {
1123 common->recursive_head_ptr = common->ovector_start;
1124 common->ovector_start += sizeof(sljit_sw);
1125 }
1126 cc += 1 + LINK_SIZE;
1127 break;
1128
1129 case OP_CALLOUT:
1130 case OP_CALLOUT_STR:
1131 if (common->capture_last_ptr == 0)
1132 {
1133 common->capture_last_ptr = common->ovector_start;
1134 common->ovector_start += sizeof(sljit_sw);
1135 }
1136 cc += (*cc == OP_CALLOUT) ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2*LINK_SIZE);
1137 break;
1138
1139 case OP_ASSERTBACK:
1140 slot = bracketend(cc);
1141 if (slot > assert_back_end)
1142 assert_back_end = slot;
1143 cc += 1 + LINK_SIZE;
1144 break;
1145
1146 case OP_THEN_ARG:
1147 common->has_then = TRUE;
1148 common->control_head_ptr = 1;
1149 /* Fall through. */
1150
1151 case OP_COMMIT_ARG:
1152 case OP_PRUNE_ARG:
1153 case OP_MARK:
1154 if (common->mark_ptr == 0)
1155 {
1156 common->mark_ptr = common->ovector_start;
1157 common->ovector_start += sizeof(sljit_sw);
1158 }
1159 cc += 1 + 2 + cc[1];
1160 break;
1161
1162 case OP_THEN:
1163 common->has_then = TRUE;
1164 common->control_head_ptr = 1;
1165 cc += 1;
1166 break;
1167
1168 case OP_SKIP:
1169 if (cc < assert_back_end)
1170 common->has_skip_in_assert_back = TRUE;
1171 cc += 1;
1172 break;
1173
1174 case OP_SKIP_ARG:
1175 common->control_head_ptr = 1;
1176 common->has_skip_arg = TRUE;
1177 if (cc < assert_back_end)
1178 common->has_skip_in_assert_back = TRUE;
1179 cc += 1 + 2 + cc[1];
1180 break;
1181
1182 default:
1183 cc = next_opcode(common, cc);
1184 if (cc == NULL)
1185 return FALSE;
1186 break;
1187 }
1188 }
1189 return TRUE;
1190 }
1191
1192 static BOOL is_accelerated_repeat(PCRE2_SPTR cc)
1193 {
1194 switch(*cc)
1195 {
1196 case OP_TYPESTAR:
1197 case OP_TYPEMINSTAR:
1198 case OP_TYPEPLUS:
1199 case OP_TYPEMINPLUS:
1200 case OP_TYPEPOSSTAR:
1201 case OP_TYPEPOSPLUS:
1202 return (cc[1] != OP_ANYNL && cc[1] != OP_EXTUNI);
1203
1204 case OP_STAR:
1205 case OP_MINSTAR:
1206 case OP_PLUS:
1207 case OP_MINPLUS:
1208 case OP_POSSTAR:
1209 case OP_POSPLUS:
1210
1211 case OP_STARI:
1212 case OP_MINSTARI:
1213 case OP_PLUSI:
1214 case OP_MINPLUSI:
1215 case OP_POSSTARI:
1216 case OP_POSPLUSI:
1217
1218 case OP_NOTSTAR:
1219 case OP_NOTMINSTAR:
1220 case OP_NOTPLUS:
1221 case OP_NOTMINPLUS:
1222 case OP_NOTPOSSTAR:
1223 case OP_NOTPOSPLUS:
1224
1225 case OP_NOTSTARI:
1226 case OP_NOTMINSTARI:
1227 case OP_NOTPLUSI:
1228 case OP_NOTMINPLUSI:
1229 case OP_NOTPOSSTARI:
1230 case OP_NOTPOSPLUSI:
1231 return TRUE;
1232
1233 case OP_CLASS:
1234 case OP_NCLASS:
1235 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1236 case OP_XCLASS:
1237 cc += (*cc == OP_XCLASS) ? GET(cc, 1) : (int)(1 + (32 / sizeof(PCRE2_UCHAR)));
1238 #else
1239 cc += (1 + (32 / sizeof(PCRE2_UCHAR)));
1240 #endif
1241
1242 switch(*cc)
1243 {
1244 case OP_CRSTAR:
1245 case OP_CRMINSTAR:
1246 case OP_CRPLUS:
1247 case OP_CRMINPLUS:
1248 case OP_CRPOSSTAR:
1249 case OP_CRPOSPLUS:
1250 return TRUE;
1251 }
1252 break;
1253 }
1254 return FALSE;
1255 }
1256
1257 static SLJIT_INLINE BOOL detect_fast_forward_skip(compiler_common *common, int *private_data_start)
1258 {
1259 PCRE2_SPTR cc = common->start;
1260 PCRE2_SPTR end;
1261
1262 /* Skip not repeated brackets. */
1263 while (TRUE)
1264 {
1265 switch(*cc)
1266 {
1267 case OP_SOD:
1268 case OP_SOM:
1269 case OP_SET_SOM:
1270 case OP_NOT_WORD_BOUNDARY:
1271 case OP_WORD_BOUNDARY:
1272 case OP_EODN:
1273 case OP_EOD:
1274 case OP_CIRC:
1275 case OP_CIRCM:
1276 case OP_DOLL:
1277 case OP_DOLLM:
1278 /* Zero width assertions. */
1279 cc++;
1280 continue;
1281 }
1282
1283 if (*cc != OP_BRA && *cc != OP_CBRA)
1284 break;
1285
1286 end = cc + GET(cc, 1);
1287 if (*end != OP_KET || PRIVATE_DATA(end) != 0)
1288 return FALSE;
1289 if (*cc == OP_CBRA)
1290 {
1291 if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0)
1292 return FALSE;
1293 cc += IMM2_SIZE;
1294 }
1295 cc += 1 + LINK_SIZE;
1296 }
1297
1298 if (is_accelerated_repeat(cc))
1299 {
1300 common->fast_forward_bc_ptr = cc;
1301 common->private_data_ptrs[(cc + 1) - common->start] = *private_data_start;
1302 *private_data_start += sizeof(sljit_sw);
1303 return TRUE;
1304 }
1305 return FALSE;
1306 }
1307
1308 static SLJIT_INLINE void detect_fast_fail(compiler_common *common, PCRE2_SPTR cc, int *private_data_start, sljit_s32 depth)
1309 {
1310 PCRE2_SPTR next_alt;
1311
1312 SLJIT_ASSERT(*cc == OP_BRA || *cc == OP_CBRA);
1313
1314 if (*cc == OP_CBRA && common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0)
1315 return;
1316
1317 next_alt = bracketend(cc) - (1 + LINK_SIZE);
1318 if (*next_alt != OP_KET || PRIVATE_DATA(next_alt) != 0)
1319 return;
1320
1321 do
1322 {
1323 next_alt = cc + GET(cc, 1);
1324
1325 cc += 1 + LINK_SIZE + ((*cc == OP_CBRA) ? IMM2_SIZE : 0);
1326
1327 while (TRUE)
1328 {
1329 switch(*cc)
1330 {
1331 case OP_SOD:
1332 case OP_SOM:
1333 case OP_SET_SOM:
1334 case OP_NOT_WORD_BOUNDARY:
1335 case OP_WORD_BOUNDARY:
1336 case OP_EODN:
1337 case OP_EOD:
1338 case OP_CIRC:
1339 case OP_CIRCM:
1340 case OP_DOLL:
1341 case OP_DOLLM:
1342 /* Zero width assertions. */
1343 cc++;
1344 continue;
1345 }
1346 break;
1347 }
1348
1349 if (depth > 0 && (*cc == OP_BRA || *cc == OP_CBRA))
1350 detect_fast_fail(common, cc, private_data_start, depth - 1);
1351
1352 if (is_accelerated_repeat(cc))
1353 {
1354 common->private_data_ptrs[(cc + 1) - common->start] = *private_data_start;
1355
1356 if (common->fast_fail_start_ptr == 0)
1357 common->fast_fail_start_ptr = *private_data_start;
1358
1359 *private_data_start += sizeof(sljit_sw);
1360 common->fast_fail_end_ptr = *private_data_start;
1361
1362 if (*private_data_start > SLJIT_MAX_LOCAL_SIZE)
1363 return;
1364 }
1365
1366 cc = next_alt;
1367 }
1368 while (*cc == OP_ALT);
1369 }
1370
1371 static int get_class_iterator_size(PCRE2_SPTR cc)
1372 {
1373 sljit_u32 min;
1374 sljit_u32 max;
1375 switch(*cc)
1376 {
1377 case OP_CRSTAR:
1378 case OP_CRPLUS:
1379 return 2;
1380
1381 case OP_CRMINSTAR:
1382 case OP_CRMINPLUS:
1383 case OP_CRQUERY:
1384 case OP_CRMINQUERY:
1385 return 1;
1386
1387 case OP_CRRANGE:
1388 case OP_CRMINRANGE:
1389 min = GET2(cc, 1);
1390 max = GET2(cc, 1 + IMM2_SIZE);
1391 if (max == 0)
1392 return (*cc == OP_CRRANGE) ? 2 : 1;
1393 max -= min;
1394 if (max > 2)
1395 max = 2;
1396 return max;
1397
1398 default:
1399 return 0;
1400 }
1401 }
1402
1403 static BOOL detect_repeat(compiler_common *common, PCRE2_SPTR begin)
1404 {
1405 PCRE2_SPTR end = bracketend(begin);
1406 PCRE2_SPTR next;
1407 PCRE2_SPTR next_end;
1408 PCRE2_SPTR max_end;
1409 PCRE2_UCHAR type;
1410 sljit_sw length = end - begin;
1411 sljit_s32 min, max, i;
1412
1413 /* Detect fixed iterations first. */
1414 if (end[-(1 + LINK_SIZE)] != OP_KET)
1415 return FALSE;
1416
1417 /* Already detected repeat. */
1418 if (common->private_data_ptrs[end - common->start - LINK_SIZE] != 0)
1419 return TRUE;
1420
1421 next = end;
1422 min = 1;
1423 while (1)
1424 {
1425 if (*next != *begin)
1426 break;
1427 next_end = bracketend(next);
1428 if (next_end - next != length || memcmp(begin, next, IN_UCHARS(length)) != 0)
1429 break;
1430 next = next_end;
1431 min++;
1432 }
1433
1434 if (min == 2)
1435 return FALSE;
1436
1437 max = 0;
1438 max_end = next;
1439 if (*next == OP_BRAZERO || *next == OP_BRAMINZERO)
1440 {
1441 type = *next;
1442 while (1)
1443 {
1444 if (next[0] != type || next[1] != OP_BRA || next[2 + LINK_SIZE] != *begin)
1445 break;
1446 next_end = bracketend(next + 2 + LINK_SIZE);
1447 if (next_end - next != (length + 2 + LINK_SIZE) || memcmp(begin, next + 2 + LINK_SIZE, IN_UCHARS(length)) != 0)
1448 break;
1449 next = next_end;
1450 max++;
1451 }
1452
1453 if (next[0] == type && next[1] == *begin && max >= 1)
1454 {
1455 next_end = bracketend(next + 1);
1456 if (next_end - next == (length + 1) && memcmp(begin, next + 1, IN_UCHARS(length)) == 0)
1457 {
1458 for (i = 0; i < max; i++, next_end += 1 + LINK_SIZE)
1459 if (*next_end != OP_KET)
1460 break;
1461
1462 if (i == max)
1463 {
1464 common->private_data_ptrs[max_end - common->start - LINK_SIZE] = next_end - max_end;
1465 common->private_data_ptrs[max_end - common->start - LINK_SIZE + 1] = (type == OP_BRAZERO) ? OP_UPTO : OP_MINUPTO;
1466 /* +2 the original and the last. */
1467 common->private_data_ptrs[max_end - common->start - LINK_SIZE + 2] = max + 2;
1468 if (min == 1)
1469 return TRUE;
1470 min--;
1471 max_end -= (1 + LINK_SIZE) + GET(max_end, -LINK_SIZE);
1472 }
1473 }
1474 }
1475 }
1476
1477 if (min >= 3)
1478 {
1479 common->private_data_ptrs[end - common->start - LINK_SIZE] = max_end - end;
1480 common->private_data_ptrs[end - common->start - LINK_SIZE + 1] = OP_EXACT;
1481 common->private_data_ptrs[end - common->start - LINK_SIZE + 2] = min;
1482 return TRUE;
1483 }
1484
1485 return FALSE;
1486 }
1487
1488 #define CASE_ITERATOR_PRIVATE_DATA_1 \
1489 case OP_MINSTAR: \
1490 case OP_MINPLUS: \
1491 case OP_QUERY: \
1492 case OP_MINQUERY: \
1493 case OP_MINSTARI: \
1494 case OP_MINPLUSI: \
1495 case OP_QUERYI: \
1496 case OP_MINQUERYI: \
1497 case OP_NOTMINSTAR: \
1498 case OP_NOTMINPLUS: \
1499 case OP_NOTQUERY: \
1500 case OP_NOTMINQUERY: \
1501 case OP_NOTMINSTARI: \
1502 case OP_NOTMINPLUSI: \
1503 case OP_NOTQUERYI: \
1504 case OP_NOTMINQUERYI:
1505
1506 #define CASE_ITERATOR_PRIVATE_DATA_2A \
1507 case OP_STAR: \
1508 case OP_PLUS: \
1509 case OP_STARI: \
1510 case OP_PLUSI: \
1511 case OP_NOTSTAR: \
1512 case OP_NOTPLUS: \
1513 case OP_NOTSTARI: \
1514 case OP_NOTPLUSI:
1515
1516 #define CASE_ITERATOR_PRIVATE_DATA_2B \
1517 case OP_UPTO: \
1518 case OP_MINUPTO: \
1519 case OP_UPTOI: \
1520 case OP_MINUPTOI: \
1521 case OP_NOTUPTO: \
1522 case OP_NOTMINUPTO: \
1523 case OP_NOTUPTOI: \
1524 case OP_NOTMINUPTOI:
1525
1526 #define CASE_ITERATOR_TYPE_PRIVATE_DATA_1 \
1527 case OP_TYPEMINSTAR: \
1528 case OP_TYPEMINPLUS: \
1529 case OP_TYPEQUERY: \
1530 case OP_TYPEMINQUERY:
1531
1532 #define CASE_ITERATOR_TYPE_PRIVATE_DATA_2A \
1533 case OP_TYPESTAR: \
1534 case OP_TYPEPLUS:
1535
1536 #define CASE_ITERATOR_TYPE_PRIVATE_DATA_2B \
1537 case OP_TYPEUPTO: \
1538 case OP_TYPEMINUPTO:
1539
1540 static void set_private_data_ptrs(compiler_common *common, int *private_data_start, PCRE2_SPTR ccend)
1541 {
1542 PCRE2_SPTR cc = common->start;
1543 PCRE2_SPTR alternative;
1544 PCRE2_SPTR end = NULL;
1545 int private_data_ptr = *private_data_start;
1546 int space, size, bracketlen;
1547 BOOL repeat_check = TRUE;
1548
1549 while (cc < ccend)
1550 {
1551 space = 0;
1552 size = 0;
1553 bracketlen = 0;
1554 if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE)
1555 break;
1556
1557 if (repeat_check && (*cc == OP_ONCE || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND))
1558 {
1559 if (detect_repeat(common, cc))
1560 {
1561 /* These brackets are converted to repeats, so no global
1562 based single character repeat is allowed. */
1563 if (cc >= end)
1564 end = bracketend(cc);
1565 }
1566 }
1567 repeat_check = TRUE;
1568
1569 switch(*cc)
1570 {
1571 case OP_KET:
1572 if (common->private_data_ptrs[cc + 1 - common->start] != 0)
1573 {
1574 common->private_data_ptrs[cc - common->start] = private_data_ptr;
1575 private_data_ptr += sizeof(sljit_sw);
1576 cc += common->private_data_ptrs[cc + 1 - common->start];
1577 }
1578 cc += 1 + LINK_SIZE;
1579 break;
1580
1581 case OP_ASSERT:
1582 case OP_ASSERT_NOT:
1583 case OP_ASSERTBACK:
1584 case OP_ASSERTBACK_NOT:
1585 case OP_ONCE:
1586 case OP_SCRIPT_RUN:
1587 case OP_BRAPOS:
1588 case OP_SBRA:
1589 case OP_SBRAPOS:
1590 case OP_SCOND:
1591 common->private_data_ptrs[cc - common->start] = private_data_ptr;
1592 private_data_ptr += sizeof(sljit_sw);
1593 bracketlen = 1 + LINK_SIZE;
1594 break;
1595
1596 case OP_CBRAPOS:
1597 case OP_SCBRAPOS:
1598 common->private_data_ptrs[cc - common->start] = private_data_ptr;
1599 private_data_ptr += sizeof(sljit_sw);
1600 bracketlen = 1 + LINK_SIZE + IMM2_SIZE;
1601 break;
1602
1603 case OP_COND:
1604 /* Might be a hidden SCOND. */
1605 alternative = cc + GET(cc, 1);
1606 if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN)
1607 {
1608 common->private_data_ptrs[cc - common->start] = private_data_ptr;
1609 private_data_ptr += sizeof(sljit_sw);
1610 }
1611 bracketlen = 1 + LINK_SIZE;
1612 break;
1613
1614 case OP_BRA:
1615 bracketlen = 1 + LINK_SIZE;
1616 break;
1617
1618 case OP_CBRA:
1619 case OP_SCBRA:
1620 bracketlen = 1 + LINK_SIZE + IMM2_SIZE;
1621 break;
1622
1623 case OP_BRAZERO:
1624 case OP_BRAMINZERO:
1625 case OP_BRAPOSZERO:
1626 repeat_check = FALSE;
1627 size = 1;
1628 break;
1629
1630 CASE_ITERATOR_PRIVATE_DATA_1
1631 space = 1;
1632 size = -2;
1633 break;
1634
1635 CASE_ITERATOR_PRIVATE_DATA_2A
1636 space = 2;
1637 size = -2;
1638 break;
1639
1640 CASE_ITERATOR_PRIVATE_DATA_2B
1641 space = 2;
1642 size = -(2 + IMM2_SIZE);
1643 break;
1644
1645 CASE_ITERATOR_TYPE_PRIVATE_DATA_1
1646 space = 1;
1647 size = 1;
1648 break;
1649
1650 CASE_ITERATOR_TYPE_PRIVATE_DATA_2A
1651 if (cc[1] != OP_ANYNL && cc[1] != OP_EXTUNI)
1652 space = 2;
1653 size = 1;
1654 break;
1655
1656 case OP_TYPEUPTO:
1657 if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI)
1658 space = 2;
1659 size = 1 + IMM2_SIZE;
1660 break;
1661
1662 case OP_TYPEMINUPTO:
1663 space = 2;
1664 size = 1 + IMM2_SIZE;
1665 break;
1666
1667 case OP_CLASS:
1668 case OP_NCLASS:
1669 space = get_class_iterator_size(cc + size);
1670 size = 1 + 32 / sizeof(PCRE2_UCHAR);
1671 break;
1672
1673 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1674 case OP_XCLASS:
1675 space = get_class_iterator_size(cc + size);
1676 size = GET(cc, 1);
1677 break;
1678 #endif
1679
1680 default:
1681 cc = next_opcode(common, cc);
1682 SLJIT_ASSERT(cc != NULL);
1683 break;
1684 }
1685
1686 /* Character iterators, which are not inside a repeated bracket,
1687 gets a private slot instead of allocating it on the stack. */
1688 if (space > 0 && cc >= end)
1689 {
1690 common->private_data_ptrs[cc - common->start] = private_data_ptr;
1691 private_data_ptr += sizeof(sljit_sw) * space;
1692 }
1693
1694 if (size != 0)
1695 {
1696 if (size < 0)
1697 {
1698 cc += -size;
1699 #ifdef SUPPORT_UNICODE
1700 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1701 #endif
1702 }
1703 else
1704 cc += size;
1705 }
1706
1707 if (bracketlen > 0)
1708 {
1709 if (cc >= end)
1710 {
1711 end = bracketend(cc);
1712 if (end[-1 - LINK_SIZE] == OP_KET)
1713 end = NULL;
1714 }
1715 cc += bracketlen;
1716 }
1717 }
1718 *private_data_start = private_data_ptr;
1719 }
1720
1721 /* Returns with a frame_types (always < 0) if no need for frame. */
1722 static int get_framesize(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, BOOL recursive, BOOL *needs_control_head)
1723 {
1724 int length = 0;
1725 int possessive = 0;
1726 BOOL stack_restore = FALSE;
1727 BOOL setsom_found = recursive;
1728 BOOL setmark_found = recursive;
1729 /* The last capture is a local variable even for recursions. */
1730 BOOL capture_last_found = FALSE;
1731
1732 #if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD
1733 SLJIT_ASSERT(common->control_head_ptr != 0);
1734 *needs_control_head = TRUE;
1735 #else
1736 *needs_control_head = FALSE;
1737 #endif
1738
1739 if (ccend == NULL)
1740 {
1741 ccend = bracketend(cc) - (1 + LINK_SIZE);
1742 if (!recursive && (*cc == OP_CBRAPOS || *cc == OP_SCBRAPOS))
1743 {
1744 possessive = length = (common->capture_last_ptr != 0) ? 5 : 3;
1745 /* This is correct regardless of common->capture_last_ptr. */
1746 capture_last_found = TRUE;
1747 }
1748 cc = next_opcode(common, cc);
1749 }
1750
1751 SLJIT_ASSERT(cc != NULL);
1752 while (cc < ccend)
1753 switch(*cc)
1754 {
1755 case OP_SET_SOM:
1756 SLJIT_ASSERT(common->has_set_som);
1757 stack_restore = TRUE;
1758 if (!setsom_found)
1759 {
1760 length += 2;
1761 setsom_found = TRUE;
1762 }
1763 cc += 1;
1764 break;
1765
1766 case OP_MARK:
1767 case OP_COMMIT_ARG:
1768 case OP_PRUNE_ARG:
1769 case OP_THEN_ARG:
1770 SLJIT_ASSERT(common->mark_ptr != 0);
1771 stack_restore = TRUE;
1772 if (!setmark_found)
1773 {
1774 length += 2;
1775 setmark_found = TRUE;
1776 }
1777 if (common->control_head_ptr != 0)
1778 *needs_control_head = TRUE;
1779 cc += 1 + 2 + cc[1];
1780 break;
1781
1782 case OP_RECURSE:
1783 stack_restore = TRUE;
1784 if (common->has_set_som && !setsom_found)
1785 {
1786 length += 2;
1787 setsom_found = TRUE;
1788 }
1789 if (common->mark_ptr != 0 && !setmark_found)
1790 {
1791 length += 2;
1792 setmark_found = TRUE;
1793 }
1794 if (common->capture_last_ptr != 0 && !capture_last_found)
1795 {
1796 length += 2;
1797 capture_last_found = TRUE;
1798 }
1799 cc += 1 + LINK_SIZE;
1800 break;
1801
1802 case OP_CBRA:
1803 case OP_CBRAPOS:
1804 case OP_SCBRA:
1805 case OP_SCBRAPOS:
1806 stack_restore = TRUE;
1807 if (common->capture_last_ptr != 0 && !capture_last_found)
1808 {
1809 length += 2;
1810 capture_last_found = TRUE;
1811 }
1812 length += 3;
1813 cc += 1 + LINK_SIZE + IMM2_SIZE;
1814 break;
1815
1816 case OP_THEN:
1817 stack_restore = TRUE;
1818 if (common->control_head_ptr != 0)
1819 *needs_control_head = TRUE;
1820 cc ++;
1821 break;
1822
1823 default:
1824 stack_restore = TRUE;
1825 /* Fall through. */
1826
1827 case OP_NOT_WORD_BOUNDARY:
1828 case OP_WORD_BOUNDARY:
1829 case OP_NOT_DIGIT:
1830 case OP_DIGIT:
1831 case OP_NOT_WHITESPACE:
1832 case OP_WHITESPACE:
1833 case OP_NOT_WORDCHAR:
1834 case OP_WORDCHAR:
1835 case OP_ANY:
1836 case OP_ALLANY:
1837 case OP_ANYBYTE:
1838 case OP_NOTPROP:
1839 case OP_PROP:
1840 case OP_ANYNL:
1841 case OP_NOT_HSPACE:
1842 case OP_HSPACE:
1843 case OP_NOT_VSPACE:
1844 case OP_VSPACE:
1845 case OP_EXTUNI:
1846 case OP_EODN:
1847 case OP_EOD:
1848 case OP_CIRC:
1849 case OP_CIRCM:
1850 case OP_DOLL:
1851 case OP_DOLLM:
1852 case OP_CHAR:
1853 case OP_CHARI:
1854 case OP_NOT:
1855 case OP_NOTI:
1856
1857 case OP_EXACT:
1858 case OP_POSSTAR:
1859 case OP_POSPLUS:
1860 case OP_POSQUERY:
1861 case OP_POSUPTO:
1862
1863 case OP_EXACTI:
1864 case OP_POSSTARI:
1865 case OP_POSPLUSI:
1866 case OP_POSQUERYI:
1867 case OP_POSUPTOI:
1868
1869 case OP_NOTEXACT:
1870 case OP_NOTPOSSTAR:
1871 case OP_NOTPOSPLUS:
1872 case OP_NOTPOSQUERY:
1873 case OP_NOTPOSUPTO:
1874
1875 case OP_NOTEXACTI:
1876 case OP_NOTPOSSTARI:
1877 case OP_NOTPOSPLUSI:
1878 case OP_NOTPOSQUERYI:
1879 case OP_NOTPOSUPTOI:
1880
1881 case OP_TYPEEXACT:
1882 case OP_TYPEPOSSTAR:
1883 case OP_TYPEPOSPLUS:
1884 case OP_TYPEPOSQUERY:
1885 case OP_TYPEPOSUPTO:
1886
1887 case OP_CLASS:
1888 case OP_NCLASS:
1889 case OP_XCLASS:
1890
1891 case OP_CALLOUT:
1892 case OP_CALLOUT_STR:
1893
1894 cc = next_opcode(common, cc);
1895 SLJIT_ASSERT(cc != NULL);
1896 break;
1897 }
1898
1899 /* Possessive quantifiers can use a special case. */
1900 if (SLJIT_UNLIKELY(possessive == length))
1901 return stack_restore ? no_frame : no_stack;
1902
1903 if (length > 0)
1904 return length + 1;
1905 return stack_restore ? no_frame : no_stack;
1906 }
1907
1908 static void init_frame(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, int stackpos, int stacktop)
1909 {
1910 DEFINE_COMPILER;
1911 BOOL setsom_found = FALSE;
1912 BOOL setmark_found = FALSE;
1913 /* The last capture is a local variable even for recursions. */
1914 BOOL capture_last_found = FALSE;
1915 int offset;
1916
1917 /* >= 1 + shortest item size (2) */
1918 SLJIT_UNUSED_ARG(stacktop);
1919 SLJIT_ASSERT(stackpos >= stacktop + 2);
1920
1921 stackpos = STACK(stackpos);
1922 if (ccend == NULL)
1923 {
1924 ccend = bracketend(cc) - (1 + LINK_SIZE);
1925 if (*cc != OP_CBRAPOS && *cc != OP_SCBRAPOS)
1926 cc = next_opcode(common, cc);
1927 }
1928
1929 SLJIT_ASSERT(cc != NULL);
1930 while (cc < ccend)
1931 switch(*cc)
1932 {
1933 case OP_SET_SOM:
1934 SLJIT_ASSERT(common->has_set_som);
1935 if (!setsom_found)
1936 {
1937 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));
1938 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -OVECTOR(0));
1939 stackpos -= (int)sizeof(sljit_sw);
1940 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
1941 stackpos -= (int)sizeof(sljit_sw);
1942 setsom_found = TRUE;
1943 }
1944 cc += 1;
1945 break;
1946
1947 case OP_MARK:
1948 case OP_COMMIT_ARG:
1949 case OP_PRUNE_ARG:
1950 case OP_THEN_ARG:
1951 SLJIT_ASSERT(common->mark_ptr != 0);
1952 if (!setmark_found)
1953 {
1954 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr);
1955 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->mark_ptr);
1956 stackpos -= (int)sizeof(sljit_sw);
1957 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
1958 stackpos -= (int)sizeof(sljit_sw);
1959 setmark_found = TRUE;
1960 }
1961 cc += 1 + 2 + cc[1];
1962 break;
1963
1964 case OP_RECURSE:
1965 if (common->has_set_som && !setsom_found)
1966 {
1967 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));
1968 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -OVECTOR(0));
1969 stackpos -= (int)sizeof(sljit_sw);
1970 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
1971 stackpos -= (int)sizeof(sljit_sw);
1972 setsom_found = TRUE;
1973 }
1974 if (common->mark_ptr != 0 && !setmark_found)
1975 {
1976 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr);
1977 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->mark_ptr);
1978 stackpos -= (int)sizeof(sljit_sw);
1979 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
1980 stackpos -= (int)sizeof(sljit_sw);
1981 setmark_found = TRUE;
1982 }
1983 if (common->capture_last_ptr != 0 && !capture_last_found)
1984 {
1985 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr);
1986 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->capture_last_ptr);
1987 stackpos -= (int)sizeof(sljit_sw);
1988 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
1989 stackpos -= (int)sizeof(sljit_sw);
1990 capture_last_found = TRUE;
1991 }
1992 cc += 1 + LINK_SIZE;
1993 break;
1994
1995 case OP_CBRA:
1996 case OP_CBRAPOS:
1997 case OP_SCBRA:
1998 case OP_SCBRAPOS:
1999 if (common->capture_last_ptr != 0 && !capture_last_found)
2000 {
2001 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr);
2002 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->capture_last_ptr);
2003 stackpos -= (int)sizeof(sljit_sw);
2004 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
2005 stackpos -= (int)sizeof(sljit_sw);
2006 capture_last_found = TRUE;
2007 }
2008 offset = (GET2(cc, 1 + LINK_SIZE)) << 1;
2009 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, OVECTOR(offset));
2010 stackpos -= (int)sizeof(sljit_sw);
2011 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset));
2012 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1));
2013 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
2014 stackpos -= (int)sizeof(sljit_sw);
2015 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP2, 0);
2016 stackpos -= (int)sizeof(sljit_sw);
2017
2018 cc += 1 + LINK_SIZE + IMM2_SIZE;
2019 break;
2020
2021 default:
2022 cc = next_opcode(common, cc);
2023 SLJIT_ASSERT(cc != NULL);
2024 break;
2025 }
2026
2027 OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, 0);
2028 SLJIT_ASSERT(stackpos == STACK(stacktop));
2029 }
2030
2031 #define RECURSE_TMP_REG_COUNT 3
2032
2033 typedef struct delayed_mem_copy_status {
2034 struct sljit_compiler *compiler;
2035 int store_bases[RECURSE_TMP_REG_COUNT];
2036 int store_offsets[RECURSE_TMP_REG_COUNT];
2037 int tmp_regs[RECURSE_TMP_REG_COUNT];
2038 int saved_tmp_regs[RECURSE_TMP_REG_COUNT];
2039 int next_tmp_reg;
2040 } delayed_mem_copy_status;
2041
2042 static void delayed_mem_copy_init(delayed_mem_copy_status *status, compiler_common *common)
2043 {
2044 int i;
2045
2046 for (i = 0; i < RECURSE_TMP_REG_COUNT; i++)
2047 {
2048 SLJIT_ASSERT(status->tmp_regs[i] >= 0);
2049 SLJIT_ASSERT(sljit_get_register_index(status->saved_tmp_regs[i]) < 0 || status->tmp_regs[i] == status->saved_tmp_regs[i]);
2050
2051 status->store_bases[i] = -1;
2052 }
2053 status->next_tmp_reg = 0;
2054 status->compiler = common->compiler;
2055 }
2056
2057 static void delayed_mem_copy_move(delayed_mem_copy_status *status, int load_base, sljit_sw load_offset,
2058 int store_base, sljit_sw store_offset)
2059 {
2060 struct sljit_compiler *compiler = status->compiler;
2061 int next_tmp_reg = status->next_tmp_reg;
2062 int tmp_reg = status->tmp_regs[next_tmp_reg];
2063
2064 SLJIT_ASSERT(load_base > 0 && store_base > 0);
2065
2066 if (status->store_bases[next_tmp_reg] == -1)
2067 {
2068 /* Preserve virtual registers. */
2069 if (sljit_get_register_index(status->saved_tmp_regs[next_tmp_reg]) < 0)
2070 OP1(SLJIT_MOV, status->saved_tmp_regs[next_tmp_reg], 0, tmp_reg, 0);
2071 }
2072 else
2073 OP1(SLJIT_MOV, SLJIT_MEM1(status->store_bases[next_tmp_reg]), status->store_offsets[next_tmp_reg], tmp_reg, 0);
2074
2075 OP1(SLJIT_MOV, tmp_reg, 0, SLJIT_MEM1(load_base), load_offset);
2076 status->store_bases[next_tmp_reg] = store_base;
2077 status->store_offsets[next_tmp_reg] = store_offset;
2078
2079 status->next_tmp_reg = (next_tmp_reg + 1) % RECURSE_TMP_REG_COUNT;
2080 }
2081
2082 static void delayed_mem_copy_finish(delayed_mem_copy_status *status)
2083 {
2084 struct sljit_compiler *compiler = status->compiler;
2085 int next_tmp_reg = status->next_tmp_reg;
2086 int tmp_reg, saved_tmp_reg, i;
2087
2088 for (i = 0; i < RECURSE_TMP_REG_COUNT; i++)
2089 {
2090 if (status->store_bases[next_tmp_reg] != -1)
2091 {
2092 tmp_reg = status->tmp_regs[next_tmp_reg];
2093 saved_tmp_reg = status->saved_tmp_regs[next_tmp_reg];
2094
2095 OP1(SLJIT_MOV, SLJIT_MEM1(status->store_bases[next_tmp_reg]), status->store_offsets[next_tmp_reg], tmp_reg, 0);
2096
2097 /* Restore virtual registers. */
2098 if (sljit_get_register_index(saved_tmp_reg) < 0)
2099 OP1(SLJIT_MOV, tmp_reg, 0, saved_tmp_reg, 0);
2100 }
2101
2102 next_tmp_reg = (next_tmp_reg + 1) % RECURSE_TMP_REG_COUNT;
2103 }
2104 }
2105
2106 #undef RECURSE_TMP_REG_COUNT
2107
2108 static int get_recurse_data_length(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend,
2109 BOOL *needs_control_head, BOOL *has_quit, BOOL *has_accept)
2110 {
2111 int length = 1;
2112 int size;
2113 PCRE2_SPTR alternative;
2114 BOOL quit_found = FALSE;
2115 BOOL accept_found = FALSE;
2116 BOOL setsom_found = FALSE;
2117 BOOL setmark_found = FALSE;
2118 BOOL capture_last_found = FALSE;
2119 BOOL control_head_found = FALSE;
2120
2121 #if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD
2122 SLJIT_ASSERT(common->control_head_ptr != 0);
2123 control_head_found = TRUE;
2124 #endif
2125
2126 /* Calculate the sum of the private machine words. */
2127 while (cc < ccend)
2128 {
2129 size = 0;
2130 switch(*cc)
2131 {
2132 case OP_SET_SOM:
2133 SLJIT_ASSERT(common->has_set_som);
2134 setsom_found = TRUE;
2135 cc += 1;
2136 break;
2137
2138 case OP_RECURSE:
2139 if (common->has_set_som)
2140 setsom_found = TRUE;
2141 if (common->mark_ptr != 0)
2142 setmark_found = TRUE;
2143 if (common->capture_last_ptr != 0)
2144 capture_last_found = TRUE;
2145 cc += 1 + LINK_SIZE;
2146 break;
2147
2148 case OP_KET:
2149 if (PRIVATE_DATA(cc) != 0)
2150 {
2151 length++;
2152 SLJIT_ASSERT(PRIVATE_DATA(cc + 1) != 0);
2153 cc += PRIVATE_DATA(cc + 1);
2154 }
2155 cc += 1 + LINK_SIZE;
2156 break;
2157
2158 case OP_ASSERT:
2159 case OP_ASSERT_NOT:
2160 case OP_ASSERTBACK:
2161 case OP_ASSERTBACK_NOT:
2162 case OP_ONCE:
2163 case OP_SCRIPT_RUN:
2164 case OP_BRAPOS:
2165 case OP_SBRA:
2166 case OP_SBRAPOS:
2167 case OP_SCOND:
2168 length++;
2169 SLJIT_ASSERT(PRIVATE_DATA(cc) != 0);
2170 cc += 1 + LINK_SIZE;
2171 break;
2172
2173 case OP_CBRA:
2174 case OP_SCBRA:
2175 length += 2;
2176 if (common->capture_last_ptr != 0)
2177 capture_last_found = TRUE;
2178 if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0)
2179 length++;
2180 cc += 1 + LINK_SIZE + IMM2_SIZE;
2181 break;
2182
2183 case OP_CBRAPOS:
2184 case OP_SCBRAPOS:
2185 length += 2 + 2;
2186 if (common->capture_last_ptr != 0)
2187 capture_last_found = TRUE;
2188 cc += 1 + LINK_SIZE + IMM2_SIZE;
2189 break;
2190
2191 case OP_COND:
2192 /* Might be a hidden SCOND. */
2193 alternative = cc + GET(cc, 1);
2194 if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN)
2195 length++;
2196 cc += 1 + LINK_SIZE;
2197 break;
2198
2199 CASE_ITERATOR_PRIVATE_DATA_1
2200 if (PRIVATE_DATA(cc) != 0)
2201 length++;
2202 cc += 2;
2203 #ifdef SUPPORT_UNICODE
2204 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
2205 #endif
2206 break;
2207
2208 CASE_ITERATOR_PRIVATE_DATA_2A
2209 if (PRIVATE_DATA(cc) != 0)
2210 length += 2;
2211 cc += 2;
2212 #ifdef SUPPORT_UNICODE
2213 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
2214 #endif
2215 break;
2216
2217 CASE_ITERATOR_PRIVATE_DATA_2B
2218 if (PRIVATE_DATA(cc) != 0)
2219 length += 2;
2220 cc += 2 + IMM2_SIZE;
2221 #ifdef SUPPORT_UNICODE
2222 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
2223 #endif
2224 break;
2225
2226 CASE_ITERATOR_TYPE_PRIVATE_DATA_1
2227 if (PRIVATE_DATA(cc) != 0)
2228 length++;
2229 cc += 1;
2230 break;
2231
2232 CASE_ITERATOR_TYPE_PRIVATE_DATA_2A
2233 if (PRIVATE_DATA(cc) != 0)
2234 length += 2;
2235 cc += 1;
2236 break;
2237
2238 CASE_ITERATOR_TYPE_PRIVATE_DATA_2B
2239 if (PRIVATE_DATA(cc) != 0)
2240 length += 2;
2241 cc += 1 + IMM2_SIZE;
2242 break;
2243
2244 case OP_CLASS:
2245 case OP_NCLASS:
2246 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
2247 case OP_XCLASS:
2248 size = (*cc == OP_XCLASS) ? GET(cc, 1) : 1 + 32 / (int)sizeof(PCRE2_UCHAR);
2249 #else
2250 size = 1 + 32 / (int)sizeof(PCRE2_UCHAR);
2251 #endif
2252 if (PRIVATE_DATA(cc) != 0)
2253 length += get_class_iterator_size(cc + size);
2254 cc += size;
2255 break;
2256
2257 case OP_MARK:
2258 case OP_COMMIT_ARG:
2259 case OP_PRUNE_ARG:
2260 case OP_THEN_ARG:
2261 SLJIT_ASSERT(common->mark_ptr != 0);
2262 if (!setmark_found)
2263 setmark_found = TRUE;
2264 if (common->control_head_ptr != 0)
2265 control_head_found = TRUE;
2266 if (*cc != OP_MARK)
2267 quit_found = TRUE;
2268
2269 cc += 1 + 2 + cc[1];
2270 break;
2271
2272 case OP_PRUNE:
2273 case OP_SKIP:
2274 case OP_COMMIT:
2275 quit_found = TRUE;
2276 cc++;
2277 break;
2278
2279 case OP_SKIP_ARG:
2280 quit_found = TRUE;
2281 cc += 1 + 2 + cc[1];
2282 break;
2283
2284 case OP_THEN:
2285 SLJIT_ASSERT(common->control_head_ptr != 0);
2286 quit_found = TRUE;
2287 if (!control_head_found)
2288 control_head_found = TRUE;
2289 cc++;
2290 break;
2291
2292 case OP_ACCEPT:
2293 case OP_ASSERT_ACCEPT:
2294 accept_found = TRUE;
2295 cc++;
2296 break;
2297
2298 default:
2299 cc = next_opcode(common, cc);
2300 SLJIT_ASSERT(cc != NULL);
2301 break;
2302 }
2303 }
2304 SLJIT_ASSERT(cc == ccend);
2305
2306 if (control_head_found)
2307 length++;
2308 if (capture_last_found)
2309 length++;
2310 if (quit_found)
2311 {
2312 if (setsom_found)
2313 length++;
2314 if (setmark_found)
2315 length++;
2316 }
2317
2318 *needs_control_head = control_head_found;
2319 *has_quit = quit_found;
2320 *has_accept = accept_found;
2321 return length;
2322 }
2323
2324 enum copy_recurse_data_types {
2325 recurse_copy_from_global,
2326 recurse_copy_private_to_global,
2327 recurse_copy_shared_to_global,
2328 recurse_copy_kept_shared_to_global,
2329 recurse_swap_global
2330 };
2331
2332 static void copy_recurse_data(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend,
2333 int type, int stackptr, int stacktop, BOOL has_quit)
2334 {
2335 delayed_mem_copy_status status;
2336 PCRE2_SPTR alternative;
2337 sljit_sw private_srcw[2];
2338 sljit_sw shared_srcw[3];
2339 sljit_sw kept_shared_srcw[2];
2340 int private_count, shared_count, kept_shared_count;
2341 int from_sp, base_reg, offset, i;
2342 BOOL setsom_found = FALSE;
2343 BOOL setmark_found = FALSE;
2344 BOOL capture_last_found = FALSE;
2345 BOOL control_head_found = FALSE;
2346
2347 #if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD
2348 SLJIT_ASSERT(common->control_head_ptr != 0);
2349 control_head_found = TRUE;
2350 #endif
2351
2352 switch (type)
2353 {
2354 case recurse_copy_from_global:
2355 from_sp = TRUE;
2356 base_reg = STACK_TOP;
2357 break;
2358
2359 case recurse_copy_private_to_global:
2360 case recurse_copy_shared_to_global:
2361 case recurse_copy_kept_shared_to_global:
2362 from_sp = FALSE;
2363 base_reg = STACK_TOP;
2364 break;
2365
2366 default:
2367 SLJIT_ASSERT(type == recurse_swap_global);
2368 from_sp = FALSE;
2369 base_reg = TMP2;
2370 break;
2371 }
2372
2373 stackptr = STACK(stackptr);
2374 stacktop = STACK(stacktop);
2375
2376 status.tmp_regs[0] = TMP1;
2377 status.saved_tmp_regs[0] = TMP1;
2378
2379 if (base_reg != TMP2)
2380 {
2381 status.tmp_regs[1] = TMP2;
2382 status.saved_tmp_regs[1] = TMP2;
2383 }
2384 else
2385 {
2386 status.saved_tmp_regs[1] = RETURN_ADDR;
2387 if (sljit_get_register_index(RETURN_ADDR) == -1)
2388 status.tmp_regs[1] = STR_PTR;
2389 else
2390 status.tmp_regs[1] = RETURN_ADDR;
2391 }
2392
2393 status.saved_tmp_regs[2] = TMP3;
2394 if (sljit_get_register_index(TMP3) == -1)
2395 status.tmp_regs[2] = STR_END;
2396 else
2397 status.tmp_regs[2] = TMP3;
2398
2399 delayed_mem_copy_init(&status, common);
2400
2401 if (type != recurse_copy_shared_to_global && type != recurse_copy_kept_shared_to_global)
2402 {
2403 SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_private_to_global || type == recurse_swap_global);
2404
2405 if (!from_sp)
2406 delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, common->recursive_head_ptr);
2407
2408 if (from_sp || type == recurse_swap_global)
2409 delayed_mem_copy_move(&status, SLJIT_SP, common->recursive_head_ptr, base_reg, stackptr);
2410 }
2411
2412 stackptr += sizeof(sljit_sw);
2413
2414 #if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD
2415 if (type != recurse_copy_shared_to_global)
2416 {
2417 if (!from_sp)
2418 delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, common->control_head_ptr);
2419
2420 if (from_sp || type == recurse_swap_global)
2421 delayed_mem_copy_move(&status, SLJIT_SP, common->control_head_ptr, base_reg, stackptr);
2422 }
2423
2424 stackptr += sizeof(sljit_sw);
2425 #endif
2426
2427 while (cc < ccend)
2428 {
2429 private_count = 0;
2430 shared_count = 0;
2431 kept_shared_count = 0;
2432
2433 switch(*cc)
2434 {
2435 case OP_SET_SOM:
2436 SLJIT_ASSERT(common->has_set_som);
2437 if (has_quit && !setsom_found)
2438 {
2439 kept_shared_srcw[0] = OVECTOR(0);
2440 kept_shared_count = 1;
2441 setsom_found = TRUE;
2442 }
2443 cc += 1;
2444 break;
2445
2446 case OP_RECURSE:
2447 if (has_quit)
2448 {
2449 if (common->has_set_som && !setsom_found)
2450 {
2451 kept_shared_srcw[0] = OVECTOR(0);
2452 kept_shared_count = 1;
2453 setsom_found = TRUE;
2454 }
2455 if (common->mark_ptr != 0 && !setmark_found)
2456 {
2457 kept_shared_srcw[kept_shared_count] = common->mark_ptr;
2458 kept_shared_count++;
2459 setmark_found = TRUE;
2460 }
2461 }
2462 if (common->capture_last_ptr != 0 && !capture_last_found)
2463 {
2464 shared_srcw[0] = common->capture_last_ptr;
2465 shared_count = 1;
2466 capture_last_found = TRUE;
2467 }
2468 cc += 1 + LINK_SIZE;
2469 break;
2470
2471 case OP_KET:
2472 if (PRIVATE_DATA(cc) != 0)
2473 {
2474 private_count = 1;
2475 private_srcw[0] = PRIVATE_DATA(cc);
2476 SLJIT_ASSERT(PRIVATE_DATA(cc + 1) != 0);
2477 cc += PRIVATE_DATA(cc + 1);
2478 }
2479 cc += 1 + LINK_SIZE;
2480 break;
2481
2482 case OP_ASSERT:
2483 case OP_ASSERT_NOT:
2484 case OP_ASSERTBACK:
2485 case OP_ASSERTBACK_NOT:
2486 case OP_ONCE:
2487 case OP_SCRIPT_RUN:
2488 case OP_BRAPOS:
2489 case OP_SBRA:
2490 case OP_SBRAPOS:
2491 case OP_SCOND:
2492 private_count = 1;
2493 private_srcw[0] = PRIVATE_DATA(cc);
2494 cc += 1 + LINK_SIZE;
2495 break;
2496
2497 case OP_CBRA:
2498 case OP_SCBRA:
2499 offset = (GET2(cc, 1 + LINK_SIZE)) << 1;
2500 shared_srcw[0] = OVECTOR(offset);
2501 shared_srcw[1] = OVECTOR(offset + 1);
2502 shared_count = 2;
2503
2504 if (common->capture_last_ptr != 0 && !capture_last_found)
2505 {
2506 shared_srcw[2] = common->capture_last_ptr;
2507 shared_count = 3;
2508 capture_last_found = TRUE;
2509 }
2510
2511 if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0)
2512 {
2513 private_count = 1;
2514 private_srcw[0] = OVECTOR_PRIV(GET2(cc, 1 + LINK_SIZE));
2515 }
2516 cc += 1 + LINK_SIZE + IMM2_SIZE;
2517 break;
2518
2519 case OP_CBRAPOS:
2520 case OP_SCBRAPOS:
2521 offset = (GET2(cc, 1 + LINK_SIZE)) << 1;
2522 shared_srcw[0] = OVECTOR(offset);
2523 shared_srcw[1] = OVECTOR(offset + 1);
2524 shared_count = 2;
2525
2526 if (common->capture_last_ptr != 0 && !capture_last_found)
2527 {
2528 shared_srcw[2] = common->capture_last_ptr;
2529 shared_count = 3;
2530 capture_last_found = TRUE;
2531 }
2532
2533 private_count = 2;
2534 private_srcw[0] = PRIVATE_DATA(cc);
2535 private_srcw[1] = OVECTOR_PRIV(GET2(cc, 1 + LINK_SIZE));
2536 cc += 1 + LINK_SIZE + IMM2_SIZE;
2537 break;
2538
2539 case OP_COND:
2540 /* Might be a hidden SCOND. */
2541 alternative = cc + GET(cc, 1);
2542 if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN)
2543 {
2544 private_count = 1;
2545 private_srcw[0] = PRIVATE_DATA(cc);
2546 }
2547 cc += 1 + LINK_SIZE;
2548 break;
2549
2550 CASE_ITERATOR_PRIVATE_DATA_1
2551 if (PRIVATE_DATA(cc))
2552 {
2553 private_count = 1;
2554 private_srcw[0] = PRIVATE_DATA(cc);
2555 }
2556 cc += 2;
2557 #ifdef SUPPORT_UNICODE
2558 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
2559 #endif
2560 break;
2561
2562 CASE_ITERATOR_PRIVATE_DATA_2A
2563 if (PRIVATE_DATA(cc))
2564 {
2565 private_count = 2;
2566 private_srcw[0] = PRIVATE_DATA(cc);
2567 private_srcw[1] = PRIVATE_DATA(cc) + sizeof(sljit_sw);
2568 }
2569 cc += 2;
2570 #ifdef SUPPORT_UNICODE
2571 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
2572 #endif
2573 break;
2574
2575 CASE_ITERATOR_PRIVATE_DATA_2B
2576 if (PRIVATE_DATA(cc))
2577 {
2578 private_count = 2;
2579 private_srcw[0] = PRIVATE_DATA(cc);
2580 private_srcw[1] = PRIVATE_DATA(cc) + sizeof(sljit_sw);
2581 }
2582 cc += 2 + IMM2_SIZE;
2583 #ifdef SUPPORT_UNICODE
2584 if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
2585 #endif
2586 break;
2587
2588 CASE_ITERATOR_TYPE_PRIVATE_DATA_1
2589 if (PRIVATE_DATA(cc))
2590 {
2591 private_count = 1;
2592 private_srcw[0] = PRIVATE_DATA(cc);
2593 }
2594 cc += 1;
2595 break;
2596
2597 CASE_ITERATOR_TYPE_PRIVATE_DATA_2A
2598 if (PRIVATE_DATA(cc))
2599 {
2600 private_count = 2;
2601 private_srcw[0] = PRIVATE_DATA(cc);
2602 private_srcw[1] = private_srcw[0] + sizeof(sljit_sw);
2603 }
2604 cc += 1;
2605 break;
2606
2607 CASE_ITERATOR_TYPE_PRIVATE_DATA_2B
2608 if (PRIVATE_DATA(cc))
2609 {
2610 private_count = 2;
2611 private_srcw[0] = PRIVATE_DATA(cc);
2612 private_srcw[1] = private_srcw[0] + sizeof(sljit_sw);
2613 }
2614 cc += 1 + IMM2_SIZE;
2615 break;
2616
2617 case OP_CLASS:
2618 case OP_NCLASS:
2619 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
2620 case OP_XCLASS:
2621 i = (*cc == OP_XCLASS) ? GET(cc, 1) : 1 + 32 / (int)sizeof(PCRE2_UCHAR);
2622 #else
2623 i = 1 + 32 / (int)sizeof(PCRE2_UCHAR);
2624 #endif
2625 if (PRIVATE_DATA(cc) != 0)
2626 switch(get_class_iterator_size(cc + i))
2627 {
2628 case 1:
2629 private_count = 1;
2630 private_srcw[0] = PRIVATE_DATA(cc);
2631 break;
2632
2633 case 2:
2634 private_count = 2;
2635 private_srcw[0] = PRIVATE_DATA(cc);
2636 private_srcw[1] = private_srcw[0] + sizeof(sljit_sw);
2637 break;
2638
2639 default:
2640 SLJIT_UNREACHABLE();
2641 break;
2642 }
2643 cc += i;
2644 break;
2645
2646 case OP_MARK:
2647 case OP_COMMIT_ARG:
2648 case OP_PRUNE_ARG:
2649 case OP_THEN_ARG:
2650 SLJIT_ASSERT(common->mark_ptr != 0);
2651 if (has_quit && !setmark_found)
2652 {
2653 kept_shared_srcw[0] = common->mark_ptr;
2654 kept_shared_count = 1;
2655 setmark_found = TRUE;
2656 }
2657 if (common->control_head_ptr != 0 && !control_head_found)
2658 {
2659 shared_srcw[0] = common->control_head_ptr;
2660 shared_count = 1;
2661 control_head_found = TRUE;
2662 }
2663 cc += 1 + 2 + cc[1];
2664 break;
2665
2666 case OP_THEN:
2667 SLJIT_ASSERT(common->control_head_ptr != 0);
2668 if (!control_head_found)
2669 {
2670 shared_srcw[0] = common->control_head_ptr;
2671 shared_count = 1;
2672 control_head_found = TRUE;
2673 }
2674 cc++;
2675 break;
2676
2677 default:
2678 cc = next_opcode(common, cc);
2679 SLJIT_ASSERT(cc != NULL);
2680 break;
2681 }
2682
2683 if (type != recurse_copy_shared_to_global && type != recurse_copy_kept_shared_to_global)
2684 {
2685 SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_private_to_global || type == recurse_swap_global);
2686
2687 for (i = 0; i < private_count; i++)
2688 {
2689 SLJIT_ASSERT(private_srcw[i] != 0);
2690
2691 if (!from_sp)
2692 delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, private_srcw[i]);
2693
2694 if (from_sp || type == recurse_swap_global)
2695 delayed_mem_copy_move(&status, SLJIT_SP, private_srcw[i], base_reg, stackptr);
2696
2697 stackptr += sizeof(sljit_sw);
2698 }
2699 }
2700 else
2701 stackptr += sizeof(sljit_sw) * private_count;
2702
2703 if (type != recurse_copy_private_to_global && type != recurse_copy_kept_shared_to_global)
2704 {
2705 SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_shared_to_global || type == recurse_swap_global);
2706
2707 for (i = 0; i < shared_count; i++)
2708 {
2709 SLJIT_ASSERT(shared_srcw[i] != 0);
2710
2711 if (!from_sp)
2712 delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, shared_srcw[i]);
2713
2714 if (from_sp || type == recurse_swap_global)
2715 delayed_mem_copy_move(&status, SLJIT_SP, shared_srcw[i], base_reg, stackptr);
2716
2717 stackptr += sizeof(sljit_sw);
2718 }
2719 }
2720 else
2721 stackptr += sizeof(sljit_sw) * shared_count;
2722
2723 if (type != recurse_copy_private_to_global && type != recurse_swap_global)
2724 {
2725 SLJIT_ASSERT(type == recurse_copy_from_global || type == recurse_copy_shared_to_global || type == recurse_copy_kept_shared_to_global);
2726
2727 for (i = 0; i < kept_shared_count; i++)
2728 {
2729 SLJIT_ASSERT(kept_shared_srcw[i] != 0);
2730
2731 if (!from_sp)
2732 delayed_mem_copy_move(&status, base_reg, stackptr, SLJIT_SP, kept_shared_srcw[i]);
2733
2734 if (from_sp || type == recurse_swap_global)
2735 delayed_mem_copy_move(&status, SLJIT_SP, kept_shared_srcw[i], base_reg, stackptr);
2736
2737 stackptr += sizeof(sljit_sw);
2738 }
2739 }
2740 else
2741 stackptr += sizeof(sljit_sw) * kept_shared_count;
2742 }
2743
2744 SLJIT_ASSERT(cc == ccend && stackptr == stacktop);
2745
2746 delayed_mem_copy_finish(&status);
2747 }
2748
2749 static SLJIT_INLINE PCRE2_SPTR set_then_offsets(compiler_common *common, PCRE2_SPTR cc, sljit_u8 *current_offset)
2750 {
2751 PCRE2_SPTR end = bracketend(cc);
2752 BOOL has_alternatives = cc[GET(cc, 1)] == OP_ALT;
2753
2754 /* Assert captures then. */
2755 if (*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT)
2756 current_offset = NULL;
2757 /* Conditional block does not. */
2758 if (*cc == OP_COND || *cc == OP_SCOND)
2759 has_alternatives = FALSE;
2760
2761 cc = next_opcode(common, cc);
2762 if (has_alternatives)
2763 current_offset = common->then_offsets + (cc - common->start);
2764
2765 while (cc < end)
2766 {
2767 if ((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND))
2768 cc = set_then_offsets(common, cc, current_offset);
2769 else
2770 {
2771 if (*cc == OP_ALT && has_alternatives)
2772 current_offset = common->then_offsets + (cc + 1 + LINK_SIZE - common->start);
2773 if (*cc >= OP_THEN && *cc <= OP_THEN_ARG && current_offset != NULL)
2774 *current_offset = 1;
2775 cc = next_opcode(common, cc);
2776 }
2777 }
2778
2779 return end;
2780 }
2781
2782 #undef CASE_ITERATOR_PRIVATE_DATA_1
2783 #undef CASE_ITERATOR_PRIVATE_DATA_2A
2784 #undef CASE_ITERATOR_PRIVATE_DATA_2B
2785 #undef CASE_ITERATOR_TYPE_PRIVATE_DATA_1
2786 #undef CASE_ITERATOR_TYPE_PRIVATE_DATA_2A
2787 #undef CASE_ITERATOR_TYPE_PRIVATE_DATA_2B
2788
2789 static SLJIT_INLINE BOOL is_powerof2(unsigned int value)
2790 {
2791 return (value & (value - 1)) == 0;
2792 }
2793
2794 static SLJIT_INLINE void set_jumps(jump_list *list, struct sljit_label *label)
2795 {
2796 while (list)
2797 {
2798 /* sljit_set_label is clever enough to do nothing
2799 if either the jump or the label is NULL. */
2800 SET_LABEL(list->jump, label);
2801 list = list->next;
2802 }
2803 }
2804
2805 static SLJIT_INLINE void add_jump(struct sljit_compiler *compiler, jump_list **list, struct sljit_jump *jump)
2806 {
2807 jump_list *list_item = sljit_alloc_memory(compiler, sizeof(jump_list));
2808 if (list_item)
2809 {
2810 list_item->next = *list;
2811 list_item->jump = jump;
2812 *list = list_item;
2813 }
2814 }
2815
2816 static void add_stub(compiler_common *common, struct sljit_jump *start)
2817 {
2818 DEFINE_COMPILER;
2819 stub_list *list_item = sljit_alloc_memory(compiler, sizeof(stub_list));
2820
2821 if (list_item)
2822 {
2823 list_item->start = start;
2824 list_item->quit = LABEL();
2825 list_item->next = common->stubs;
2826 common->stubs = list_item;
2827 }
2828 }
2829
2830 static void flush_stubs(compiler_common *common)
2831 {
2832 DEFINE_COMPILER;
2833 stub_list *list_item = common->stubs;
2834
2835 while (list_item)
2836 {
2837 JUMPHERE(list_item->start);
2838 add_jump(compiler, &common->stackalloc, JUMP(SLJIT_FAST_CALL));
2839 JUMPTO(SLJIT_JUMP, list_item->quit);
2840 list_item = list_item->next;
2841 }
2842 common->stubs = NULL;
2843 }
2844
2845 static void add_label_addr(compiler_common *common, sljit_uw *update_addr)
2846 {
2847 DEFINE_COMPILER;
2848 label_addr_list *label_addr;
2849
2850 label_addr = sljit_alloc_memory(compiler, sizeof(label_addr_list));
2851 if (label_addr == NULL)
2852 return;
2853 label_addr->label = LABEL();
2854 label_addr->update_addr = update_addr;
2855 label_addr->next = common->label_addrs;
2856 common->label_addrs = label_addr;
2857 }
2858
2859 static SLJIT_INLINE void count_match(compiler_common *common)
2860 {
2861 DEFINE_COMPILER;
2862
2863 OP2(SLJIT_SUB | SLJIT_SET_Z, COUNT_MATCH, 0, COUNT_MATCH, 0, SLJIT_IMM, 1);
2864 add_jump(compiler, &common->calllimit, JUMP(SLJIT_ZERO));
2865 }
2866
2867 static SLJIT_INLINE void allocate_stack(compiler_common *common, int size)
2868 {
2869 /* May destroy all locals and registers except TMP2. */
2870 DEFINE_COMPILER;
2871
2872 SLJIT_ASSERT(size > 0);
2873 OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw));
2874 #ifdef DESTROY_REGISTERS
2875 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 12345);
2876 OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
2877 OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0);
2878 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP1, 0);
2879 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP1, 0);
2880 #endif
2881 add_stub(common, CMP(SLJIT_LESS, STACK_TOP, 0, STACK_LIMIT, 0));
2882 }
2883
2884 static SLJIT_INLINE void free_stack(compiler_common *common, int size)
2885 {
2886 DEFINE_COMPILER;
2887
2888 SLJIT_ASSERT(size > 0);
2889 OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw));
2890 }
2891
2892 static sljit_uw * allocate_read_only_data(compiler_common *common, sljit_uw size)
2893 {
2894 DEFINE_COMPILER;
2895 sljit_uw *result;
2896
2897 if (SLJIT_UNLIKELY(sljit_get_compiler_error(compiler)))
2898 return NULL;
2899
2900 result = (sljit_uw *)SLJIT_MALLOC(size + sizeof(sljit_uw), compiler->allocator_data);
2901 if (SLJIT_UNLIKELY(result == NULL))
2902 {
2903 sljit_set_compiler_memory_error(compiler);
2904 return NULL;
2905 }
2906
2907 *(void**)result = common->read_only_data_head;
2908 common->read_only_data_head = (void *)result;
2909 return result + 1;
2910 }
2911
2912 static SLJIT_INLINE void reset_ovector(compiler_common *common, int length)
2913 {
2914 DEFINE_COMPILER;
2915 struct sljit_label *loop;
2916 sljit_s32 i;
2917
2918 /* At this point we can freely use all temporary registers. */
2919 SLJIT_ASSERT(length > 1);
2920 /* TMP1 returns with begin - 1. */
2921 OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_S0), SLJIT_OFFSETOF(jit_arguments, begin), SLJIT_IMM, IN_UCHARS(1));
2922 if (length < 8)
2923 {
2924 for (i = 1; i < length; i++)
2925 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(i), SLJIT_R0, 0);
2926 }
2927 else
2928 {
2929 if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw)) == SLJIT_SUCCESS)
2930 {
2931 GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START);
2932 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1);
2933 loop = LABEL();
2934 sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw));
2935 OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
2936 JUMPTO(SLJIT_NOT_ZERO, loop);
2937 }
2938 else
2939 {
2940 GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START + sizeof(sljit_sw));
2941 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1);
2942 loop = LABEL();
2943 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), 0, SLJIT_R0, 0);
2944 OP2(SLJIT_ADD, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, sizeof(sljit_sw));
2945 OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
2946 JUMPTO(SLJIT_NOT_ZERO, loop);
2947 }
2948 }
2949 }
2950
2951 static SLJIT_INLINE void reset_fast_fail(compiler_common *common)
2952 {
2953 DEFINE_COMPILER;
2954 sljit_s32 i;
2955
2956 SLJIT_ASSERT(common->fast_fail_start_ptr < common->fast_fail_end_ptr);
2957
2958 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2959 for (i = common->fast_fail_start_ptr; i < common->fast_fail_end_ptr; i += sizeof(sljit_sw))
2960 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), i, TMP1, 0);
2961 }
2962
2963 static SLJIT_INLINE void do_reset_match(compiler_common *common, int length)
2964 {
2965 DEFINE_COMPILER;
2966 struct sljit_label *loop;
2967 int i;
2968
2969 SLJIT_ASSERT(length > 1);
2970 /* OVECTOR(1) contains the "string begin - 1" constant. */
2971 if (length > 2)
2972 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1));
2973 if (length < 8)
2974 {
2975 for (i = 2; i < length; i++)
2976 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(i), TMP1, 0);
2977 }
2978 else
2979 {
2980 if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw)) == SLJIT_SUCCESS)
2981 {
2982 GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + sizeof(sljit_sw));
2983 OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2);
2984 loop = LABEL();
2985 sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw));
2986 OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1);
2987 JUMPTO(SLJIT_NOT_ZERO, loop);
2988 }
2989 else
2990 {
2991 GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + 2 * sizeof(sljit_sw));
2992 OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2);
2993 loop = LABEL();
2994 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP1, 0);
2995 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, sizeof(sljit_sw));
2996 OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1);
2997 JUMPTO(SLJIT_NOT_ZERO, loop);
2998 }
2999 }
3000
3001 OP1(SLJIT_MOV, STACK_TOP, 0, ARGUMENTS, 0);
3002 if (common->mark_ptr != 0)
3003 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->mark_ptr, SLJIT_IMM, 0);
3004 if (common->control_head_ptr != 0)
3005 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_IMM, 0);
3006 OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(STACK_TOP), SLJIT_OFFSETOF(jit_arguments, stack));
3007 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr);
3008 OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(STACK_TOP), SLJIT_OFFSETOF(struct sljit_stack, end));
3009 }
3010
3011 static sljit_sw SLJIT_FUNC do_search_mark(sljit_sw *current, PCRE2_SPTR skip_arg)
3012 {
3013 while (current != NULL)
3014 {
3015 switch (current[1])
3016 {
3017 case type_then_trap:
3018 break;
3019
3020 case type_mark:
3021 if (PRIV(strcmp)(skip_arg, (PCRE2_SPTR)current[2]) == 0)
3022 return current[3];
3023 break;
3024
3025 default:
3026 SLJIT_UNREACHABLE();
3027 break;
3028 }
3029 SLJIT_ASSERT(current[0] == 0 || current < (sljit_sw*)current[0]);
3030 current = (sljit_sw*)current[0];
3031 }
3032 return 0;
3033 }
3034
3035 static SLJIT_INLINE void copy_ovector(compiler_common *common, int topbracket)
3036 {
3037 DEFINE_COMPILER;
3038 struct sljit_label *loop;
3039 BOOL has_pre;
3040
3041 /* At this point we can freely use all registers. */
3042 OP1(SLJIT_MOV, SLJIT_S2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1));
3043 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(1), STR_PTR, 0);
3044
3045 OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0);
3046 OP1(SLJIT_MOV, SLJIT_S0, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr);
3047 if (common->mark_ptr != 0)
3048 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr);
3049 OP1(SLJIT_MOV_U32, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, oveccount));
3050 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_S0, 0);
3051 if (common->mark_ptr != 0)
3052 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, mark_ptr), SLJIT_R2, 0);
3053 OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, match_data),
3054 SLJIT_IMM, SLJIT_OFFSETOF(pcre2_match_data, ovector) - sizeof(PCRE2_SIZE));
3055
3056 has_pre = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw)) == SLJIT_SUCCESS;
3057
3058 GET_LOCAL_BASE(SLJIT_S0, 0, OVECTOR_START - (has_pre ? sizeof(sljit_sw) : 0));
3059 OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, begin));
3060
3061 loop = LABEL();
3062
3063 if (has_pre)
3064 sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
3065 else
3066 {
3067 OP1(SLJIT_MOV, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_S0), 0);
3068 OP2(SLJIT_ADD, SLJIT_S0, 0, SLJIT_S0, 0, SLJIT_IMM, sizeof(sljit_sw));
3069 }
3070
3071 OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, sizeof(PCRE2_SIZE));
3072 OP2(SLJIT_SUB, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_R0, 0);
3073 /* Copy the integer value to the output buffer */
3074 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
3075 OP2(SLJIT_ASHR, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_IMM, UCHAR_SHIFT);
3076 #endif
3077
3078 SLJIT_ASSERT(sizeof(PCRE2_SIZE) == 4 || sizeof(PCRE2_SIZE) == 8);
3079 OP1(((sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV), SLJIT_MEM1(SLJIT_R2), 0, SLJIT_S1, 0);
3080
3081 OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
3082 JUMPTO(SLJIT_NOT_ZERO, loop);
3083
3084 /* Calculate the return value, which is the maximum ovector value. */
3085 if (topbracket > 1)
3086 {
3087 if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw))) == SLJIT_SUCCESS)
3088 {
3089 GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + topbracket * 2 * sizeof(sljit_sw));
3090 OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1);
3091
3092 /* OVECTOR(0) is never equal to SLJIT_S2. */
3093 loop = LABEL();
3094 sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw)));
3095 OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
3096 CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
3097 OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
3098 }
3099 else
3100 {
3101 GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + (topbracket - 1) * 2 * sizeof(sljit_sw));
3102 OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1);
3103
3104 /* OVECTOR(0) is never equal to SLJIT_S2. */
3105 loop = LABEL();
3106 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), 0);
3107 OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2 * (sljit_sw)sizeof(sljit_sw));
3108 OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
3109 CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
3110 OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
3111 }
3112 }
3113 else
3114 OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1);
3115 }
3116
3117 static SLJIT_INLINE void return_with_partial_match(compiler_common *common, struct sljit_label *quit)
3118 {
3119 DEFINE_COMPILER;
3120 sljit_s32 mov_opcode;
3121
3122 SLJIT_COMPILE_ASSERT(STR_END == SLJIT_S0, str_end_must_be_saved_reg0);
3123 SLJIT_ASSERT(common->start_used_ptr != 0 && common->start_ptr != 0
3124 && (common->mode == PCRE2_JIT_PARTIAL_SOFT ? common->hit_start != 0 : common->hit_start == 0));
3125
3126 OP1(SLJIT_MOV, SLJIT_R1, 0, ARGUMENTS, 0);
3127 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP),
3128 common->mode == PCRE2_JIT_PARTIAL_SOFT ? common->hit_start : common->start_ptr);
3129 OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_PARTIAL);
3130
3131 /* Store match begin and end. */
3132 OP1(SLJIT_MOV, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, begin));
3133 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_R2, 0);
3134 OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, match_data));
3135
3136 mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV;
3137
3138 OP2(SLJIT_SUB, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_S1, 0);
3139 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
3140 OP2(SLJIT_ASHR, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, UCHAR_SHIFT);
3141 #endif
3142 OP1(mov_opcode, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(pcre2_match_data, ovector), SLJIT_R2, 0);
3143
3144 OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_S1, 0);
3145 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
3146 OP2(SLJIT_ASHR, STR_END, 0, STR_END, 0, SLJIT_IMM, UCHAR_SHIFT);
3147 #endif
3148 OP1(mov_opcode, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(pcre2_match_data, ovector) + sizeof(PCRE2_SIZE), STR_END, 0);
3149
3150 JUMPTO(SLJIT_JUMP, quit);
3151 }
3152
3153 static SLJIT_INLINE void check_start_used_ptr(compiler_common *common)
3154 {
3155 /* May destroy TMP1. */
3156 DEFINE_COMPILER;
3157 struct sljit_jump *jump;
3158
3159 if (common->mode == PCRE2_JIT_PARTIAL_SOFT)
3160 {
3161 /* The value of -1 must be kept for start_used_ptr! */
3162 OP2(SLJIT_ADD, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, SLJIT_IMM, 1);
3163 /* Jumps if start_used_ptr < STR_PTR, or start_used_ptr == -1. Although overwriting
3164 is not necessary if start_used_ptr == STR_PTR, it does not hurt as well. */
3165 jump = CMP(SLJIT_LESS_EQUAL, TMP1, 0, STR_PTR, 0);
3166 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0);
3167 JUMPHERE(jump);
3168 }
3169 else if (common->mode == PCRE2_JIT_PARTIAL_HARD)
3170 {
3171 jump = CMP(SLJIT_LESS_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0);
3172 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0);
3173 JUMPHERE(jump);
3174 }
3175 }
3176
3177 static SLJIT_INLINE BOOL char_has_othercase(compiler_common *common, PCRE2_SPTR cc)
3178 {
3179 /* Detects if the character has an othercase. */
3180 unsigned int c;
3181
3182 #ifdef SUPPORT_UNICODE
3183 if (common->utf)
3184 {
3185 GETCHAR(c, cc);
3186 if (c > 127)
3187 {
3188 return c != UCD_OTHERCASE(c);
3189 }
3190 #if PCRE2_CODE_UNIT_WIDTH != 8
3191 return common->fcc[c] != c;
3192 #endif
3193 }
3194 else
3195 #endif
3196 c = *cc;
3197 return MAX_255(c) ? common->fcc[c] != c : FALSE;
3198 }
3199
3200 static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c)
3201 {
3202 /* Returns with the othercase. */
3203 #ifdef SUPPORT_UNICODE
3204 if (common->utf && c > 127)
3205 {
3206 return UCD_OTHERCASE(c);
3207 }
3208 #endif
3209 return TABLE_GET(c, common->fcc, c);
3210 }
3211
3212 static unsigned int char_get_othercase_bit(compiler_common *common, PCRE2_SPTR cc)
3213 {
3214 /* Detects if the character and its othercase has only 1 bit difference. */
3215 unsigned int c, oc, bit;
3216 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
3217 int n;
3218 #endif
3219
3220 #ifdef SUPPORT_UNICODE
3221 if (common->utf)
3222 {
3223 GETCHAR(c, cc);
3224 if (c <= 127)
3225 oc = common->fcc[c];
3226 else
3227 {
3228 oc = UCD_OTHERCASE(c);
3229 }
3230 }
3231 else
3232 {
3233 c = *cc;
3234 oc = TABLE_GET(c, common->fcc, c);
3235 }
3236 #else
3237 c = *cc;
3238 oc = TABLE_GET(c, common->fcc, c);
3239 #endif
3240
3241 SLJIT_ASSERT(c != oc);
3242
3243 bit = c ^ oc;
3244 /* Optimized for English alphabet. */
3245 if (c <= 127 && bit == 0x20)
3246 return (0 << 8) | 0x20;
3247
3248 /* Since c != oc, they must have at least 1 bit difference. */
3249 if (!is_powerof2(bit))
3250 return 0;
3251
3252 #if PCRE2_CODE_UNIT_WIDTH == 8
3253
3254 #ifdef SUPPORT_UNICODE
3255 if (common->utf && c > 127)
3256 {
3257 n = GET_EXTRALEN(*cc);
3258 while ((bit & 0x3f) == 0)
3259 {
3260 n--;
3261 bit >>= 6;
3262 }
3263 return (n << 8) | bit;
3264 }
3265 #endif /* SUPPORT_UNICODE */
3266 return (0 << 8) | bit;
3267
3268 #elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
3269
3270 #ifdef SUPPORT_UNICODE
3271 if (common->utf && c > 65535)
3272 {
3273 if (bit >= (1u << 10))
3274 bit >>= 10;
3275 else
3276 return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
3277 }
3278 #endif /* SUPPORT_UNICODE */
3279 return (bit < 256) ? ((0u << 8) | bit) : ((1u << 8) | (bit >> 8));
3280
3281 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
3282 }
3283
3284 static void check_partial(compiler_common *common, BOOL force)
3285 {
3286 /* Checks whether a partial matching is occurred. Does not modify registers. */
3287 DEFINE_COMPILER;
3288 struct sljit_jump *jump = NULL;
3289
3290 SLJIT_ASSERT(!force || common->mode != PCRE2_JIT_COMPLETE);
3291
3292 if (common->mode == PCRE2_JIT_COMPLETE)
3293 return;
3294
3295 if (!force)
3296 jump = CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0);
3297 else if (common->mode == PCRE2_JIT_PARTIAL_SOFT)
3298 jump = CMP(SLJIT_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, SLJIT_IMM, -1);
3299
3300 if (common->mode == PCRE2_JIT_PARTIAL_SOFT)
3301 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0);
3302 else
3303 {
3304 if (common->partialmatchlabel != NULL)
3305 JUMPTO(SLJIT_JUMP, common->partialmatchlabel);
3306 else
3307 add_jump(compiler, &common->partialmatch, JUMP(SLJIT_JUMP));
3308 }
3309
3310 if (jump != NULL)
3311 JUMPHERE(jump);
3312 }
3313
3314 static void check_str_end(compiler_common *common, jump_list **end_reached)
3315 {
3316 /* Does not affect registers. Usually used in a tight spot. */
3317 DEFINE_COMPILER;
3318 struct sljit_jump *jump;
3319
3320 if (common->mode == PCRE2_JIT_COMPLETE)
3321 {
3322 add_jump(compiler, end_reached, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
3323 return;
3324 }
3325
3326 jump = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0);
3327 if (common->mode == PCRE2_JIT_PARTIAL_SOFT)
3328 {
3329 add_jump(compiler, end_reached, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0));
3330 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0);
3331 add_jump(compiler, end_reached, JUMP(SLJIT_JUMP));
3332 }
3333 else
3334 {
3335 add_jump(compiler, end_reached, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0));
3336 if (common->partialmatchlabel != NULL)
3337 JUMPTO(SLJIT_JUMP, common->partialmatchlabel);
3338 else
3339 add_jump(compiler, &common->partialmatch, JUMP(SLJIT_JUMP));
3340 }
3341 JUMPHERE(jump);
3342 }
3343
3344 static void detect_partial_match(compiler_common *common, jump_list **backtracks)
3345 {
3346 DEFINE_COMPILER;
3347 struct sljit_jump *jump;
3348
3349 if (common->mode == PCRE2_JIT_COMPLETE)
3350 {
3351 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
3352 return;
3353 }
3354
3355 /* Partial matching mode. */
3356 jump = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0);
3357 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, SLJIT_MEM1(SLJIT_SP), common->start_used_ptr, STR_PTR, 0));
3358 if (common->mode == PCRE2_JIT_PARTIAL_SOFT)
3359 {
3360 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, 0);
3361 add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
3362 }
3363 else
3364 {
3365 if (common->partialmatchlabel != NULL)
3366 JUMPTO(SLJIT_JUMP, common->partialmatchlabel);
3367 else
3368 add_jump(compiler, &common->partialmatch, JUMP(SLJIT_JUMP));
3369 }
3370 JUMPHERE(jump);
3371 }
3372
3373 static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw, jump_list **backtracks)
3374 {
3375 /* Reads the character into TMP1, keeps STR_PTR.
3376 Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */
3377 DEFINE_COMPILER;
3378 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3379 struct sljit_jump *jump;
3380 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
3381
3382 SLJIT_UNUSED_ARG(max);
3383 SLJIT_UNUSED_ARG(dst);
3384 SLJIT_UNUSED_ARG(dstw);
3385 SLJIT_UNUSED_ARG(backtracks);
3386
3387 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3388
3389 #ifdef SUPPORT_UNICODE
3390 #if PCRE2_CODE_UNIT_WIDTH == 8
3391 if (common->utf)
3392 {
3393 if (max < 128) return;
3394
3395 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
3396 OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0);
3397 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3398 add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
3399 OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
3400 if (backtracks && common->invalid_utf)
3401 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3402 JUMPHERE(jump);
3403 }
3404 #elif PCRE2_CODE_UNIT_WIDTH == 16
3405 if (common->utf)
3406 {
3407 if (max < 0xd800) return;
3408
3409 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3410
3411 if (common->invalid_utf)
3412 {
3413 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
3414 OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0);
3415 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3416 add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
3417 OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
3418 if (backtracks && common->invalid_utf)
3419 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3420 }
3421 else
3422 {
3423 /* TMP2 contains the high surrogate. */
3424 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
3425 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
3426 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
3427 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
3428 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
3429 }
3430
3431 JUMPHERE(jump);
3432 }
3433 #elif PCRE2_CODE_UNIT_WIDTH == 32
3434 if (common->invalid_utf)
3435 {
3436 if (max < 0xd800) return;
3437
3438 if (backtracks != NULL)
3439 {
3440 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3441 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
3442 add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
3443 }
3444 else
3445 {
3446 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3447 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
3448 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
3449 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
3450 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
3451 }
3452 }
3453 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
3454 #endif /* SUPPORT_UNICODE */
3455 }
3456
3457 static void peek_char_back(compiler_common *common, sljit_u32 max, jump_list **backtracks)
3458 {
3459 /* Reads one character back without moving STR_PTR. TMP2 must
3460 contain the start of the subject buffer. Affects TMP1, TMP2, and RETURN_ADDR. */
3461 DEFINE_COMPILER;
3462
3463 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3464 struct sljit_jump *jump;
3465 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
3466
3467 SLJIT_UNUSED_ARG(max);
3468 SLJIT_UNUSED_ARG(backtracks);
3469
3470 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
3471
3472 #ifdef SUPPORT_UNICODE
3473 #if PCRE2_CODE_UNIT_WIDTH == 8
3474 if (common->utf)
3475 {
3476 if (max < 128) return;
3477
3478 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
3479 if (common->invalid_utf)
3480 {
3481 add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL));
3482 if (backtracks != NULL)
3483 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3484 }
3485 else
3486 add_jump(compiler, &common->utfpeakcharback, JUMP(SLJIT_FAST_CALL));
3487 JUMPHERE(jump);
3488 }
3489 #elif PCRE2_CODE_UNIT_WIDTH == 16
3490 if (common->utf)
3491 {
3492 if (max < 0xd800) return;
3493
3494 if (common->invalid_utf)
3495 {
3496 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
3497 add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL));
3498 if (backtracks != NULL)
3499 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3500 }
3501 else
3502 {
3503 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
3504 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xdc00);
3505 /* TMP2 contains the low surrogate. */
3506 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
3507 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000);
3508 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3509 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
3510 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
3511 }
3512 JUMPHERE(jump);
3513 }
3514 #elif PCRE2_CODE_UNIT_WIDTH == 32
3515 if (common->invalid_utf)
3516 {
3517 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3518 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
3519 add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
3520 }
3521 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
3522 #endif /* SUPPORT_UNICODE */
3523 }
3524
3525 #define READ_CHAR_UPDATE_STR_PTR 0x1
3526 #define READ_CHAR_UTF8_NEWLINE 0x2
3527 #define READ_CHAR_NEWLINE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UTF8_NEWLINE)
3528 #define READ_CHAR_VALID_UTF 0x4
3529
3530 static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max,
3531 jump_list **backtracks, sljit_u32 options)
3532 {
3533 /* Reads the precise value of a character into TMP1, if the character is
3534 between min and max (c >= min && c <= max). Otherwise it returns with a value
3535 outside the range. Does not check STR_END. */
3536 DEFINE_COMPILER;
3537 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3538 struct sljit_jump *jump;
3539 #endif
3540 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
3541 struct sljit_jump *jump2;
3542 #endif
3543
3544 SLJIT_UNUSED_ARG(min);
3545 SLJIT_UNUSED_ARG(max);
3546 SLJIT_UNUSED_ARG(backtracks);
3547 SLJIT_UNUSED_ARG(options);
3548 SLJIT_ASSERT(min <= max);
3549
3550 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3551 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3552
3553 #ifdef SUPPORT_UNICODE
3554 #if PCRE2_CODE_UNIT_WIDTH == 8
3555 if (common->utf)
3556 {
3557 if (max < 128 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;
3558
3559 if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
3560 {
3561 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
3562
3563 if (options & READ_CHAR_UTF8_NEWLINE)
3564 add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL));
3565 else
3566 add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
3567
3568 if (backtracks != NULL)
3569 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3570 JUMPHERE(jump);
3571 return;
3572 }
3573
3574 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
3575 if (min >= 0x10000)
3576 {
3577 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0);
3578 if (options & READ_CHAR_UPDATE_STR_PTR)
3579 OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
3580 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3581 jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7);
3582 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
3583 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
3584 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
3585 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
3586 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
3587 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
3588 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
3589 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
3590 if (!(options & READ_CHAR_UPDATE_STR_PTR))
3591 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
3592 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
3593 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
3594 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
3595 JUMPHERE(jump2);
3596 if (options & READ_CHAR_UPDATE_STR_PTR)
3597 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
3598 }
3599 else if (min >= 0x800 && max <= 0xffff)
3600 {
3601 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0);
3602 if (options & READ_CHAR_UPDATE_STR_PTR)
3603 OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
3604 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3605 jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf);
3606 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
3607 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
3608 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
3609 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
3610 if (!(options & READ_CHAR_UPDATE_STR_PTR))
3611 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
3612 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
3613 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
3614 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
3615 JUMPHERE(jump2);
3616 if (options & READ_CHAR_UPDATE_STR_PTR)
3617 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
3618 }
3619 else if (max >= 0x800)
3620 {
3621 add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
3622 }
3623 else if (max < 128)
3624 {
3625 OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
3626 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
3627 }
3628 else
3629 {
3630 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3631 if (!(options & READ_CHAR_UPDATE_STR_PTR))
3632 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3633 else
3634 OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
3635 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
3636 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
3637 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
3638 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
3639 if (options & READ_CHAR_UPDATE_STR_PTR)
3640 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
3641 }
3642 JUMPHERE(jump);
3643 }
3644 #elif PCRE2_CODE_UNIT_WIDTH == 16
3645 if (common->utf)
3646 {
3647 if (max < 0xd800 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;
3648
3649 if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
3650 {
3651 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3652 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
3653
3654 if (options & READ_CHAR_UTF8_NEWLINE)
3655 add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL));
3656 else
3657 add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
3658
3659 if (backtracks != NULL)
3660 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3661 JUMPHERE(jump);
3662 return;
3663 }
3664
3665 if (max >= 0x10000)
3666 {
3667 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3668 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
3669 /* TMP2 contains the high surrogate. */
3670 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3671 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
3672 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3673 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
3674 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
3675 JUMPHERE(jump);
3676 return;
3677 }
3678
3679 /* Skip low surrogate if necessary. */
3680 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3681
3682 if (sljit_has_cpu_feature(SLJIT_HAS_CMOV) && sljit_get_register_index(RETURN_ADDR) >= 0)
3683 {
3684 if (options & READ_CHAR_UPDATE_STR_PTR)
3685 OP2(SLJIT_ADD, RETURN_ADDR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3686 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400);
3687 if (options & READ_CHAR_UPDATE_STR_PTR)
3688 CMOV(SLJIT_LESS, STR_PTR, RETURN_ADDR, 0);
3689 if (max >= 0xd800)
3690 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, 0x10000);
3691 }
3692 else
3693 {
3694 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400);
3695 if (options & READ_CHAR_UPDATE_STR_PTR)
3696 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3697 if (max >= 0xd800)
3698 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
3699 JUMPHERE(jump);
3700 }
3701 }
3702 #elif PCRE2_CODE_UNIT_WIDTH == 32
3703 if (common->invalid_utf)
3704 {
3705 if (backtracks != NULL)
3706 {
3707 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3708 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
3709 add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
3710 }
3711 else
3712 {
3713 OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3714 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
3715 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
3716 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
3717 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
3718 }
3719 }
3720 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
3721 #endif /* SUPPORT_UNICODE */
3722 }
3723
3724 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
3725
3726 static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass)
3727 {
3728 /* Tells whether the character codes below 128 are enough
3729 to determine a match. */
3730 const sljit_u8 value = nclass ? 0xff : 0;
3731 const sljit_u8 *end = bitset + 32;
3732
3733 bitset += 16;
3734 do
3735 {
3736 if (*bitset++ != value)
3737 return FALSE;
3738 }
3739 while (bitset < end);
3740 return TRUE;
3741 }
3742
3743 static void read_char7_type(compiler_common *common, jump_list **backtracks, BOOL negated)
3744 {
3745 /* Reads the precise character type of a character into TMP1, if the character
3746 is less than 128. Otherwise it returns with zero. Does not check STR_END. The
3747 full_read argument tells whether characters above max are accepted or not. */
3748 DEFINE_COMPILER;
3749 struct sljit_jump *jump;
3750
3751 SLJIT_ASSERT(common->utf);
3752
3753 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
3754 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3755
3756 /* All values > 127 are zero in ctypes. */
3757 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
3758
3759 if (negated)
3760 {
3761 jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80);
3762
3763 if (common->invalid_utf)
3764 {
3765 add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
3766 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3767 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
3768 }
3769 else
3770 {
3771 OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0);
3772 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
3773 }
3774 JUMPHERE(jump);
3775 }
3776 }
3777
3778 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
3779
3780 static void read_char8_type(compiler_common *common, jump_list **backtracks, BOOL negated)
3781 {
3782 /* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */
3783 DEFINE_COMPILER;
3784 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
3785 struct sljit_jump *jump;
3786 #endif
3787 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
3788 struct sljit_jump *jump2;
3789 #endif
3790
3791 SLJIT_UNUSED_ARG(backtracks);
3792 SLJIT_UNUSED_ARG(negated);
3793
3794 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
3795 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3796
3797 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
3798 if (common->utf)
3799 {
3800 /* The result of this read may be unused, but saves an "else" part. */
3801 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
3802 jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80);
3803
3804 if (!negated)
3805 {
3806 if (common->invalid_utf)
3807 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
3808
3809 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3810 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3811 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
3812 if (common->invalid_utf)
3813 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe0 - 0xc2));
3814
3815 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
3816 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
3817 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
3818 if (common->invalid_utf)
3819 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40));
3820
3821 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
3822 jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255);
3823 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
3824 JUMPHERE(jump2);
3825 }
3826 else if (common->invalid_utf)
3827 {
3828 add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
3829 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
3830 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
3831
3832 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
3833 jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255);
3834 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
3835 JUMPHERE(jump2);
3836 }
3837 else
3838 add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL));
3839
3840 JUMPHERE(jump);
3841 return;
3842 }
3843 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
3844
3845 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32
3846 if (common->invalid_utf && negated)
3847 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x110000));
3848 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32 */
3849
3850 #if PCRE2_CODE_UNIT_WIDTH != 8
3851 /* The ctypes array contains only 256 values. */
3852 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
3853 jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255);
3854 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3855 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
3856 #if PCRE2_CODE_UNIT_WIDTH != 8
3857 JUMPHERE(jump);
3858 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3859
3860 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
3861 if (common->utf && negated)
3862 {
3863 /* Skip low surrogate if necessary. */
3864 if (!common->invalid_utf)
3865 {
3866 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
3867
3868 if (sljit_has_cpu_feature(SLJIT_HAS_CMOV) && sljit_get_register_index(RETURN_ADDR) >= 0)
3869 {
3870 OP2(SLJIT_ADD, RETURN_ADDR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3871 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400);
3872 CMOV(SLJIT_LESS, STR_PTR, RETURN_ADDR, 0);
3873 }
3874 else
3875 {
3876 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400);
3877 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3878 JUMPHERE(jump);
3879 }
3880 return;
3881 }
3882
3883 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
3884 jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
3885 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400));
3886 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
3887
3888 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
3889 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3890 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00);
3891 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400));
3892
3893 JUMPHERE(jump);
3894 return;
3895 }
3896 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 */
3897 }
3898
3899 static void move_back(compiler_common *common, jump_list **backtracks, BOOL must_be_valid)
3900 {
3901 /* Goes one character back. TMP2 must contain the start of
3902 the subject buffer. Affects STR_PTR and TMP1. Does not modify
3903 STR_PTR for invalid character sequences. */
3904 DEFINE_COMPILER;
3905
3906 SLJIT_UNUSED_ARG(backtracks);
3907 SLJIT_UNUSED_ARG(must_be_valid);
3908
3909 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3910 struct sljit_jump *jump;
3911 #endif
3912
3913 #ifdef SUPPORT_UNICODE
3914 #if PCRE2_CODE_UNIT_WIDTH == 8
3915 struct sljit_label *label;
3916
3917 if (common->utf)
3918 {
3919 if (!must_be_valid && common->invalid_utf)
3920 {
3921 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
3922 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3923 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
3924 add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL));
3925 if (backtracks != NULL)
3926 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0));
3927 JUMPHERE(jump);
3928 return;
3929 }
3930
3931 label = LABEL();
3932 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
3933 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3934 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
3935 CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label);
3936 return;
3937 }
3938 #elif PCRE2_CODE_UNIT_WIDTH == 16
3939 if (common->utf)
3940 {
3941 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
3942 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3943
3944 if (!must_be_valid && common->invalid_utf)
3945 {
3946 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
3947 jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0xd800);
3948 add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL));
3949 if (backtracks != NULL)
3950 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0));
3951 JUMPHERE(jump);
3952 return;
3953 }
3954
3955 /* Skip low surrogate if necessary. */
3956 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
3957 OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
3958 OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
3959 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
3960 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
3961 return;
3962 }
3963 #elif PCRE2_CODE_UNIT_WIDTH == 32
3964 if (common->invalid_utf && !must_be_valid)
3965 {
3966 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
3967 if (backtracks != NULL)
3968 {
3969 add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
3970 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3971 return;
3972 }
3973
3974 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
3975 OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS);
3976 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
3977 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
3978 return;
3979 }
3980 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
3981 #endif /* SUPPORT_UNICODE */
3982 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
3983 }
3984
3985 static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpifmatch)
3986 {
3987 /* Character comes in TMP1. Checks if it is a newline. TMP2 may be destroyed. */
3988 DEFINE_COMPILER;
3989 struct sljit_jump *jump;
3990
3991 if (nltype == NLTYPE_ANY)
3992 {
3993 add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
3994 sljit_set_current_flags(compiler, SLJIT_SET_Z);
3995 add_jump(compiler, backtracks, JUMP(jumpifmatch ? SLJIT_NOT_ZERO : SLJIT_ZERO));
3996 }
3997 else if (nltype == NLTYPE_ANYCRLF)
3998 {
3999 if (jumpifmatch)
4000 {
4001 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR));
4002 add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
4003 }
4004 else
4005 {
4006 jump = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
4007 add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
4008 JUMPHERE(jump);
4009 }
4010 }
4011 else
4012 {
4013 SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
4014 add_jump(compiler, backtracks, CMP(jumpifmatch ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
4015 }
4016 }
4017
4018 #ifdef SUPPORT_UNICODE
4019
4020 #if PCRE2_CODE_UNIT_WIDTH == 8
4021 static void do_utfreadchar(compiler_common *common)
4022 {
4023 /* Fast decoding a UTF-8 character. TMP1 contains the first byte
4024 of the character (>= 0xc0). Return char value in TMP1. */
4025 DEFINE_COMPILER;
4026 struct sljit_jump *jump;
4027
4028 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4029 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4030 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4031 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
4032 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4033
4034 /* Searching for the first zero. */
4035 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4036 jump = JUMP(SLJIT_NOT_ZERO);
4037 /* Two byte sequence. */
4038 OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3000);
4039 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4040 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4041
4042 JUMPHERE(jump);
4043 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
4044 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4045 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
4046 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4047
4048 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
4049 jump = JUMP(SLJIT_NOT_ZERO);
4050 /* Three byte sequence. */
4051 OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0000);
4052 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4053 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4054
4055 /* Four byte sequence. */
4056 JUMPHERE(jump);
4057 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
4058 OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0000);
4059 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
4060 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4061 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
4062 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4063 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4064 }
4065
4066 static void do_utfreadtype8(compiler_common *common)
4067 {
4068 /* Fast decoding a UTF-8 character type. TMP2 contains the first byte
4069 of the character (>= 0xc0). Return value in TMP1. */
4070 DEFINE_COMPILER;
4071 struct sljit_jump *jump;
4072 struct sljit_jump *compare;
4073
4074 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4075
4076 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20);
4077 jump = JUMP(SLJIT_NOT_ZERO);
4078 /* Two byte sequence. */
4079 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4080 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4081 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f);
4082 /* The upper 5 bits are known at this point. */
4083 compare = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3);
4084 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
4085 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
4086 OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0);
4087 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
4088 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4089
4090 JUMPHERE(compare);
4091 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
4092 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4093
4094 /* We only have types for characters less than 256. */
4095 JUMPHERE(jump);
4096 OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0);
4097 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
4098 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
4099 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4100 }
4101
4102 static void do_utfreadchar_invalid(compiler_common *common)
4103 {
4104 /* Slow decoding a UTF-8 character. TMP1 contains the first byte
4105 of the character (>= 0xc0). Return char value in TMP1. STR_PTR is
4106 undefined for invalid characters. */
4107 DEFINE_COMPILER;
4108 sljit_s32 i;
4109 sljit_s32 has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV);
4110 struct sljit_jump *jump;
4111 struct sljit_jump *buffer_end_close;
4112 struct sljit_label *three_byte_entry;
4113 struct sljit_label *exit_invalid_label;
4114 struct sljit_jump *exit_invalid[11];
4115
4116 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4117
4118 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc2);
4119
4120 /* Usually more than 3 characters remained in the subject buffer. */
4121 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
4122
4123 /* Not a valid start of a multi-byte sequence, no more bytes read. */
4124 exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf5 - 0xc2);
4125
4126 buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
4127
4128 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
4129 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4130 /* If TMP2 is in 0x80-0xbf range, TMP1 is also increased by (0x2 << 6). */
4131 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4132 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4133 exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4134
4135 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4136 jump = JUMP(SLJIT_NOT_ZERO);
4137
4138 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4139 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4140
4141 JUMPHERE(jump);
4142
4143 /* Three-byte sequence. */
4144 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4145 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4146 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4147 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4148 if (has_cmov)
4149 {
4150 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x40);
4151 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, 0x20000);
4152 exit_invalid[2] = NULL;
4153 }
4154 else
4155 exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4156
4157 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
4158 jump = JUMP(SLJIT_NOT_ZERO);
4159
4160 three_byte_entry = LABEL();
4161
4162 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
4163 if (has_cmov)
4164 {
4165 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4166 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0xd800);
4167 exit_invalid[3] = NULL;
4168 }
4169 else
4170 exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
4171 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
4172 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4173
4174 if (has_cmov)
4175 {
4176 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4177 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
4178 exit_invalid[4] = NULL;
4179 }
4180 else
4181 exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
4182 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4183
4184 JUMPHERE(jump);
4185
4186 /* Four-byte sequence. */
4187 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
4188 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4189 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4190 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4191 if (has_cmov)
4192 {
4193 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x40);
4194 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, 0);
4195 exit_invalid[5] = NULL;
4196 }
4197 else
4198 exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4199
4200 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
4201 if (has_cmov)
4202 {
4203 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x100000);
4204 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0x10000);
4205 exit_invalid[6] = NULL;
4206 }
4207 else
4208 exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
4209
4210 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
4211 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4212
4213 JUMPHERE(buffer_end_close);
4214 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4215 exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
4216
4217 /* Two-byte sequence. */
4218 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
4219 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4220 /* If TMP2 is in 0x80-0xbf range, TMP1 is also increased by (0x2 << 6). */
4221 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4222 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4223 exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4224
4225 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4226 jump = JUMP(SLJIT_NOT_ZERO);
4227
4228 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4229
4230 /* Three-byte sequence. */
4231 JUMPHERE(jump);
4232 exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4233
4234 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4235 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4236 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4237 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4238 if (has_cmov)
4239 {
4240 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x40);
4241 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
4242 exit_invalid[10] = NULL;
4243 }
4244 else
4245 exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4246
4247 /* One will be substracted from STR_PTR later. */
4248 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4249
4250 /* Four byte sequences are not possible. */
4251 CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry);
4252
4253 exit_invalid_label = LABEL();
4254 for (i = 0; i < 11; i++)
4255 sljit_set_label(exit_invalid[i], exit_invalid_label);
4256
4257 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4258 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4259 }
4260
4261 static void do_utfreadnewline_invalid(compiler_common *common)
4262 {
4263 /* Slow decoding a UTF-8 character, specialized for newlines.
4264 TMP1 contains the first byte of the character (>= 0xc0). Return
4265 char value in TMP1. */
4266 DEFINE_COMPILER;
4267 struct sljit_label *loop;
4268 struct sljit_label *skip_start;
4269 struct sljit_label *three_byte_exit;
4270 struct sljit_jump *jump[5];
4271
4272 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4273
4274 if (common->nltype != NLTYPE_ANY)
4275 {
4276 SLJIT_ASSERT(common->nltype != NLTYPE_FIXED || common->newline < 128);
4277
4278 /* All newlines are ascii, just skip intermediate octets. */
4279 jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4280 loop = LABEL();
4281 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4282 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0);
4283 CMPTO(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80, loop);
4284 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4285
4286 JUMPHERE(jump[0]);
4287
4288 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4289 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4290 return;
4291 }
4292
4293 jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4294 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4295 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4296
4297 jump[1] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xc2);
4298 jump[2] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xe2);
4299
4300 skip_start = LABEL();
4301 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0);
4302 jump[3] = CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80);
4303
4304 /* Skip intermediate octets. */
4305 loop = LABEL();
4306 jump[4] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4307 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4308 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4309 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0);
4310 CMPTO(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80, loop);
4311
4312 JUMPHERE(jump[3]);
4313 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4314
4315 three_byte_exit = LABEL();
4316 JUMPHERE(jump[0]);
4317 JUMPHERE(jump[4]);
4318
4319 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4320 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4321
4322 /* Two byte long newline: 0x85. */
4323 JUMPHERE(jump[1]);
4324 CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0x85, skip_start);
4325
4326 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x85);
4327 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4328
4329 /* Three byte long newlines: 0x2028 and 0x2029. */
4330 JUMPHERE(jump[2]);
4331 CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0x80, skip_start);
4332 CMPTO(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0, three_byte_exit);
4333
4334 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4335 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4336
4337 OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 0x80);
4338 CMPTO(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x40, skip_start);
4339
4340 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0x2000);
4341 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4342 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4343 }
4344
4345 static void do_utfmoveback_invalid(compiler_common *common)
4346 {
4347 /* Goes one character back. */
4348 DEFINE_COMPILER;
4349 sljit_s32 i;
4350 struct sljit_jump *jump;
4351 struct sljit_jump *buffer_start_close;
4352 struct sljit_label *exit_ok_label;
4353 struct sljit_label *exit_invalid_label;
4354 struct sljit_jump *exit_invalid[7];
4355
4356 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4357
4358 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
4359 exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0);
4360
4361 /* Two-byte sequence. */
4362 buffer_start_close = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
4363
4364 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
4365
4366 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
4367 jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x20);
4368
4369 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
4370 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4371 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4372
4373 /* Three-byte sequence. */
4374 JUMPHERE(jump);
4375 exit_invalid[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40);
4376
4377 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
4378
4379 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0);
4380 jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x10);
4381
4382 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
4383 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4384 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4385
4386 /* Four-byte sequence. */
4387 JUMPHERE(jump);
4388 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80);
4389 exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x40);
4390
4391 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4392 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0);
4393 exit_invalid[3] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x05);
4394
4395 exit_ok_label = LABEL();
4396 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
4397 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4398
4399 /* Two-byte sequence. */
4400 JUMPHERE(buffer_start_close);
4401 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4402
4403 exit_invalid[4] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
4404
4405 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4406
4407 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
4408 CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20, exit_ok_label);
4409
4410 /* Three-byte sequence. */
4411 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4412 exit_invalid[5] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40);
4413 exit_invalid[6] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
4414
4415 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4416
4417 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0);
4418 CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10, exit_ok_label);
4419
4420 /* Four-byte sequences are not possible. */
4421
4422 exit_invalid_label = LABEL();
4423 sljit_set_label(exit_invalid[5], exit_invalid_label);
4424 sljit_set_label(exit_invalid[6], exit_invalid_label);
4425 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
4426 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
4427 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4428
4429 JUMPHERE(exit_invalid[4]);
4430 /* -2 + 4 = 2 */
4431 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
4432
4433 exit_invalid_label = LABEL();
4434 for (i = 0; i < 4; i++)
4435 sljit_set_label(exit_invalid[i], exit_invalid_label);
4436 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
4437 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(4));
4438 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4439 }
4440
4441 static void do_utfpeakcharback(compiler_common *common)
4442 {
4443 /* Peak a character back. */
4444 DEFINE_COMPILER;
4445 struct sljit_jump *jump[2];
4446
4447 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4448
4449 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4450 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
4451 jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20);
4452
4453 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
4454 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0);
4455 jump[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10);
4456
4457 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4));
4458 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80);
4459 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0);
4460 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
4461 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4462
4463 JUMPHERE(jump[1]);
4464 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4465 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4466 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4467 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4468
4469 JUMPHERE(jump[0]);
4470 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
4471 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
4472 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
4473 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4474
4475 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4476 }
4477
4478 static void do_utfpeakcharback_invalid(compiler_common *common)
4479 {
4480 /* Peak a character back. */
4481 DEFINE_COMPILER;
4482 sljit_s32 i;
4483 sljit_s32 has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV);
4484 struct sljit_jump *jump[2];
4485 struct sljit_label *two_byte_entry;
4486 struct sljit_label *three_byte_entry;
4487 struct sljit_label *exit_invalid_label;
4488 struct sljit_jump *exit_invalid[8];
4489
4490 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4491
4492 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(3));
4493 exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0);
4494 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
4495
4496 /* Two-byte sequence. */
4497 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4498 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
4499 jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1e);
4500
4501 two_byte_entry = LABEL();
4502 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
4503 /* If TMP1 is in 0x80-0xbf range, TMP1 is also increased by (0x2 << 6). */
4504 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4505 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4506
4507 JUMPHERE(jump[1]);
4508 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80);
4509 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
4510 exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4511 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
4512 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4513
4514 /* Three-byte sequence. */
4515 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
4516 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0);
4517 jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x10);
4518
4519 three_byte_entry = LABEL();
4520 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
4521 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4522
4523 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
4524 if (has_cmov)
4525 {
4526 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4527 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, -0xd800);
4528 exit_invalid[2] = NULL;
4529 }
4530 else
4531 exit_invalid[2] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
4532
4533 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
4534 if (has_cmov)
4535 {
4536 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
4537 CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
4538 exit_invalid[3] = NULL;
4539 }
4540 else
4541 exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
4542
4543 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4544
4545 JUMPHERE(jump[1]);
4546 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0 - 0x80);
4547 exit_invalid[4] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4548 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
4549 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4550
4551 /* Four-byte sequence. */
4552 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4));
4553 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
4554 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0);
4555 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18);
4556 /* ADD is used instead of OR because of the SUB 0x10000 above. */
4557 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4558
4559 if (has_cmov)
4560 {
4561 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x100000);
4562 CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0x10000);
4563 exit_invalid[5] = NULL;
4564 }
4565 else
4566 exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
4567
4568 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
4569 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4570
4571 JUMPHERE(jump[0]);
4572 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
4573 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
4574
4575 /* Two-byte sequence. */
4576 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4577 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
4578 CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry);
4579
4580 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80);
4581 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
4582 exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
4583 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
4584 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
4585
4586 /* Three-byte sequence. */
4587 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
4588 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0);
4589 CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x10, three_byte_entry);
4590
4591 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4592 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4593
4594 JUMPHERE(jump[0]);
4595 exit_invalid[7] = CMP(SLJIT_GREATER, TMP2, 0, STR_PTR, 0);
4596
4597 /* Two-byte sequence. */
4598 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4599 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
4600 CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry);
4601
4602 exit_invalid_label = LABEL();
4603 for (i = 0; i < 8; i++)
4604 sljit_set_label(exit_invalid[i], exit_invalid_label);
4605
4606 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4607 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4608 }
4609
4610 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
4611
4612 #if PCRE2_CODE_UNIT_WIDTH == 16
4613
4614 static void do_utfreadchar_invalid(compiler_common *common)
4615 {
4616 /* Slow decoding a UTF-16 character. TMP1 contains the first half
4617 of the character (>= 0xd800). Return char value in TMP1. STR_PTR is
4618 undefined for invalid characters. */
4619 DEFINE_COMPILER;
4620 struct sljit_jump *exit_invalid[3];
4621
4622 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4623
4624 /* TMP2 contains the high surrogate. */
4625 exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00);
4626 exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4627
4628 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4629 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
4630 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4631
4632 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
4633 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000);
4634 exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400);
4635
4636 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4637 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4638
4639 JUMPHERE(exit_invalid[0]);
4640 JUMPHERE(exit_invalid[1]);
4641 JUMPHERE(exit_invalid[2]);
4642 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4643 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4644 }
4645
4646 static void do_utfreadnewline_invalid(compiler_common *common)
4647 {
4648 /* Slow decoding a UTF-16 character, specialized for newlines.
4649 TMP1 contains the first half of the character (>= 0xd800). Return
4650 char value in TMP1. */
4651
4652 DEFINE_COMPILER;
4653 struct sljit_jump *exit_invalid[2];
4654
4655 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4656
4657 /* TMP2 contains the high surrogate. */
4658 exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4659
4660 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4661 exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00);
4662
4663 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00);
4664 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400);
4665 OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS);
4666 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
4667 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCHAR_SHIFT);
4668 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
4669
4670 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4671
4672 JUMPHERE(exit_invalid[0]);
4673 JUMPHERE(exit_invalid[1]);
4674 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4675 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4676 }
4677
4678 static void do_utfmoveback_invalid(compiler_common *common)
4679 {
4680 /* Goes one character back. */
4681 DEFINE_COMPILER;
4682 struct sljit_jump *exit_invalid[3];
4683
4684 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4685
4686 exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x400);
4687 exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
4688
4689 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
4690 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
4691 exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400);
4692
4693 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4694 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
4695 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4696
4697 JUMPHERE(exit_invalid[0]);
4698 JUMPHERE(exit_invalid[1]);
4699 JUMPHERE(exit_invalid[2]);
4700
4701 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4702 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
4703 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4704 }
4705
4706 static void do_utfpeakcharback_invalid(compiler_common *common)
4707 {
4708 /* Peak a character back. */
4709 DEFINE_COMPILER;
4710 struct sljit_jump *jump;
4711 struct sljit_jump *exit_invalid[3];
4712
4713 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4714
4715 jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000);
4716 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
4717 exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xdc00);
4718 exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
4719
4720 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
4721 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
4722 OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
4723 exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400);
4724 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
4725 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4726
4727 JUMPHERE(jump);
4728 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4729
4730 JUMPHERE(exit_invalid[0]);
4731 JUMPHERE(exit_invalid[1]);
4732 JUMPHERE(exit_invalid[2]);
4733
4734 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
4735 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4736 }
4737
4738 #endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
4739
4740 /* UCD_BLOCK_SIZE must be 128 (see the assert below). */
4741 #define UCD_BLOCK_MASK 127
4742 #define UCD_BLOCK_SHIFT 7
4743
4744 static void do_getucd(compiler_common *common)
4745 {
4746 /* Search the UCD record for the character comes in TMP1.
4747 Returns chartype in TMP1 and UCD offset in TMP2. */
4748 DEFINE_COMPILER;
4749 #if PCRE2_CODE_UNIT_WIDTH == 32
4750 struct sljit_jump *jump;
4751 #endif
4752
4753 #if defined SLJIT_DEBUG && SLJIT_DEBUG
4754 /* dummy_ucd_record */
4755 const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
4756 SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
4757 SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
4758 #endif
4759
4760 SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
4761
4762 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4763
4764 #if PCRE2_CODE_UNIT_WIDTH == 32
4765 if (!common->utf)
4766 {
4767 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
4768 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR);
4769 JUMPHERE(jump);
4770 }
4771 #endif
4772
4773 OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
4774 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
4775 OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
4776 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
4777 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
4778 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4779 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
4780 OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
4781 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4782 }
4783
4784 static void do_getucdtype(compiler_common *common)
4785 {
4786 /* Search the UCD record for the character comes in TMP1.
4787 Returns chartype in TMP1 and UCD offset in TMP2. */
4788 DEFINE_COMPILER;
4789 #if PCRE2_CODE_UNIT_WIDTH == 32
4790 struct sljit_jump *jump;
4791 #endif
4792
4793 #if defined SLJIT_DEBUG && SLJIT_DEBUG
4794 /* dummy_ucd_record */
4795 const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
4796 SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
4797 SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
4798 #endif
4799
4800 SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
4801
4802 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
4803
4804 #if PCRE2_CODE_UNIT_WIDTH == 32
4805 if (!common->utf)
4806 {
4807 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
4808 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR);
4809 JUMPHERE(jump);
4810 }
4811 #endif
4812
4813 OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
4814 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
4815 OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
4816 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
4817 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
4818 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
4819 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
4820 OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
4821
4822 // PH hacking
4823 //fprintf(stderr, "~~A\n");
4824 OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
4825 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
4826 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
4827 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
4828
4829 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
4830
4831 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
4832
4833 // OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
4834 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
4835 }
4836
4837 #endif /* SUPPORT_UNICODE */
4838
4839 static SLJIT_INLINE struct sljit_label *mainloop_entry(compiler_common *common)
4840 {
4841 DEFINE_COMPILER;
4842 struct sljit_label *mainloop;
4843 struct sljit_label *newlinelabel = NULL;
4844 struct sljit_jump *start;
4845 struct sljit_jump *end = NULL;
4846 struct sljit_jump *end2 = NULL;
4847 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
4848 struct sljit_label *loop;
4849 struct sljit_jump *jump;
4850 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
4851 jump_list *newline = NULL;
4852 sljit_u32 overall_options = common->re->overall_options;
4853 BOOL hascrorlf = (common->re->flags & PCRE2_HASCRORLF) != 0;
4854 BOOL newlinecheck = FALSE;
4855 BOOL readuchar = FALSE;
4856
4857 if (!(hascrorlf || (overall_options & PCRE2_FIRSTLINE) != 0)
4858 && (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF || common->newline > 255))
4859 newlinecheck = TRUE;
4860
4861 SLJIT_ASSERT(common->abort_label == NULL);
4862
4863 if ((overall_options & PCRE2_FIRSTLINE) != 0)
4864 {
4865 /* Search for the end of the first line. */
4866 SLJIT_ASSERT(common->match_end_ptr != 0);
4867 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
4868
4869 if (common->nltype == NLTYPE_FIXED && common->newline > 255)
4870 {
4871 mainloop = LABEL();
4872 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4873 end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4874 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
4875 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
4876 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, mainloop);
4877 CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, mainloop);
4878 JUMPHERE(end);
4879 OP2(SLJIT_SUB, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4880 }
4881 else
4882 {
4883 end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4884 mainloop = LABEL();
4885 /* Continual stores does not cause data dependency. */
4886 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
4887 read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_NEWLINE);
4888 check_newlinechar(common, common->nltype, &newline, TRUE);
4889 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop);
4890 JUMPHERE(end);
4891 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
4892 set_jumps(newline, LABEL());
4893 }
4894
4895 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
4896 }
4897 else if ((overall_options & PCRE2_USE_OFFSET_LIMIT) != 0)
4898 {
4899 /* Check whether offset limit is set and valid. */
4900 SLJIT_ASSERT(common->match_end_ptr != 0);
4901
4902 OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
4903 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, offset_limit));
4904 OP1(SLJIT_MOV, TMP2, 0, STR_END, 0);
4905 end = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw) PCRE2_UNSET);
4906 OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0);
4907 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
4908 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
4909 #endif /* PCRE2_CODE_UNIT_WIDTH == [16|32] */
4910 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin));
4911 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
4912 end2 = CMP(SLJIT_LESS_EQUAL, TMP2, 0, STR_END, 0);
4913 OP1(SLJIT_MOV, TMP2, 0, STR_END, 0);
4914 JUMPHERE(end2);
4915 OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH);
4916 add_jump(compiler, &common->abort, CMP(SLJIT_LESS, TMP2, 0, STR_PTR, 0));
4917 JUMPHERE(end);
4918 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, TMP2, 0);
4919 }
4920
4921 start = JUMP(SLJIT_JUMP);
4922
4923 if (newlinecheck)
4924 {
4925 newlinelabel = LABEL();
4926 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4927 end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4928 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
4929 OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, common->newline & 0xff);
4930 OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
4931 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
4932 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
4933 #endif /* PCRE2_CODE_UNIT_WIDTH == [16|32] */
4934 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
4935 end2 = JUMP(SLJIT_JUMP);
4936 }
4937
4938 mainloop = LABEL();
4939
4940 /* Increasing the STR_PTR here requires one less jump in the most common case. */
4941 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
4942 if (common->utf && !common->invalid_utf) readuchar = TRUE;
4943 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
4944 if (newlinecheck) readuchar = TRUE;
4945
4946 if (readuchar)
4947 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
4948
4949 if (newlinecheck)
4950 CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);
4951
4952 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4953 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
4954 #if PCRE2_CODE_UNIT_WIDTH == 8
4955 if (common->invalid_utf)
4956 {
4957 /* Skip continuation code units. */
4958 loop = LABEL();
4959 jump = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4960 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
4961 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4962 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
4963 CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x40, loop);
4964 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4965 JUMPHERE(jump);
4966 }
4967 else if (common->utf)
4968 {
4969 jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
4970 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
4971 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
4972 JUMPHERE(jump);
4973 }
4974 #elif PCRE2_CODE_UNIT_WIDTH == 16
4975 if (common->invalid_utf)
4976 {
4977 /* Skip continuation code units. */
4978 loop = LABEL();
4979 jump = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
4980 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
4981 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4982 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
4983 CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x400, loop);
4984 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4985 JUMPHERE(jump);
4986 }
4987 else if (common->utf)
4988 {
4989 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
4990
4991 if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
4992 {
4993 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
4994 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400);
4995 CMOV(SLJIT_LESS, STR_PTR, TMP2, 0);
4996 }
4997 else
4998 {
4999 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400);
5000 OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS);
5001 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
5002 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
5003 }
5004 }
5005 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */
5006 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
5007 JUMPHERE(start);
5008
5009 if (newlinecheck)
5010 {
5011 JUMPHERE(end);
5012 JUMPHERE(end2);
5013 }
5014
5015 return mainloop;
5016 }
5017
5018
5019 static SLJIT_INLINE void add_prefix_char(PCRE2_UCHAR chr, fast_forward_char_data *chars, BOOL last)
5020 {
5021 sljit_u32 i, count = chars->count;
5022
5023 if (count == 255)
5024 return;
5025
5026 if (count == 0)
5027 {
5028 chars->count = 1;
5029 chars->chars[0] = chr;
5030
5031 if (last)
5032 chars->last_count = 1;
5033 return;
5034 }
5035
5036 for (i = 0; i < count; i++)
5037 if (chars->chars[i] == chr)
5038 return;
5039
5040 if (count >= MAX_DIFF_CHARS)
5041 {
5042 chars->count = 255;
5043 return;
5044 }
5045
5046 chars->chars[count] = chr;
5047 chars->count = count + 1;
5048
5049 if (last)
5050 chars->last_count++;
5051 }
5052
5053 static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count)
5054 {
5055 /* Recursive function, which scans prefix literals. */
5056 BOOL last, any, class, caseless;
5057 int len, repeat, len_save, consumed = 0;
5058 sljit_u32 chr; /* Any unicode character. */
5059 sljit_u8 *bytes, *bytes_end, byte;
5060 PCRE2_SPTR alternative, cc_save, oc;
5061 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
5062 PCRE2_UCHAR othercase[4];
5063 #elif defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
5064 PCRE2_UCHAR othercase[2];
5065 #else
5066 PCRE2_UCHAR othercase[1];
5067 #endif
5068
5069 repeat = 1;
5070 while (TRUE)
5071 {
5072 if (*rec_count == 0)
5073 return 0;
5074 (*rec_count)--;
5075
5076 last = TRUE;
5077 any = FALSE;
5078 class = FALSE;
5079 caseless = FALSE;
5080
5081 switch (*cc)
5082 {
5083 case OP_CHARI:
5084 caseless = TRUE;
5085 /* Fall through */
5086 case OP_CHAR:
5087 last = FALSE;
5088 cc++;
5089 break;
5090
5091 case OP_SOD:
5092 case OP_SOM:
5093 case OP_SET_SOM:
5094 case OP_NOT_WORD_BOUNDARY:
5095 case OP_WORD_BOUNDARY:
5096 case OP_EODN:
5097 case OP_EOD:
5098 case OP_CIRC:
5099 case OP_CIRCM:
5100 case OP_DOLL:
5101 case OP_DOLLM:
5102 /* Zero width assertions. */
5103 cc++;
5104 continue;
5105
5106 case OP_ASSERT:
5107 case OP_ASSERT_NOT:
5108 case OP_ASSERTBACK:
5109 case OP_ASSERTBACK_NOT:
5110 cc = bracketend(cc);
5111 continue;
5112
5113 case OP_PLUSI:
5114 case OP_MINPLUSI:
5115 case OP_POSPLUSI:
5116 caseless = TRUE;
5117 /* Fall through */
5118 case OP_PLUS:
5119 case OP_MINPLUS:
5120 case OP_POSPLUS:
5121 cc++;
5122 break;
5123
5124 case OP_EXACTI:
5125 caseless = TRUE;
5126 /* Fall through */
5127 case OP_EXACT:
5128 repeat = GET2(cc, 1);
5129 last = FALSE;
5130 cc += 1 + IMM2_SIZE;
5131 break;
5132
5133 case OP_QUERYI:
5134 case OP_MINQUERYI:
5135 case OP_POSQUERYI:
5136 caseless = TRUE;
5137 /* Fall through */
5138 case OP_QUERY:
5139 case OP_MINQUERY:
5140 case OP_POSQUERY:
5141 len = 1;
5142 cc++;
5143 #ifdef SUPPORT_UNICODE
5144 if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
5145 #endif
5146 max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
5147 if (max_chars == 0)
5148 return consumed;
5149 last = FALSE;
5150 break;
5151
5152 case OP_KET:
5153 cc += 1 + LINK_SIZE;
5154 continue;
5155
5156 case OP_ALT:
5157 cc += GET(cc, 1);
5158 continue;
5159
5160 case OP_ONCE:
5161 case OP_BRA:
5162 case OP_BRAPOS:
5163 case OP_CBRA:
5164 case OP_CBRAPOS:
5165 alternative = cc + GET(cc, 1);
5166 while (*alternative == OP_ALT)
5167 {
5168 max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
5169 if (max_chars == 0)
5170 return consumed;
5171 alternative += GET(alternative, 1);
5172 }
5173
5174 if (*cc == OP_CBRA || *cc == OP_CBRAPOS)
5175 cc += IMM2_SIZE;
5176 cc += 1 + LINK_SIZE;
5177 continue;
5178
5179 case OP_CLASS:
5180 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
5181 if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE))
5182 return consumed;
5183 #endif
5184 class = TRUE;
5185 break;
5186
5187 case OP_NCLASS:
5188 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5189 if (common->utf) return consumed;
5190 #endif
5191 class = TRUE;
5192 break;
5193
5194 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
5195 case OP_XCLASS:
5196 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5197 if (common->utf) return consumed;
5198 #endif
5199 any = TRUE;
5200 cc += GET(cc, 1);
5201 break;
5202 #endif
5203
5204 case OP_DIGIT:
5205 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
5206 if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE))
5207 return consumed;
5208 #endif
5209 any = TRUE;
5210 cc++;
5211 break;
5212
5213 case OP_WHITESPACE:
5214 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
5215 if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE))
5216 return consumed;
5217 #endif
5218 any = TRUE;
5219 cc++;
5220 break;
5221
5222 case OP_WORDCHAR:
5223 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
5224 if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE))
5225 return consumed;
5226 #endif
5227 any = TRUE;
5228 cc++;
5229 break;
5230
5231 case OP_NOT:
5232 case OP_NOTI:
5233 cc++;
5234 /* Fall through. */
5235 case OP_NOT_DIGIT:
5236 case OP_NOT_WHITESPACE:
5237 case OP_NOT_WORDCHAR:
5238 case OP_ANY:
5239 case OP_ALLANY:
5240 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5241 if (common->utf) return consumed;
5242 #endif
5243 any = TRUE;
5244 cc++;
5245 break;
5246
5247 #ifdef SUPPORT_UNICODE
5248 case OP_NOTPROP:
5249 case OP_PROP:
5250 #if PCRE2_CODE_UNIT_WIDTH != 32
5251 if (common->utf) return consumed;
5252 #endif
5253 any = TRUE;
5254 cc += 1 + 2;
5255 break;
5256 #endif
5257
5258 case OP_TYPEEXACT:
5259 repeat = GET2(cc, 1);
5260 cc += 1 + IMM2_SIZE;
5261 continue;
5262
5263 case OP_NOTEXACT:
5264 case OP_NOTEXACTI:
5265 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5266 if (common->utf) return consumed;
5267 #endif
5268 any = TRUE;
5269 repeat = GET2(cc, 1);
5270 cc += 1 + IMM2_SIZE + 1;
5271 break;
5272
5273 default:
5274 return consumed;
5275 }
5276
5277 if (any)
5278 {
5279 do
5280 {
5281 chars->count = 255;
5282
5283 consumed++;
5284 if (--max_chars == 0)
5285 return consumed;
5286 chars++;
5287 }
5288 while (--repeat > 0);
5289
5290 repeat = 1;
5291 continue;
5292 }
5293
5294 if (class)
5295 {
5296 bytes = (sljit_u8*) (cc + 1);
5297 cc += 1 + 32 / sizeof(PCRE2_UCHAR);
5298
5299 switch (*cc)
5300 {
5301 case OP_CRSTAR:
5302 case OP_CRMINSTAR:
5303 case OP_CRPOSSTAR:
5304 case OP_CRQUERY:
5305 case OP_CRMINQUERY:
5306 case OP_CRPOSQUERY:
5307 max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count);
5308 if (max_chars == 0)
5309 return consumed;
5310 break;
5311
5312 default:
5313 case OP_CRPLUS:
5314 case OP_CRMINPLUS:
5315 case OP_CRPOSPLUS:
5316 break;
5317
5318 case OP_CRRANGE:
5319 case OP_CRMINRANGE:
5320 case OP_CRPOSRANGE:
5321 repeat = GET2(cc, 1);
5322 if (repeat <= 0)
5323 return consumed;
5324 break;
5325 }
5326
5327 do
5328 {
5329 if (bytes[31] & 0x80)
5330 chars->count = 255;
5331 else if (chars->count != 255)
5332 {
5333 bytes_end = bytes + 32;
5334 chr = 0;
5335 do
5336 {
5337 byte = *bytes++;
5338 SLJIT_ASSERT((chr & 0x7) == 0);
5339 if (byte == 0)
5340 chr += 8;
5341 else
5342 {
5343 do
5344 {
5345 if ((byte & 0x1) != 0)
5346 add_prefix_char(chr, chars, TRUE);
5347 byte >>= 1;
5348 chr++;
5349 }
5350 while (byte != 0);
5351 chr = (chr + 7) & ~7;
5352 }
5353 }
5354 while (chars->count != 255 && bytes < bytes_end);
5355 bytes = bytes_end - 32;
5356 }
5357
5358 consumed++;
5359 if (--max_chars == 0)
5360 return consumed;
5361 chars++;
5362 }
5363 while (--repeat > 0);
5364
5365 switch (*cc)
5366 {
5367 case OP_CRSTAR:
5368 case OP_CRMINSTAR:
5369 case OP_CRPOSSTAR:
5370 return consumed;
5371
5372 case OP_CRQUERY:
5373 case OP_CRMINQUERY:
5374 case OP_CRPOSQUERY:
5375 cc++;
5376 break;
5377
5378 case OP_CRRANGE:
5379 case OP_CRMINRANGE:
5380 case OP_CRPOSRANGE:
5381 if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE))
5382 return consumed;
5383 cc += 1 + 2 * IMM2_SIZE;
5384 break;
5385 }
5386
5387 repeat = 1;
5388 continue;
5389 }
5390
5391 len = 1;
5392 #ifdef SUPPORT_UNICODE
5393 if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
5394 #endif
5395
5396 if (caseless && char_has_othercase(common, cc))
5397 {
5398 #ifdef SUPPORT_UNICODE
5399 if (common->utf)
5400 {
5401 GETCHAR(chr, cc);
5402 if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len)
5403 return consumed;
5404 }
5405 else
5406 #endif
5407 {
5408 chr = *cc;
5409 othercase[0] = TABLE_GET(chr, common->fcc, chr);
5410 }
5411 }
5412 else
5413 {
5414 caseless = FALSE;
5415 othercase[0] = 0; /* Stops compiler warning - PH */
5416 }
5417
5418 len_save = len;
5419 cc_save = cc;
5420 while (TRUE)
5421 {
5422 oc = othercase;
5423 do
5424 {
5425 len--;
5426 consumed++;
5427
5428 chr = *cc;
5429 add_prefix_char(*cc, chars, len == 0);
5430
5431 if (caseless)
5432 add_prefix_char(*oc, chars, len == 0);
5433
5434 if (--max_chars == 0)
5435 return consumed;
5436 chars++;
5437 cc++;
5438 oc++;
5439 }
5440 while (len > 0);
5441
5442 if (--repeat == 0)
5443 break;
5444
5445 len = len_save;
5446 cc = cc_save;
5447 }
5448
5449 repeat = 1;
5450 if (last)
5451 return consumed;
5452 }
5453 }
5454
5455 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5456 static void jumpto_if_not_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg, struct sljit_label *label)
5457 {
5458 #if PCRE2_CODE_UNIT_WIDTH == 8
5459 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
5460 CMPTO(SLJIT_EQUAL, reg, 0, SLJIT_IMM, 0x80, label);
5461 #elif PCRE2_CODE_UNIT_WIDTH == 16
5462 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
5463 CMPTO(SLJIT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00, label);
5464 #else
5465 #error "Unknown code width"
5466 #endif
5467 }
5468 #endif
5469
5470 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) && !(defined SUPPORT_VALGRIND)
5471
5472 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5473 static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
5474 {
5475 #if PCRE2_CODE_UNIT_WIDTH == 8
5476 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
5477 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
5478 #elif PCRE2_CODE_UNIT_WIDTH == 16
5479 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
5480 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
5481 #else
5482 #error "Unknown code width"
5483 #endif
5484 }
5485 #endif
5486
5487 static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
5488 {
5489 sljit_u32 value = chr;
5490 #if PCRE2_CODE_UNIT_WIDTH == 8
5491 #define SSE2_COMPARE_TYPE_INDEX 0
5492 return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
5493 #elif PCRE2_CODE_UNIT_WIDTH == 16
5494 #define SSE2_COMPARE_TYPE_INDEX 1
5495 return (sljit_s32)((value << 16) | value);
5496 #elif PCRE2_CODE_UNIT_WIDTH == 32
5497 #define SSE2_COMPARE_TYPE_INDEX 2
5498 return (sljit_s32)(value);
5499 #else
5500 #error "Unsupported unit width"
5501 #endif
5502 }
5503
5504 static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg)
5505 {
5506 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
5507 sljit_u8 instruction[5];
5508 #else
5509 sljit_u8 instruction[4];
5510 #endif
5511
5512 SLJIT_ASSERT(dst_xmm_reg < 8);
5513
5514 /* MOVDQA xmm1, xmm2/m128 */
5515 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
5516 if (src_general_reg < 8)
5517 {
5518 instruction[0] = 0x66;
5519 instruction[1] = 0x0f;
5520 instruction[2] = 0x6f;
5521 instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
5522 sljit_emit_op_custom(compiler, instruction, 4);
5523 }
5524 else
5525 {
5526 instruction[0] = 0x66;
5527 instruction[1] = 0x41;
5528 instruction[2] = 0x0f;
5529 instruction[3] = 0x6f;
5530 instruction[4] = (dst_xmm_reg << 3) | (src_general_reg & 0x7);
5531 sljit_emit_op_custom(compiler, instruction, 4);
5532 }
5533 #else
5534 instruction[0] = 0x66;
5535 instruction[1] = 0x0f;
5536 instruction[2] = 0x6f;
5537 instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
5538 sljit_emit_op_custom(compiler, instruction, 4);
5539 #endif
5540 }
5541
5542 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR char2,
5543 sljit_u32 bit, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
5544 {
5545 sljit_u8 instruction[4];
5546 instruction[0] = 0x66;
5547 instruction[1] = 0x0f;
5548
5549 if (char1 == char2 || bit != 0)
5550 {
5551 if (bit != 0)
5552 {
5553 /* POR xmm1, xmm2/m128 */
5554 /* instruction[0] = 0x66; */
5555 /* instruction[1] = 0x0f; */
5556 instruction[2] = 0xeb;
5557 instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
5558 sljit_emit_op_custom(compiler, instruction, 4);
5559 }
5560
5561 /* PCMPEQB/W/D xmm1, xmm2/m128 */
5562 /* instruction[0] = 0x66; */
5563 /* instruction[1] = 0x0f; */
5564 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
5565 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
5566 sljit_emit_op_custom(compiler, instruction, 4);
5567 }
5568 else
5569 {
5570 /* MOVDQA xmm1, xmm2/m128 */
5571 /* instruction[0] = 0x66; */
5572 /* instruction[1] = 0x0f; */
5573 instruction[2] = 0x6f;
5574 instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
5575 sljit_emit_op_custom(compiler, instruction, 4);
5576
5577 /* PCMPEQB/W/D xmm1, xmm2/m128 */
5578 /* instruction[0] = 0x66; */
5579 /* instruction[1] = 0x0f; */
5580 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
5581 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
5582 sljit_emit_op_custom(compiler, instruction, 4);
5583
5584 instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
5585 sljit_emit_op_custom(compiler, instruction, 4);
5586
5587 /* POR xmm1, xmm2/m128 */
5588 /* instruction[0] = 0x66; */
5589 /* instruction[1] = 0x0f; */
5590 instruction[2] = 0xeb;
5591 instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
5592 sljit_emit_op_custom(compiler, instruction, 4);
5593 }
5594 }
5595
5596 static void fast_forward_first_char2_sse2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
5597 {
5598 DEFINE_COMPILER;
5599 struct sljit_label *start;
5600 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5601 struct sljit_label *restart;
5602 #endif
5603 struct sljit_jump *quit;
5604 struct sljit_jump *partial_quit[2];
5605 sljit_u8 instruction[8];
5606 sljit_s32 tmp1_ind = sljit_get_register_index(TMP1);
5607 sljit_s32 str_ptr_ind = sljit_get_register_index(STR_PTR);
5608 sljit_s32 data_ind = 0;
5609 sljit_s32 tmp_ind = 1;
5610 sljit_s32 cmp1_ind = 2;
5611 sljit_s32 cmp2_ind = 3;
5612 sljit_u32 bit = 0;
5613
5614 SLJIT_UNUSED_ARG(offset);
5615
5616 if (char1 != char2)
5617 {
5618 bit = char1 ^ char2;
5619 if (!is_powerof2(bit))
5620 bit = 0;
5621 }
5622
5623 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
5624 if (common->mode == PCRE2_JIT_COMPLETE)
5625 add_jump(compiler, &common->failed_match, partial_quit[0]);
5626
5627 /* First part (unaligned start) */
5628
5629 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
5630
5631 SLJIT_ASSERT(tmp1_ind < 8);
5632
5633 /* MOVD xmm, r/m32 */
5634 instruction[0] = 0x66;
5635 instruction[1] = 0x0f;
5636 instruction[2] = 0x6e;
5637 instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_ind;
5638 sljit_emit_op_custom(compiler, instruction, 4);
5639
5640 if (char1 != char2)
5641 {
5642 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
5643
5644 /* MOVD xmm, r/m32 */
5645 instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_ind;
5646 sljit_emit_op_custom(compiler, instruction, 4);
5647 }
5648
5649 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
5650
5651 /* PSHUFD xmm1, xmm2/m128, imm8 */
5652 /* instruction[0] = 0x66; */
5653 /* instruction[1] = 0x0f; */
5654 instruction[2] = 0x70;
5655 instruction[3] = 0xc0 | (cmp1_ind << 3) | 2;
5656 instruction[4] = 0;
5657 sljit_emit_op_custom(compiler, instruction, 5);
5658
5659 if (char1 != char2)
5660 {
5661 /* PSHUFD xmm1, xmm2/m128, imm8 */
5662 instruction[3] = 0xc0 | (cmp2_ind << 3) | 3;
5663 sljit_emit_op_custom(compiler, instruction, 5);
5664 }
5665
5666 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5667 restart = LABEL();
5668 #endif
5669 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
5670 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
5671
5672 load_from_mem_sse2(compiler, data_ind, str_ptr_ind);
5673 fast_forward_char_pair_sse2_compare(compiler, char1, char2, bit, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
5674
5675 /* PMOVMSKB reg, xmm */
5676 /* instruction[0] = 0x66; */
5677 /* instruction[1] = 0x0f; */
5678 instruction[2] = 0xd7;
5679 instruction[3] = 0xc0 | (tmp1_ind << 3) | 0;
5680 sljit_emit_op_custom(compiler, instruction, 4);
5681
5682 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
5683 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
5684
5685 /* BSF r32, r/m32 */
5686 instruction[0] = 0x0f;
5687 instruction[1] = 0xbc;
5688 instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind;
5689 sljit_emit_op_custom(compiler, instruction, 3);
5690 sljit_set_current_flags(compiler, SLJIT_SET_Z);
5691
5692 quit = JUMP(SLJIT_NOT_ZERO);
5693
5694 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
5695
5696 start = LABEL();
5697 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
5698
5699 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
5700 if (common->mode == PCRE2_JIT_COMPLETE)
5701 add_jump(compiler, &common->failed_match, partial_quit[1]);
5702
5703 /* Second part (aligned) */
5704
5705 load_from_mem_sse2(compiler, 0, str_ptr_ind);
5706 fast_forward_char_pair_sse2_compare(compiler, char1, char2, bit, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
5707
5708 /* PMOVMSKB reg, xmm */
5709 instruction[0] = 0x66;
5710 instruction[1] = 0x0f;
5711 instruction[2] = 0xd7;
5712 instruction[3] = 0xc0 | (tmp1_ind << 3) | 0;
5713 sljit_emit_op_custom(compiler, instruction, 4);
5714
5715 /* BSF r32, r/m32 */
5716 instruction[0] = 0x0f;
5717 instruction[1] = 0xbc;
5718 instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind;
5719 sljit_emit_op_custom(compiler, instruction, 3);
5720 sljit_set_current_flags(compiler, SLJIT_SET_Z);
5721
5722 JUMPTO(SLJIT_ZERO, start);
5723
5724 JUMPHERE(quit);
5725 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
5726
5727 if (common->mode != PCRE2_JIT_COMPLETE)
5728 {
5729 JUMPHERE(partial_quit[0]);
5730 JUMPHERE(partial_quit[1]);
5731 OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
5732 CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
5733 }
5734 else
5735 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
5736
5737 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5738 if (common->utf && offset > 0)
5739 {
5740 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
5741
5742 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
5743
5744 quit = jump_if_utf_char_start(compiler, TMP1);
5745
5746 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
5747 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
5748 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
5749 JUMPTO(SLJIT_JUMP, restart);
5750
5751 JUMPHERE(quit);
5752 }
5753 #endif
5754 }
5755
5756 #ifndef _WIN64
5757
5758 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_sse2_offset(void)
5759 {
5760 #if PCRE2_CODE_UNIT_WIDTH == 8
5761 return 15;
5762 #elif PCRE2_CODE_UNIT_WIDTH == 16
5763 return 7;
5764 #elif PCRE2_CODE_UNIT_WIDTH == 32
5765 return 3;
5766 #else
5767 #error "Unsupported unit width"
5768 #endif
5769 }
5770
5771 static void fast_forward_char_pair_sse2(compiler_common *common, sljit_s32 offs1,
5772 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
5773 {
5774 DEFINE_COMPILER;
5775 sljit_u32 bit1 = 0;
5776 sljit_u32 bit2 = 0;
5777 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
5778 sljit_s32 tmp1_ind = sljit_get_register_index(TMP1);
5779 sljit_s32 tmp2_ind = sljit_get_register_index(TMP2);
5780 sljit_s32 str_ptr_ind = sljit_get_register_index(STR_PTR);
5781 sljit_s32 data1_ind = 0;
5782 sljit_s32 data2_ind = 1;
5783 sljit_s32 tmp_ind = 2;
5784 sljit_s32 cmp1a_ind = 3;
5785 sljit_s32 cmp1b_ind = 4;
5786 sljit_s32 cmp2a_ind = 5;
5787 sljit_s32 cmp2b_ind = 6;
5788 struct sljit_label *start;
5789 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5790 struct sljit_label *restart;
5791 #endif
5792 struct sljit_jump *jump[2];
5793
5794 sljit_u8 instruction[8];
5795
5796 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
5797 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_sse2_offset()));
5798 SLJIT_ASSERT(tmp1_ind < 8 && tmp2_ind == 1);
5799
5800 /* Initialize. */
5801 if (common->match_end_ptr != 0)
5802 {
5803 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
5804 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
5805 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
5806
5807 OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
5808 CMOV(SLJIT_LESS, STR_END, TMP1, 0);
5809 }
5810
5811 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
5812 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
5813
5814 /* MOVD xmm, r/m32 */
5815 instruction[0] = 0x66;
5816 instruction[1] = 0x0f;
5817 instruction[2] = 0x6e;
5818
5819 if (char1a == char1b)
5820 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
5821 else
5822 {
5823 bit1 = char1a ^ char1b;
5824 if (is_powerof2(bit1))
5825 {
5826 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
5827 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
5828 }
5829 else
5830 {
5831 bit1 = 0;
5832 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
5833 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
5834 }
5835 }
5836
5837 instruction[3] = 0xc0 | (cmp1a_ind << 3) | tmp1_ind;
5838 sljit_emit_op_custom(compiler, instruction, 4);
5839
5840 if (char1a != char1b)
5841 {
5842 instruction[3] = 0xc0 | (cmp1b_ind << 3) | tmp2_ind;
5843 sljit_emit_op_custom(compiler, instruction, 4);
5844 }
5845
5846 if (char2a == char2b)
5847 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
5848 else
5849 {
5850 bit2 = char2a ^ char2b;
5851 if (is_powerof2(bit2))
5852 {
5853 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
5854 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
5855 }
5856 else
5857 {
5858 bit2 = 0;
5859 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
5860 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
5861 }
5862 }
5863
5864 instruction[3] = 0xc0 | (cmp2a_ind << 3) | tmp1_ind;
5865 sljit_emit_op_custom(compiler, instruction, 4);
5866
5867 if (char2a != char2b)
5868 {
5869 instruction[3] = 0xc0 | (cmp2b_ind << 3) | tmp2_ind;
5870 sljit_emit_op_custom(compiler, instruction, 4);
5871 }
5872
5873 /* PSHUFD xmm1, xmm2/m128, imm8 */
5874 /* instruction[0] = 0x66; */
5875 /* instruction[1] = 0x0f; */
5876 instruction[2] = 0x70;
5877 instruction[4] = 0;
5878
5879 instruction[3] = 0xc0 | (cmp1a_ind << 3) | cmp1a_ind;
5880 sljit_emit_op_custom(compiler, instruction, 5);
5881
5882 if (char1a != char1b)
5883 {
5884 instruction[3] = 0xc0 | (cmp1b_ind << 3) | cmp1b_ind;
5885 sljit_emit_op_custom(compiler, instruction, 5);
5886 }
5887
5888 instruction[3] = 0xc0 | (cmp2a_ind << 3) | cmp2a_ind;
5889 sljit_emit_op_custom(compiler, instruction, 5);
5890
5891 if (char2a != char2b)
5892 {
5893 instruction[3] = 0xc0 | (cmp2b_ind << 3) | cmp2b_ind;
5894 sljit_emit_op_custom(compiler, instruction, 5);
5895 }
5896
5897 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5898 restart = LABEL();
5899 #endif
5900
5901 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1 - offs2));
5902 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
5903 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
5904 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, ~0xf);
5905
5906 load_from_mem_sse2(compiler, data1_ind, str_ptr_ind);
5907
5908 jump[0] = CMP(SLJIT_EQUAL, STR_PTR, 0, TMP1, 0);
5909
5910 load_from_mem_sse2(compiler, data2_ind, tmp1_ind);
5911
5912 /* MOVDQA xmm1, xmm2/m128 */
5913 /* instruction[0] = 0x66; */
5914 /* instruction[1] = 0x0f; */
5915 instruction[2] = 0x6f;
5916 instruction[3] = 0xc0 | (tmp_ind << 3) | data1_ind;
5917 sljit_emit_op_custom(compiler, instruction, 4);
5918
5919 /* PSLLDQ xmm1, xmm2/m128, imm8 */
5920 /* instruction[0] = 0x66; */
5921 /* instruction[1] = 0x0f; */
5922 instruction[2] = 0x73;
5923 instruction[3] = 0xc0 | (7 << 3) | tmp_ind;
5924 instruction[4] = diff;
5925 sljit_emit_op_custom(compiler, instruction, 5);
5926
5927 /* PSRLDQ xmm1, xmm2/m128, imm8 */
5928 /* instruction[0] = 0x66; */
5929 /* instruction[1] = 0x0f; */
5930 /* instruction[2] = 0x73; */
5931 instruction[3] = 0xc0 | (3 << 3) | data2_ind;
5932 instruction[4] = 16 - diff;
5933 sljit_emit_op_custom(compiler, instruction, 5);
5934
5935 /* POR xmm1, xmm2/m128 */
5936 /* instruction[0] = 0x66; */
5937 /* instruction[1] = 0x0f; */
5938 instruction[2] = 0xeb;
5939 instruction[3] = 0xc0 | (data2_ind << 3) | tmp_ind;
5940 sljit_emit_op_custom(compiler, instruction, 4);
5941
5942 jump[1] = JUMP(SLJIT_JUMP);
5943
5944 JUMPHERE(jump[0]);
5945
5946 /* MOVDQA xmm1, xmm2/m128 */
5947 /* instruction[0] = 0x66; */
5948 /* instruction[1] = 0x0f; */
5949 instruction[2] = 0x6f;
5950 instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
5951 sljit_emit_op_custom(compiler, instruction, 4);
5952
5953 /* PSLLDQ xmm1, xmm2/m128, imm8 */
5954 /* instruction[0] = 0x66; */
5955 /* instruction[1] = 0x0f; */
5956 instruction[2] = 0x73;
5957 instruction[3] = 0xc0 | (7 << 3) | data2_ind;
5958 instruction[4] = diff;
5959 sljit_emit_op_custom(compiler, instruction, 5);
5960
5961 JUMPHERE(jump[1]);
5962
5963 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
5964
5965 fast_forward_char_pair_sse2_compare(compiler, char2a, char2b, bit2, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
5966 fast_forward_char_pair_sse2_compare(compiler, char1a, char1b, bit1, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
5967
5968 /* PAND xmm1, xmm2/m128 */
5969 /* instruction[0] = 0x66; */
5970 /* instruction[1] = 0x0f; */
5971 instruction[2] = 0xdb;
5972 instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
5973 sljit_emit_op_custom(compiler, instruction, 4);
5974
5975 /* PMOVMSKB reg, xmm */
5976 /* instruction[0] = 0x66; */
5977 /* instruction[1] = 0x0f; */
5978 instruction[2] = 0xd7;
5979 instruction[3] = 0xc0 | (tmp1_ind << 3) | 0;
5980 sljit_emit_op_custom(compiler, instruction, 4);
5981
5982 /* Ignore matches before the first STR_PTR. */
5983 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
5984 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
5985
5986 /* BSF r32, r/m32 */
5987 instruction[0] = 0x0f;
5988 instruction[1] = 0xbc;
5989 instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind;
5990 sljit_emit_op_custom(compiler, instruction, 3);
5991 sljit_set_current_flags(compiler, SLJIT_SET_Z);
5992
5993 jump[0] = JUMP(SLJIT_NOT_ZERO);
5994
5995 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
5996
5997 /* Main loop. */
5998 instruction[0] = 0x66;
5999 instruction[1] = 0x0f;
6000
6001 start = LABEL();
6002
6003 load_from_mem_sse2(compiler, data2_ind, str_ptr_ind);
6004
6005 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
6006 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
6007
6008 load_from_mem_sse2(compiler, data1_ind, str_ptr_ind);
6009
6010 /* PSRLDQ xmm1, xmm2/m128, imm8 */
6011 /* instruction[0] = 0x66; */
6012 /* instruction[1] = 0x0f; */
6013 instruction[2] = 0x73;
6014 instruction[3] = 0xc0 | (3 << 3) | data2_ind;
6015 instruction[4] = 16 - diff;
6016 sljit_emit_op_custom(compiler, instruction, 5);
6017
6018 /* MOVDQA xmm1, xmm2/m128 */
6019 /* instruction[0] = 0x66; */
6020 /* instruction[1] = 0x0f; */
6021 instruction[2] = 0x6f;
6022 instruction[3] = 0xc0 | (tmp_ind << 3) | data1_ind;
6023 sljit_emit_op_custom(compiler, instruction, 4);
6024
6025 /* PSLLDQ xmm1, xmm2/m128, imm8 */
6026 /* instruction[0] = 0x66; */
6027 /* instruction[1] = 0x0f; */
6028 instruction[2] = 0x73;
6029 instruction[3] = 0xc0 | (7 << 3) | tmp_ind;
6030 instruction[4] = diff;
6031 sljit_emit_op_custom(compiler, instruction, 5);
6032
6033 /* POR xmm1, xmm2/m128 */
6034 /* instruction[0] = 0x66; */
6035 /* instruction[1] = 0x0f; */
6036 instruction[2] = 0xeb;
6037 instruction[3] = 0xc0 | (data2_ind << 3) | tmp_ind;
6038 sljit_emit_op_custom(compiler, instruction, 4);
6039
6040 fast_forward_char_pair_sse2_compare(compiler, char1a, char1b, bit1, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
6041 fast_forward_char_pair_sse2_compare(compiler, char2a, char2b, bit2, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
6042
6043 /* PAND xmm1, xmm2/m128 */
6044 /* instruction[0] = 0x66; */
6045 /* instruction[1] = 0x0f; */
6046 instruction[2] = 0xdb;
6047 instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
6048 sljit_emit_op_custom(compiler, instruction, 4);
6049
6050 /* PMOVMSKB reg, xmm */
6051 /* instruction[0] = 0x66; */
6052 /* instruction[1] = 0x0f; */
6053 instruction[2] = 0xd7;
6054 instruction[3] = 0xc0 | (tmp1_ind << 3) | 0;
6055 sljit_emit_op_custom(compiler, instruction, 4);
6056
6057 /* BSF r32, r/m32 */
6058 instruction[0] = 0x0f;
6059 instruction[1] = 0xbc;
6060 instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind;
6061 sljit_emit_op_custom(compiler, instruction, 3);
6062 sljit_set_current_flags(compiler, SLJIT_SET_Z);
6063
6064 JUMPTO(SLJIT_ZERO, start);
6065
6066 JUMPHERE(jump[0]);
6067
6068 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
6069
6070 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
6071
6072 if (common->match_end_ptr != 0)
6073 OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
6074
6075 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
6076 if (common->utf)
6077 {
6078 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
6079
6080 jump[0] = jump_if_utf_char_start(compiler, TMP1);
6081
6082 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6083 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
6084
6085 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
6086
6087 JUMPHERE(jump[0]);
6088 }
6089 #endif
6090
6091 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
6092
6093 if (common->match_end_ptr != 0)
6094 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
6095 }
6096
6097 static BOOL check_fast_forward_char_pair_sse2(compiler_common *common, fast_forward_char_data *chars, int max)
6098 {
6099 sljit_s32 i, j, priority, count;
6100 sljit_u32 priorities;
6101 PCRE2_UCHAR a1, a2, b1, b2;
6102
6103 priorities = 0;
6104
6105 count = 0;
6106 for (i = 0; i < max; i++)
6107 {
6108 if (chars[i].last_count > 2)
6109 {
6110 SLJIT_ASSERT(chars[i].last_count <= 7);
6111
6112 priorities |= (1 << chars[i].last_count);
6113 count++;
6114 }
6115 }
6116
6117 if (count < 2)
6118 return FALSE;
6119
6120 for (priority = 7; priority > 2; priority--)
6121 {
6122 if ((priorities & (1 << priority)) == 0)
6123 continue;
6124
6125 for (i = max - 1; i >= 1; i--)
6126 if (chars[i].last_count >= priority)
6127 {
6128 SLJIT_ASSERT(chars[i].count <= 2 && chars[i].count >= 1);
6129
6130 a1 = chars[i].chars[0];
6131 a2 = chars[i].chars[1];
6132
6133 j = i - max_fast_forward_char_pair_sse2_offset();
6134 if (j < 0)
6135 j = 0;
6136
6137 while (j < i)
6138 {
6139 if (chars[j].last_count >= priority)
6140 {
6141 b1 = chars[j].chars[0];
6142 b2 = chars[j].chars[1];
6143
6144 if (a1 != b1 && a1 != b2 && a2 != b1 && a2 != b2)
6145 {
6146 fast_forward_char_pair_sse2(common, i, a1, a2, j, b1, b2);
6147 return TRUE;
6148 }
6149 }
6150 j++;
6151 }
6152 }
6153 }
6154
6155 return FALSE;
6156 }
6157
6158 #endif
6159
6160 #undef SSE2_COMPARE_TYPE_INDEX
6161
6162 #endif
6163
6164 static void fast_forward_first_char2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
6165 {
6166 DEFINE_COMPILER;
6167 struct sljit_label *start;
6168 struct sljit_jump *match;
6169 struct sljit_jump *partial_quit;
6170 PCRE2_UCHAR mask;
6171 BOOL has_match_end = (common->match_end_ptr != 0);
6172
6173 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE || offset == 0);
6174
6175 if (has_match_end)
6176 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
6177
6178 if (offset > 0)
6179 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset));
6180
6181 if (has_match_end)
6182 {
6183 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
6184
6185 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offset + 1));
6186 OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_END, 0, TMP1, 0);
6187 CMOV(SLJIT_GREATER, STR_END, TMP1, 0);
6188 }
6189
6190 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) && !(defined SUPPORT_VALGRIND)
6191
6192 /* SSE2 accelerated first character search. */
6193
6194 if (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
6195 {
6196 fast_forward_first_char2_sse2(common, char1, char2, offset);
6197
6198 if (offset > 0)
6199 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset));
6200
6201 if (has_match_end)
6202 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
6203 return;
6204 }
6205
6206 #endif
6207
6208 start = LABEL();
6209
6210 partial_quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
6211 if (common->mode == PCRE2_JIT_COMPLETE)
6212 add_jump(compiler, &common->failed_match, partial_quit);
6213
6214 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
6215 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6216
6217 if (char1 == char2)
6218 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, char1, start);
6219 else
6220 {
6221 mask = char1 ^ char2;
6222 if (is_powerof2(mask))
6223 {
6224 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask);
6225 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, char1 | mask, start);
6226 }
6227 else
6228 {
6229 match = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, char1);
6230 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, char2, start);
6231 JUMPHERE(match);
6232 }
6233 }
6234
6235 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
6236 if (common->utf && offset > 0)
6237 {
6238 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-(offset + 1)));
6239 jumpto_if_not_utf_char_start(compiler, TMP1, start);
6240 }
6241 #endif
6242
6243 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset + 1));
6244
6245 if (common->mode != PCRE2_JIT_COMPLETE)
6246 JUMPHERE(partial_quit);
6247
6248 if (has_match_end)
6249 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
6250 }
6251
6252 static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common)
6253 {
6254 DEFINE_COMPILER;
6255 struct sljit_label *start;
6256 struct sljit_jump *match;
6257 fast_forward_char_data chars[MAX_N_CHARS];
6258 sljit_s32 offset;
6259 PCRE2_UCHAR mask;
6260 PCRE2_UCHAR *char_set, *char_set_end;
6261 int i, max, from;
6262 int range_right = -1, range_len;
6263 sljit_u8 *update_table = NULL;
6264 BOOL in_range;
6265 sljit_u32 rec_count;
6266
6267 for (i = 0; i < MAX_N_CHARS; i++)
6268 {
6269 chars[i].count = 0;
6270 chars[i].last_count = 0;
6271 }
6272
6273 rec_count = 10000;
6274 max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);
6275
6276 if (max < 1)
6277 return FALSE;
6278
6279 /* Convert last_count to priority. */
6280 for (i = 0; i < max; i++)
6281 {
6282 SLJIT_ASSERT(chars[i].count > 0 && chars[i].last_count <= chars[i].count);
6283
6284 if (chars[i].count == 1)
6285 {
6286 chars[i].last_count = (chars[i].last_count == 1) ? 7 : 5;
6287 /* Simplifies algorithms later. */
6288 chars[i].chars[1] = chars[i].chars[0];
6289 }
6290 else if (chars[i].count == 2)
6291 {
6292 SLJIT_ASSERT(chars[i].chars[0] != chars[i].chars[1]);
6293
6294 if (is_powerof2(chars[i].chars[0] ^ chars[i].chars[1]))
6295 chars[i].last_count = (chars[i].last_count == 2) ? 6 : 4;
6296 else
6297 chars[i].last_count = (chars[i].last_count == 2) ? 3 : 2;
6298 }
6299 else
6300 chars[i].last_count = (chars[i].count == 255) ? 0 : 1;
6301 }
6302
6303 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) && !(defined SUPPORT_VALGRIND) && !(defined _WIN64)
6304 if (sljit_has_cpu_feature(SLJIT_HAS_SSE2) && check_fast_forward_char_pair_sse2(common, chars, max))
6305 return TRUE;
6306 #endif
6307
6308 in_range = FALSE;
6309 /* Prevent compiler "uninitialized" warning */
6310 from = 0;
6311 range_len = 4 /* minimum length */ - 1;
6312 for (i = 0; i <= max; i++)
6313 {
6314 if (in_range && (i - from) > range_len && (chars[i - 1].count < 255))
6315 {
6316 range_len = i - from;
6317 range_right = i - 1;
6318 }
6319
6320 if (i < max && chars[i].count < 255)
6321 {
6322 SLJIT_ASSERT(chars[i].count > 0);
6323 if (!in_range)
6324 {
6325 in_range = TRUE;
6326 from = i;
6327 }
6328 }
6329 else
6330 in_range = FALSE;
6331 }
6332
6333 if (range_right >= 0)
6334 {
6335 update_table = (sljit_u8 *)allocate_read_only_data(common, 256);
6336 if (update_table == NULL)
6337 return TRUE;
6338 memset(update_table, IN_UCHARS(range_len), 256);
6339
6340 for (i = 0; i < range_len; i++)
6341 {
6342 SLJIT_ASSERT(chars[range_right - i].count > 0 && chars[range_right - i].count < 255);
6343
6344 char_set = chars[range_right - i].chars;
6345 char_set_end = char_set + chars[range_right - i].count;
6346 do
6347 {
6348 if (update_table[(*char_set) & 0xff] > IN_UCHARS(i))
6349 update_table[(*char_set) & 0xff] = IN_UCHARS(i);
6350 char_set++;
6351 }
6352 while (char_set < char_set_end);
6353 }
6354 }
6355
6356 offset = -1;
6357 /* Scan forward. */
6358 for (i = 0; i < max; i++)
6359 {
6360 if (range_right == i)
6361 continue;
6362
6363 if (offset == -1)
6364 {
6365 if (chars[i].last_count >= 2)
6366 offset = i;
6367 }
6368 else if (chars[offset].last_count < chars[i].last_count)
6369 offset = i;
6370 }
6371
6372 SLJIT_ASSERT(offset == -1 || (chars[offset].count >= 1 && chars[offset].count <= 2));
6373
6374 if (range_right < 0)
6375 {
6376 if (offset < 0)
6377 return FALSE;
6378 /* Works regardless the value is 1 or 2. */
6379 fast_forward_first_char2(common, chars[offset].chars[0], chars[offset].chars[1], offset);
6380 return TRUE;
6381 }
6382
6383 SLJIT_ASSERT(range_right != offset);
6384
6385 if (common->match_end_ptr != 0)
6386 {
6387 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
6388 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
6389 OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));
6390 OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_END, 0, TMP1, 0);
6391 CMOV(SLJIT_GREATER, STR_END, TMP1, 0);
6392 }
6393 else
6394 OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));
6395
6396 SLJIT_ASSERT(range_right >= 0);
6397
6398 #if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
6399 OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table);
6400 #endif
6401
6402 start = LABEL();
6403 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
6404
6405 #if PCRE2_CODE_UNIT_WIDTH == 8 || (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
6406 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right));
6407 #else
6408 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1);
6409 #endif
6410
6411 #if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
6412 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0);
6413 #else
6414 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table);
6415 #endif
6416 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
6417 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start);
6418
6419 if (offset >= 0)
6420 {
6421 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offset));
6422 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6423
6424 if (chars[offset].count == 1)
6425 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[0], start);
6426 else
6427 {
6428 mask = chars[offset].chars[0] ^ chars[offset].chars[1];
6429 if (is_powerof2(mask))
6430 {
6431 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask);
6432 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[0] | mask, start);
6433 }
6434 else
6435 {
6436 match = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[0]);
6437 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset].chars[1], start);
6438 JUMPHERE(match);
6439 }
6440 }
6441 }
6442
6443 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
6444 if (common->utf && offset != 0)
6445 {
6446 if (offset < 0)
6447 {
6448 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
6449 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6450 }
6451 else
6452 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
6453
6454 jumpto_if_not_utf_char_start(compiler, TMP1, start);
6455
6456 if (offset < 0)
6457 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6458 }
6459 #endif
6460
6461 if (offset >= 0)
6462 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6463
6464 if (common->match_end_ptr != 0)
6465 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
6466 else
6467 OP2(SLJIT_ADD, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));
6468 return TRUE;
6469 }
6470
6471 static SLJIT_INLINE void fast_forward_first_char(compiler_common *common)
6472 {
6473 PCRE2_UCHAR first_char = (PCRE2_UCHAR)(common->re->first_codeunit);
6474 PCRE2_UCHAR oc;
6475
6476 oc = first_char;
6477 if ((common->re->flags & PCRE2_FIRSTCASELESS) != 0)
6478 {
6479 oc = TABLE_GET(first_char, common->fcc, first_char);
6480 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6481 if (first_char > 127 && common->utf)
6482 oc = UCD_OTHERCASE(first_char);
6483 #endif
6484 }
6485
6486 fast_forward_first_char2(common, first_char, oc, 0);
6487 }
6488
6489 static SLJIT_INLINE void fast_forward_newline(compiler_common *common)
6490 {
6491 DEFINE_COMPILER;
6492 struct sljit_label *loop;
6493 struct sljit_jump *lastchar;
6494 struct sljit_jump *firstchar;
6495 struct sljit_jump *quit;
6496 struct sljit_jump *foundcr = NULL;
6497 struct sljit_jump *notfoundnl;
6498 jump_list *newline = NULL;
6499
6500 if (common->match_end_ptr != 0)
6501 {
6502 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
6503 OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
6504 }
6505
6506 if (common->nltype == NLTYPE_FIXED && common->newline > 255)
6507 {
6508 lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
6509 OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
6510 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str));
6511 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
6512 firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
6513
6514 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2));
6515 OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0);
6516 OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER_EQUAL);
6517 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
6518 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCHAR_SHIFT);
6519 #endif
6520 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
6521
6522 loop = LABEL();
6523 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6524 quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
6525 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
6526 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
6527 CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop);
6528 CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop);
6529
6530 JUMPHERE(quit);
6531 JUMPHERE(firstchar);
6532 JUMPHERE(lastchar);
6533
6534 if (common->match_end_ptr != 0)
6535 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
6536 return;
6537 }
6538
6539 OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
6540 /* Example: match /^/ to \r\n from offset 1. */
6541 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str));
6542 firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
6543 move_back(common, NULL, FALSE);
6544
6545 loop = LABEL();
6546 common->ff_newline_shortcut = loop;
6547
6548 read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_NEWLINE);
6549 lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
6550 if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
6551 foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
6552 check_newlinechar(common, common->nltype, &newline, FALSE);
6553 set_jumps(newline, loop);
6554
6555 if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
6556 {
6557 quit = JUMP(SLJIT_JUMP);
6558 JUMPHERE(foundcr);
6559 notfoundnl = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
6560 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
6561 OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
6562 OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
6563 #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
6564 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
6565 #endif
6566 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
6567 JUMPHERE(notfoundnl);
6568 JUMPHERE(quit);
6569 }
6570 JUMPHERE(lastchar);
6571 JUMPHERE(firstchar);
6572
6573 if (common->match_end_ptr != 0)
6574 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
6575 }
6576
6577 static BOOL optimize_class(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks);
6578
6579 static SLJIT_INLINE void fast_forward_start_bits(compiler_common *common)
6580 {
6581 DEFINE_COMPILER;
6582 const sljit_u8 *start_bits = common->re->start_bitmap;
6583 struct sljit_label *start;
6584 struct sljit_jump *partial_quit;
6585 #if PCRE2_CODE_UNIT_WIDTH != 8
6586 struct sljit_jump *found = NULL;
6587 #endif
6588 jump_list *matches = NULL;
6589
6590 if (common->match_end_ptr != 0)
6591 {
6592 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
6593 OP1(SLJIT_MOV, RETURN_ADDR, 0, STR_END, 0);
6594 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
6595 OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_END, 0, TMP1, 0);
6596 CMOV(SLJIT_GREATER, STR_END, TMP1, 0);
6597 }
6598
6599 start = LABEL();
6600
6601 partial_quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
6602 if (common->mode == PCRE2_JIT_COMPLETE)
6603 add_jump(compiler, &common->failed_match, partial_quit);
6604
6605 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
6606 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6607
6608 if (!optimize_class(common, start_bits, (start_bits[31] & 0x80) != 0, FALSE, &matches))
6609 {
6610 #if PCRE2_CODE_UNIT_WIDTH != 8
6611 if ((start_bits[31] & 0x80) != 0)
6612 found = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 255);
6613 else
6614 CMPTO(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 255, start);
6615 #elif defined SUPPORT_UNICODE
6616 if (common->utf && is_char7_bitset(start_bits, FALSE))
6617 CMPTO(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 127, start);
6618 #endif
6619 OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
6620 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
6621 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)start_bits);
6622 if (sljit_get_register_index(TMP3) >= 0)
6623 {
6624 OP2(SLJIT_SHL, TMP3, 0, SLJIT_IMM, 1, TMP2, 0);
6625 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP3, 0);
6626 }
6627 else
6628 {
6629 OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
6630 OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
6631 }
6632 JUMPTO(SLJIT_ZERO, start);
6633 }
6634 else
6635 set_jumps(matches, start);
6636
6637 #if PCRE2_CODE_UNIT_WIDTH != 8
6638 if (found != NULL)
6639 JUMPHERE(found);
6640 #endif
6641
6642 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6643
6644 if (common->mode != PCRE2_JIT_COMPLETE)
6645 JUMPHERE(partial_quit);
6646
6647 if (common->match_end_ptr != 0)
6648 OP1(SLJIT_MOV, STR_END, 0, RETURN_ADDR, 0);
6649 }
6650
6651 static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, PCRE2_UCHAR req_char, BOOL caseless, BOOL has_firstchar)
6652 {
6653 DEFINE_COMPILER;
6654 struct sljit_label *loop;
6655 struct sljit_jump *toolong;
6656 struct sljit_jump *alreadyfound;
6657 struct sljit_jump *found;
6658 struct sljit_jump *foundoc = NULL;
6659 struct sljit_jump *notfound;
6660 sljit_u32 oc, bit;
6661
6662 SLJIT_ASSERT(common->req_char_ptr != 0);
6663 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->req_char_ptr);
6664 OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, REQ_CU_MAX);
6665 toolong = CMP(SLJIT_LESS, TMP1, 0, STR_END, 0);
6666 alreadyfound = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
6667
6668 if (has_firstchar)
6669 OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
6670 else
6671 OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0);
6672
6673 loop = LABEL();
6674 notfound = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0);
6675
6676 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0);
6677 oc = req_char;
6678 if (caseless)
6679 {
6680 oc = TABLE_GET(req_char, common->fcc, req_char);
6681 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
6682 if (req_char > 127 && common->utf)
6683 oc = UCD_OTHERCASE(req_char);
6684 #endif
6685 }
6686 if (req_char == oc)
6687 found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
6688 else
6689 {
6690 bit = req_char ^ oc;
6691 if (is_powerof2(bit))
6692 {
6693 OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit);
6694 found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit);
6695 }
6696 else
6697 {
6698 found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
6699 foundoc = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, oc);
6700 }
6701 }
6702 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
6703 JUMPTO(SLJIT_JUMP, loop);
6704
6705 JUMPHERE(found);
6706 if (foundoc)
6707 JUMPHERE(foundoc);
6708 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->req_char_ptr, TMP1, 0);
6709 JUMPHERE(alreadyfound);
6710 JUMPHERE(toolong);
6711 return notfound;
6712 }
6713
6714 static void do_revertframes(compiler_common *common)
6715 {
6716 DEFINE_COMPILER;
6717 struct sljit_jump *jump;
6718 struct sljit_label *mainloop;
6719
6720 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
6721 GET_LOCAL_BASE(TMP1, 0, 0);
6722
6723 /* Drop frames until we reach STACK_TOP. */
6724 mainloop = LABEL();
6725 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), -sizeof(sljit_sw));
6726 jump = CMP(SLJIT_SIG_LESS_EQUAL, TMP2, 0, SLJIT_IMM, 0);
6727
6728 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
6729 if (sljit_get_register_index(TMP3) < 0)
6730 {
6731 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
6732 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), SLJIT_MEM1(STACK_TOP), -(3 * sizeof(sljit_sw)));
6733 OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * sizeof(sljit_sw));
6734 }
6735 else
6736 {
6737 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
6738 OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(3 * sizeof(sljit_sw)));
6739 OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * sizeof(sljit_sw));
6740 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP1, 0);
6741 GET_LOCAL_BASE(TMP1, 0, 0);
6742 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), TMP3, 0);
6743 }
6744 JUMPTO(SLJIT_JUMP, mainloop);
6745
6746 JUMPHERE(jump);
6747 jump = CMP(SLJIT_NOT_ZERO /* SIG_LESS */, TMP2, 0, SLJIT_IMM, 0);
6748 /* End of reverting values. */
6749 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
6750
6751 JUMPHERE(jump);
6752 OP1(SLJIT_NEG, TMP2, 0, TMP2, 0);
6753 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
6754 if (sljit_get_register_index(TMP3) < 0)
6755 {
6756 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
6757 OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
6758 }
6759 else
6760 {
6761 OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
6762 OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
6763 OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP3, 0);
6764 }
6765 JUMPTO(SLJIT_JUMP, mainloop);
6766 }
6767
6768 static void check_wordboundary(compiler_common *common)
6769 {
6770 DEFINE_COMPILER;
6771 struct sljit_jump *skipread;
6772 jump_list *skipread_list = NULL;
6773 jump_list *invalid_utf = NULL;
6774 #if PCRE2_CODE_UNIT_WIDTH != 8 || defined SUPPORT_UNICODE
6775 struct sljit_jump *jump;
6776 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 || SUPPORT_UNICODE */
6777
6778 SLJIT_COMPILE_ASSERT(ctype_word == 0x10, ctype_word_must_be_16);
6779
6780 sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0);
6781 /* Get type of the previous char, and put it to TMP3. */
6782 OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
6783 OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
6784 OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0);
6785 skipread = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
6786
6787 if (common->mode == PCRE2_JIT_COMPLETE)
6788 peek_char_back(common, READ_CHAR_MAX, &invalid_utf);
6789 else
6790 {
6791 move_back(common, &invalid_utf, FALSE);
6792 check_start_used_ptr(common);
6793 /* No need precise read since match fails anyway. */
6794 read_char(common, 0, READ_CHAR_MAX, &invalid_utf, READ_CHAR_UPDATE_STR_PTR);
6795 }
6796
6797 /* Testing char type. */
6798 #ifdef SUPPORT_UNICODE
6799 if (common->use_ucp)
6800 {
6801 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
6802 jump = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
6803 add_jump(compiler, &common->getucdtype, JUMP(SLJIT_FAST_CALL));
6804 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll);
6805 OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
6806 OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
6807 OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll);
6808 OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd);
6809 OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
6810 JUMPHERE(jump);
6811 OP1(SLJIT_MOV, TMP3, 0, TMP2, 0);
6812 }
6813 else
6814 #endif /* SUPPORT_UNICODE */
6815 {
6816 #if PCRE2_CODE_UNIT_WIDTH != 8
6817 jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
6818 #elif defined SUPPORT_UNICODE
6819 /* Here TMP3 has already been zeroed. */
6820 jump = NULL;
6821 if (common->utf)
6822 jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
6823 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
6824 OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
6825 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */);
6826 OP2(SLJIT_AND, TMP3, 0, TMP1, 0, SLJIT_IMM, 1);
6827 #if PCRE2_CODE_UNIT_WIDTH != 8
6828 JUMPHERE(jum