--- code/trunk/study.c 2007/02/24 21:38:21 13 +++ code/trunk/study.c 2007/02/24 21:39:42 53 @@ -9,7 +9,7 @@ Written by: Philip Hazel - Copyright (c) 1997 University of Cambridge + Copyright (c) 1997-2001 University of Cambridge ----------------------------------------------------------------------------- Permission is granted to anyone to use this software for any purpose on any @@ -25,6 +25,10 @@ 3. Altered versions must be plainly marked as such, and must not be misrepresented as being the original software. + +4. If PCRE is embedded in any software that is released under the GNU + General Purpose Licence (GPL), then the terms of that licence shall + supersede any condition above with which it is incompatible. ----------------------------------------------------------------------------- */ @@ -37,6 +41,32 @@ /************************************************* +* Set a bit and maybe its alternate case * +*************************************************/ + +/* Given a character, set its bit in the table, and also the bit for the other +version of a letter if we are caseless. + +Arguments: + start_bits points to the bit map + c is the character + caseless the caseless flag + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd) +{ +start_bits[c/8] |= (1 << (c&7)); +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) + start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +} + + + +/************************************************* * Create bitmap of starting chars * *************************************************/ @@ -47,15 +77,26 @@ Arguments: code points to an expression start_bits points to a 32-byte table, initialized to 0 + caseless the current state of the caseless flag + cd the block with char table pointers Returns: TRUE if table built, FALSE otherwise */ static BOOL -set_start_bits(const uschar *code, uschar *start_bits) +set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, + compile_data *cd) { register int c; +/* This next statement and the later reference to dummy are here in order to +trick the optimizer of the IBM C compiler for OS/2 into generating correct +code. Apparently IBM isn't going to fix the problem, and we would rather not +disable optimization (in this module it actually makes a big difference, and +the pcre module can use all the optimization it can get). */ + +volatile int dummy; + do { const uschar *tcode = code + 3; @@ -63,11 +104,14 @@ while (try_next) { - try_next = FALSE; + /* If a branch starts with a bracket or a positive lookahead assertion, + recurse to set bits from within them. That's all for this branch. */ if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) { - if (!set_start_bits(tcode, start_bits)) return FALSE; + if (!set_start_bits(tcode, start_bits, caseless, cd)) + return FALSE; + try_next = FALSE; } else switch(*tcode) @@ -75,14 +119,37 @@ default: return FALSE; + /* Skip over extended extraction bracket number */ + + case OP_BRANUMBER: + tcode += 3; + break; + + /* Skip over lookbehind and negative lookahead assertions */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); + tcode += 3; + break; + + /* Skip over an option setting, changing the caseless flag */ + + case OP_OPT: + caseless = (tcode[1] & PCRE_CASELESS) != 0; + tcode += 2; + break; + /* BRAZERO does the bracket, but carries on. */ case OP_BRAZERO: case OP_BRAMINZERO: - if (!set_start_bits(++tcode, start_bits)) return FALSE; + if (!set_start_bits(++tcode, start_bits, caseless, cd)) + return FALSE; + dummy = 1; do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); tcode += 3; - try_next = TRUE; break; /* Single-char * or ? sets the bit and tries the next item */ @@ -91,18 +158,16 @@ case OP_MINSTAR: case OP_QUERY: case OP_MINQUERY: - start_bits[tcode[1]/8] |= (1 << (tcode[1]&7)); + set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; - try_next = TRUE; break; /* Single-char upto sets the bit and tries the next */ case OP_UPTO: case OP_MINUPTO: - start_bits[tcode[3]/8] |= (1 << (tcode[3]&7)); + set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; - try_next = TRUE; break; /* At least one single char sets the bit and stops */ @@ -115,35 +180,46 @@ case OP_PLUS: case OP_MINPLUS: - start_bits[tcode[1]/8] |= (1 << (tcode[1]&7)); + set_bit(start_bits, tcode[1], caseless, cd); + try_next = FALSE; break; /* Single character type sets the bits and stops */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; + try_next = FALSE; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; + try_next = FALSE; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; + try_next = FALSE; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; + try_next = FALSE; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~cd->cbits[c+cbit_word]; + try_next = FALSE; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= cd->cbits[c+cbit_word]; + try_next = FALSE; break; /* One or more character type fudges the pointer and restarts, knowing @@ -152,12 +228,10 @@ case OP_TYPEPLUS: case OP_TYPEMINPLUS: tcode++; - try_next = TRUE; break; case OP_TYPEEXACT: tcode += 3; - try_next = TRUE; break; /* Zero or more repeats of character types set the bits and then @@ -174,41 +248,43 @@ switch(tcode[1]) { case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~cd->cbits[c+cbit_word]; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= cd->cbits[c+cbit_word]; break; } tcode += 2; - try_next = TRUE; break; /* Character class: set the bits and either carry on or not, according to the repeat count. */ case OP_CLASS: - case OP_NEGCLASS: { tcode++; for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; @@ -220,16 +296,16 @@ case OP_CRQUERY: case OP_CRMINQUERY: tcode++; - try_next = TRUE; break; case OP_CRRANGE: case OP_CRMINRANGE: - if (((tcode[1] << 8) + tcode[2]) == 0) - { - tcode += 5; - try_next = TRUE; - } + if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; + else try_next = FALSE; + break; + + default: + try_next = FALSE; break; } } @@ -267,10 +343,10 @@ pcre_extra * pcre_study(const pcre *external_re, int options, const char **errorptr) { -BOOL caseless; uschar start_bits[32]; real_pcre_extra *extra; const real_pcre *re = (const real_pcre *)external_re; +compile_data compile_block; *errorptr = NULL; @@ -286,10 +362,6 @@ return NULL; } -/* Caseless can either be from the compiled regex or from options. */ - -caseless = ((re->options | options) & PCRE_CASELESS) != 0; - /* For an anchored pattern, or an unchored pattern that has a first char, or a multiline pattern that matches only at "line starts", no further processing at present. */ @@ -297,27 +369,18 @@ if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) return NULL; -/* See if we can find a fixed set of initial characters for the pattern. */ +/* Set the character tables in the block which is passed around */ -memset(start_bits, 0, 32 * sizeof(uschar)); -if (!set_start_bits(re->code, start_bits)) return NULL; +compile_block.lcc = re->tables + lcc_offset; +compile_block.fcc = re->tables + fcc_offset; +compile_block.cbits = re->tables + cbits_offset; +compile_block.ctypes = re->tables + ctypes_offset; -/* If this studying is caseless, scan the created bit map and duplicate the -bits for any letters. */ +/* See if we can find a fixed set of initial characters for the pattern. */ -if (caseless) - { - register int c; - for (c = 0; c < 256; c++) - { - if ((start_bits[c/8] & (1 << (c&7))) != 0 && - (pcre_ctypes[c] & ctype_letter) != 0) - { - int d = pcre_fcc[c]; - start_bits[d/8] |= (1 << (d&7)); - } - } - } +memset(start_bits, 0, 32 * sizeof(uschar)); +if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0, + &compile_block)) return NULL; /* Get an "extra" block and put the information therein. */ @@ -329,7 +392,7 @@ return NULL; } -extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0); +extra->options = PCRE_STUDY_MAPPED; memcpy(extra->start_bits, start_bits, sizeof(start_bits)); return (pcre_extra *)extra;