--- code/trunk/study.c 2007/02/24 21:38:09 7 +++ code/trunk/study.c 2007/02/24 21:38:45 25 @@ -9,7 +9,7 @@ Written by: Philip Hazel - Copyright (c) 1997 University of Cambridge + Copyright (c) 1998 University of Cambridge ----------------------------------------------------------------------------- Permission is granted to anyone to use this software for any purpose on any @@ -37,6 +37,32 @@ /************************************************* +* Set a bit and maybe its alternate case * +*************************************************/ + +/* Given a character, set its bit in the table, and also the bit for the other +version of a letter if we are caseless. + +Arguments: + start_bits points to the bit map + c is the character + caseless the caseless flag + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd) +{ +start_bits[c/8] |= (1 << (c&7)); +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) + start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +} + + + +/************************************************* * Create bitmap of starting chars * *************************************************/ @@ -47,12 +73,15 @@ Arguments: code points to an expression start_bits points to a 32-byte table, initialized to 0 + caseless the current state of the caseless flag + cd the block with char table pointers Returns: TRUE if table built, FALSE otherwise */ static BOOL -set_start_bits(const uschar *code, uschar *start_bits) +set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, + compile_data *cd) { register int c; @@ -65,9 +94,13 @@ { try_next = FALSE; + /* If a branch starts with a bracket or a positive lookahead assertion, + recurse to set bits from within them. That's all for this branch. */ + if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) { - if (!set_start_bits(tcode, start_bits)) return FALSE; + if (!set_start_bits(tcode, start_bits, caseless, cd)) + return FALSE; } else switch(*tcode) @@ -75,11 +108,30 @@ default: return FALSE; + /* Skip over lookbehind and negative lookahead assertions */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + try_next = TRUE; + do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); + tcode += 3; + break; + + /* Skip over an option setting, changing the caseless flag */ + + case OP_OPT: + caseless = (tcode[1] & PCRE_CASELESS) != 0; + tcode += 2; + try_next = TRUE; + break; + /* BRAZERO does the bracket, but carries on. */ case OP_BRAZERO: case OP_BRAMINZERO: - if (!set_start_bits(++tcode, start_bits)) return FALSE; + if (!set_start_bits(++tcode, start_bits, caseless, cd)) + return FALSE; do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); tcode += 3; try_next = TRUE; @@ -91,7 +143,7 @@ case OP_MINSTAR: case OP_QUERY: case OP_MINQUERY: - start_bits[tcode[1]/8] |= (1 << (tcode[1]&7)); + set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; try_next = TRUE; break; @@ -100,7 +152,7 @@ case OP_UPTO: case OP_MINUPTO: - start_bits[tcode[3]/8] |= (1 << (tcode[3]&7)); + set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; try_next = TRUE; break; @@ -115,35 +167,39 @@ case OP_PLUS: case OP_MINPLUS: - start_bits[tcode[1]/8] |= (1 << (tcode[1]&7)); + set_bit(start_bits, tcode[1], caseless, cd); break; /* Single character type sets the bits and stops */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]); break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]); break; /* One or more character type fudges the pointer and restarts, knowing @@ -174,29 +230,33 @@ switch(tcode[1]) { case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]); break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]); break; } @@ -266,10 +326,10 @@ pcre_extra * pcre_study(const pcre *external_re, int options, const char **errorptr) { -BOOL caseless; uschar start_bits[32]; real_pcre_extra *extra; const real_pcre *re = (const real_pcre *)external_re; +compile_data compile_block; *errorptr = NULL; @@ -285,10 +345,6 @@ return NULL; } -/* Caseless can either be from the compiled regex or from options. */ - -caseless = ((re->options | options) & PCRE_CASELESS) != 0; - /* For an anchored pattern, or an unchored pattern that has a first char, or a multiline pattern that matches only at "line starts", no further processing at present. */ @@ -296,27 +352,18 @@ if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) return NULL; -/* See if we can find a fixed set of initial characters for the pattern. */ +/* Set the character tables in the block which is passed around */ -memset(start_bits, 0, 32 * sizeof(uschar)); -if (!set_start_bits(re->code, start_bits)) return NULL; +compile_block.lcc = re->tables + lcc_offset; +compile_block.fcc = re->tables + fcc_offset; +compile_block.cbits = re->tables + cbits_offset; +compile_block.ctypes = re->tables + ctypes_offset; -/* If this studying is caseless, scan the created bit map and duplicate the -bits for any letters. */ +/* See if we can find a fixed set of initial characters for the pattern. */ -if (caseless) - { - register int c; - for (c = 0; c < 256; c++) - { - if ((start_bits[c/8] & (1 << (c&7))) != 0 && - (pcre_ctypes[c] & ctype_letter) != 0) - { - int d = pcre_fcc[c]; - start_bits[d/8] |= (1 << (d&7)); - } - } - } +memset(start_bits, 0, 32 * sizeof(uschar)); +if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0, + &compile_block)) return NULL; /* Get an "extra" block and put the information therein. */ @@ -328,7 +375,7 @@ return NULL; } -extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0); +extra->options = PCRE_STUDY_MAPPED; memcpy(extra->start_bits, start_bits, sizeof(start_bits)); return (pcre_extra *)extra;