--- code/trunk/study.c 2007/02/24 21:38:25 15 +++ code/trunk/study.c 2007/02/24 21:40:03 63 @@ -9,7 +9,7 @@ Written by: Philip Hazel - Copyright (c) 1998 University of Cambridge + Copyright (c) 1997-2002 University of Cambridge ----------------------------------------------------------------------------- Permission is granted to anyone to use this software for any purpose on any @@ -25,6 +25,10 @@ 3. Altered versions must be plainly marked as such, and must not be misrepresented as being the original software. + +4. If PCRE is embedded in any software that is released under the GNU + General Purpose Licence (GPL), then the terms of that licence shall + supersede any condition above with which it is incompatible. ----------------------------------------------------------------------------- */ @@ -37,6 +41,32 @@ /************************************************* +* Set a bit and maybe its alternate case * +*************************************************/ + +/* Given a character, set its bit in the table, and also the bit for the other +version of a letter if we are caseless. + +Arguments: + start_bits points to the bit map + c is the character + caseless the caseless flag + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd) +{ +start_bits[c/8] |= (1 << (c&7)); +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) + start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +} + + + +/************************************************* * Create bitmap of starting chars * *************************************************/ @@ -47,27 +77,42 @@ Arguments: code points to an expression start_bits points to a 32-byte table, initialized to 0 + caseless the current state of the caseless flag + utf8 TRUE if in UTF-8 mode + cd the block with char table pointers Returns: TRUE if table built, FALSE otherwise */ static BOOL -set_start_bits(const uschar *code, uschar *start_bits) +set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, + BOOL utf8, compile_data *cd) { register int c; +/* This next statement and the later reference to dummy are here in order to +trick the optimizer of the IBM C compiler for OS/2 into generating correct +code. Apparently IBM isn't going to fix the problem, and we would rather not +disable optimization (in this module it actually makes a big difference, and +the pcre module can use all the optimization it can get). */ + +volatile int dummy; + do { - const uschar *tcode = code + 3; + const uschar *tcode = code + 1 + LINK_SIZE; BOOL try_next = TRUE; while (try_next) { - try_next = FALSE; + /* If a branch starts with a bracket or a positive lookahead assertion, + recurse to set bits from within them. That's all for this branch. */ if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) { - if (!set_start_bits(tcode, start_bits)) return FALSE; + if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) + return FALSE; + try_next = FALSE; } else switch(*tcode) @@ -75,14 +120,43 @@ default: return FALSE; + /* Skip over callout */ + + case OP_CALLOUT: + tcode += 2; + break; + + /* Skip over extended extraction bracket number */ + + case OP_BRANUMBER: + tcode += 3; + break; + + /* Skip over lookbehind and negative lookahead assertions */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); + tcode += 1+LINK_SIZE; + break; + + /* Skip over an option setting, changing the caseless flag */ + + case OP_OPT: + caseless = (tcode[1] & PCRE_CASELESS) != 0; + tcode += 2; + break; + /* BRAZERO does the bracket, but carries on. */ case OP_BRAZERO: case OP_BRAMINZERO: - if (!set_start_bits(++tcode, start_bits)) return FALSE; - do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); - tcode += 3; - try_next = TRUE; + if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) + return FALSE; + dummy = 1; + do tcode += GET(tcode,1); while (*tcode == OP_ALT); + tcode += 1+LINK_SIZE; break; /* Single-char * or ? sets the bit and tries the next item */ @@ -91,18 +165,22 @@ case OP_MINSTAR: case OP_QUERY: case OP_MINQUERY: - start_bits[tcode[1]/8] |= (1 << (tcode[1]&7)); + set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; - try_next = TRUE; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; +#endif break; /* Single-char upto sets the bit and tries the next */ case OP_UPTO: case OP_MINUPTO: - start_bits[tcode[3]/8] |= (1 << (tcode[3]&7)); + set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; - try_next = TRUE; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; +#endif break; /* At least one single char sets the bit and stops */ @@ -115,35 +193,46 @@ case OP_PLUS: case OP_MINPLUS: - start_bits[tcode[1]/8] |= (1 << (tcode[1]&7)); + set_bit(start_bits, tcode[1], caseless, cd); + try_next = FALSE; break; /* Single character type sets the bits and stops */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; + try_next = FALSE; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; + try_next = FALSE; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; + try_next = FALSE; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; + try_next = FALSE; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~cd->cbits[c+cbit_word]; + try_next = FALSE; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= cd->cbits[c+cbit_word]; + try_next = FALSE; break; /* One or more character type fudges the pointer and restarts, knowing @@ -152,12 +241,10 @@ case OP_TYPEPLUS: case OP_TYPEMINPLUS: tcode++; - try_next = TRUE; break; case OP_TYPEEXACT: tcode += 3; - try_next = TRUE; break; /* Zero or more repeats of character types set the bits and then @@ -174,41 +261,52 @@ switch(tcode[1]) { case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~cd->cbits[c+cbit_word]; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= cd->cbits[c+cbit_word]; break; } tcode += 2; - try_next = TRUE; break; - /* Character class: set the bits and either carry on or not, - according to the repeat count. */ + /* Character class where all the information is in a bit map: set the + bits and either carry on or not, according to the repeat count. If it was + a negative class, and we are operating with UTF-8 characters, any byte + with the top-bit set is a potentially valid starter because it may start + a character with a value > 255. (This is sub-optimal in that the + character may be in the range 128-255, and those characters might be + unwanted, but that's as far as we go for the moment.) */ + + case OP_NCLASS: + if (utf8) memset(start_bits+16, 0xff, 16); + /* Fall through */ case OP_CLASS: - case OP_NEGCLASS: { tcode++; for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; @@ -220,25 +318,25 @@ case OP_CRQUERY: case OP_CRMINQUERY: tcode++; - try_next = TRUE; break; case OP_CRRANGE: case OP_CRMINRANGE: - if (((tcode[1] << 8) + tcode[2]) == 0) - { - tcode += 5; - try_next = TRUE; - } + if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; + else try_next = FALSE; + break; + + default: + try_next = FALSE; break; } } - break; /* End of class handling */ + break; /* End of bitmap class handling */ } /* End of switch */ } /* End of try_next loop */ - code += (code[1] << 8) + code[2]; /* Advance to next branch */ + code += GET(code, 1); /* Advance to next branch */ } while (*code == OP_ALT); return TRUE; @@ -260,17 +358,21 @@ errorptr points to where to place error messages; set NULL unless error -Returns: pointer to a pcre_extra block, +Returns: pointer to a pcre_extra block, with study_data filled in and the + appropriate flag set; NULL on error or if no optimization possible */ pcre_extra * pcre_study(const pcre *external_re, int options, const char **errorptr) { -BOOL caseless; uschar start_bits[32]; -real_pcre_extra *extra; +pcre_extra *extra; +pcre_study_data *study; const real_pcre *re = (const real_pcre *)external_re; +uschar *code = (uschar *)re + sizeof(real_pcre) + + (re->name_count * re->name_entry_size); +compile_data compile_block; *errorptr = NULL; @@ -286,42 +388,35 @@ return NULL; } -/* Caseless can either be from the compiled regex or from options. */ - -caseless = ((re->options | options) & PCRE_CASELESS) != 0; - -/* For an anchored pattern, or an unchored pattern that has a first char, or a -multiline pattern that matches only at "line starts", no further processing at -present. */ +/* For an anchored pattern, or an unanchored pattern that has a first char, or +a multiline pattern that matches only at "line starts", no further processing +at present. */ if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) return NULL; -/* See if we can find a fixed set of initial characters for the pattern. */ +/* Set the character tables in the block which is passed around */ -memset(start_bits, 0, 32 * sizeof(uschar)); -if (!set_start_bits(re->code, start_bits)) return NULL; +compile_block.lcc = re->tables + lcc_offset; +compile_block.fcc = re->tables + fcc_offset; +compile_block.cbits = re->tables + cbits_offset; +compile_block.ctypes = re->tables + ctypes_offset; -/* If this studying is caseless, scan the created bit map and duplicate the -bits for any letters. */ +/* See if we can find a fixed set of initial characters for the pattern. */ -if (caseless) - { - register int c; - for (c = 0; c < 256; c++) - { - if ((start_bits[c/8] & (1 << (c&7))) != 0 && - (pcre_ctypes[c] & ctype_letter) != 0) - { - int d = pcre_fcc[c]; - start_bits[d/8] |= (1 << (d&7)); - } - } - } +memset(start_bits, 0, 32 * sizeof(uschar)); +if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, + (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; -/* Get an "extra" block and put the information therein. */ +/* Get a pcre_extra block and a pcre_study_data block. The study data is put in +the latter, which is pointed to by the former, which may also get additional +data set later by the calling program. At the moment, the size of +pcre_study_data is fixed. We nevertheless save it in a field for returning via +the pcre_fullinfo() function so that if it becomes variable in the future, we +don't have to change that code. */ -extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra)); +extra = (pcre_extra *)(pcre_malloc) + (sizeof(pcre_extra) + sizeof(pcre_study_data)); if (extra == NULL) { @@ -329,10 +424,15 @@ return NULL; } -extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0); -memcpy(extra->start_bits, start_bits, sizeof(start_bits)); +study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); +extra->flags = PCRE_EXTRA_STUDY_DATA; +extra->study_data = study; + +study->size = sizeof(pcre_study_data); +study->options = PCRE_STUDY_MAPPED; +memcpy(study->start_bits, start_bits, sizeof(start_bits)); -return (pcre_extra *)extra; +return extra; } /* End of study.c */