/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 803 by ph10, Tue Dec 13 09:52:20 2011 UTC revision 804 by zherczeg, Wed Dec 14 11:18:01 2011 UTC
# Line 681  if (cd->workspace_size >= COMPILE_WORK_S Line 681  if (cd->workspace_size >= COMPILE_WORK_S
681      newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)      newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
682   return ERR72;   return ERR72;
683    
684  newspace = (pcre_malloc)(newsize);  newspace = (PUBL(malloc))(newsize);
685  if (newspace == NULL) return ERR21;  if (newspace == NULL) return ERR21;
686    
687  memcpy(newspace, cd->start_workspace, cd->workspace_size);  memcpy(newspace, cd->start_workspace, cd->workspace_size);
688  cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);  cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
689  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
690    (pcre_free)((void *)cd->start_workspace);    (PUBL(free))((void *)cd->start_workspace);
691  cd->start_workspace = newspace;  cd->start_workspace = newspace;
692  cd->workspace_size = newsize;  cd->workspace_size = newsize;
693  return 0;  return 0;
# Line 2956  if ((options & PCRE_EXTENDED) != 0) Line 2956  if ((options & PCRE_EXTENDED) != 0)
2956    {    {
2957    for (;;)    for (;;)
2958      {      {
2959      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2960      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
2961        {        {
2962        ptr++;        ptr++;
# Line 2998  if ((options & PCRE_EXTENDED) != 0) Line 2998  if ((options & PCRE_EXTENDED) != 0)
2998    {    {
2999    for (;;)    for (;;)
3000      {      {
3001      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3002      if (*ptr == CHAR_NUMBER_SIGN)      if (*ptr == CHAR_NUMBER_SIGN)
3003        {        {
3004        ptr++;        ptr++;
# Line 3462  for (;; ptr++) Line 3462  for (;; ptr++)
3462    BOOL reset_bracount;    BOOL reset_bracount;
3463    int class_has_8bitchar;    int class_has_8bitchar;
3464    int class_single_char;    int class_single_char;
   int class_lastchar;  
3465    int newoptions;    int newoptions;
3466    int recno;    int recno;
3467    int refsign;    int refsign;
# Line 3600  for (;; ptr++) Line 3599  for (;; ptr++)
3599    
3600    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3601      {      {
3602      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3603      if (c == CHAR_NUMBER_SIGN)      if (c == CHAR_NUMBER_SIGN)
3604        {        {
3605        ptr++;        ptr++;
# Line 3767  for (;; ptr++) Line 3766  for (;; ptr++)
3766    
3767      class_has_8bitchar = 0;      class_has_8bitchar = 0;
3768      class_single_char = 0;      class_single_char = 0;
     class_lastchar = -1;  
3769    
3770      /* Initialize the 32-char bit map to all zeros. We build the map in a      /* Initialize the 32-char bit map to all zeros. We build the map in a
3771      temporary bit of memory, in case the class contains only 1 character (less      temporary bit of memory, in case the class contains only 1 character (less
# Line 4417  for (;; ptr++) Line 4415  for (;; ptr++)
4415    
4416        /* Only the value of 1 matters for class_single_char. */        /* Only the value of 1 matters for class_single_char. */
4417        if (class_single_char < 2) class_single_char++;        if (class_single_char < 2) class_single_char++;
       class_lastchar = c;  
4418    
4419        /* Handle a character that cannot go in the bit map */        /* If class_charcount is 1, we saw precisely one character. As long as
4420          there were no negated characters >= 128 and there was no use of \p or \P,
4421          in other words, no use of any XCLASS features, we can optimize.
4422    
4423          In UTF-8 mode, we can optimize the negative case only if there were no
4424          characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4425          operate on single-bytes characters only. This is an historical hangover.
4426          Maybe one day we can tidy these opcodes to handle multi-byte characters.
4427    
4428          The optimization throws away the bit map. We turn the item into a
4429          1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4430          Note that OP_NOT[I] does not support multibyte characters. In the positive
4431          case, it can cause firstchar to be set. Otherwise, there can be no first
4432          char if this item is first, whatever repeat count may follow. In the case
4433          of reqchar, save the previous value for reinstating. */
4434    
4435    #ifdef SUPPORT_UTF
4436          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4437            && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4438    #else
4439          if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4440    #endif
4441            {
4442            ptr++;
4443            zeroreqchar = reqchar;
4444    
4445            /* The OP_NOT[I] opcodes work on single characters only. */
4446    
4447            if (negate_class)
4448              {
4449              if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4450              zerofirstchar = firstchar;
4451              *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4452              *code++ = c;
4453              goto NOT_CHAR;
4454              }
4455    
4456            /* For a single, positive character, get the value into mcbuffer, and
4457            then we can handle this with the normal one-character code. */
4458    
4459    #ifdef SUPPORT_UTF
4460            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4461              mclength = PRIV(ord2utf)(c, mcbuffer);
4462            else
4463    #endif
4464              {
4465              mcbuffer[0] = c;
4466              mclength = 1;
4467              }
4468            goto ONE_CHAR;
4469            }       /* End of 1-char optimization */
4470    
4471          /* Handle a character that cannot go in the bit map. */
4472    
4473  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)  #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4474        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))        if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4475  #elif defined SUPPORT_UTF  #elif defined SUPPORT_UTF
# Line 4458  for (;; ptr++) Line 4507  for (;; ptr++)
4507              {              {
4508              *class_uchardata++ = XCL_SINGLE;              *class_uchardata++ = XCL_SINGLE;
4509              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);              class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
   
             /* In the first pass, we must accumulate the space used here for  
             the following reason: If this ends up as the only character in the  
             class, it will later be optimized down to a single character.  
             However, that uses less memory, and so if this happens to be at the  
             end of the regex, there will not be enough memory in the real  
             compile for this temporary storage. */  
   
             if (lengthptr != NULL)  
               {  
               *lengthptr += class_uchardata - class_uchardata_base;  
               class_uchardata = class_uchardata_base;  
               }  
4510              }              }
4511            }            }
4512  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 4508  for (;; ptr++) Line 4544  for (;; ptr++)
4544        goto FAILED;        goto FAILED;
4545        }        }
4546    
4547      /* If class_charcount is 1, we saw precisely one character. As long as      /* If this is the first thing in the branch, there can be no first char
4548      there were no negated characters >= 128 and there was no use of \p or \P,      setting, whatever the repeat count. Any reqchar setting must remain
4549      in other words, no use of any XCLASS features, we can optimize.      unchanged after any kind of repeat. */
   
     In UTF-8 mode, we can optimize the negative case only if there were no  
     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
     operate on single-bytes characters only. This is an historical hangover.  
     Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
     The optimization throws away the bit map. We turn the item into a  
     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.  
     Note that OP_NOT[I] does not support multibyte characters. In the positive  
     case, it can cause firstchar to be set. Otherwise, there can be no first  
     char if this item is first, whatever repeat count may follow. In the case  
     of reqchar, save the previous value for reinstating. */  
   
 #ifdef SUPPORT_UTF  
     if (class_single_char == 1 && (!utf || !negate_class  
       || class_lastchar < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))  
 #else  
     if (class_single_char == 1)  
 #endif  
       {  
       zeroreqchar = reqchar;  
   
       /* The OP_NOT[I] opcodes work on single characters only. */  
   
       if (negate_class)  
         {  
         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;  
         zerofirstchar = firstchar;  
         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;  
         *code++ = class_lastchar;  
         break;  
         }  
   
       /* For a single, positive character, get the value into mcbuffer, and  
       then we can handle this with the normal one-character code. */  
   
 #ifdef SUPPORT_UTF  
       if (utf && class_lastchar > MAX_VALUE_FOR_SINGLE_CHAR)  
         mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);  
       else  
 #endif  
         {  
         mcbuffer[0] = class_lastchar;  
         mclength = 1;  
         }  
       goto ONE_CHAR;  
       }       /* End of 1-char optimization */  
   
     /* The general case - not the one-char optimization. If this is the first  
     thing in the branch, there can be no first char setting, whatever the  
     repeat count. Any reqchar setting must remain unchanged after any kind of  
     repeat. */  
4550    
4551      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4552      zerofirstchar = firstchar;      zerofirstchar = firstchar;
# Line 4623  for (;; ptr++) Line 4607  for (;; ptr++)
4607        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
4608        }        }
4609      code += 32 / sizeof(pcre_uchar);      code += 32 / sizeof(pcre_uchar);
4610        NOT_CHAR:
4611      break;      break;
4612    
4613    
# Line 5510  for (;; ptr++) Line 5495  for (;; ptr++)
5495    
5496      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
5497    
5498      if (*(++ptr) == CHAR_ASTERISK &&      ptr++;
5499           ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))      if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5500             || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5501        {        {
5502        int i, namelen;        int i, namelen;
5503        int arglen = 0;        int arglen = 0;
# Line 5519  for (;; ptr++) Line 5505  for (;; ptr++)
5505        const pcre_uchar *name = ptr + 1;        const pcre_uchar *name = ptr + 1;
5506        const pcre_uchar *arg = NULL;        const pcre_uchar *arg = NULL;
5507        previous = NULL;        previous = NULL;
5508        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};        ptr++;
5509          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5510        namelen = (int)(ptr - name);        namelen = (int)(ptr - name);
5511    
5512        /* It appears that Perl allows any characters whatsoever, other than        /* It appears that Perl allows any characters whatsoever, other than
# Line 5705  for (;; ptr++) Line 5692  for (;; ptr++)
5692    
5693          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
5694    
5695          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)          if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5696            {            {
5697            ptr += 1;  /* To get the right offset */            ptr += 1;  /* To get the right offset */
5698            *errorcodeptr = ERR28;            *errorcodeptr = ERR28;
# Line 5716  for (;; ptr++) Line 5703  for (;; ptr++)
5703    
5704          recno = 0;          recno = 0;
5705          name = ++ptr;          name = ++ptr;
5706          while ((cd->ctypes[*ptr] & ctype_word) != 0)          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5707            {            {
5708            if (recno >= 0)            if (recno >= 0)
5709              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;              recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
# Line 5887  for (;; ptr++) Line 5874  for (;; ptr++)
5874            break;            break;
5875    
5876            default:                /* Could be name define, else bad */            default:                /* Could be name define, else bad */
5877            if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;            if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5878                goto DEFINE_NAME;
5879            ptr++;                  /* Correct offset for error */            ptr++;                  /* Correct offset for error */
5880            *errorcodeptr = ERR24;            *errorcodeptr = ERR24;
5881            goto FAILED;            goto FAILED;
# Line 5956  for (;; ptr++) Line 5944  for (;; ptr++)
5944              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5945            name = ++ptr;            name = ++ptr;
5946    
5947            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5948            namelen = (int)(ptr - name);            namelen = (int)(ptr - name);
5949    
5950            /* In the pre-compile phase, just do a syntax check. */            /* In the pre-compile phase, just do a syntax check. */
# Line 6086  for (;; ptr++) Line 6074  for (;; ptr++)
6074    
6075          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
6076          name = ++ptr;          name = ++ptr;
6077          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6078          namelen = (int)(ptr - name);          namelen = (int)(ptr - name);
6079    
6080          /* In the pre-compile phase, do a syntax check. We used to just set          /* In the pre-compile phase, do a syntax check. We used to just set
# Line 6672  for (;; ptr++) Line 6660  for (;; ptr++)
6660            BOOL isnumber = TRUE;            BOOL isnumber = TRUE;
6661            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6662              {              {
6663                if (!MAX_255(*p)) { isnumber = FALSE; break; }
6664              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
6665              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
6666              }              }
# Line 7788  because nowadays we limit the maximum va Line 7777  because nowadays we limit the maximum va
7777  cd->name_entry_size. */  cd->name_entry_size. */
7778    
7779  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);  size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7780  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(PUBL(malloc))(size);
7781    
7782  if (re == NULL)  if (re == NULL)
7783    {    {
# Line 7890  if (cd->hwm > cd->start_workspace) Line 7879  if (cd->hwm > cd->start_workspace)
7879  /* If the workspace had to be expanded, free the new memory. */  /* If the workspace had to be expanded, free the new memory. */
7880    
7881  if (cd->workspace_size > COMPILE_WORK_SIZE)  if (cd->workspace_size > COMPILE_WORK_SIZE)
7882    (pcre_free)((void *)cd->start_workspace);    (PUBL(free))((void *)cd->start_workspace);
7883    
7884  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
7885  subpattern. */  subpattern. */
# Line 7944  if (cd->check_lookbehind) Line 7933  if (cd->check_lookbehind)
7933    
7934  if (errorcode != 0)  if (errorcode != 0)
7935    {    {
7936    (pcre_free)(re);    (PUBL(free))(re);
7937    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
7938    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);    *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
7939    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
# Line 8079  was compiled can be seen. */ Line 8068  was compiled can be seen. */
8068    
8069  if (code - codestart > length)  if (code - codestart > length)
8070    {    {
8071    (pcre_free)(re);    (PUBL(free))(re);
8072    *errorptr = find_error_text(ERR23);    *errorptr = find_error_text(ERR23);
8073    *erroroffset = ptr - (pcre_uchar *)pattern;    *erroroffset = ptr - (pcre_uchar *)pattern;
8074    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;

Legend:
Removed from v.803  
changed lines
  Added in v.804

  ViewVC Help
Powered by ViewVC 1.1.5