/[pcre]/code/trunk/pcre_tables.c
ViewVC logotype

Diff of /code/trunk/pcre_tables.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1011 by ph10, Sat Aug 25 11:36:15 2012 UTC revision 1015 by ph10, Sun Aug 26 16:07:14 2012 UTC
# Line 109  const int PRIV(ucp_gentype)[] = { Line 109  const int PRIV(ucp_gentype)[] = {
109    ucp_Z, ucp_Z, ucp_Z                 /* Zl, Zp, Zs */    ucp_Z, ucp_Z, ucp_Z                 /* Zl, Zp, Zs */
110  };  };
111    
112  /* This byte table encodes the rules for finding the end of an extended  /* This table encodes the rules for finding the end of an extended grapheme
113  grapheme cluster. It could be done with bits instead of bytes, but the saving  cluster. Every code point has a grapheme break property which is one of the
114  in memory would be small and there would be more computation at runtime.  ucp_gbXX values defined in ucp.h. The 2-dimensional table is indexed by the
115    properties of two adjacent code points. The left property selects a word from
116  Every code point has a grapheme break property which is one of the ucp_gbXX  the table, and the right property selects a bit from that word like this:
117  values defined in ucp.h. The number of such properties is ucp_gbCount. The  
118  2-dimensional table is indexed by the properties of two adjacent code points.    ucp_gbtable[left-property] & (1 << right-property)
119    
120  The value is non-zero if a grapheme break is NOT permitted between the relevant  The value is non-zero if a grapheme break is NOT permitted between the relevant
121  two code points. The breaking rules are as follows:  two code points. The breaking rules are as follows:
122    
123  1. Break at the start and end of text (pretty obviously).  1. Break at the start and end of text (pretty obviously).
124    
125  2. Do not break between a CR and LF: (0,1) is set; otherwise, break before and  2. Do not break between a CR and LF; otherwise, break before and   after
126     after controls: (x,0), (x,1), (x,2), (0,x), (1,x), and (2,x) are not set,     controls.
127     except for (0,1).  
128    3. Do not break Hangul syllable sequences, the rules for which are:
 3. Do not break Hangul syllable sequences: (6,6), (6,7), (6,9), (6,10),  
    (7,7), (7,8), (8,8), (9,7), (9,8), and (10,8) are set. The rules for Hangul  
    sequences are:  
129    
130      L may be followed by L, V, LV or LVT      L may be followed by L, V, LV or LVT
131      LV or V may be followed by V or T      LV or V may be followed by V or T
132      LVT or T may be followed by T      LVT or T may be followed by T
133    
134  4. Do not break before extending characters: (x,3) is set except for (0,3),  4. Do not break before extending characters.
    (1,3), and (2,3).  
135    
136  The next two rules are only for extended grapheme clusters (but that's what we  The next two rules are only for extended grapheme clusters (but that's what we
137  are implementing).  are implementing).
138    
139  5. Do not break before SpacingMarks: (x,5) is set except for (0,5), (1,5),  5. Do not break before SpacingMarks.
    and (2,5).  
140    
141  6. Do not break after Prepend characters: (4,x) is set except for (4,0), (4,1),  6. Do not break after Prepend characters.
    and (4,2).  
142    
143  8. Otherwise, break everywhere.  7. Otherwise, break everywhere.
144  */  */
145    
146  const pcre_uint8 PRIV(ucp_gbtable[]) = {  const pcre_uint32 PRIV(ucp_gbtable[]) = {
147  /* 0  1  2  3  4  5  6  7  8  9 10 11 */     (1<<ucp_gbLF),                                           /*  0 CR */
148     0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,     /*  0 CR */     0,                                                       /*  1 LF */
149     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,     /*  1 LF */     0,                                                       /*  2 Control */
150     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,     /*  2 Control */     (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark),                /*  3 Extend */
151     0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,     /*  3 Extend */     (1<<ucp_gbExtend)|(1<<ucp_gbPrepend)|                    /*  4 Prepend */
152     0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,     /*  4 Prepend */       (1<<ucp_gbSpacingMark)|(1<<ucp_gbL)|
153     0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,     /*  5 SpacingMark */       (1<<ucp_gbV)|(1<<ucp_gbT)|(1<<ucp_gbLV)|
154     0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,     /*  6 L */       (1<<ucp_gbLVT)|(1<<ucp_gbOther),
155     0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,     /*  7 V */  
156     0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,     /*  8 T */     (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark),                /*  5 SpacingMark */
157     0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,     /*  9 LV */     (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbL)|   /*  6 L */
158     0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,     /* 10 LVT */       (1<<ucp_gbL)|(1<<ucp_gbV)|(1<<ucp_gbLV)|(1<<ucp_gbLVT),
159     0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0      /* 11 Other */  
160  };     (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbV)|   /*  7 V */
161         (1<<ucp_gbT),
162    
163       (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbT),   /*  8 T */
164       (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbV)|   /*  9 LV */
165         (1<<ucp_gbT),
166    
167       (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbT),   /* 10 LVT */
168       (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)                 /* 11 Other */
169    };
170    
171  #ifdef SUPPORT_JIT  #ifdef SUPPORT_JIT
172  /* This table reverses PRIV(ucp_gentype). We can save the cost  /* This table reverses PRIV(ucp_gentype). We can save the cost
173  of a memory load. */  of a memory load. */

Legend:
Removed from v.1011  
changed lines
  Added in v.1015

  ViewVC Help
Powered by ViewVC 1.1.5