5151    'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
5252}
5353
54+ 
55+ # Grapheme cluster data 
56+ # taken from UAX29, http://www.unicode.org/reports/tr29/ 
57+ # these code points are excluded from the Control category 
58+ # NOTE: CR and LF are also technically excluded, but for 
59+ # the sake of convenience we leave them in the Control group 
60+ # and manually check them in the appropriate place. This is 
61+ # still compliant with the implementation requirements. 
62+ grapheme_control_exceptions  =  set ([0x200c , 0x200d ])
63+ 
64+ # the Regional_Indicator category 
65+ grapheme_regional_indicator  =  [(0x1f1e6 , 0x1f1ff )]
66+ 
67+ # "The following ... are specifically excluded" from the SpacingMark category 
68+ # http://www.unicode.org/reports/tr29/#SpacingMark 
69+ grapheme_spacingmark_exceptions  =  [(0x102b , 0x102c ), (0x1038 , 0x1038 ),
70+     (0x1062 , 0x1064 ), (0x1067 , 0x106d ), (0x1083 , 0x1083 ), (0x1087 , 0x108c ),
71+     (0x108f , 0x108f ), (0x109a , 0x109c ), (0x19b0 , 0x19b4 ), (0x19b8 , 0x19b9 ),
72+     (0x19bb , 0x19c0 ), (0x19c8 , 0x19c9 ), (0x1a61 , 0x1a61 ), (0x1a63 , 0x1a64 ),
73+     (0xaa7b , 0xaa7b ), (0xaa7d , 0xaa7d )]
74+ 
75+ # these are included in the SpacingMark category 
76+ grapheme_spacingmark_extra  =  set ([0xe33 , 0xeb3 ])
77+ 
5478def  fetch (f ):
5579    if  not  os .path .exists (f ):
5680        os .system ("curl -O http://www.unicode.org/Public/UNIDATA/%s" 
@@ -109,7 +133,7 @@ def load_unicode_data(f):
109133                canon_decomp [code ] =  seq 
110134
111135        # place letter in categories as appropriate 
112-         for  cat  in  [gencat ] +  expanded_categories .get (gencat , []):
136+         for  cat  in  [gencat ,  "Assigned" ] +  expanded_categories .get (gencat , []):
113137            if  cat  not  in gencats :
114138                gencats [cat ] =  []
115139            gencats [cat ].append (code )
@@ -120,6 +144,12 @@ def load_unicode_data(f):
120144                combines [combine ] =  []
121145            combines [combine ].append (code )
122146
147+     # generate Not_Assigned from Assigned 
148+     gencats ["Cn" ] =  gen_unassigned (gencats ["Assigned" ])
149+     # Assigned is not a real category 
150+     del (gencats ["Assigned" ])
151+     # Other contains Not_Assigned 
152+     gencats ["C" ].extend (gencats ["Cn" ])
123153    gencats  =  group_cats (gencats )
124154    combines  =  to_combines (group_cats (combines ))
125155
@@ -155,6 +185,11 @@ def ungroup_cat(cat):
155185            lo  +=  1 
156186    return  cat_out 
157187
188+ def  gen_unassigned (assigned ):
189+     assigned  =  set (assigned )
190+     return  ([i  for  i  in  range (0 , 0xd800 ) if  i  not  in assigned ] + 
191+             [i  for  i  in  range (0xe000 , 0x110000 ) if  i  not  in assigned ])
192+ 
158193def  to_combines (combs ):
159194    combs_out  =  []
160195    for  comb  in  combs :
@@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower):
350385        sorted (lowerupper .iteritems (), key = operator .itemgetter (0 )), is_pub = False )
351386    f .write ("}\n \n " )
352387
388+ def  emit_grapheme_module (f , grapheme_table , grapheme_cats ):
389+     f .write ("""pub mod grapheme { 
390+     use core::option::{Some, None}; 
391+     use core::slice::ImmutableVector; 
392+ 
393+     #[allow(non_camel_case_types)] 
394+     #[deriving(Clone)] 
395+     pub enum GraphemeCat { 
396+ """ )
397+     for  cat  in  grapheme_cats  +  ["Any" ]:
398+         f .write ("        GC_"  +  cat  +  ",\n " )
399+     f .write ("""    } 
400+ 
401+     fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat { 
402+         use core::cmp::{Equal, Less, Greater}; 
403+         match r.bsearch(|&(lo, hi, _)| { 
404+             if lo <= c && c <= hi { Equal } 
405+             else if hi < c { Less } 
406+             else { Greater } 
407+         }) { 
408+             Some(idx) => { 
409+                 let (_, _, cat) = r[idx]; 
410+                 cat 
411+             } 
412+             None => GC_Any 
413+         } 
414+     } 
415+ 
416+     pub fn grapheme_category(c: char) -> GraphemeCat { 
417+         bsearch_range_value_table(c, grapheme_cat_table) 
418+     } 
419+ 
420+ """ )
421+ 
422+     emit_table (f , "grapheme_cat_table" , grapheme_table , "&'static [(char, char, GraphemeCat)]" ,
423+         pfun = lambda  x : "(%s,%s,GC_%s)"  %  (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]),
424+         is_pub = False )
425+     f .write ("}\n " )
426+ 
353427def  emit_charwidth_module (f , width_table ):
354428    f .write ("pub mod charwidth {\n " )
355429    f .write ("    use core::option::{Option, Some, None};\n " )
@@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table):
388462    f .write ("    //     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n " )
389463    emit_table (f , "charwidth_table" , width_table , "&'static [(char, char, u8, u8)]" , is_pub = False ,
390464            pfun = lambda  x : "(%s,%s,%s,%s)"  %  (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ], x [3 ]))
391-     f .write ("}\n " )
465+     f .write ("}\n \n " )
392466
393467def  emit_norm_module (f , canon , compat , combine ):
394468    canon_keys  =  canon .keys ()
@@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val):
473547        wtable_out .extend (wtable )
474548    return  wtable_out 
475549
550+ 
551+ 
476552def  optimize_width_table (wtable ):
477553    wtable_out  =  []
478554    w_this  =  wtable .pop (0 )
@@ -487,7 +563,7 @@ def optimize_width_table(wtable):
487563    return  wtable_out 
488564
489565if  __name__  ==  "__main__" :
490-     r  =  "unicode .rs" 
566+     r  =  "tables .rs" 
491567    if  os .path .exists (r ):
492568        os .remove (r )
493569    with  open (r , "w" ) as  rf :
@@ -498,12 +574,18 @@ def optimize_width_table(wtable):
498574        (canon_decomp , compat_decomp , gencats , combines ,
499575                lowerupper , upperlower ) =  load_unicode_data ("UnicodeData.txt" )
500576        want_derived  =  ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ]
501-         other_derived  =  ["Default_Ignorable_Code_Point" ]
577+         other_derived  =  ["Default_Ignorable_Code_Point" ,  "Grapheme_Extend" ]
502578        derived  =  load_properties ("DerivedCoreProperties.txt" , want_derived  +  other_derived )
503579        scripts  =  load_properties ("Scripts.txt" , [])
504580        props  =  load_properties ("PropList.txt" ,
505581                ["White_Space" , "Join_Control" , "Noncharacter_Code_Point" ])
506582
583+         # grapheme cluster category from DerivedCoreProperties 
584+         # the rest are defined below 
585+         grapheme_cats  =  {}
586+         grapheme_cats ["Extend" ] =  derived ["Grapheme_Extend" ]
587+         del (derived ["Grapheme_Extend" ])
588+ 
507589        # bsearch_range_table is used in all the property modules below 
508590        emit_bsearch_range_table (rf )
509591
@@ -533,7 +615,7 @@ def optimize_width_table(wtable):
533615        emit_norm_module (rf , canon_decomp , compat_decomp , combines )
534616        emit_conversions_module (rf , lowerupper , upperlower )
535617
536-         # character width module 
618+         ###  character width module 
537619        width_table  =  []
538620        for  zwcat  in  ["Me" , "Mn" , "Cf" ]:
539621            width_table .extend (map (lambda  (lo , hi ): (lo , hi , 0 , 0 ), gencats [zwcat ]))
@@ -555,3 +637,40 @@ def optimize_width_table(wtable):
555637        # optimize the width table by collapsing adjacent entities when possible 
556638        width_table  =  optimize_width_table (width_table )
557639        emit_charwidth_module (rf , width_table )
640+ 
641+         ### grapheme cluster module 
642+         # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values 
643+         # Hangul syllable categories 
644+         want_hangul  =  ["L" , "V" , "T" , "LV" , "LVT" ]
645+         grapheme_cats .update (load_properties ("HangulSyllableType.txt" , want_hangul ))
646+ 
647+         # Control 
648+         # This category also includes Cs (surrogate codepoints), but Rust's `char`s are 
649+         # Unicode Scalar Values only, and surrogates are thus invalid `char`s. 
650+         grapheme_cats ["Control" ] =  set ()
651+         for  cat  in  ["Zl" , "Zp" , "Cc" , "Cf" ]:
652+             grapheme_cats ["Control" ] |=  set (ungroup_cat (gencats [cat ]))
653+         grapheme_cats ["Control" ] =  group_cat (list (
654+             grapheme_cats ["Control" ]
655+             -  grapheme_control_exceptions 
656+             |  (set (ungroup_cat (gencats ["Cn" ]))
657+                &  set (ungroup_cat (derived ["Default_Ignorable_Code_Point" ])))))
658+ 
659+         # Regional Indicator 
660+         grapheme_cats ["RegionalIndicator" ] =  grapheme_regional_indicator 
661+ 
662+         # Prepend - "Currently there are no characters with this value" 
663+         # (from UAX#29, Unicode 7.0) 
664+ 
665+         # SpacingMark 
666+         grapheme_cats ["SpacingMark" ] =  group_cat (list (
667+             set (ungroup_cat (gencats ["Mc" ]))
668+             -  set (ungroup_cat (grapheme_cats ["Extend" ]))
669+             |  grapheme_spacingmark_extra 
670+             -  set (ungroup_cat (grapheme_spacingmark_exceptions ))))
671+ 
672+         grapheme_table  =  []
673+         for  cat  in  grapheme_cats :
674+             grapheme_table .extend ([(x , y , cat ) for  (x , y ) in  grapheme_cats [cat ]])
675+         grapheme_table .sort (key = lambda  w : w [0 ])
676+         emit_grapheme_module (rf , grapheme_table , grapheme_cats .keys ())
0 commit comments