@@ -81,6 +81,10 @@ export
8181
8282 // Misc
8383 is_utf8,
84+ is_utf16,
85+ to_utf16,
86+ from_utf16,
87+ utf16_chars,
8488 count_chars, count_bytes,
8589 utf8_char_width,
8690 char_range_at,
@@ -1060,6 +1064,83 @@ fn is_utf8(v: [u8]) -> bool {
10601064 ret true;
10611065}
10621066
1067+
1068+ fn is_utf16 ( v : [ u16 ] ) -> bool {
1069+ let len = v. len ( ) ;
1070+ let i = 0 u;
1071+ while ( i < len) {
1072+ let u = v[ i] ;
1073+
1074+ if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
1075+ i += 1 u;
1076+
1077+ } else {
1078+ if i+1 u < len { ret false ; }
1079+ let u2 = v[ i+1 u] ;
1080+ if u < 0xD7FF_u16 || u > 0xDBFF_u16 { ret false ; }
1081+ if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { ret false ; }
1082+ i += 2 u;
1083+ }
1084+ }
1085+ ret true;
1086+ }
1087+
1088+
1089+ fn to_utf16 ( s : str ) -> [ u16 ] {
1090+ let u = [ ] ;
1091+ chars_iter ( s) { |cch|
1092+ // Arithmetic with u32 literals is easier on the eyes than chars.
1093+ let ch = cch as u32 ;
1094+
1095+ if ( ch & 0xFFFF_u32 ) == ch {
1096+ // The BMP falls through (assuming non-surrogate, as it should)
1097+ assert ch <= 0xD7FF_u32 || ch >= 0xE000_u32 ;
1098+ u += [ ch as u16 ]
1099+ } else {
1100+ // Supplementary planes break into surrogates.
1101+ assert ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32 ;
1102+ ch -= 0x1_0000_u32 ;
1103+ let w1 = 0xD800_u16 | ( ( ch >> 10 ) as u16 ) ;
1104+ let w2 = 0xDC00_u16 | ( ( ch as u16 ) & 0x3FF_u16 ) ;
1105+ u += [ w1, w2]
1106+ }
1107+ }
1108+ ret u;
1109+ }
1110+
1111+ fn utf16_chars ( v : [ u16 ] , f : fn ( char ) ) {
1112+ let len = v. len ( ) ;
1113+ let i = 0 u;
1114+ while ( i < len) {
1115+ let u = v[ i] ;
1116+
1117+ if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
1118+ f ( u as char ) ;
1119+ i += 1 u;
1120+
1121+ } else {
1122+ let u2 = v[ i+1 u] ;
1123+ assert u >= 0xD800_u16 && u <= 0xDBFF_u16 ;
1124+ assert u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16 ;
1125+ let c = ( u - 0xD800_u16 ) as char ;
1126+ c = c << 10 ;
1127+ c |= ( u2 - 0xDC00_u16 ) as char ;
1128+ c |= 0x1_0000_u32 as char ;
1129+ f ( c) ;
1130+ i += 2 u;
1131+ }
1132+ }
1133+ }
1134+
1135+
1136+ fn from_utf16 ( v : [ u16 ] ) -> str {
1137+ let buf = "" ;
1138+ reserve ( buf, v. len ( ) ) ;
1139+ utf16_chars ( v) { |ch| push_char ( buf, ch) ; }
1140+ ret buf;
1141+ }
1142+
1143+
10631144/*
10641145Function: count_chars
10651146
@@ -2223,4 +2304,51 @@ mod tests {
22232304 assert [ 'ศ' , 'ไ' , 'ท' , 'ย' , '中' , '华' , 'V' , 'i' , 'ệ' , 't' , ' ' , 'N' , 'a' , 'm' ]
22242305 == chars ( ss) ;
22252306 }
2307+
2308+ #[ test]
2309+ fn test_utf16 ( ) {
2310+ let pairs =
2311+ [ ( "𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n " ,
2312+ [ 0xd800_u16 , 0xdf45_u16 , 0xd800_u16 , 0xdf3f_u16 ,
2313+ 0xd800_u16 , 0xdf3b_u16 , 0xd800_u16 , 0xdf46_u16 ,
2314+ 0xd800_u16 , 0xdf39_u16 , 0xd800_u16 , 0xdf3b_u16 ,
2315+ 0xd800_u16 , 0xdf30_u16 , 0x000a_u16 ] ) ,
2316+
2317+ ( "𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n " ,
2318+ [ 0xd801_u16 , 0xdc12_u16 , 0xd801_u16 ,
2319+ 0xdc49_u16 , 0xd801_u16 , 0xdc2e_u16 , 0xd801_u16 ,
2320+ 0xdc40_u16 , 0xd801_u16 , 0xdc32_u16 , 0xd801_u16 ,
2321+ 0xdc4b_u16 , 0x0020_u16 , 0xd801_u16 , 0xdc0f_u16 ,
2322+ 0xd801_u16 , 0xdc32_u16 , 0xd801_u16 , 0xdc4d_u16 ,
2323+ 0x000a_u16 ] ) ,
2324+
2325+ ( "𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n " ,
2326+ [ 0xd800_u16 , 0xdf00_u16 , 0xd800_u16 , 0xdf16_u16 ,
2327+ 0xd800_u16 , 0xdf0b_u16 , 0xd800_u16 , 0xdf04_u16 ,
2328+ 0xd800_u16 , 0xdf11_u16 , 0xd800_u16 , 0xdf09_u16 ,
2329+ 0x00b7_u16 , 0xd800_u16 , 0xdf0c_u16 , 0xd800_u16 ,
2330+ 0xdf04_u16 , 0xd800_u16 , 0xdf15_u16 , 0xd800_u16 ,
2331+ 0xdf04_u16 , 0xd800_u16 , 0xdf0b_u16 , 0xd800_u16 ,
2332+ 0xdf09_u16 , 0xd800_u16 , 0xdf11_u16 , 0x000a_u16 ] ) ,
2333+
2334+ ( "𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n " ,
2335+ [ 0xd801_u16 , 0xdc8b_u16 , 0xd801_u16 , 0xdc98_u16 ,
2336+ 0xd801_u16 , 0xdc88_u16 , 0xd801_u16 , 0xdc91_u16 ,
2337+ 0xd801_u16 , 0xdc9b_u16 , 0xd801_u16 , 0xdc92_u16 ,
2338+ 0x0020_u16 , 0xd801_u16 , 0xdc95_u16 , 0xd801_u16 ,
2339+ 0xdc93_u16 , 0x0020_u16 , 0xd801_u16 , 0xdc88_u16 ,
2340+ 0xd801_u16 , 0xdc9a_u16 , 0xd801_u16 , 0xdc8d_u16 ,
2341+ 0x0020_u16 , 0xd801_u16 , 0xdc8f_u16 , 0xd801_u16 ,
2342+ 0xdc9c_u16 , 0xd801_u16 , 0xdc92_u16 , 0xd801_u16 ,
2343+ 0xdc96_u16 , 0xd801_u16 , 0xdc86_u16 , 0x0020_u16 ,
2344+ 0xd801_u16 , 0xdc95_u16 , 0xd801_u16 , 0xdc86_u16 ,
2345+ 0x000a_u16 ] ) ] ;
2346+
2347+ for ( s, u) in pairs {
2348+ assert to_utf16 ( s) == u;
2349+ assert from_utf16 ( u) == s;
2350+ assert from_utf16 ( to_utf16 ( s) ) == s;
2351+ assert to_utf16 ( from_utf16 ( u) ) == u;
2352+ }
2353+ }
22262354}
0 commit comments