@@ -97,47 +97,121 @@ impl<'a> CharEq for &'a [char] {
9797Section: Iterators
9898*/
9999
100- /// External iterator for a string's characters.
101- /// Use with the `std::iter` module.
100+ /// Iterator for the char (representing *Unicode Scalar Values*) of a string
101+ ///
102+ /// Created with the method `.chars()`.
102103#[ deriving( Clone ) ]
103104pub struct Chars < ' a > {
104- /// The slice remaining to be iterated
105- string : & ' a str ,
105+ iter : slice:: Items < ' a , u8 >
106+ }
107+
108+ // Return the initial codepoint accumulator for the first byte.
109+ // The first byte is special, only want bottom 5 bits for width 2, 4 bits
110+ // for width 3, and 3 bits for width 4
111+ macro_rules! utf8_first_byte(
112+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
113+ )
114+
115+ // return the value of $ch updated with continuation byte $byte
116+ macro_rules! utf8_acc_cont_byte(
117+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
118+ )
119+
120+ macro_rules! utf8_is_cont_byte(
121+ ( $byte: expr) => ( ( $byte & 192u8 ) == 128 )
122+ )
123+
124+ #[ inline]
125+ fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
126+ match opt {
127+ Some ( & byte) => byte,
128+ None => 0 ,
129+ }
106130}
107131
108132impl < ' a > Iterator < char > for Chars < ' a > {
109133 #[ inline]
110134 fn next ( & mut self ) -> Option < char > {
111- // Decode the next codepoint, then update
112- // the slice to be just the remaining part
113- if self . string . len ( ) != 0 {
114- let CharRange { ch, next} = self . string . char_range_at ( 0 ) ;
135+ // Decode UTF-8, using the valid UTF-8 invariant
136+ #[ inline]
137+ fn decode_multibyte < ' a > ( x : u8 , it : & mut slice:: Items < ' a , u8 > ) -> char {
138+ // NOTE: Performance is very sensitive to the exact formulation here
139+ // Decode from a byte combination out of: [[[x y] z] w]
140+ let cont_mask = 0x3F ; // continuation byte mask
141+ let init = utf8_first_byte ! ( x, 2 ) ;
142+ let y = unwrap_or_0 ( it. next ( ) ) ;
143+ let mut ch = utf8_acc_cont_byte ! ( init, y) ;
144+ if x >= 0xE0 {
145+ /* [[x y z] w] case */
146+ let z = unwrap_or_0 ( it. next ( ) ) ;
147+
148+ let y_z = ( ( ( y & cont_mask) as u32 ) << 6 ) | ( z & cont_mask) as u32 ;
149+ ch = init << 12 | y_z;
150+ if x >= 0xF0 {
151+ /* [x y z w] case */
152+ let w = unwrap_or_0 ( it. next ( ) ) ;
153+ ch = ( init & 7 ) << 18 | y_z << 6 | ( w & cont_mask) as u32 ;
154+ }
155+ }
115156 unsafe {
116- self . string = raw:: slice_unchecked ( self . string , next, self . string . len ( ) ) ;
157+ mem:: transmute ( ch)
158+ }
159+ }
160+
161+ match self . iter . next ( ) {
162+ None => None ,
163+ Some ( & next_byte) => {
164+ if next_byte < 128 {
165+ Some ( next_byte as char )
166+ } else {
167+ Some ( decode_multibyte ( next_byte, & mut self . iter ) )
168+ }
117169 }
118- Some ( ch)
119- } else {
120- None
121170 }
122171 }
123172
124173 #[ inline]
125174 fn size_hint ( & self ) -> ( uint , Option < uint > ) {
126- ( self . string . len ( ) . saturating_add ( 3 ) /4 , Some ( self . string . len ( ) ) )
175+ let ( len, _) = self . iter . size_hint ( ) ;
176+ ( len. saturating_add ( 3 ) / 4 , Some ( len) )
127177 }
128178}
129179
130180impl < ' a > DoubleEndedIterator < char > for Chars < ' a > {
131181 #[ inline]
132182 fn next_back ( & mut self ) -> Option < char > {
133- if self . string . len ( ) != 0 {
134- let CharRange { ch, next} = self . string . char_range_at_reverse ( self . string . len ( ) ) ;
183+ #[ inline]
184+ fn decode_multibyte_back < ' a > ( w : u8 , it : & mut slice:: Items < ' a , u8 > ) -> char {
185+ // Decode from a byte combination out of: [x [y [z w]]]
186+ let mut ch;
187+ let z = unwrap_or_0 ( it. next_back ( ) ) ;
188+ ch = utf8_first_byte ! ( z, 2 ) ;
189+ if utf8_is_cont_byte ! ( z) {
190+ let y = unwrap_or_0 ( it. next_back ( ) ) ;
191+ ch = utf8_first_byte ! ( y, 3 ) ;
192+ if utf8_is_cont_byte ! ( y) {
193+ let x = unwrap_or_0 ( it. next_back ( ) ) ;
194+ ch = utf8_first_byte ! ( x, 4 ) ;
195+ ch = utf8_acc_cont_byte ! ( ch, y) ;
196+ }
197+ ch = utf8_acc_cont_byte ! ( ch, z) ;
198+ }
199+ ch = utf8_acc_cont_byte ! ( ch, w) ;
200+
135201 unsafe {
136- self . string = raw:: slice_unchecked ( self . string , 0 , next) ;
202+ mem:: transmute ( ch)
203+ }
204+ }
205+
206+ match self . iter . next_back ( ) {
207+ None => None ,
208+ Some ( & back_byte) => {
209+ if back_byte < 128 {
210+ Some ( back_byte as char )
211+ } else {
212+ Some ( decode_multibyte_back ( back_byte, & mut self . iter ) )
213+ }
137214 }
138- Some ( ch)
139- } else {
140- None
141215 }
142216 }
143217}
@@ -146,18 +220,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
146220/// Use with the `std::iter` module.
147221#[ deriving( Clone ) ]
148222pub struct CharOffsets < ' a > {
149- /// The original string to be iterated
150- string : & ' a str ,
223+ front : uint ,
224+ back : uint ,
151225 iter : Chars < ' a > ,
152226}
153227
154228impl < ' a > Iterator < ( uint , char ) > for CharOffsets < ' a > {
155229 #[ inline]
156230 fn next ( & mut self ) -> Option < ( uint , char ) > {
157- // Compute the byte offset by using the pointer offset between
158- // the original string slice and the iterator's remaining part
159- let offset = self . iter . string . as_ptr ( ) as uint - self . string . as_ptr ( ) as uint ;
160- self . iter . next ( ) . map ( |ch| ( offset, ch) )
231+ match self . iter . next ( ) {
232+ None => None ,
233+ Some ( ch) => {
234+ let index = self . front ;
235+ let ( len, _) = self . iter . iter . size_hint ( ) ;
236+ self . front += self . back - self . front - len;
237+ Some ( ( index, ch) )
238+ }
239+ }
161240 }
162241
163242 #[ inline]
@@ -169,11 +248,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
169248impl < ' a > DoubleEndedIterator < ( uint , char ) > for CharOffsets < ' a > {
170249 #[ inline]
171250 fn next_back ( & mut self ) -> Option < ( uint , char ) > {
172- self . iter . next_back ( ) . map ( |ch| {
173- let offset = self . iter . string . len ( ) +
174- self . iter . string . as_ptr ( ) as uint - self . string . as_ptr ( ) as uint ;
175- ( offset, ch)
176- } )
251+ match self . iter . next_back ( ) {
252+ None => None ,
253+ Some ( ch) => {
254+ let ( len, _) = self . iter . iter . size_hint ( ) ;
255+ self . back -= self . back - self . front - len;
256+ Some ( ( self . back , ch) )
257+ }
258+ }
177259 }
178260}
179261
@@ -880,18 +962,6 @@ pub struct CharRange {
880962 pub next : uint ,
881963}
882964
883- // Return the initial codepoint accumulator for the first byte.
884- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
885- // for width 3, and 3 bits for width 4
886- macro_rules! utf8_first_byte(
887- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
888- )
889-
890- // return the value of $ch updated with continuation byte $byte
891- macro_rules! utf8_acc_cont_byte(
892- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
893- )
894-
895965static TAG_CONT_U8 : u8 = 128u8 ;
896966
897967/// Unsafe operations
@@ -1608,7 +1678,7 @@ impl<'a> StrSlice<'a> for &'a str {
16081678
16091679 #[ inline]
16101680 fn chars ( & self ) -> Chars < ' a > {
1611- Chars { string : * self }
1681+ Chars { iter : self . as_bytes ( ) . iter ( ) }
16121682 }
16131683
16141684 #[ inline]
@@ -1618,7 +1688,7 @@ impl<'a> StrSlice<'a> for &'a str {
16181688
16191689 #[ inline]
16201690 fn char_indices ( & self ) -> CharOffsets < ' a > {
1621- CharOffsets { string : * self , iter : self . chars ( ) }
1691+ CharOffsets { front : 0 , back : self . len ( ) , iter : self . chars ( ) }
16221692 }
16231693
16241694 #[ inline]
0 commit comments