@@ -29,6 +29,7 @@ pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<Multi
2929    ( lines,  multi_byte_chars) 
3030} 
3131
32+ #[ cfg( bootstrap) ]  
3233cfg_match !  { 
3334    cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ) )  => { 
3435        fn  analyze_source_file_dispatch( 
@@ -185,6 +186,165 @@ cfg_match! {
185186        } 
186187    } 
187188} 
189+ 
190+ #[ cfg( not( bootstrap) ) ]  
191+ cfg_match !  { 
192+     any( target_arch = "x86" ,  target_arch = "x86_64" )  => { 
193+         fn  analyze_source_file_dispatch( 
194+             src:  & str , 
195+             lines:  & mut  Vec <RelativeBytePos >, 
196+             multi_byte_chars:  & mut  Vec <MultiByteChar >, 
197+         )  { 
198+             if  is_x86_feature_detected!( "sse2" )  { 
199+                 unsafe  { 
200+                     analyze_source_file_sse2( src,  lines,  multi_byte_chars) ; 
201+                 } 
202+             }  else { 
203+                 analyze_source_file_generic( 
204+                     src, 
205+                     src. len( ) , 
206+                     RelativeBytePos :: from_u32( 0 ) , 
207+                     lines, 
208+                     multi_byte_chars, 
209+                 ) ; 
210+             } 
211+         } 
212+ 
213+         /// Checks 16 byte chunks of text at a time. If the chunk contains 
214+ /// something other than printable ASCII characters and newlines, the 
215+ /// function falls back to the generic implementation. Otherwise it uses 
216+ /// SSE2 intrinsics to quickly find all newlines. 
217+ [ target_feature( enable = "sse2" ) ] 
218+         unsafe  fn  analyze_source_file_sse2( 
219+             src:  & str , 
220+             lines:  & mut  Vec <RelativeBytePos >, 
221+             multi_byte_chars:  & mut  Vec <MultiByteChar >, 
222+         )  { 
223+             #[ cfg( target_arch = "x86" ) ] 
224+             use  std:: arch:: x86:: * ; 
225+             #[ cfg( target_arch = "x86_64" ) ] 
226+             use  std:: arch:: x86_64:: * ; 
227+ 
228+             const  CHUNK_SIZE :  usize  = 16 ; 
229+ 
230+             let  src_bytes = src. as_bytes( ) ; 
231+ 
232+             let  chunk_count = src. len( )  / CHUNK_SIZE ; 
233+ 
234+             // This variable keeps track of where we should start decoding a 
235+             // chunk. If a multi-byte character spans across chunk boundaries, 
236+             // we need to skip that part in the next chunk because we already 
237+             // handled it. 
238+             let  mut  intra_chunk_offset = 0 ; 
239+ 
240+             for  chunk_index in 0 ..chunk_count { 
241+                 let  ptr = src_bytes. as_ptr( )  as  * const  __m128i; 
242+                 // We don't know if the pointer is aligned to 16 bytes, so we 
243+                 // use `loadu`, which supports unaligned loading. 
244+                 let  chunk = unsafe  {  _mm_loadu_si128( ptr. add( chunk_index) )  } ; 
245+ 
246+                 // For character in the chunk, see if its byte value is < 0, which 
247+                 // indicates that it's part of a UTF-8 char. 
248+                 let  multibyte_test = unsafe  {  _mm_cmplt_epi8( chunk,  _mm_set1_epi8( 0 ) )  } ; 
249+                 // Create a bit mask from the comparison results. 
250+                 let  multibyte_mask = unsafe  {  _mm_movemask_epi8( multibyte_test)  } ; 
251+ 
252+                 // If the bit mask is all zero, we only have ASCII chars here: 
253+                 if  multibyte_mask == 0  { 
254+                     assert!( intra_chunk_offset == 0 ) ; 
255+ 
256+                     // Check if there are any control characters in the chunk. All 
257+                     // control characters that we can encounter at this point have a 
258+                     // byte value less than 32 or ... 
259+                     let  control_char_test0 = unsafe  {  _mm_cmplt_epi8( chunk,  _mm_set1_epi8( 32 ) )  } ; 
260+                     let  control_char_mask0 = unsafe  {  _mm_movemask_epi8( control_char_test0)  } ; 
261+ 
262+                     // ... it's the ASCII 'DEL' character with a value of 127. 
263+                     let  control_char_test1 = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( 127 ) )  } ; 
264+                     let  control_char_mask1 = unsafe  {  _mm_movemask_epi8( control_char_test1)  } ; 
265+ 
266+                     let  control_char_mask = control_char_mask0 | control_char_mask1; 
267+ 
268+                     if  control_char_mask != 0  { 
269+                         // Check for newlines in the chunk 
270+                         let  newlines_test = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( b'\n'  as  i8 ) )  } ; 
271+                         let  newlines_mask = unsafe  {  _mm_movemask_epi8( newlines_test)  } ; 
272+ 
273+                         if  control_char_mask == newlines_mask { 
274+                             // All control characters are newlines, record them 
275+                             let  mut  newlines_mask = 0xFFFF0000  | newlines_mask as  u32 ; 
276+                             let  output_offset = RelativeBytePos :: from_usize( chunk_index *  CHUNK_SIZE  + 1 ) ; 
277+ 
278+                             loop  { 
279+                                 let  index = newlines_mask. trailing_zeros( ) ; 
280+ 
281+                                 if  index >= CHUNK_SIZE  as  u32  { 
282+                                     // We have arrived at the end of the chunk. 
283+                                     break ; 
284+                                 } 
285+ 
286+                                 lines. push( RelativeBytePos ( index)  + output_offset) ; 
287+ 
288+                                 // Clear the bit, so we can find the next one. 
289+                                 newlines_mask &= ( !1 )  << index; 
290+                             } 
291+ 
292+                             // We are done for this chunk. All control characters were 
293+                             // newlines and we took care of those. 
294+                             continue ; 
295+                         }  else { 
296+                             // Some of the control characters are not newlines, 
297+                             // fall through to the slow path below. 
298+                         } 
299+                     }  else { 
300+                         // No control characters, nothing to record for this chunk 
301+                         continue ; 
302+                     } 
303+                 } 
304+ 
305+                 // The slow path. 
306+                 // There are control chars in here, fallback to generic decoding. 
307+                 let  scan_start = chunk_index *  CHUNK_SIZE  + intra_chunk_offset; 
308+                 intra_chunk_offset = analyze_source_file_generic( 
309+                     & src[ scan_start..] , 
310+                     CHUNK_SIZE  - intra_chunk_offset, 
311+                     RelativeBytePos :: from_usize( scan_start) , 
312+                     lines, 
313+                     multi_byte_chars, 
314+                 ) ; 
315+             } 
316+ 
317+             // There might still be a tail left to analyze 
318+             let  tail_start = chunk_count *  CHUNK_SIZE  + intra_chunk_offset; 
319+             if  tail_start < src. len( )  { 
320+                 analyze_source_file_generic( 
321+                     & src[ tail_start..] , 
322+                     src. len( )  - tail_start, 
323+                     RelativeBytePos :: from_usize( tail_start) , 
324+                     lines, 
325+                     multi_byte_chars, 
326+                 ) ; 
327+             } 
328+         } 
329+     } 
330+     _ => { 
331+         // The target (or compiler version) does not support SSE2 ... 
332+         fn  analyze_source_file_dispatch( 
333+             src:  & str , 
334+             lines:  & mut  Vec <RelativeBytePos >, 
335+             multi_byte_chars:  & mut  Vec <MultiByteChar >, 
336+         )  { 
337+             analyze_source_file_generic( 
338+                 src, 
339+                 src. len( ) , 
340+                 RelativeBytePos :: from_u32( 0 ) , 
341+                 lines, 
342+                 multi_byte_chars, 
343+             ) ; 
344+         } 
345+     } 
346+ } 
347+ 
188348// `scan_len` determines the number of bytes in `src` to scan. Note that the 
189349// function can read past `scan_len` if a multi-byte character start within the 
190350// range but extends past it. The overflow is returned by the function. 
0 commit comments