@@ -29,6 +29,7 @@ pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<Multi
2929 ( lines, multi_byte_chars)
3030}
3131
32+ #[ cfg( bootstrap) ]
3233cfg_match ! {
3334 cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) => {
3435 fn analyze_source_file_dispatch(
@@ -185,6 +186,165 @@ cfg_match! {
185186 }
186187 }
187188}
189+
190+ #[ cfg( not( bootstrap) ) ]
191+ cfg_match ! {
192+ any( target_arch = "x86" , target_arch = "x86_64" ) => {
193+ fn analyze_source_file_dispatch(
194+ src: & str ,
195+ lines: & mut Vec <RelativeBytePos >,
196+ multi_byte_chars: & mut Vec <MultiByteChar >,
197+ ) {
198+ if is_x86_feature_detected!( "sse2" ) {
199+ unsafe {
200+ analyze_source_file_sse2( src, lines, multi_byte_chars) ;
201+ }
202+ } else {
203+ analyze_source_file_generic(
204+ src,
205+ src. len( ) ,
206+ RelativeBytePos :: from_u32( 0 ) ,
207+ lines,
208+ multi_byte_chars,
209+ ) ;
210+ }
211+ }
212+
213+ /// Checks 16 byte chunks of text at a time. If the chunk contains
214+ /// something other than printable ASCII characters and newlines, the
215+ /// function falls back to the generic implementation. Otherwise it uses
216+ /// SSE2 intrinsics to quickly find all newlines.
217+ #[ target_feature( enable = "sse2" ) ]
218+ unsafe fn analyze_source_file_sse2(
219+ src: & str ,
220+ lines: & mut Vec <RelativeBytePos >,
221+ multi_byte_chars: & mut Vec <MultiByteChar >,
222+ ) {
223+ #[ cfg( target_arch = "x86" ) ]
224+ use std:: arch:: x86:: * ;
225+ #[ cfg( target_arch = "x86_64" ) ]
226+ use std:: arch:: x86_64:: * ;
227+
228+ const CHUNK_SIZE : usize = 16 ;
229+
230+ let src_bytes = src. as_bytes( ) ;
231+
232+ let chunk_count = src. len( ) / CHUNK_SIZE ;
233+
234+ // This variable keeps track of where we should start decoding a
235+ // chunk. If a multi-byte character spans across chunk boundaries,
236+ // we need to skip that part in the next chunk because we already
237+ // handled it.
238+ let mut intra_chunk_offset = 0 ;
239+
240+ for chunk_index in 0 ..chunk_count {
241+ let ptr = src_bytes. as_ptr( ) as * const __m128i;
242+ // We don't know if the pointer is aligned to 16 bytes, so we
243+ // use `loadu`, which supports unaligned loading.
244+ let chunk = unsafe { _mm_loadu_si128( ptr. add( chunk_index) ) } ;
245+
246+ // For character in the chunk, see if its byte value is < 0, which
247+ // indicates that it's part of a UTF-8 char.
248+ let multibyte_test = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 0 ) ) } ;
249+ // Create a bit mask from the comparison results.
250+ let multibyte_mask = unsafe { _mm_movemask_epi8( multibyte_test) } ;
251+
252+ // If the bit mask is all zero, we only have ASCII chars here:
253+ if multibyte_mask == 0 {
254+ assert!( intra_chunk_offset == 0 ) ;
255+
256+ // Check if there are any control characters in the chunk. All
257+ // control characters that we can encounter at this point have a
258+ // byte value less than 32 or ...
259+ let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
260+ let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
261+
262+ // ... it's the ASCII 'DEL' character with a value of 127.
263+ let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
264+ let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
265+
266+ let control_char_mask = control_char_mask0 | control_char_mask1;
267+
268+ if control_char_mask != 0 {
269+ // Check for newlines in the chunk
270+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
271+ let newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
272+
273+ if control_char_mask == newlines_mask {
274+ // All control characters are newlines, record them
275+ let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
276+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
277+
278+ loop {
279+ let index = newlines_mask. trailing_zeros( ) ;
280+
281+ if index >= CHUNK_SIZE as u32 {
282+ // We have arrived at the end of the chunk.
283+ break ;
284+ }
285+
286+ lines. push( RelativeBytePos ( index) + output_offset) ;
287+
288+ // Clear the bit, so we can find the next one.
289+ newlines_mask &= ( !1 ) << index;
290+ }
291+
292+ // We are done for this chunk. All control characters were
293+ // newlines and we took care of those.
294+ continue ;
295+ } else {
296+ // Some of the control characters are not newlines,
297+ // fall through to the slow path below.
298+ }
299+ } else {
300+ // No control characters, nothing to record for this chunk
301+ continue ;
302+ }
303+ }
304+
305+ // The slow path.
306+ // There are control chars in here, fallback to generic decoding.
307+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
308+ intra_chunk_offset = analyze_source_file_generic(
309+ & src[ scan_start..] ,
310+ CHUNK_SIZE - intra_chunk_offset,
311+ RelativeBytePos :: from_usize( scan_start) ,
312+ lines,
313+ multi_byte_chars,
314+ ) ;
315+ }
316+
317+ // There might still be a tail left to analyze
318+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
319+ if tail_start < src. len( ) {
320+ analyze_source_file_generic(
321+ & src[ tail_start..] ,
322+ src. len( ) - tail_start,
323+ RelativeBytePos :: from_usize( tail_start) ,
324+ lines,
325+ multi_byte_chars,
326+ ) ;
327+ }
328+ }
329+ }
330+ _ => {
331+ // The target (or compiler version) does not support SSE2 ...
332+ fn analyze_source_file_dispatch(
333+ src: & str ,
334+ lines: & mut Vec <RelativeBytePos >,
335+ multi_byte_chars: & mut Vec <MultiByteChar >,
336+ ) {
337+ analyze_source_file_generic(
338+ src,
339+ src. len( ) ,
340+ RelativeBytePos :: from_u32( 0 ) ,
341+ lines,
342+ multi_byte_chars,
343+ ) ;
344+ }
345+ }
346+ }
347+
188348// `scan_len` determines the number of bytes in `src` to scan. Note that the
189349// function can read past `scan_len` if a multi-byte character start within the
190350// range but extends past it. The overflow is returned by the function.
0 commit comments