@@ -17,14 +17,14 @@ pub(crate) fn decode_suffix(
17
17
decode_allow_trailing_bits : bool ,
18
18
padding_mode : DecodePaddingMode ,
19
19
) -> Result < DecodeMetadata , DecodeError > {
20
- // Decode any leftovers that aren't a complete input block of 8 bytes.
20
+ // Decode any leftovers that might not be a complete input chunk of 8 bytes.
21
21
// Use a u64 as a stack-resident 8 byte buffer.
22
- let mut leftover_bits: u64 = 0 ;
23
22
let mut morsels_in_leftover = 0 ;
24
23
let mut padding_bytes = 0 ;
25
24
let mut first_padding_index: usize = 0 ;
26
25
let mut last_symbol = 0_u8 ;
27
26
let start_of_leftovers = input_index;
27
+ let mut morsels = [ 0_u8 ; 8 ] ;
28
28
29
29
for ( i, & b) in input[ start_of_leftovers..] . iter ( ) . enumerate ( ) {
30
30
// '=' padding
@@ -83,13 +83,12 @@ pub(crate) fn decode_suffix(
83
83
84
84
// can use up to 8 * 6 = 48 bits of the u64, if last chunk has no padding.
85
85
// Pack the leftovers from left to right.
86
- let shift = 64 - ( morsels_in_leftover + 1 ) * 6 ;
87
86
let morsel = decode_table[ b as usize ] ;
88
87
if morsel == INVALID_VALUE {
89
88
return Err ( DecodeError :: InvalidByte ( start_of_leftovers + i, b) ) ;
90
89
}
91
90
92
- leftover_bits |= ( morsel as u64 ) << shift ;
91
+ morsels [ morsels_in_leftover ] = morsel;
93
92
morsels_in_leftover += 1 ;
94
93
}
95
94
@@ -121,23 +120,23 @@ pub(crate) fn decode_suffix(
121
120
// useless since there are no more symbols to provide the necessary 4 additional bits
122
121
// to finish the second original byte.
123
122
124
- let leftover_bits_ready_to_append = match morsels_in_leftover {
125
- 0 => 0 ,
126
- 2 => 8 ,
127
- 3 => 16 ,
128
- 4 => 24 ,
129
- 6 => 32 ,
130
- 7 => 40 ,
131
- 8 => 48 ,
132
- // can also be detected as case #2 bad padding above
133
- _ => unreachable ! (
134
- "Impossible: must only have 0 to 8 input bytes in last chunk, with no invalid lengths"
135
- ) ,
136
- } ;
123
+ // TODO how do we know this?
124
+ debug_assert ! ( morsels_in_leftover != 1 && morsels_in_leftover != 5 ) ;
125
+ let leftover_bytes_to_append = morsels_in_leftover * 6 / 8 ;
126
+ let leftover_bits_to_append = leftover_bytes_to_append * 8 ;
127
+ // A couple percent speedup from nudging these ORs to use more ILP with a two-way split
128
+ let leftover_bits = ( ( u64 :: from ( morsels [ 0 ] ) << 58 )
129
+ | ( u64 :: from ( morsels [ 1 ] ) << 52 )
130
+ | ( u64 :: from ( morsels [ 2 ] ) << 46 )
131
+ | ( u64 :: from ( morsels [ 3 ] ) << 40 ) )
132
+ | ( ( u64 :: from ( morsels [ 4 ] ) << 34 )
133
+ | ( u64 :: from ( morsels [ 5 ] ) << 28 )
134
+ | ( u64 :: from ( morsels [ 6 ] ) << 22 )
135
+ | ( u64 :: from ( morsels [ 7 ] ) << 16 ) ) ;
137
136
138
137
// if there are bits set outside the bits we care about, last symbol encodes trailing bits that
139
138
// will not be included in the output
140
- let mask = !0 >> leftover_bits_ready_to_append ;
139
+ let mask = !0 >> leftover_bits_to_append ;
141
140
if !decode_allow_trailing_bits && ( leftover_bits & mask) != 0 {
142
141
// last morsel is at `morsels_in_leftover` - 1
143
142
return Err ( DecodeError :: InvalidLastSymbol (
@@ -148,7 +147,7 @@ pub(crate) fn decode_suffix(
148
147
149
148
// TODO benchmark simply converting to big endian bytes
150
149
let mut leftover_bits_appended_to_buf = 0 ;
151
- while leftover_bits_appended_to_buf < leftover_bits_ready_to_append {
150
+ while leftover_bits_appended_to_buf < leftover_bits_to_append {
152
151
// `as` simply truncates the higher bits, which is what we want here
153
152
let selected_bits = ( leftover_bits >> ( 56 - leftover_bits_appended_to_buf) ) as u8 ;
154
153
output[ output_index] = selected_bits;
0 commit comments