Skip to content

Commit

Permalink
Auto merge of #89611 - eduardosm:next_code_point, r=Mark-Simulacrum
Browse files Browse the repository at this point in the history
libcore: assume the input of `next_code_point` and `next_code_point_reverse` is UTF-8-like

The functions are now `unsafe` and they use `Option::unwrap_unchecked` instead of `unwrap_or_0`

`unwrap_or_0` was added in 42357d7. I guess `unwrap_unchecked` was not available back then.

Given this example:

```rust
pub fn first_char(s: &str) -> Option<char> {
    s.chars().next()
}
```

Previously, the following assembly was produced:

```asm
_ZN7example10first_char17ha056ddea6bafad1cE:
	.cfi_startproc
	test	rsi, rsi
	je	.LBB0_1
	movzx	edx, byte ptr [rdi]
	test	dl, dl
	js	.LBB0_3
	mov	eax, edx
	ret
.LBB0_1:
	mov	eax, 1114112
	ret
.LBB0_3:
	lea	r8, [rdi + rsi]
	xor	eax, eax
	mov	r9, r8
	cmp	rsi, 1
	je	.LBB0_5
	movzx	eax, byte ptr [rdi + 1]
	add	rdi, 2
	and	eax, 63
	mov	r9, rdi
.LBB0_5:
	mov	ecx, edx
	and	ecx, 31
	cmp	dl, -33
	jbe	.LBB0_6
	cmp	r9, r8
	je	.LBB0_9
	movzx	esi, byte ptr [r9]
	add	r9, 1
	and	esi, 63
	shl	eax, 6
	or	eax, esi
	cmp	dl, -16
	jb	.LBB0_12
.LBB0_13:
	cmp	r9, r8
	je	.LBB0_14
	movzx	edx, byte ptr [r9]
	and	edx, 63
	jmp	.LBB0_16
.LBB0_6:
	shl	ecx, 6
	or	eax, ecx
	ret
.LBB0_9:
	xor	esi, esi
	mov	r9, r8
	shl	eax, 6
	or	eax, esi
	cmp	dl, -16
	jae	.LBB0_13
.LBB0_12:
	shl	ecx, 12
	or	eax, ecx
	ret
.LBB0_14:
	xor	edx, edx
.LBB0_16:
	and	ecx, 7
	shl	ecx, 18
	shl	eax, 6
	or	eax, ecx
	or	eax, edx
	ret
```

After this change, the assembly is reduced to:

```asm
_ZN7example10first_char17h4318683472f884ccE:
	.cfi_startproc
	test	rsi, rsi
	je	.LBB0_1
	movzx	ecx, byte ptr [rdi]
	test	cl, cl
	js	.LBB0_3
	mov	eax, ecx
	ret
.LBB0_1:
	mov	eax, 1114112
	ret
.LBB0_3:
	mov	eax, ecx
	and	eax, 31
	movzx	esi, byte ptr [rdi + 1]
	and	esi, 63
	cmp	cl, -33
	jbe	.LBB0_4
	movzx	edx, byte ptr [rdi + 2]
	shl	esi, 6
	and	edx, 63
	or	edx, esi
	cmp	cl, -16
	jb	.LBB0_7
	movzx	ecx, byte ptr [rdi + 3]
	and	eax, 7
	shl	eax, 18
	shl	edx, 6
	and	ecx, 63
	or	ecx, edx
	or	eax, ecx
	ret
.LBB0_4:
	shl	eax, 6
	or	eax, esi
	ret
.LBB0_7:
	shl	eax, 12
	or	eax, edx
	ret
```
  • Loading branch information
bors committed Nov 21, 2021
2 parents 49d4232 + 23637e2 commit 65f3f8b
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 25 deletions.
14 changes: 6 additions & 8 deletions library/core/src/str/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@ impl<'a> Iterator for Chars<'a> {

#[inline]
fn next(&mut self) -> Option<char> {
next_code_point(&mut self.iter).map(|ch| {
// SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
unsafe { char::from_u32_unchecked(ch) }
})
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
// the resulting `ch` is a valid Unicode Scalar Value.
unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
}

#[inline]
Expand Down Expand Up @@ -81,10 +80,9 @@ impl fmt::Debug for Chars<'_> {
impl<'a> DoubleEndedIterator for Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<char> {
next_code_point_reverse(&mut self.iter).map(|ch| {
// SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
unsafe { char::from_u32_unchecked(ch) }
})
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
// the resulting `ch` is a valid Unicode Scalar Value.
unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
}
}

Expand Down
44 changes: 28 additions & 16 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,15 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}

#[inline]
const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
// Decode UTF-8
let x = *bytes.next()?;
if x < 128 {
Expand All @@ -48,18 +44,24 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next().unwrap_unchecked() };
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next().unwrap_unchecked() };
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let w = unsafe { *bytes.next().unwrap_unchecked() };
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Expand All @@ -69,8 +71,12 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<

/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
I: DoubleEndedIterator<Item = &'a u8>,
{
Expand All @@ -83,13 +89,19 @@ where
// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
let z = unwrap_or_0(bytes.next_back());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
let y = unwrap_or_0(bytes.next_back());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
let x = unwrap_or_0(bytes.next_back());
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
Expand Down
3 changes: 2 additions & 1 deletion library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,8 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {

#[inline]
fn next(&mut self) -> Option<CodePoint> {
next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
// SAFETY: `self.bytes` has been created from a WTF-8 string
unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
}

#[inline]
Expand Down

0 comments on commit 65f3f8b

Please sign in to comment.