Skip to content

Commit

Permalink
Fix issue helix-editor#6645: emoji causing crashes
Browse files Browse the repository at this point in the history
- Added functions `is_unicode_boundary_byte`, `prev_unicode_boundary`
  and `next_unicode_boundary`
- Added unicode boundary checks before calling `GraphemeCursor`
  functions
  • Loading branch information
KMikeeU committed Apr 8, 2023
1 parent e856906 commit 2078122
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions helix-core/src/graphemes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,14 @@ pub fn nth_next_grapheme_boundary_byte(slice: RopeSlice, mut byte_idx: usize, n:
// Find the nth next grapheme cluster boundary.
for _ in 0..n {
loop {
match is_unicode_boundary_byte(slice, gc.cur_cursor()) {
Some(false) => match next_unicode_boundary(slice, gc.cur_cursor()) {
Some(n) => gc.set_cursor(n),
None => gc.set_cursor(prev_unicode_boundary(slice, gc.cur_cursor()).unwrap()),
},
_ => {}
}

match gc.next_boundary(chunk, chunk_byte_idx) {
Ok(None) => return slice.len_bytes(),
Ok(Some(n)) => {
Expand Down Expand Up @@ -304,6 +312,11 @@ pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool {
// We work with bytes for this, so convert.
let byte_idx = slice.char_to_byte(char_idx);

match is_unicode_boundary_byte(slice, byte_idx) {
Some(false) => return false,
_ => {}
}

// Get the chunk with our byte index in it.
let (chunk, chunk_byte_idx, _, _) = slice.chunk_at_byte(byte_idx);

Expand All @@ -329,6 +342,11 @@ pub fn is_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> bool {
// Bounds check
debug_assert!(byte_idx <= slice.len_bytes());

match is_unicode_boundary_byte(slice, byte_idx) {
Some(false) => return false,
_ => {}
}

// Get the chunk with our byte index in it.
let (chunk, chunk_byte_idx, _, _) = slice.chunk_at_byte(byte_idx);

Expand All @@ -348,6 +366,43 @@ pub fn is_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> bool {
}
}

// Returns whether the given byte position is the start of a unicode sequence OR a single-byte value
pub fn is_unicode_boundary_byte(slice: RopeSlice, byte_idx: usize) -> Option<bool> {
let byte = slice.get_byte(byte_idx);

// Unicode continuation bytes (ie bytes in the middle of a unicode sequence)
// always start with binary `10` => If the current byte is a continuation byte,
// it cannot possibly be a boundary!
return match byte {
Some(b) => Some(!(b & 0b11000000 == 0b10000000)),
None => None,
};
}

// Returns Some(index) of the next unicode boundary to the left
pub fn prev_unicode_boundary(slice: RopeSlice, byte_idx: usize) -> Option<usize> {
for i in byte_idx..0 {
match is_unicode_boundary_byte(slice, i) {
Some(true) => return Some(i),
_ => {}
}
}

None
}

// Returns Some(index) of the next unicode boundary to the right
pub fn next_unicode_boundary(slice: RopeSlice, byte_idx: usize) -> Option<usize> {
for i in byte_idx..slice.len_bytes() {
match is_unicode_boundary_byte(slice, i) {
Some(true) => return Some(i),
_ => {}
}
}

None
}

/// An iterator over the graphemes of a `RopeSlice`.
#[derive(Clone)]
pub struct RopeGraphemes<'a> {
Expand Down

0 comments on commit 2078122

Please sign in to comment.