From 2078122001ae3a58885c6504ab5a4ffbe09d183a Mon Sep 17 00:00:00 2001
From: Michael <20937441+KMikeeU@users.noreply.github.com>
Date: Sat, 8 Apr 2023 15:38:54 +0200
Subject: [PATCH] Fix issue #6645: emoji causing crashes

- Added functions `is_unicode_boundary_byte`, `prev_unicode_boundary`
  and `next_unicode_boundary`
- Added unicode boundary checks before calling `GraphemeCursor`
  functions
---
 helix-core/src/graphemes.rs | 55 +++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
diff --git a/helix-core/src/graphemes.rs b/helix-core/src/graphemes.rs
index 15ef3eb043e8..6d8c5005fd59 100644
--- a/helix-core/src/graphemes.rs
+++ b/helix-core/src/graphemes.rs
@@ -217,6 +217,14 @@ pub fn nth_next_grapheme_boundary_byte(slice: RopeSlice, mut byte_idx: usize, n:
     // Find the nth next grapheme cluster boundary.
     for _ in 0..n {
         loop {
+            match is_unicode_boundary_byte(slice, gc.cur_cursor()) {
+                Some(false) => match next_unicode_boundary(slice, gc.cur_cursor()) {
+                    Some(n) => gc.set_cursor(n),
+                    None => gc.set_cursor(prev_unicode_boundary(slice, gc.cur_cursor()).unwrap()),
+                },
+                _ => {}
+            }
+
             match gc.next_boundary(chunk, chunk_byte_idx) {
                 Ok(None) => return slice.len_bytes(),
                 Ok(Some(n)) => {
@@ -304,6 +312,11 @@ pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool {
     // We work with bytes for this, so convert.
     let byte_idx = slice.char_to_byte(char_idx);
 
+    match is_unicode_boundary_byte(slice, byte_idx) {
+        Some(false) => return false,
+        _ => {}
+    }
+
     // Get the chunk with our byte index in it.
     let (chunk, chunk_byte_idx, _, _) = slice.chunk_at_byte(byte_idx);
 
@@ -329,6 +342,11 @@ pub fn is_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> bool {
     // Bounds check
     debug_assert!(byte_idx <= slice.len_bytes());
 
+    match is_unicode_boundary_byte(slice, byte_idx) {
+        Some(false) => return false,
+        _ => {}
+    }
+
     // Get the chunk with our byte index in it.
     let (chunk, chunk_byte_idx, _, _) = slice.chunk_at_byte(byte_idx);
 
@@ -348,6 +366,43 @@ pub fn is_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> bool {
     }
 }
 
+// Returns whether the given byte position is the start of a unicode sequence OR a single-byte value
+pub fn is_unicode_boundary_byte(slice: RopeSlice, byte_idx: usize) -> Option<bool> {
+    let byte = slice.get_byte(byte_idx);
+
+    // Unicode continuation bytes (ie bytes in the middle of a unicode sequence)
+    // always start with binary `10` => If the current byte is a continuation byte,
+    // it cannot possibly be a boundary!
+    return match byte {
+        Some(b) => Some(!(b & 0b11000000 == 0b10000000)),
+        None => None,
+    };
+}
+
+// Returns Some(index) of the next unicode boundary to the left
+pub fn prev_unicode_boundary(slice: RopeSlice, byte_idx: usize) -> Option<usize> {
+    for i in byte_idx..0 {
+        match is_unicode_boundary_byte(slice, i) {
+            Some(true) => return Some(i),
+            _ => {}
+        }
+    }
+
+    None
+}
+
+// Returns Some(index) of the next unicode boundary to the right
+pub fn next_unicode_boundary(slice: RopeSlice, byte_idx: usize) -> Option<usize> {
+    for i in byte_idx..slice.len_bytes() {
+        match is_unicode_boundary_byte(slice, i) {
+            Some(true) => return Some(i),
+            _ => {}
+        }
+    }
+
+    None
+}
+
 /// An iterator over the graphemes of a `RopeSlice`.
 #[derive(Clone)]
 pub struct RopeGraphemes<'a> {