diff --git a/Cargo.lock b/Cargo.lock index b5f30db..9e7b8e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -263,6 +263,7 @@ dependencies = [ "textwrap", "thiserror", "trybuild", + "unicode-segmentation", "unicode-width", ] @@ -584,6 +585,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index c7bed56..9c7096e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ owo-colors = { version = "4", optional = true } cfg-if = "1" unicode-width = "0.2.0" +unicode-segmentation = "1.12.0" textwrap = { version = "0.16.2", optional = true } supports-hyperlinks = { version = "3.1.0", optional = true } diff --git a/src/handlers/graphical.rs b/src/handlers/graphical.rs index 3b12887..3b2ba3d 100644 --- a/src/handlers/graphical.rs +++ b/src/handlers/graphical.rs @@ -4,7 +4,8 @@ use std::{ }; use owo_colors::{OwoColorize, Style}; -use unicode_width::UnicodeWidthChar; +use unicode_segmentation::UnicodeSegmentation; +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; use crate::{ Diagnostic, GraphicalTheme, LabeledSpan, ReportHandler, Severity, SourceCode, SourceSpan, @@ -930,31 +931,77 @@ impl GraphicalReportHandler { &self, text: &'a str, ) -> impl Iterator + 'a + use<'a> { - let mut column = 0; - let mut escaped = false; - let tab_width = self.tab_width; - text.chars().map(move |c| { - let width = match (escaped, c) { - // Round up to the next multiple of tab_width - (false, '\t') => tab_width - column % tab_width, - // start of ANSI escape - (false, '\x1b') => { - escaped = true; - 0 - } - // use Unicode width for all other characters - (false, c) => c.width().unwrap_or(0), - // end of ANSI escape - (true, 'm') => { - escaped = false; - 0 - } - // characters are zero width within escape sequence - (true, _) => 0, - }; - column += width; - width - }) + // Custom iterator that handles both ASCII and Unicode efficiently + struct CharWidthIterator<'a> { + chars: std::str::CharIndices<'a>, + grapheme_boundaries: Option>, // (byte_pos, width) - None for ASCII + current_grapheme_idx: usize, + column: usize, + escaped: bool, + tab_width: usize, + } + + impl<'a> Iterator for CharWidthIterator<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + let (byte_pos, c) = self.chars.next()?; + + let width = match (self.escaped, c) { + (false, '\t') => self.tab_width - self.column % self.tab_width, + (false, '\x1b') => { + self.escaped = true; + 0 + } + (false, _) => { + if let Some(ref boundaries) = self.grapheme_boundaries { + // Unicode path: check if we're at a grapheme boundary + if self.current_grapheme_idx < boundaries.len() + && boundaries[self.current_grapheme_idx].0 == byte_pos + { + let width = boundaries[self.current_grapheme_idx].1; + self.current_grapheme_idx += 1; + width + } else { + 0 // Not at a grapheme boundary + } + } else { + // ASCII path: all non-control chars are width 1 + 1 + } + } + (true, 'm') => { + self.escaped = false; + 0 + } + (true, _) => 0, + }; + + self.column += width; + Some(width) + } + } + + // Only compute grapheme boundaries for non-ASCII text + let grapheme_boundaries = if text.is_ascii() { + None + } else { + // Collect grapheme boundaries with their widths + Some( + text.grapheme_indices(true) + .map(|(pos, grapheme)| (pos, grapheme.width())) + .collect(), + ) + }; + + CharWidthIterator { + chars: text.char_indices(), + grapheme_boundaries, + current_grapheme_idx: 0, + column: 0, + escaped: false, + tab_width: self.tab_width, + } } /// Returns the visual column position of a byte offset on a specific line. diff --git a/tests/test_emoji_underline.rs b/tests/test_emoji_underline.rs new file mode 100644 index 0000000..cd34de2 --- /dev/null +++ b/tests/test_emoji_underline.rs @@ -0,0 +1,75 @@ +#![cfg(feature = "fancy-no-backtrace")] + +use miette::{Diagnostic, GraphicalReportHandler, NamedSource, SourceSpan}; +use thiserror::Error; + +#[test] +fn test_emoji_sequence_underline() { + #[derive(Error, Debug, Diagnostic)] + #[error("emoji test")] + struct TestError { + #[source_code] + src: NamedSource, + #[label("here")] + span: SourceSpan, + } + + // Test with a ZWJ emoji sequence (family emoji) + let family_emoji = "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ"; + let src = format!("before {} after", family_emoji); + let err = TestError { + src: NamedSource::new("test.txt", src.clone()), + span: (7, family_emoji.len()).into(), + }; + + let mut output = String::new(); + GraphicalReportHandler::new().render_report(&mut output, &err).unwrap(); + + println!("Output for family emoji:"); + println!("{}", output); + + // Test with flag emoji (also uses ZWJ) + let flag_emoji = "๐Ÿณ๏ธโ€๐ŸŒˆ"; + let src2 = format!("before {} after", flag_emoji); + let err2 = TestError { + src: NamedSource::new("test2.txt", src2.clone()), + span: (7, flag_emoji.len()).into(), + }; + + let mut output2 = String::new(); + GraphicalReportHandler::new().render_report(&mut output2, &err2).unwrap(); + + println!("\nOutput for rainbow flag:"); + println!("{}", output2); + + // Test with skin tone modifier + let skin_tone_emoji = "๐Ÿ‘‹๐Ÿฝ"; + let src3 = format!("before {} after", skin_tone_emoji); + let err3 = TestError { + src: NamedSource::new("test3.txt", src3.clone()), + span: (7, skin_tone_emoji.len()).into(), + }; + + let mut output3 = String::new(); + GraphicalReportHandler::new().render_report(&mut output3, &err3).unwrap(); + + println!("\nOutput for waving hand with skin tone:"); + println!("{}", output3); + + // Test ASCII fast path + let ascii_text = "hello world"; + let src4 = format!("before {} after", ascii_text); + let err4 = TestError { + src: NamedSource::new("test4.txt", src4.clone()), + span: (7, ascii_text.len()).into(), + }; + + let mut output4 = String::new(); + GraphicalReportHandler::new().render_report(&mut output4, &err4).unwrap(); + + println!("\nOutput for ASCII text:"); + println!("{}", output4); + + // Verify the underline matches the text length + assert!(output4.contains("hello world")); +}