diff --git a/Cargo.lock b/Cargo.lock index 122efc3c0a88a..2bebeacbc9784 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2881,6 +2881,7 @@ dependencies = [ "tracing", "tracing-subscriber", "ty_static", + "unicode-width 0.2.1", "web-time", "zip", ] diff --git a/crates/ruff_db/Cargo.toml b/crates/ruff_db/Cargo.toml index ccc9edf4804b3..2954357e9682c 100644 --- a/crates/ruff_db/Cargo.toml +++ b/crates/ruff_db/Cargo.toml @@ -42,6 +42,7 @@ serde_json = { workspace = true, optional = true } thiserror = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true, optional = true } +unicode-width = { workspace = true } zip = { workspace = true } [target.'cfg(target_arch="wasm32")'.dependencies] diff --git a/crates/ruff_db/src/diagnostic/mod.rs b/crates/ruff_db/src/diagnostic/mod.rs index c5eb007b880db..7adfe2fc9febf 100644 --- a/crates/ruff_db/src/diagnostic/mod.rs +++ b/crates/ruff_db/src/diagnostic/mod.rs @@ -6,7 +6,9 @@ use ruff_source_file::{LineColumn, SourceCode, SourceFile}; use ruff_annotate_snippets::Level as AnnotateLevel; use ruff_text_size::{Ranged, TextRange, TextSize}; -pub use self::render::{DisplayDiagnostic, DisplayDiagnostics, FileResolver, Input}; +pub use self::render::{ + DisplayDiagnostic, DisplayDiagnostics, FileResolver, Input, ceil_char_boundary, +}; use crate::{Db, files::File}; mod render; diff --git a/crates/ruff_db/src/diagnostic/render.rs b/crates/ruff_db/src/diagnostic/render.rs index 393272f39c041..bfc2ca3ce7fd9 100644 --- a/crates/ruff_db/src/diagnostic/render.rs +++ b/crates/ruff_db/src/diagnostic/render.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::BTreeMap; use std::path::Path; @@ -7,7 +8,7 @@ use ruff_annotate_snippets::{ }; use ruff_notebook::{Notebook, NotebookIndex}; use ruff_source_file::{LineIndex, OneIndexed, SourceCode}; -use ruff_text_size::{TextRange, TextSize}; +use ruff_text_size::{TextLen, TextRange, TextSize}; use crate::diagnostic::stylesheet::DiagnosticStylesheet; use crate::{ @@ -520,7 +521,7 @@ impl<'r> RenderableSnippets<'r> { #[derive(Debug)] struct RenderableSnippet<'r> { /// The actual snippet text. - snippet: &'r str, + snippet: Cow<'r, str>, /// The absolute line number corresponding to where this /// snippet begins. line_start: OneIndexed, @@ -580,6 +581,13 @@ impl<'r> RenderableSnippet<'r> { .iter() .map(|ann| RenderableAnnotation::new(snippet_start, ann)) .collect(); + + let EscapedSourceCode { + text: snippet, + annotations, + } = replace_whitespace_and_unprintable(snippet, annotations) + .fix_up_empty_spans_after_line_terminator(); + RenderableSnippet { snippet, line_start, @@ -590,7 +598,7 @@ impl<'r> RenderableSnippet<'r> { /// Convert this to an "annotate" snippet. fn to_annotate<'a>(&'a self, path: &'a str) -> AnnotateSnippet<'a> { - AnnotateSnippet::source(self.snippet) + AnnotateSnippet::source(&self.snippet) .origin(path) .line_start(self.line_start.get()) .annotations( @@ -820,6 +828,230 @@ fn relativize_path<'p>(cwd: &SystemPath, path: &'p str) -> &'p str { path } +/// Given some source code and annotation ranges, this routine replaces tabs +/// with ASCII whitespace, and unprintable characters with printable +/// representations of them. +/// +/// The source code and annotations returned are updated to reflect changes made +/// to the source code (if any). +fn replace_whitespace_and_unprintable<'r>( + source: &'r str, + mut annotations: Vec>, +) -> EscapedSourceCode<'r> { + // Updates the annotation ranges given by the caller whenever a single byte (at `index` in + // `source`) is replaced with `len` bytes. + // + // When the index occurs before the start of the range, the range is + // offset by `len`. When the range occurs after or at the start but before + // the end, then the end of the range only is offset by `len`. + let mut update_ranges = |index: usize, len: u32| { + for ann in &mut annotations { + if index < usize::from(ann.range.start()) { + ann.range += TextSize::new(len - 1); + } else if index < usize::from(ann.range.end()) { + ann.range = ann.range.add_end(TextSize::new(len - 1)); + } + } + }; + + // If `c` is an unprintable character, then this returns a printable + // representation of it (using a fancier Unicode codepoint). + let unprintable_replacement = |c: char| -> Option { + match c { + '\x07' => Some('␇'), + '\x08' => Some('␈'), + '\x1b' => Some('␛'), + '\x7f' => Some('␡'), + _ => None, + } + }; + + const TAB_SIZE: usize = 4; + let mut width = 0; + let mut column = 0; + let mut last_end = 0; + let mut result = String::new(); + for (index, c) in source.char_indices() { + let old_width = width; + match c { + '\n' | '\r' => { + width = 0; + column = 0; + } + '\t' => { + let tab_offset = TAB_SIZE - (column % TAB_SIZE); + width += tab_offset; + column += tab_offset; + + let tab_width = + u32::try_from(width - old_width).expect("small width because of tab size"); + result.push_str(&source[last_end..index]); + + update_ranges(result.text_len().to_usize(), tab_width); + + for _ in 0..tab_width { + result.push(' '); + } + last_end = index + 1; + } + _ => { + width += unicode_width::UnicodeWidthChar::width(c).unwrap_or(0); + column += 1; + + if let Some(printable) = unprintable_replacement(c) { + result.push_str(&source[last_end..index]); + + let len = printable.text_len().to_u32(); + update_ranges(result.text_len().to_usize(), len); + + result.push(printable); + last_end = index + 1; + } + } + } + } + + // No tabs or unprintable chars + if result.is_empty() { + EscapedSourceCode { + annotations, + text: Cow::Borrowed(source), + } + } else { + result.push_str(&source[last_end..]); + EscapedSourceCode { + annotations, + text: Cow::Owned(result), + } + } +} + +struct EscapedSourceCode<'r> { + text: Cow<'r, str>, + annotations: Vec>, +} + +impl<'r> EscapedSourceCode<'r> { + // This attempts to "fix up" the spans on each annotation in the case where + // it's an empty span immediately following a line terminator. + // + // At present, `annotate-snippets` (both upstream and our vendored copy) + // will render annotations of such spans to point to the space immediately + // following the previous line. But ideally, this should point to the space + // immediately preceding the next line. + // + // After attempting to fix `annotate-snippets` and giving up after a couple + // hours, this routine takes a different tact: it adjusts the span to be + // non-empty and it will cover the first codepoint of the following line. + // This forces `annotate-snippets` to point to the right place. + // + // See also: and + // `ruff_linter::message::text::SourceCode::fix_up_empty_spans_after_line_terminator`, + // from which this was adapted. + fn fix_up_empty_spans_after_line_terminator(mut self) -> EscapedSourceCode<'r> { + for ann in &mut self.annotations { + let range = ann.range; + if !range.is_empty() + || range.start() == TextSize::from(0) + || range.start() >= self.text.text_len() + { + continue; + } + if !matches!( + self.text.as_bytes()[range.start().to_usize() - 1], + b'\n' | b'\r' + ) { + continue; + } + let start = range.start(); + let end = ceil_char_boundary(&self.text, start + TextSize::from(1)); + ann.range = TextRange::new(start, end); + } + + self + } +} + +/// Finds the closest [`TextSize`] not less than the offset given for which +/// `is_char_boundary` is `true`. Unless the offset given is greater than +/// the length of the underlying contents, in which case, the length of the +/// contents is returned. +/// +/// Can be replaced with `str::ceil_char_boundary` once it's stable. +/// +/// # Examples +/// +/// From `std`: +/// +/// ``` +/// use ruff_db::diagnostic::ceil_char_boundary; +/// use ruff_text_size::{Ranged, TextLen, TextSize}; +/// +/// let source = "❤️🧡💛💚💙💜"; +/// assert_eq!(source.text_len(), TextSize::from(26)); +/// assert!(!source.is_char_boundary(13)); +/// +/// let closest = ceil_char_boundary(source, TextSize::from(13)); +/// assert_eq!(closest, TextSize::from(14)); +/// assert_eq!(&source[..closest.to_usize()], "❤️🧡💛"); +/// ``` +/// +/// Additional examples: +/// +/// ``` +/// use ruff_db::diagnostic::ceil_char_boundary; +/// use ruff_text_size::{Ranged, TextRange, TextSize}; +/// +/// let source = "Hello"; +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(0)), +/// TextSize::from(0) +/// ); +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(5)), +/// TextSize::from(5) +/// ); +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(6)), +/// TextSize::from(5) +/// ); +/// +/// let source = "α"; +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(0)), +/// TextSize::from(0) +/// ); +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(1)), +/// TextSize::from(2) +/// ); +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(2)), +/// TextSize::from(2) +/// ); +/// +/// assert_eq!( +/// ceil_char_boundary(source, TextSize::from(3)), +/// TextSize::from(2) +/// ); +/// ``` +pub fn ceil_char_boundary(text: &str, offset: TextSize) -> TextSize { + let upper_bound = offset + .to_u32() + .saturating_add(4) + .min(text.text_len().to_u32()); + (offset.to_u32()..upper_bound) + .map(TextSize::from) + .find(|offset| text.is_char_boundary(offset.to_usize())) + .unwrap_or_else(|| TextSize::from(upper_bound)) +} + #[cfg(test)] mod tests { @@ -2359,7 +2591,7 @@ watermelon } /// Returns a builder for tersely constructing diagnostics. - fn builder( + pub(super) fn builder( &mut self, identifier: &'static str, severity: Severity, @@ -2426,7 +2658,7 @@ watermelon /// /// See the docs on `TestEnvironment::span` for the meaning of /// `path`, `line_offset_start` and `line_offset_end`. - fn primary( + pub(super) fn primary( mut self, path: &str, line_offset_start: &str, diff --git a/crates/ruff_db/src/diagnostic/render/full.rs b/crates/ruff_db/src/diagnostic/render/full.rs index 35f5e3e5a1829..9200fd8d2f702 100644 --- a/crates/ruff_db/src/diagnostic/render/full.rs +++ b/crates/ruff_db/src/diagnostic/render/full.rs @@ -1,8 +1,8 @@ #[cfg(test)] mod tests { use crate::diagnostic::{ - DiagnosticFormat, - render::tests::{create_diagnostics, create_syntax_error_diagnostics}, + DiagnosticFormat, Severity, + render::tests::{TestEnvironment, create_diagnostics, create_syntax_error_diagnostics}, }; #[test] @@ -63,4 +63,118 @@ mod tests { | "); } + + /// Check that the new `full` rendering code in `ruff_db` handles cases fixed by commit c9b99e4. + /// + /// For example, without the fix, we get diagnostics like this: + /// + /// ``` + /// error[no-indented-block]: Expected an indented block + /// --> example.py:3:1 + /// | + /// 2 | if False: + /// | ^ + /// 3 | print() + /// | + /// ``` + /// + /// where the caret points to the end of the previous line instead of the start of the next. + #[test] + fn empty_span_after_line_terminator() { + let mut env = TestEnvironment::new(); + env.add( + "example.py", + r#" +if False: +print() +"#, + ); + env.format(DiagnosticFormat::Full); + + let diagnostic = env + .builder( + "no-indented-block", + Severity::Error, + "Expected an indented block", + ) + .primary("example.py", "3:0", "3:0", "") + .build(); + + insta::assert_snapshot!(env.render(&diagnostic), @r" + error[no-indented-block]: Expected an indented block + --> example.py:3:1 + | + 2 | if False: + 3 | print() + | ^ + | + "); + } + + /// Check that the new `full` rendering code in `ruff_db` handles cases fixed by commit 2922490. + /// + /// For example, without the fix, we get diagnostics like this: + /// + /// ``` + /// error[invalid-character-sub]: Invalid unescaped character SUB, use "\x1A" instead + /// --> example.py:1:25 + /// | + /// 1 | nested_fstrings = f'␈{f'{f'␛'}'}' + /// | ^ + /// | + /// ``` + /// + /// where the caret points to the `f` in the f-string instead of the start of the invalid + /// character (`^Z`). + #[test] + fn unprintable_characters() { + let mut env = TestEnvironment::new(); + env.add("example.py", "nested_fstrings = f'{f'{f''}'}'"); + env.format(DiagnosticFormat::Full); + + let diagnostic = env + .builder( + "invalid-character-sub", + Severity::Error, + r#"Invalid unescaped character SUB, use "\x1A" instead"#, + ) + .primary("example.py", "1:24", "1:24", "") + .build(); + + insta::assert_snapshot!(env.render(&diagnostic), @r#" + error[invalid-character-sub]: Invalid unescaped character SUB, use "\x1A" instead + --> example.py:1:25 + | + 1 | nested_fstrings = f'␈{f'{f'␛'}'}' + | ^ + | + "#); + } + + #[test] + fn multiple_unprintable_characters() -> std::io::Result<()> { + let mut env = TestEnvironment::new(); + env.add("example.py", ""); + env.format(DiagnosticFormat::Full); + + let diagnostic = env + .builder( + "invalid-character-sub", + Severity::Error, + r#"Invalid unescaped character SUB, use "\x1A" instead"#, + ) + .primary("example.py", "1:1", "1:1", "") + .build(); + + insta::assert_snapshot!(env.render(&diagnostic), @r#" + error[invalid-character-sub]: Invalid unescaped character SUB, use "\x1A" instead + --> example.py:1:2 + | + 1 | ␈␛ + | ^ + | + "#); + + Ok(()) + } } diff --git a/crates/ruff_linter/src/locator.rs b/crates/ruff_linter/src/locator.rs index afc212e6a68ca..8fd60b9351c4b 100644 --- a/crates/ruff_linter/src/locator.rs +++ b/crates/ruff_linter/src/locator.rs @@ -118,86 +118,6 @@ impl<'a> Locator<'a> { } } - /// Finds the closest [`TextSize`] not less than the offset given for which - /// `is_char_boundary` is `true`. Unless the offset given is greater than - /// the length of the underlying contents, in which case, the length of the - /// contents is returned. - /// - /// Can be replaced with `str::ceil_char_boundary` once it's stable. - /// - /// # Examples - /// - /// From `std`: - /// - /// ``` - /// use ruff_text_size::{Ranged, TextSize}; - /// use ruff_linter::Locator; - /// - /// let locator = Locator::new("❤️🧡💛💚💙💜"); - /// assert_eq!(locator.text_len(), TextSize::from(26)); - /// assert!(!locator.contents().is_char_boundary(13)); - /// - /// let closest = locator.ceil_char_boundary(TextSize::from(13)); - /// assert_eq!(closest, TextSize::from(14)); - /// assert_eq!(&locator.contents()[..closest.to_usize()], "❤️🧡💛"); - /// ``` - /// - /// Additional examples: - /// - /// ``` - /// use ruff_text_size::{Ranged, TextRange, TextSize}; - /// use ruff_linter::Locator; - /// - /// let locator = Locator::new("Hello"); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(0)), - /// TextSize::from(0) - /// ); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(5)), - /// TextSize::from(5) - /// ); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(6)), - /// TextSize::from(5) - /// ); - /// - /// let locator = Locator::new("α"); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(0)), - /// TextSize::from(0) - /// ); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(1)), - /// TextSize::from(2) - /// ); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(2)), - /// TextSize::from(2) - /// ); - /// - /// assert_eq!( - /// locator.ceil_char_boundary(TextSize::from(3)), - /// TextSize::from(2) - /// ); - /// ``` - pub fn ceil_char_boundary(&self, offset: TextSize) -> TextSize { - let upper_bound = offset - .to_u32() - .saturating_add(4) - .min(self.text_len().to_u32()); - (offset.to_u32()..upper_bound) - .map(TextSize::from) - .find(|offset| self.contents.is_char_boundary(offset.to_usize())) - .unwrap_or_else(|| TextSize::from(upper_bound)) - } - /// Take the source code between the given [`TextRange`]. #[inline] pub fn slice(&self, ranged: T) -> &'a str { diff --git a/crates/ruff_linter/src/message/text.rs b/crates/ruff_linter/src/message/text.rs index 5659306d81ea3..9eceb2fb1d926 100644 --- a/crates/ruff_linter/src/message/text.rs +++ b/crates/ruff_linter/src/message/text.rs @@ -6,12 +6,13 @@ use bitflags::bitflags; use colored::Colorize; use ruff_annotate_snippets::{Level, Renderer, Snippet}; -use ruff_db::diagnostic::{Diagnostic, DiagnosticFormat, DisplayDiagnosticConfig, SecondaryCode}; +use ruff_db::diagnostic::{ + Diagnostic, DiagnosticFormat, DisplayDiagnosticConfig, SecondaryCode, ceil_char_boundary, +}; use ruff_notebook::NotebookIndex; use ruff_source_file::OneIndexed; use ruff_text_size::{TextLen, TextRange, TextSize}; -use crate::Locator; use crate::line_width::{IndentWidth, LineWidthBuilder}; use crate::message::diff::Diff; use crate::message::{Emitter, EmitterContext}; @@ -370,9 +371,8 @@ impl<'a> SourceCode<'a> { if self.text.as_bytes()[self.annotation_range.start().to_usize() - 1] != b'\n' { return self; } - let locator = Locator::new(&self.text); let start = self.annotation_range.start(); - let end = locator.ceil_char_boundary(start + TextSize::from(1)); + let end = ceil_char_boundary(&self.text, start + TextSize::from(1)); SourceCode { annotation_range: TextRange::new(start, end), ..self