diff --git a/Cargo.lock b/Cargo.lock index 1b4862769d710..833417340ef39 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3275,6 +3275,8 @@ dependencies = [ "ruff_python_ast", "ruff_python_formatter", "ruff_python_trivia", + "ruff_source_file", + "ruff_text_size", "ruff_workspace", ] diff --git a/crates/ruff_markdown/Cargo.toml b/crates/ruff_markdown/Cargo.toml index 19fd48c8d6454..c611deeecd0f4 100644 --- a/crates/ruff_markdown/Cargo.toml +++ b/crates/ruff_markdown/Cargo.toml @@ -13,6 +13,8 @@ license = { workspace = true } ruff_python_ast = { workspace = true } ruff_python_formatter = { workspace = true } ruff_python_trivia = { workspace = true } +ruff_source_file = { workspace = true } +ruff_text_size = { workspace = true } ruff_workspace = { workspace = true } insta = { workspace = true } diff --git a/crates/ruff_markdown/src/lib.rs b/crates/ruff_markdown/src/lib.rs index 79d2b0a62ea95..7277f0c7bf63e 100644 --- a/crates/ruff_markdown/src/lib.rs +++ b/crates/ruff_markdown/src/lib.rs @@ -4,6 +4,8 @@ use regex::Regex; use ruff_python_ast::PySourceType; use ruff_python_formatter::format_module_source; use ruff_python_trivia::textwrap::{dedent, indent}; +use ruff_source_file::{Line, UniversalNewlines}; +use ruff_text_size::{TextRange, TextSize}; use ruff_workspace::FormatterSettings; #[derive(Debug, PartialEq, Eq)] @@ -12,67 +14,115 @@ pub enum MarkdownResult { Unchanged, } -// TODO: account for ~~~ and arbitrary length code fences // TODO: support code blocks nested inside block quotes, etc -static MARKDOWN_CODE_BLOCK: LazyLock = LazyLock::new(|| { - // adapted from blacken-docs - // https://github.com/adamchainz/blacken-docs/blob/fb107c1dce25f9206e29297aaa1ed7afc2980a5a/src/blacken_docs/__init__.py#L17 +static MARKDOWN_CODE_FENCE: LazyLock = LazyLock::new(|| { Regex::new( - r"(?imsx) - (? - ^(?\ *)```[^\S\r\n]* - (?(?:python|py|python3|py3|pyi)?) - (?:\ .*?)?\n - ) - (?.*?) - (? - ^\ *```[^\S\r\n]*$ - ) - ", + r"(?ix) + ^ + (?\s*) + (?(?:```+|~~~+))\s* + (?(?:\w+)?)\s* + (?(?:.*))\s* + $ + ", ) .unwrap() }); +static OFF_ON_DIRECTIVES: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?imx) + ^ + \s* + ", + ) + .unwrap() +}); + +#[derive(Debug, Default, PartialEq, Eq)] +enum MarkdownState { + #[default] + On, + Off, +} + pub fn format_code_blocks( source: &str, path: Option<&Path>, settings: &FormatterSettings, ) -> MarkdownResult { + let mut state = MarkdownState::On; let mut changed = false; let mut formatted = String::with_capacity(source.len()); - let mut last_match = 0; + let mut last_match = TextSize::new(0); - for capture in MARKDOWN_CODE_BLOCK.captures_iter(source) { - let (_, [before, code_indent, language, code, after]) = capture.extract(); + let mut lines = source.universal_newlines().peekable(); + while let Some(line) = lines.next() { + // Toggle code block formatting off/on + if let Some(capture) = OFF_ON_DIRECTIVES.captures(&line) { + let (_, [action]) = capture.extract(); + state = match action { + "off" => MarkdownState::Off, + "on" => MarkdownState::On, + _ => state, + }; + // Process code blocks + } else if let Some(opening_capture) = MARKDOWN_CODE_FENCE.captures(&line) { + let (_, [code_indent, opening_fence, language, _info]) = opening_capture.extract(); + let start = lines.peek().map(Line::start).unwrap_or_default(); - let py_source_type = PySourceType::from_extension(language); - let unformatted_code = dedent(code); - let options = settings.to_format_options(py_source_type, &unformatted_code, path); + // Consume lines until reaching the matching/ending code fence + for code_line in lines.by_ref() { + let Some((_, [_, closing_fence, _, _])) = MARKDOWN_CODE_FENCE + .captures(&code_line) + .map(|cap| cap.extract()) + else { + continue; + }; - // Using `Printed::into_code` requires adding `ruff_formatter` as a direct dependency, and I suspect that Rust can optimize the closure away regardless. - #[expect(clippy::redundant_closure_for_method_calls)] - let formatted_code = - format_module_source(&unformatted_code, options).map(|formatted| formatted.into_code()); + // Found the matching end of the code block + if closing_fence == opening_fence { + let language = language.to_ascii_lowercase(); + if state == MarkdownState::On + && matches!( + language.as_str(), + "python" | "py" | "python3" | "py3" | "pyi" | "" + ) + { + // Maybe python, try formatting it + let end = code_line.start(); + let unformatted_code = dedent(&source[TextRange::new(start, end)]); - if let Ok(formatted_code) = formatted_code { - if formatted_code.len() != unformatted_code.len() || formatted_code != *unformatted_code - { - let m = capture.get_match(); - formatted.push_str(&source[last_match..m.start()]); + let py_source_type = PySourceType::from_extension(&language); + let options = + settings.to_format_options(py_source_type, &unformatted_code, path); - let indented_code = indent(&formatted_code, code_indent); - // otherwise I need to deal with a result from write! - #[expect(clippy::format_push_string)] - formatted.push_str(&format!("{before}{indented_code}{after}")); + // Using `Printed::into_code` requires adding `ruff_formatter` as a direct + // dependency, and I suspect that Rust can optimize the closure away regardless. + #[expect(clippy::redundant_closure_for_method_calls)] + let formatted_code = format_module_source(&unformatted_code, options) + .map(|formatted| formatted.into_code()); - last_match = m.end(); - changed = true; + // Formatting produced changes + if let Ok(formatted_code) = formatted_code + && (formatted_code.len() != unformatted_code.len() + || formatted_code != *unformatted_code) + { + formatted.push_str(&source[TextRange::new(last_match, start)]); + let formatted_code = indent(&formatted_code, code_indent); + formatted.push_str(&formatted_code); + last_match = end; + changed = true; + } + } + break; + } } } } if changed { - formatted.push_str(&source[last_match..]); + formatted.push_str(&source[last_match.to_usize()..]); MarkdownResult::Formatted(formatted) } else { MarkdownResult::Unchanged @@ -187,4 +237,151 @@ fn (foo: &str) -> &str { format_code_blocks(code, None, &FormatterSettings::default()), @"Unchanged"); } + + #[test] + fn format_code_blocks_tildes() { + let code = r#" +~~~py +print( 'hello' ) +~~~ + "#; + assert_snapshot!( + format_code_blocks(code, None, &FormatterSettings::default()), + @r#" + ~~~py + print("hello") + ~~~ + "#); + } + + #[test] + fn format_code_blocks_long_fence() { + let code = r#" +````py +print( 'hello' ) +```` +~~~~~py +print( 'hello' ) +~~~~~ + "#; + assert_snapshot!( + format_code_blocks(code, None, &FormatterSettings::default()), + @r#" + ````py + print("hello") + ```` + ~~~~~py + print("hello") + ~~~~~ + "#); + } + + #[test] + fn format_code_blocks_nested() { + let code = r#" +````markdown +```py +print( 'hello' ) +``` +```` + "#; + assert_snapshot!( + format_code_blocks(code, None, &FormatterSettings::default()), + @"Unchanged"); + } + + #[test] + fn format_code_blocks_ignore_blackendocs_off() { + let code = r#" +```py +print( 'hello' ) +``` + + +```py +print( 'hello' ) +``` + + +```py +print( 'hello' ) +``` + "#; + assert_snapshot!(format_code_blocks( + code, + None, + &FormatterSettings::default() + ), @r#" + ```py + print("hello") + ``` + + + ```py + print( 'hello' ) + ``` + + + ```py + print("hello") + ``` + "#); + } + + #[test] + fn format_code_blocks_ignore_ruff_off() { + let code = r#" +```py +print( 'hello' ) +``` + + +```py +print( 'hello' ) +``` + + +```py +print( 'hello' ) +``` + "#; + assert_snapshot!(format_code_blocks( + code, + None, + &FormatterSettings::default() + ), @r#" + ```py + print("hello") + ``` + + + ```py + print( 'hello' ) + ``` + + + ```py + print("hello") + ``` + "#); + } + + #[test] + fn format_code_blocks_ignore_to_end() { + let code = r#" + +```py +print( 'hello' ) +``` + +```py +print( 'hello' ) +``` + "#; + assert_snapshot!(format_code_blocks( + code, + None, + &FormatterSettings::default() + ), @"Unchanged"); + } }