diff --git a/Cargo.lock b/Cargo.lock index 1225ef1fcd1e1..211a57bee7610 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2074,6 +2074,7 @@ dependencies = [ "oxc_ast 0.96.0", "oxc_data_structures 0.96.0", "oxc_parser 0.96.0", + "oxc_semantic 0.96.0", "oxc_span 0.96.0", "oxc_syntax 0.96.0", "pico-args", diff --git a/crates/oxc_formatter/Cargo.toml b/crates/oxc_formatter/Cargo.toml index 25f75c2c48698..75581446045e5 100644 --- a/crates/oxc_formatter/Cargo.toml +++ b/crates/oxc_formatter/Cargo.toml @@ -24,6 +24,7 @@ oxc_allocator = { workspace = true } oxc_ast = { workspace = true } oxc_data_structures = { workspace = true, features = ["stack"] } oxc_parser = { workspace = true } +oxc_semantic = { workspace = true, optional = true } oxc_span = { workspace = true } oxc_syntax = { workspace = true } @@ -43,3 +44,7 @@ serde_json = { workspace = true } [build-dependencies] oxc_span = { workspace = true } + +[features] +default = [] +detect_code_removal = ["dep:oxc_semantic"] diff --git a/crates/oxc_formatter/src/detect_code_removal/mod.rs b/crates/oxc_formatter/src/detect_code_removal/mod.rs new file mode 100644 index 0000000000000..f83c78e6aa723 --- /dev/null +++ b/crates/oxc_formatter/src/detect_code_removal/mod.rs @@ -0,0 +1,320 @@ +use rustc_hash::{FxHashMap, FxHashSet}; + +use oxc_allocator::Allocator; +use oxc_ast::{AstKind, ast}; +use oxc_parser::{ParseOptions, Parser, ParserReturn}; +use oxc_semantic::{AstNode, Semantic, SemanticBuilder}; +use oxc_span::SourceType; + +use crate::get_parse_options; + +/// Collect statistics from source code. +pub fn collect(code: &str, source_type: SourceType) -> StatsCollector { + let allocator = Allocator::default(); + let parser = Parser::new(&allocator, code, source_type).with_options(get_parse_options()); + let ParserReturn { program, errors, .. } = parser.parse(); + + let mut collector = StatsCollector::default(); + + // If there are parse errors, skip further analysis. + // This will be reported in `diff()` later. + if !errors.is_empty() { + collector.has_parse_error = true; + return collector; + } + + // Using semantic analysis here only to get parent node + let semantic_ret = SemanticBuilder::new().build(&program); + collector.collect(&program, &semantic_ret.semantic) +} + +/// Check if there's a difference (= code removal) between before and after formatting. +pub fn diff(before: &StatsCollector, after: &StatsCollector) -> Option { + // Simply counts differences in node counts. + // `debug_name()` which contains node type and its details is used as the key. + fn diff_counts(before: &Counter, after: &Counter) -> Option> { + let mut errors = Vec::new(); + + let mut all_key_names: FxHashSet<_> = before.keys().collect(); + all_key_names.extend(after.keys()); + + for key in all_key_names { + let before_count = before.get(key).copied().unwrap_or(0); + let after_count = after.get(key).copied().unwrap_or(0); + + if before_count != after_count { + errors + .push(format!("Count mismatch for '{key}': {before_count} -> {after_count}",)); + } + } + + (!errors.is_empty()).then_some(errors) + } + + // Compare block comments by their count. + // We should check content but it is not straightforward, + // leading whitespaces inside block comment also removed and `*` is aligned. + fn diff_block_comments(before: &[String], after: &[String]) -> Option { + let before_count = before.len(); + let after_count = after.len(); + (before_count != after_count) + .then_some(format!("BlockComment count mismatch: {before_count} -> {after_count}")) + } + + // Compare line comments by their content. + // Count is not enough since line comments can be merged. + fn diff_line_comments(before: &[String], after: &[String]) -> Option { + // Happy path! + if before == after { + return None; + } + + // Sometimes, formatter trims trailing whitespaces. + // e.g. + // ``` + // // if extra whitespaces here -> + // ``` + // (rustfmt also removes...) + // -> `if extra whitespaces here ->` + // + // Sometimes, line comments are merged. + // e.g. + // ``` + // for (x + // in //a + // y); //b + // ``` + // -> `for (x in y); //a //b` = Comment: 2 -> 1 + for bf in before { + if after.iter().any(|af| bf.trim_end() == af || af.contains(bf)) { + continue; + } + return Some(format!("LineComment mismatch: `{bf}` not found")); + } + + None + } + + if before.has_parse_error { + unreachable!("Do not expect to be called with code having parse error"); + } + if after.has_parse_error { + return Some("Parse error found after formatting".to_string()); + } + + let mut errors = Vec::new(); + errors.extend(diff_block_comments(&before.block_comments, &after.block_comments)); + errors.extend(diff_line_comments(&before.line_comments, &after.line_comments)); + errors.extend(diff_counts(&before.node_counts, &after.node_counts).unwrap_or_default()); + + (!errors.is_empty()).then_some(errors.join("\n")) +} + +type Counter = FxHashMap; + +#[derive(Debug, Default)] +pub struct StatsCollector { + has_parse_error: bool, + block_comments: Vec, + line_comments: Vec, + node_counts: Counter, +} + +impl StatsCollector { + #[must_use] + fn collect<'a>(mut self, program: &ast::Program<'a>, semantic: &Semantic<'a>) -> Self { + for comment in &program.comments { + // See `diff()` logic above for why content is needed + let content = comment.span.source_text(program.source_text).to_string(); + if comment.is_block() { + self.block_comments.push(content); + } else { + self.line_comments.push(content); + } + } + + for node in semantic.nodes().iter() { + self.handle_node(node, semantic); + } + + self + } + + fn handle_node<'a>(&mut self, node: &AstNode<'a>, semantic: &Semantic<'a>) { + let kind = node.kind(); + + // `;` can be safely removed. + // e.g. `;[];` -> `[];` = EmptyStatement: 1 -> 0 + if matches!(kind, AstKind::EmptyStatement(_)) { + return; + } + + // `TSUnionType` can have unnecessary leading `|`. + // If there is such leading `|` with only one type, they can be removed. + // e.g. `type T = | A` -> `type T = A` = TSUnionType: 1 -> 0 + if matches!(kind, AstKind::TSUnionType(u) if u.types.len() == 1) { + return; + } + // `TSIntersectionType` can have unnecessary leading `&` too. + // e.g. `type T = & A` -> `type T = A` = TSIntersectionType: 1 -> 0 + if matches!(kind, AstKind::TSIntersectionType(i) if i.types.len() == 1) { + return; + } + + // Useless parentheses can be safely removed. + // e.g. `(a)` -> `a` + // e.g. `type T = (A | B)` -> `type T = A | B` + // + // However, at the same time, some nodes are affected by parentheses removal. + // e.g. `(a, (b, c))` -> `(a, b, c)` = SequenceExpression: 2 -> 1 + // e.g. `(a?.b)?.c` -> `a?.b?.c` = ChainExpression: 2 -> 1 + // e.g. `(a)()` -> `a()` = CallExpression() -> CallExpression(a) + // + // Track of such nodes is not straightforward. + // To detect whether only truly unnecessary parentheses were removed, + // we need to consider semantics, scope, etc... + // + // NOTE: By this reason, at least for now, do not count parentheses and the affected nodes. + // For the first place, there should be no parentheses by `preserve_parens: false`. + if matches!( + kind, + AstKind::ParenthesizedExpression(_) + | AstKind::TSParenthesizedType(_) + | AstKind::ChainExpression(_) + | AstKind::SequenceExpression(_) + ) { + return; + } + + // Count by `debug_name` which contains the node type and its details. + // If this is too strict, we can relax it later. + let node_name = kind.debug_name().to_string(); + let parent_kind = semantic.nodes().parent_kind(node.id()); + + // Object-like keys can be formatted differently based on quote options. + // e.g. `{ "key": value }` -> `{ key: value }` + // Therefore, we should count their value instead of their node type. + if matches!( + parent_kind, + AstKind::ObjectProperty(_) + | AstKind::MethodDefinition(_) + | AstKind::PropertyDefinition(_) + | AstKind::ImportAttribute(_) + | AstKind::TSPropertySignature(_) + | AstKind::TSLiteralType(_), + ) && matches!( + kind, + AstKind::IdentifierName(_) | AstKind::StringLiteral(_) | AstKind::NumericLiteral(_) + ) { + for prefix in ["StringLiteral(", "IdentifierName(", "NumericLiteral("] { + if let Some(rest) = node_name.strip_prefix(prefix) + && let Some(value) = rest.strip_suffix(')') + { + *self.node_counts.entry(format!("OBJECT_LIKE_KEYS({value})")).or_insert(0) += 1; + return; + } + } + } + + // `JSXText` with only whitespace can be safely removed. + // e.g. + // ``` + // return ( + //
+ // {children} + //
+ // ) + // ``` + // -> `return (
{children}
)` = JSXText(\n): 2 -> 0 + // + // NOTE: In addition, there are many cases related to `JSXText` where whitespaces are changed. + // Just skip counting `JSXText` with only whitespaces. + if matches!(kind, AstKind::JSXText(t) if t.value.trim().is_empty()) { + return; + } + + // `JSXExpressionContainer` with a single whitespace `StringLiteral` will be removed. + // e.g. `
{" "}
` -> `
` + // = JSXExpressionContainer: 1 -> 0, StringLiteral: 1 -> 0, JSXText(" "): 0 -> 1 + // e.g. `
{" "}{' '}
` -> `
` + // = JSXExpressionContainer: 2 -> 0, StringLiteral: 1 -> 0, JSXText(" "): 0 -> 1 + if matches!(kind, AstKind::JSXExpressionContainer(c) + if matches!(&c.expression, ast::JSXExpression::StringLiteral(s) if s.value == " ")) + { + // Skip `JSXExpressionContainer` containing only `StringLiteral(" ")` + return; + } + if matches!(parent_kind, AstKind::JSXExpressionContainer(_)) + && matches!(kind, AstKind::StringLiteral(s) if s.value == " ") + { + // Skip `StringLiteral(" ")` inside `JSXExpressionContainer` + return; + } + + let count_key = match kind { + // `JSXText` may contain redundant whitespaces. + // e.g. `

World

` -> `

World

` + // Redundant whitespaces can be truncated even if they are inside. + // e.g. `

abc : def

` -> `

abc : def

` + AstKind::JSXText(t) if t.value != " " => { + format!("JSX_TEXT({})", t.value.split_whitespace().collect::>().join(" ")) + } + // When ParenthesizedExpression is removed, CallExpression(calee name) may be affected. + // e.g. `(a?.b)?.()` -> `a?.b?.()` + // CallExpression() -> CallExpression(b) + AstKind::CallExpression(_) => "CALL_EXPRESSION".to_string(), + _ => node_name, + }; + *self.node_counts.entry(count_key).or_insert(0) += 1; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_code_removal() { + for (code1, code2) in [ + ("const a = 1;", "const b = 1;"), + ("const a = 1; // variable a", "const a = 1;"), + ("/* block comment */ const b = 2;", "const b = 2;"), + ] { + let source_type = SourceType::default().with_typescript(true).with_jsx(true); + + let stats1 = collect(code1, source_type); + let stats2 = collect(code2, source_type); + + assert_ne!(diff(&stats1, &stats2), None, "Expected diff not found:\n{code1}"); + } + } + + #[test] + fn test_no_diff_for_same_code() { + for (code1, code2) in [ + ("const a = 1;", "const a = 1;"), + ("for ((let.a) of foo);", "for ((let).a of foo);"), + ("for ((let[a]) of foo);", "for ((let)[a] of foo);"), + ("for ((let.a) in foo);", "for (let.a in foo);"), + ("
{' '}
", "
"), + ("type T = | A;", "type T = A;"), + ("type T = & A;", "type T = A;"), + ("(a);", "a;"), + ("(a)();", "a();"), + ("(a, (b, c));", "(a, b, c);"), + ("(a?.b)?.c;", "a?.b?.c;"), + ("(a.b)?.().c;", "a.b?.().c;"), + ("(a?.b)?.().c", "a?.b?.().c"), + ("for ((let)[a] in foo);", "for ((let)[a] in foo);"), + ] { + let source_type = SourceType::default().with_typescript(true).with_jsx(true); + + let stats1 = collect(code1, source_type); + let stats2 = collect(code2, source_type); + + if let Some(diff) = diff(&stats1, &stats2) { + panic!("Unexpected diff found:\n{diff}"); + } + } + } +} diff --git a/crates/oxc_formatter/src/lib.rs b/crates/oxc_formatter/src/lib.rs index f4b998c0e168f..309bf2ba135ef 100644 --- a/crates/oxc_formatter/src/lib.rs +++ b/crates/oxc_formatter/src/lib.rs @@ -10,6 +10,8 @@ )] // FIXME: all these needs to be fixed. mod ast_nodes; +#[cfg(feature = "detect_code_removal")] +mod detect_code_removal; mod formatter; mod ir_transform; mod options; @@ -87,6 +89,30 @@ impl<'a> Formatter<'a> { formatted.apply_transform(|doc| sort_imports.transform(doc)); } + // If the feature is enabled, perform extra checks to detect code removal. + #[cfg(feature = "detect_code_removal")] + { + // Use the same source type + let source_type = program.source_type; + + let before_text = program.source_text; + let before_stats = detect_code_removal::collect(before_text, source_type); + + let after_text = formatted.print().unwrap().into_code(); + let after_stats = detect_code_removal::collect(&after_text, source_type); + + if let Some(diff) = detect_code_removal::diff(&before_stats, &after_stats) { + unreachable!( + r"🚨 Code removal detected during formatting! +Difference found ========================== +{diff} +Original code ============================= +{before_text} +===========================================" + ); + } + } + formatted } }