From 142a1be9b6e182efb57a966f22752752e90952ed Mon Sep 17 00:00:00 2001 From: Boshen Date: Fri, 6 Feb 2026 10:26:56 +0000 Subject: [PATCH] feat(parser): detect binary files with TS1490 error (#19047) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Port TypeScript's binary file detection (`TS1490: File appears to be binary.`) - When the scanner encounters U+FFFD (replacement character) as a standalone token, emit the error and stop parsing - U+FFFD inside strings, comments, and templates is unaffected - Reference: https://github.com/microsoft/TypeScript/blob/main/src/compiler/scanner.ts 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- crates/oxc_parser/src/diagnostics.rs | 5 +++++ crates/oxc_parser/src/lexer/unicode.rs | 11 +++++++++++ crates/oxc_parser/src/lib.rs | 17 +++++++++++++++++ tasks/coverage/snapshots/parser_typescript.snap | 15 +++++++++------ tasks/coverage/src/typescript/constants.rs | 3 --- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/crates/oxc_parser/src/diagnostics.rs b/crates/oxc_parser/src/diagnostics.rs index 4e67f4a2c8869..c2a9f0458b98b 100644 --- a/crates/oxc_parser/src/diagnostics.rs +++ b/crates/oxc_parser/src/diagnostics.rs @@ -48,6 +48,11 @@ pub fn overlong_source() -> OxcDiagnostic { OxcDiagnostic::error("Source length exceeds 4 GiB limit") } +#[cold] +pub fn file_appears_to_be_binary() -> OxcDiagnostic { + ts_error("1490", "File appears to be binary.") +} + #[cold] pub fn flow(span: Span) -> OxcDiagnostic { OxcDiagnostic::error("Flow is not supported").with_label(span) diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 6e2dbf7e4fd82..15894b5f290dc 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -33,6 +33,10 @@ impl<'a> Lexer<'a> { pub(super) fn unicode_char_handler(&mut self) -> Kind { let c = self.peek_char().unwrap(); match c { + // U+FFFD (replacement character) appears when a binary file is decoded as UTF-8. + // This is likely a binary file that cannot be parsed. + // + '\u{FFFD}' => self.handle_binary_file(), c if is_identifier_start_unicode(c) => { let start_pos = self.source.position(); self.consume_char(); @@ -45,6 +49,13 @@ impl<'a> Lexer<'a> { } } + #[cold] + fn handle_binary_file(&mut self) -> Kind { + self.error(diagnostics::file_appears_to_be_binary()); + self.source.advance_to_end(); + Kind::Eof + } + #[cold] fn handle_irregular_whitespace(&mut self, _c: char) -> Kind { self.consume_char(); diff --git a/crates/oxc_parser/src/lib.rs b/crates/oxc_parser/src/lib.rs index ee802ea09a456..52cd03c2539a2 100644 --- a/crates/oxc_parser/src/lib.rs +++ b/crates/oxc_parser/src/lib.rs @@ -808,6 +808,23 @@ mod test { } } + #[test] + fn binary_file() { + let allocator = Allocator::default(); + let source_type = SourceType::default(); + + // U+FFFD as a standalone token — file appears to be binary + let ret = Parser::new(&allocator, "\u{FFFD}", source_type).parse(); + assert!(ret.program.is_empty()); + assert_eq!(ret.errors.len(), 1); + assert_eq!(ret.errors[0].to_string(), "File appears to be binary."); + + // U+FFFD inside string literals — should parse fine + let ret = Parser::new(&allocator, "\"oops \u{FFFD} oops\";", source_type).parse(); + assert!(!ret.program.is_empty()); + assert!(ret.errors.is_empty()); + } + #[test] fn memory_leak() { let allocator = Allocator::default(); diff --git a/tasks/coverage/snapshots/parser_typescript.snap b/tasks/coverage/snapshots/parser_typescript.snap index aa1d449fd519d..211862373ed51 100644 --- a/tasks/coverage/snapshots/parser_typescript.snap +++ b/tasks/coverage/snapshots/parser_typescript.snap @@ -3,7 +3,7 @@ commit: 95e3aaa9 parser_typescript Summary: AST Parsed : 9841/9841 (100.00%) Positive Passed: 9841/9841 (100.00%) -Negative Passed: 1529/2557 (59.80%) +Negative Passed: 1530/2558 (59.81%) Expect Syntax Error: tasks/coverage/typescript/tests/cases/compiler/FunctionDeclaration3.ts Expect Syntax Error: tasks/coverage/typescript/tests/cases/compiler/FunctionDeclaration4.ts @@ -2312,6 +2312,13 @@ Expect Syntax Error: tasks/coverage/typescript/tests/cases/conformance/types/wit 5 │ } ╰──── + × Expected a semicolon or an implicit semicolon after a statement, but found none + ╭─[typescript/tests/cases/compiler/TransportStream.ts:1:2] + 1 │ G@�G@�G@� + · ▲ + ╰──── + help: Try inserting a semicolon here + × Getters and setters must have an implementation. ╭─[typescript/tests/cases/compiler/abstractPropertyNegative.ts:16:9] 15 │ abstract notAllowed: string; @@ -4586,11 +4593,7 @@ Expect Syntax Error: tasks/coverage/typescript/tests/cases/conformance/types/wit 293 │ class interface { } ╰──── - × Invalid Character `�` - ╭─[typescript/tests/cases/compiler/corrupted.ts:1:1] - 1 │ ��� - · ─ - ╰──── + × TS(1490): File appears to be binary. × TS(2391): Function implementation is missing or not immediately following the declaration. ╭─[typescript/tests/cases/compiler/crashOnMethodSignatures.ts:2:5] diff --git a/tasks/coverage/src/typescript/constants.rs b/tasks/coverage/src/typescript/constants.rs index ec29729ab3d4f..20b1e8a6f0dde 100644 --- a/tasks/coverage/src/typescript/constants.rs +++ b/tasks/coverage/src/typescript/constants.rs @@ -92,9 +92,6 @@ pub static NOT_SUPPORTED_TEST_PATHS: phf::Set<&'static str> = phf::phf_set![ // TSC: Parse without error, they support BOM // OXC: We do not ignore or exclude BOM, will be invalid character error "bom-utf16be.ts", - // TSC: This is just a binary file, but their test project skips reading - // OXC: Try to parse, and fail - "TransportStream.ts", // TSC: Allows `catch({x}) { var x; }` (destructured catch param + var redeclaration) // OXC: Reports redeclaration error per Annex B §B.3.4 (only simple identifiers are exempt) "asyncWithVarShadowing_es6.ts",