Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
save-cache: ${{ github.ref_name == 'main' }}
cache-key: warm
- run: cargo ck
- run: cargo test
- run: cargo test --all-features
- run: git diff --exit-code # Must commit everything

test-windows:
Expand Down Expand Up @@ -96,7 +96,7 @@ jobs:
shell: bash
run: |
# cargo ck # no need to check because it's already checked in linux
cargo test --workspace
cargo test --all-features

test-wasm32-wasip1-threads:
name: Test wasm32-wasip1-threads
Expand Down
2 changes: 2 additions & 0 deletions crates/oxc_ast/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ mod ast_impl;
mod ast_kind_impl;
pub mod precedence;
mod trivia;
#[cfg(feature = "serialize")]
pub mod utf8_to_utf16;

mod generated {
#![allow(missing_docs)]
Expand Down
248 changes: 248 additions & 0 deletions crates/oxc_ast/src/utf8_to_utf16.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
//! Convert UTF-8 span offsets to UTF-16.

use oxc_span::Span;

use crate::{ast::Program, visit::VisitMut};

/// Convert UTF-8 span offsets to UTF-16.
pub struct Utf8ToUtf16 {
translations: Vec<Translation>,
}

#[derive(Clone, Copy)]
#[repr(align(8))]
struct Translation {
// UTF-8 byte offset
utf8_offset: u32,
// Number to subtract from UTF-8 byte offset to get UTF-16 char offset
// for offsets *after* `utf8_offset`
utf16_difference: u32,
}

impl Utf8ToUtf16 {
/// Create new `Utf8ToUtf16` converter.
#[expect(clippy::new_without_default)]
pub fn new() -> Self {
let mut translations = Vec::with_capacity(16);
translations.push(Translation { utf8_offset: 0, utf16_difference: 0 });
Self { translations }
}

/// Convert all spans in the AST to UTF-16.
pub fn convert(mut self, program: &mut Program<'_>) {
self.build_table(program.source_text);
// Skip if source is entirely ASCII
if self.translations.len() == 1 {
return;
}
self.visit_program(program);
for comment in &mut program.comments {
self.convert_span(&mut comment.span);
}
}

#[allow(clippy::cast_possible_truncation)]
fn build_table(&mut self, source_text: &str) {
// Translation from UTF-8 byte offset to UTF-16 char offset:
//
// * 1-byte UTF-8 sequence
// = 1st byte 0xxxxxxx (0 - 0x7F)
// -> 1 x UTF-16 char
// UTF-16 len = UTF-8 len
// * 2-byte UTF-8 sequence
// = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 1 x UTF-16
// UTF-16 len = UTF-8 len - 1
// * 3-byte UTF-8 sequence
// = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 1 x UTF-16
// UTF-16 len = UTF-8 len - 2
// * 4-byte UTF-8 sequence
// = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 2 x UTF-16
// UTF-16 len = UTF-8 len - 2
//
// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
let mut utf16_difference = 0;
for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() {
if byte >= 0xC0 {
let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
utf16_difference += difference_for_this_byte;
// Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this
// Unicode character that need to be shifted
self.translations
.push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference });
}
}
}

fn convert_span(&self, span: &mut Span) {
span.start = self.convert_offset(span.start);
span.end = self.convert_offset(span.end);
}

fn convert_offset(&self, utf8_offset: u32) -> u32 {
// Find the first entry in table *after* the UTF-8 offset.
// The difference we need to subtract is recorded in the entry prior to it.
let index =
self.translations.partition_point(|translation| translation.utf8_offset <= utf8_offset);
// First entry in table is `0, 0`. `partition_point` finds the first entry where
// `utf8_offset < translation.utf8_offset` (or `translations.len()` if none exists).
// So guaranteed `index > 0`, and `index <= translations.len()`.
// Therefore `index - 1` cannot wrap around, and cannot be out of bounds.
let translation = self.translations[index - 1];
utf8_offset - translation.utf16_difference
}
}

impl VisitMut<'_> for Utf8ToUtf16 {
fn visit_span(&mut self, span: &mut Span) {
self.convert_span(span);
}
}

#[cfg(test)]
mod test {
use oxc_allocator::Allocator;
use oxc_span::{GetSpan, SourceType, Span};

use crate::{
ast::{Expression, Statement},
AstBuilder, Comment, CommentKind,
};

use super::Utf8ToUtf16;

#[test]
fn translate_ast() {
let allocator = Allocator::new();
let ast = AstBuilder::new(&allocator);

let mut program = ast.program(
Span::new(0, 15),
SourceType::default(),
";'🤨' // 🤨",
ast.vec1(Comment::new(8, 15, CommentKind::Line)),
None,
ast.vec(),
ast.vec_from_array([
ast.statement_empty(Span::new(0, 1)),
ast.statement_expression(
Span::new(1, 7),
ast.expression_string_literal(Span::new(1, 7), "🤨", None),
),
]),
);

Utf8ToUtf16::new().convert(&mut program);
assert_eq!(program.span, Span::new(0, 11));
assert_eq!(program.body[1].span(), Span::new(1, 5));
let Statement::ExpressionStatement(expr_stmt) = &program.body[1] else { unreachable!() };
let Expression::StringLiteral(s) = &expr_stmt.expression else { unreachable!() };
assert_eq!(s.span, Span::new(1, 5));
assert_eq!(program.comments[0].span, Span::new(6, 11));
}

#[test]
fn translate_offsets() {
assert_eq!('_'.len_utf8(), 1);
assert_eq!('_'.len_utf16(), 1);
assert_eq!('£'.len_utf8(), 2);
assert_eq!('£'.len_utf16(), 1);
assert_eq!('ऊ'.len_utf8(), 3);
assert_eq!('ऊ'.len_utf16(), 1);
assert_eq!('🤨'.len_utf8(), 4);
assert_eq!('🤨'.len_utf16(), 2);

let cases: &[(&str, &[(u32, u32)])] = &[
// 1-byte
("_", &[(0, 0), (1, 1)]),
// 2-byte
("£", &[(0, 0), (2, 1)]),
("£_", &[(0, 0), (2, 1), (3, 2)]),
("_£", &[(0, 0), (1, 1), (3, 2)]),
("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]),
("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]),
("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]),
// 3-byte
("ऊ", &[(0, 0), (3, 1)]),
("ऊ_", &[(0, 0), (3, 1), (4, 2)]),
("_ऊ", &[(0, 0), (1, 1), (4, 2)]),
("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]),
("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]),
("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]),
// 4-byte
("🤨", &[(0, 0), (4, 2)]),
("🤨_", &[(0, 0), (4, 2), (5, 3)]),
("_🤨", &[(0, 0), (1, 1), (5, 3)]),
("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]),
("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]),
("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
];

for (text, translations) in cases {
let mut converter = Utf8ToUtf16::new();
converter.build_table(text);
for &(utf8_offset, expected_utf16_offset) in *translations {
assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset);
}
}
}

// Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences,
// which are relied on by `build_table`
#[test]
fn char_lengths() {
macro_rules! assert_utf8_bytes_eq {
($c:expr, $bytes:expr) => {{
let mut buffer = [0; 4];
let bytes = $c.encode_utf8(&mut buffer).as_bytes();
assert!($bytes == bytes);
}};
}

// All 1-byte UTF-8 character sequences = 1 x UTF-16 character.
// First byte is 0x00 - 0x7F.
let min_1_byte_char = char::from_u32(0).unwrap();
assert_eq!(min_1_byte_char.len_utf8(), 1);
assert_eq!(min_1_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_1_byte_char, [0x00]);
let max_1_byte_char = char::from_u32(0x7F).unwrap();
assert_eq!(max_1_byte_char.len_utf8(), 1);
assert_eq!(max_1_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]);

// All 2-byte UTF-8 character sequences = 1 x UTF-16 character
// First byte is 0xC2 - 0xDF.
let min_2_byte_char = char::from_u32(0x80).unwrap();
assert_eq!(min_2_byte_char.len_utf8(), 2);
assert_eq!(min_2_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]);
let max_2_byte_char = char::from_u32(0x7FF).unwrap();
assert_eq!(max_2_byte_char.len_utf8(), 2);
assert_eq!(max_2_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]);

// All 3-byte UTF-8 character sequences = 1 x UTF-16 character
// First byte is 0xE0 - 0xEF.
let min_3_byte_char = char::from_u32(0x800).unwrap();
assert_eq!(min_3_byte_char.len_utf8(), 3);
assert_eq!(min_3_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]);
let max_3_byte_char = char::from_u32(0xFFFF).unwrap();
assert_eq!(max_3_byte_char.len_utf8(), 3);
assert_eq!(max_3_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]);

// All 4-byte UTF-8 character sequences = 2 x UTF-16 characters
// First byte is 0xF0 - 0xF4.
let min_4_byte_char = char::from_u32(0x10000).unwrap();
assert_eq!(min_4_byte_char.len_utf8(), 4);
assert_eq!(min_4_byte_char.len_utf16(), 2);
assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]);
let max_4_byte_char = char::MAX;
assert_eq!(max_4_byte_char.len_utf8(), 4);
assert_eq!(max_4_byte_char.len_utf16(), 2);
assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]);
}
}
20 changes: 13 additions & 7 deletions crates/oxc_parser/examples/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
use std::{fs, path::Path};

use oxc_allocator::Allocator;
use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
use oxc_parser::{ParseOptions, Parser};
use oxc_span::SourceType;
use pico_args::Arguments;

// Instruction:
// create a `test.js`,
// run `cargo run -p oxc_parser --example parser`
// or `cargo watch -x "run -p oxc_parser --example parser"`
// or `just watch "cargo run -p oxc_parser --example parser"`

fn main() -> Result<(), String> {
let mut args = Arguments::from_env();

let show_ast = args.contains("--ast");
let show_estree = args.contains("--estree");
let show_comments = args.contains("--comments");
let name = args.free_from_str().unwrap_or_else(|_| "test.js".to_string());

Expand All @@ -26,20 +28,24 @@ fn main() -> Result<(), String> {
let ret = Parser::new(&allocator, &source_text, source_type)
.with_options(ParseOptions { parse_regular_expression: true, ..ParseOptions::default() })
.parse();

if show_ast {
println!("AST:");
println!("{}", serde_json::to_string_pretty(&ret.program).unwrap());
}
let mut program = ret.program;

if show_comments {
println!("Comments:");
for comment in ret.program.comments {
for comment in &program.comments {
let s = comment.content_span().source_text(&source_text);
println!("{s}");
}
}

if show_ast || show_estree {
println!("AST:");
if show_estree {
Utf8ToUtf16::new().convert(&mut program);
}
println!("{}", serde_json::to_string_pretty(&program).unwrap());
}

if ret.errors.is_empty() {
println!("Parsed Successfully.");
} else {
Expand Down
4 changes: 2 additions & 2 deletions tasks/benchmark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ bench = false
# with only the crates it needs, to speed up the builds
[dependencies]
oxc_allocator = { workspace = true, optional = true }
oxc_ast = { workspace = true, optional = true }
oxc_ast = { workspace = true, optional = true, features = ["serialize"] }
oxc_codegen = { workspace = true, optional = true }
oxc_isolated_declarations = { workspace = true, optional = true }
oxc_linter = { workspace = true, optional = true }
Expand Down Expand Up @@ -106,7 +106,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"]
# Features for running each benchmark separately with minimum dependencies that benchmark needs.
# e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser`
lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
parser = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
transformer = [
"dep:oxc_allocator",
"dep:oxc_parser",
Expand Down
31 changes: 30 additions & 1 deletion tasks/benchmark/benches/parser.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use oxc_allocator::Allocator;
use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
use oxc_parser::{ParseOptions, Parser};
use oxc_span::SourceType;
Expand Down Expand Up @@ -29,5 +30,33 @@ fn bench_parser(criterion: &mut Criterion) {
group.finish();
}

criterion_group!(parser, bench_parser);
fn bench_estree(criterion: &mut Criterion) {
let mut group = criterion.benchmark_group("estree");
for file in TestFiles::complicated().files().iter().take(1) {
let id = BenchmarkId::from_parameter(&file.file_name);
let source_text = file.source_text.as_str();
let source_type = SourceType::from_path(&file.file_name).unwrap();
let mut allocator = Allocator::default();
group.bench_function(id, |b| {
b.iter_with_setup_wrapper(|runner| {
allocator.reset();
let mut program = Parser::new(&allocator, source_text, source_type)
.with_options(ParseOptions {
parse_regular_expression: true,
..ParseOptions::default()
})
.parse()
.program;
runner.run(|| {
Utf8ToUtf16::new().convert(&mut program);
program.to_json();
program
});
});
});
}
group.finish();
}

criterion_group!(parser, bench_parser, bench_estree);
criterion_main!(parser);