diff --git a/ext/web/08_text_encoding.js b/ext/web/08_text_encoding.js index 4477d9b9ee2cee..bf4b338082c4e0 100644 --- a/ext/web/08_text_encoding.js +++ b/ext/web/08_text_encoding.js @@ -16,14 +16,14 @@ const ops = core.ops; const webidl = window.__bootstrap.webidl; const { - ArrayBufferIsView, - ObjectPrototypeIsPrototypeOf, PromiseReject, PromiseResolve, StringPrototypeCharCodeAt, StringPrototypeSlice, TypedArrayPrototypeSubarray, Uint8Array, + ObjectPrototypeIsPrototypeOf, + ArrayBufferIsView, Uint32Array, } = window.__bootstrap.primordials; @@ -34,6 +34,8 @@ #fatal; /** @type {boolean} */ #ignoreBOM; + /** @type {boolean} */ + #utf8SinglePass; /** @type {number | null} */ #rid = null; @@ -56,6 +58,7 @@ this.#encoding = encoding; this.#fatal = options.fatal; this.#ignoreBOM = options.ignoreBOM; + this.#utf8SinglePass = encoding === "utf-8" && !options.fatal; this[webidl.brand] = webidl.brand; } @@ -81,7 +84,7 @@ * @param {BufferSource} [input] * @param {TextDecodeOptions} options */ - decode(input = new Uint8Array(), options = {}) { + decode(input = new Uint8Array(), options = undefined) { webidl.assertBranded(this, TextDecoderPrototype); const prefix = "Failed to execute 'decode' on 'TextDecoder'"; if (input !== undefined) { @@ -91,40 +94,46 @@ allowShared: true, }); } - options = webidl.converters.TextDecodeOptions(options, { - prefix, - context: "Argument 2", - }); + let stream = false; + if (options !== undefined) { + options = webidl.converters.TextDecodeOptions(options, { + prefix, + context: "Argument 2", + }); + stream = options.stream; + } try { - try { - if (ArrayBufferIsView(input)) { - input = new Uint8Array( - input.buffer, - input.byteOffset, - input.byteLength, - ); - } else { - input = new Uint8Array(input); - } - } catch { - // If the buffer is detached, just create a new empty Uint8Array. - input = new Uint8Array(); - } + // Note from spec: implementations are strongly encouraged to use an implementation strategy that avoids this copy. + // When doing so they will have to make sure that changes to input do not affect future calls to decode(). if ( ObjectPrototypeIsPrototypeOf( SharedArrayBuffer.prototype, - input.buffer, + input || input.buffer, ) ) { // We clone the data into a non-shared ArrayBuffer so we can pass it // to Rust. // `input` is now a Uint8Array, and calling the TypedArray constructor // with a TypedArray argument copies the data. - input = new Uint8Array(input); + if (ArrayBufferIsView(input)) { + input = new Uint8Array( + input.buffer, + input.byteOffset, + input.byteLength, + ); + } else { + input = new Uint8Array(input); + } } - if (!options.stream && this.#rid === null) { + // Fast path for single pass encoding. + if (!stream && this.#rid === null) { + // Fast path for utf8 single pass encoding. + if (this.#utf8SinglePass) { + return ops.op_encoding_decode_utf8(input, this.#ignoreBOM); + } + return ops.op_encoding_decode_single( input, this.#encoding, @@ -140,9 +149,9 @@ this.#ignoreBOM, ); } - return ops.op_encoding_decode(input, this.#rid, options.stream); + return ops.op_encoding_decode(input, this.#rid, stream); } finally { - if (!options.stream && this.#rid !== null) { + if (!stream && this.#rid !== null) { core.close(this.#rid); this.#rid = null; } diff --git a/ext/web/lib.rs b/ext/web/lib.rs index 588a3adfd18d94..f799f02e746c83 100644 --- a/ext/web/lib.rs +++ b/ext/web/lib.rs @@ -91,6 +91,7 @@ pub fn init( op_base64_btoa::decl(), op_encoding_normalize_label::decl(), op_encoding_decode_single::decl(), + op_encoding_decode_utf8::decl(), op_encoding_new_decoder::decl(), op_encoding_decode::decl(), op_encoding_encode_into::decl(), @@ -179,6 +180,39 @@ fn op_encoding_normalize_label(label: String) -> Result { Ok(encoding.name().to_lowercase()) } +#[op(v8)] +fn op_encoding_decode_utf8<'a>( + scope: &mut v8::HandleScope<'a>, + zero_copy: &[u8], + ignore_bom: bool, +) -> Result, AnyError> { + let buf = &zero_copy; + + let buf = if !ignore_bom + && buf.len() >= 3 + && buf[0] == 0xef + && buf[1] == 0xbb + && buf[2] == 0xbf + { + &buf[3..] + } else { + buf + }; + + // If `String::new_from_utf8()` returns `None`, this means that the + // length of the decoded string would be longer than what V8 can + // handle. In this case we return `RangeError`. + // + // For more details see: + // - https://encoding.spec.whatwg.org/#dom-textdecoder-decode + // - https://github.com/denoland/deno/issues/6649 + // - https://github.com/v8/v8/blob/d68fb4733e39525f9ff0a9222107c02c28096e2a/include/v8.h#L3277-L3278 + match v8::String::new_from_utf8(scope, buf, v8::NewStringType::Normal) { + Some(text) => Ok(serde_v8::from_v8(scope, text.into())?), + None => Err(type_error("buffer exceeds maximum length")), + } +} + #[op] fn op_encoding_decode_single( data: &[u8], diff --git a/ops/lib.rs b/ops/lib.rs index 44f783280394c3..f2e9545dbd39cf 100644 --- a/ops/lib.rs +++ b/ops/lib.rs @@ -910,13 +910,16 @@ fn codegen_u8_slice(core: &TokenStream2, idx: usize) -> TokenStream2 { let value = args.get(#idx as i32); match #core::v8::Local::<#core::v8::ArrayBuffer>::try_from(value) { Ok(b) => { + // Handles detached buffers. + let byte_length = b.byte_length(); let store = b.data() as *mut u8; // SAFETY: rust guarantees that lifetime of slice is no longer than the call. - unsafe { ::std::slice::from_raw_parts_mut(store, b.byte_length()) } + unsafe { ::std::slice::from_raw_parts_mut(store, byte_length) } }, Err(_) => { if let Ok(view) = #core::v8::Local::<#core::v8::ArrayBufferView>::try_from(value) { - let (offset, len) = (view.byte_offset(), view.byte_length()); + let len = view.byte_length(); + let offset = view.byte_offset(); let buffer = match view.buffer(scope) { Some(v) => v, None => {