diff --git a/src/bun_core/string/StringJoiner.rs b/src/bun_core/string/StringJoiner.rs index 8af2dcc7c3c..2af7b9b2a65 100644 --- a/src/bun_core/string/StringJoiner.rs +++ b/src/bun_core/string/StringJoiner.rs @@ -82,6 +82,12 @@ pub struct Watcher { } impl StringJoiner { + /// Pre-allocate room for `additional` more pushed slices, so a join with a + /// known piece count does a single nodes allocation instead of log₂N grows. + pub fn reserve(&mut self, additional: usize) { + self.nodes.reserve(additional); + } + /// `data` is expected to live until `.done` is called pub fn push_static(&mut self, data: &[u8]) { self.push(data); diff --git a/src/runtime/webcore/Blob.rs b/src/runtime/webcore/Blob.rs index d67b19cd570..28999a0a51a 100644 --- a/src/runtime/webcore/Blob.rs +++ b/src/runtime/webcore/Blob.rs @@ -858,23 +858,16 @@ impl BlobExt for Blob { } fn from_dom_form_data(global_this: &JSGlobalObject, form_data: &mut jsc::DOMFormData) -> Blob { - // PERF(port): was arena bulk-free + stack-fallback alloc — profile if it shows up on a hot path. - - let mut hex_buf = [0u8; 70]; - let boundary = { + // "----WebKitFormBoundary" (22 bytes) + 32 lowercase-hex chars of a fresh UUID. + const BOUNDARY_PREFIX: &[u8; 22] = b"----WebKitFormBoundary"; + let mut boundary_buf = [0u8; BOUNDARY_PREFIX.len() + 32]; + let boundary: &[u8] = { // SAFETY: bun_vm() never returns null for a Bun-owned global. let random = global_this.bun_vm().as_mut().rare_data().next_uuid().bytes; - use std::io::Write; - let mut cursor = &mut hex_buf[..]; + boundary_buf[..BOUNDARY_PREFIX.len()].copy_from_slice(BOUNDARY_PREFIX); // Zig `{x}` on `[16]u8` emits 32 contiguous lowercase-hex chars. - // Rust's `{:x?}` on `[u8;16]` is Debug formatting (`[a1, b2, …]`), - // so write the prefix then encode bytes one at a time. - cursor.write_all(b"----WebKitFormBoundary").unwrap(); - for b in random { - write!(&mut cursor, "{b:02x}").expect("infallible: in-memory write"); - } - let written = 70 - cursor.len(); - &hex_buf[..written] + bun_core::fmt::bytes_to_hex_lower(&random, &mut boundary_buf[BOUNDARY_PREFIX.len()..]); + &boundary_buf }; let mut context = FormDataContext { @@ -883,6 +876,9 @@ impl BlobExt for Blob { failed: false, global_this, }; + // Size the node list up front: a file entry pushes at most 13 slices, a + // string entry 8, plus 3 for the closing boundary. + context.joiner.reserve(form_data.count() * 13 + 3); // PORT NOTE (layering): `bun_jsc::DOMFormData::for_each` yields the // lower-tier `bun_jsc::dom_form_data::FormDataEntry`, whose `blob` @@ -945,11 +941,9 @@ impl BlobExt for Blob { for_each_thunk, ); if context.failed { - // The joiner's Node structs are owned by the (former) arena, but each - // node's data carries its own owner allocator — heap for non-ASCII - // name/value slices and the NodeFS read_file result buffer. Drop the - // joiner (Drop runs StringJoiner::deinit) so every heap-owned slice - // already pushed is freed. + // Drop the joiner (Drop runs StringJoiner::deinit) so every + // heap-owned slice already pushed — escaped names, non-ASCII + // conversions, NodeFS read_file result buffers — is freed. drop(context.joiner); return Blob::init_empty(global_this); } @@ -968,16 +962,10 @@ impl BlobExt for Blob { let blob = Blob::init_with_store(store, global_this); // Always allocate content_type with the default allocator so deinit() can // free it unconditionally. - let mut ct = Vec::new(); - { - use std::io::Write; - write!( - &mut ct, - "multipart/form-data; boundary={}", - bstr::BStr::new(boundary) - ) - .unwrap(); - } + const CONTENT_TYPE_PREFIX: &[u8] = b"multipart/form-data; boundary="; + let mut ct = Vec::with_capacity(CONTENT_TYPE_PREFIX.len() + boundary.len()); + ct.extend_from_slice(CONTENT_TYPE_PREFIX); + ct.extend_from_slice(boundary); blob.content_type .set(bun_core::heap::into_raw(ct.into_boxed_slice())); blob.content_type_allocated.set(true); @@ -3942,13 +3930,14 @@ struct FormDataContext<'a> { /// WHATWG HTML "multipart/form-data encoding algorithm": percent-encode `"`, /// CR and LF in field names and filenames so they cannot terminate the -/// quoted-string or inject part headers. -fn escape_form_data_name(bytes: Vec) -> Box<[u8]> { +/// quoted-string or inject part headers. Returns `None` when nothing needs +/// escaping so the caller can keep using the original bytes without a copy. +fn escape_form_data_name(bytes: &[u8]) -> Option> { if !bytes.iter().any(|&b| matches!(b, b'"' | b'\r' | b'\n')) { - return bytes.into_boxed_slice(); + return None; } let mut out = Vec::with_capacity(bytes.len() + 6); - for &b in &bytes { + for &b in bytes { match b { b'"' => out.extend_from_slice(b"%22"), b'\r' => out.extend_from_slice(b"%0D"), @@ -3956,10 +3945,35 @@ fn escape_form_data_name(bytes: Vec) -> Box<[u8]> { _ => out.push(b), } } - out.into_boxed_slice() + Some(out.into_boxed_slice()) } impl FormDataContext<'_> { + /// Append the UTF-8 view of a form-data string without copying it: the + /// borrowed case points into a WTF string owned by the `DOMFormData` being + /// serialized, which outlives `joiner.done()` in `from_dom_form_data` + /// (Zig: `joiner.push(slice, slice.allocator.get())`); an owned slice + /// (UTF-16 / non-ASCII Latin-1 conversion) transfers its allocation to the + /// joiner. When `escape` is set, `"`/CR/LF are percent-encoded into a copy. + fn push_string_slice(joiner: &mut StringJoiner, slice: ZigStringSlice, escape: bool) { + if escape { + if let Some(escaped) = escape_form_data_name(slice.slice()) { + joiner.push_owned(escaped); + return; + } + } + if matches!(slice, ZigStringSlice::Owned(_)) { + // `into_vec` moves the buffer out of an `Owned` slice without copying. + joiner.push_owned(slice.into_vec().into_boxed_slice()); + } else if matches!(slice, ZigStringSlice::Static(..)) { + joiner.push_static(slice.slice()); + } else { + // WTF-backed slices release their pin on drop — copy rather than + // borrow past it. (`ZigString::to_slice` never produces these.) + joiner.push_cloned(slice.slice()); + } + } + pub(crate) fn on_entry(&mut self, name: ZigString, entry: FormDataEntry<'_>) { if self.failed { return; @@ -3975,22 +3989,20 @@ impl FormDataContext<'_> { joiner.push_static(b"\r\n"); joiner.push_static(b"Content-Disposition: form-data; name=\""); - // PORT NOTE: Zig `joiner.push(slice, allocator?)` encoded ownership in - // the optional allocator. `StringJoiner::push_owned` is the Rust - // equivalent; `ZigStringSlice::into_vec` moves out the buffer if owned - // or copies if borrowed (matching Zig's `null`-allocator borrow case). - joiner.push_owned(escape_form_data_name(name.to_slice().into_vec())); + Self::push_string_slice(joiner, name.to_slice(), true); match entry { FormDataEntry::String(value) => { joiner.push_static(b"\"\r\n\r\n"); - joiner.push_owned(value.to_slice().into_vec().into_boxed_slice()); + Self::push_string_slice(joiner, value.to_slice(), false); } FormDataEntry::File { blob, filename } => { joiner.push_static(b"\"; filename=\""); - joiner.push_owned(escape_form_data_name(filename.to_slice().into_vec())); + Self::push_string_slice(joiner, filename.to_slice(), true); joiner.push_static(b"\"\r\n"); + // Borrowed from the blob, which the `DOMFormData` keeps alive + // past `joiner.done()` (Zig used `pushStatic` here too). let blob_ct = blob.content_type_slice(); let content_type: &[u8] = if !blob_ct.is_empty() && !blob_ct.iter().any(|&b| matches!(b, b'\r' | b'\n')) @@ -4000,7 +4012,7 @@ impl FormDataContext<'_> { b"application/octet-stream" }; joiner.push_static(b"Content-Type: "); - joiner.push_cloned(content_type); + joiner.push_static(content_type); joiner.push_static(b"\r\n\r\n"); if blob.store.get().is_some() { @@ -4052,7 +4064,10 @@ impl FormDataContext<'_> { } } store::Data::Bytes(_) => { - joiner.push_cloned(blob.shared_view()); + // Borrowed: the blob's store is kept alive by the + // `DOMFormData` entry until after `joiner.done()` + // (Zig used `pushStatic` here too). + joiner.push_static(blob.shared_view()); } } } diff --git a/test/js/web/html/FormData-multipart-serialization.test.ts b/test/js/web/html/FormData-multipart-serialization.test.ts new file mode 100644 index 00000000000..23ef3267a88 --- /dev/null +++ b/test/js/web/html/FormData-multipart-serialization.test.ts @@ -0,0 +1,158 @@ +import { describe, expect, test } from "bun:test"; +import { bunEnv, bunExe, isLinux } from "harness"; + +// Serializing a FormData body (`new Response(formData)` / `new Request(..., { body: formData })`) +// goes through Blob::from_dom_form_data, which joins the multipart parts out of +// borrowed views of the FormData's strings and blob bytes. These tests lock in +// the exact wire format (boundary shape, part layout, name/filename escaping, +// content-type fallback), verify the serialize → parse round-trip (including +// non-ASCII strings and large binary payloads), and check that serialization +// does not duplicate in-memory blob contents. +describe("multipart serialization (new Response(formData))", () => { + test("serializes string fields, blobs, unicode and escaped names exactly", async () => { + const formData = new FormData(); + formData.append("simple", "value"); + formData.append("empty", ""); + formData.append("unicode name ☺", "ünïcode välue 😊"); + formData.append('quote"name', "v1"); + formData.append("crlf\r\nname", "v2"); + formData.append("untyped-blob", new Blob(["blob-bytes"])); + formData.append("named-blob", new Blob(["named-bytes"]), "file.bin"); + formData.append("typed-file", new File(["

hi

"], 'weird"file\r\nname.html', { type: "text/html" })); + + const response = new Response(formData); + const contentType = response.headers.get("Content-Type")!; + expect(contentType).toMatch(/^multipart\/form-data; boundary=----WebKitFormBoundary[0-9a-f]{32}$/); + const boundary = contentType.slice(contentType.indexOf("boundary=") + "boundary=".length); + + const text = await response.text(); + expect(text).toBe( + [ + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="simple"\r\n\r\nvalue\r\n`, + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="empty"\r\n\r\n\r\n`, + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="unicode name ☺"\r\n\r\nünïcode välue 😊\r\n`, + `--${boundary}\r\n`, + // '"', CR and LF in names/filenames are percent-encoded so they can't + // break out of the quoted-string or inject part headers. + `Content-Disposition: form-data; name="quote%22name"\r\n\r\nv1\r\n`, + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="crlf%0D%0Aname"\r\n\r\nv2\r\n`, + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="untyped-blob"; filename=""\r\n`, + `Content-Type: application/octet-stream\r\n\r\nblob-bytes\r\n`, + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="named-blob"; filename="file.bin"\r\n`, + `Content-Type: application/octet-stream\r\n\r\nnamed-bytes\r\n`, + `--${boundary}\r\n`, + `Content-Disposition: form-data; name="typed-file"; filename="weird%22file%0D%0Aname.html"\r\n`, + `Content-Type: text/html;charset=utf-8\r\n\r\n

hi

\r\n`, + `--${boundary}--\r\n`, + ].join(""), + ); + }); + + test("round-trips every entry kind through Response.formData()", async () => { + const formData = new FormData(); + formData.append("simple", "value"); + formData.append("dup", "first"); + formData.append("dup", "second"); + formData.append("unicode name ☺", "ünïcode välue 😊"); + formData.append("blob", new Blob(["blob-bytes"])); + formData.append("file", new File(["

hi

"], "日本語ファイル名.html", { type: "text/html" })); + + const parsed = await new Response(formData).formData(); + const entries = await Promise.all( + [...parsed.entries()].map(async ([name, value]) => + value instanceof Blob + ? [ + name, + { file: value instanceof File, name: (value as File).name, type: value.type, text: await value.text() }, + ] + : [name, value], + ), + ); + + expect(entries).toEqual([ + ["simple", "value"], + ["dup", "first"], + ["dup", "second"], + ["unicode name ☺", "ünïcode välue 😊"], + ["blob", { file: true, name: undefined, type: "", text: "blob-bytes" }], + ["file", { file: true, name: "日本語ファイル名.html", type: "text/html;charset=utf-8", text: "

hi

" }], + ]); + }); + + test("round-trips large binary blob contents intact", async () => { + const bytes = new Uint8Array(64 * 1024); + for (let i = 0; i < bytes.length; i++) { + bytes[i] = (i * 31 + 7) & 0xff; + } + + const formData = new FormData(); + formData.append("payload", new Blob([bytes]), "payload.bin"); + formData.append("after", "still-parses"); + + const parsed = await new Response(formData).formData(); + const payload = parsed.get("payload") as File; + expect(payload.size).toBe(bytes.length); + expect(new Uint8Array(await payload.arrayBuffer())).toEqual(bytes); + expect(parsed.get("after")).toBe("still-parses"); + }); + + // Serializing an in-memory blob entry borrows its bytes; the only blob-sized + // allocation made by `new Response(formData)` is the joined multipart body + // itself, so peak memory grows by ~1x the blob size (measured ~1.0x on both + // release and ASAN debug builds). Copying the blob's bytes into the joiner + // (an extra full copy alive while the output is assembled) pushes the peak to + // ~2x, which the 1.5x threshold catches with a wide margin on both sides. + // Linux-only: peak RSS is read from /proc/self/status (VmHWM), which has no + // portable equivalent. + test.skipIf(!isLinux)("does not duplicate in-memory blob bytes while serializing", async () => { + const blobSizeMB = 128; + const script = ` + const blobSizeMB = ${blobSizeMB}; + const bytes = Buffer.alloc(blobSizeMB * 1024 * 1024, 0x61); + // Two parts force the Blob to materialize its own store now, so the + // baseline below already includes one copy of the payload and the only + // thing measured across the Response constructor is serialization. + const blob = new Blob([bytes, "x"]); + const formData = new FormData(); + formData.append("payload", blob, "payload.bin"); + formData.append("field", "value"); + + function peakRssMB() { + const status = require("fs").readFileSync("/proc/self/status", "utf8"); + const start = status.indexOf("VmHWM:"); + return parseInt(status.slice(start + "VmHWM:".length), 10) / 1024; + } + + const before = peakRssMB(); + const response = new Response(formData); + const after = peakRssMB(); + console.log(JSON.stringify({ deltaMB: after - before, alive: response instanceof Response })); + `; + + await using proc = Bun.spawn({ + cmd: [bunExe(), "--smol", "-e", script], + env: bunEnv, + stdout: "pipe", + stderr: "pipe", + }); + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + if (exitCode !== 0) { + // Surface the crashed child's own error output in the assertion diff. + expect(stderr).toBe(""); + } + + const { deltaMB, alive } = JSON.parse(stdout.trim()); + expect(alive).toBe(true); + // Sanity: the serialized body was actually materialized during the sample. + expect(deltaMB).toBeGreaterThan(blobSizeMB * 0.5); + // An extra copy of the blob bytes would put this at ~2x the blob size. + expect(deltaMB).toBeLessThan(blobSizeMB * 1.5); + expect(exitCode).toBe(0); + }); +});