url: add fileURLToPathBuffer API

jasnell · jasnell · commit 755e985a87a1 · 2025-06-14T07:22:53.000-07:00
The existing `fileURLToPath()` does not handle the case
where the input URL contains percent-encoded characters
that are not valid UTF-8 sequences. This can lead to
issues, for instance, when the URL is constructed
using file names in non-Unicode encodings (like
Shift-JIS). This commit introduces a new API,
`fileURLToPathBuffer()`, which returns a `Buffer`
representing the path, allowing for accurate
conversion of file URLs to paths without attempting
to decode the percent-encoded bytes into characters.
diff --git a/doc/api/url.md b/doc/api/url.md
@@ -1358,6 +1358,26 @@ new URL('file:///hello world').pathname;   // Incorrect: /hello%20world
 fileURLToPath('file:///hello world');      // Correct:   /hello world (POSIX)
 ```
 
+### `url.fileURLToPathBuffer(url[, options])`
+
+<!--
+added: REPLACEME
+-->
+
+* `url` {URL | string} The file URL string or URL object to convert to a path.
+* `options` {Object}
+  * `windows` {boolean|undefined} `true` if the `path` should be
+    return as a windows filepath, `false` for posix, and
+    `undefined` for the system default.
+    **Default:** `undefined`.
+* Returns: {Buffer} The fully-resolved platform-specific Node.js file path
+  as a {Buffer}.
+
+Like `url.fileURLToPath(...)` except that instead of returning a string
+representation of the path, a `Buffer` is returned. This conversion is
+helpful when the input URL contains percent-encoded segments that are
+not valid UTF-8 / Unicode sequences.
+
 ### `url.format(URL[, options])`
 
 <!-- YAML
diff --git a/lib/internal/data_url.js b/lib/internal/data_url.js
@@ -349,4 +349,5 @@ function isomorphicDecode(input) {
 
 module.exports = {
   dataURLProcessor,
+  percentDecode,
 };
diff --git a/lib/internal/url.js b/lib/internal/url.js
@@ -29,6 +29,9 @@ const {
   Symbol,
   SymbolIterator,
   SymbolToStringTag,
+  TypedArrayPrototypeGetBuffer,
+  TypedArrayPrototypeGetByteLength,
+  TypedArrayPrototypeGetByteOffset,
   decodeURIComponent,
 } = primordials;
 
@@ -81,13 +84,17 @@ const {
   CHAR_LOWERCASE_Z,
   CHAR_PERCENT,
   CHAR_PLUS,
+  CHAR_COLON,
 } = require('internal/constants');
 const path = require('path');
+const { Buffer } = require('buffer');
 
 const {
   validateFunction,
 } = require('internal/validators');
 
+const { percentDecode } = require('internal/data_url');
+
 const querystring = require('querystring');
 
 const bindingUrl = internalBinding('url');
@@ -1482,6 +1489,76 @@ function getPathFromURLWin32(url) {
   return StringPrototypeSlice(pathname, 1);
 }
 
+function getPathBufferFromURLWin32(url) {
+  const hostname = url.hostname;
+  let pathname = url.pathname;
+  // In the getPathFromURLWin32 variant, we scan the input for backslash (\)
+  // and forward slash (/) characters, specifically looking for the ASCII/UTF8
+  // encoding these and forbidding their use. This is a bit tricky
+  // because these may conflict with non-UTF8 encodings. For instance,
+  // in shift-jis, %5C identifies the symbol for the Japanese Yen and not the
+  // backslash. If we have a url like file:///foo/%5c/bar, then we really have
+  // no way of knowing if that %5c is meant to be a backslash \ or a yen sign.
+  // Passing in an encoding option does not help since our Buffer encoding only
+  // knows about certain specific text encodings and a single file path might
+  // actually contain segments that use multiple encodings. It's tricky! So,
+  // for this variation where we are producing a buffer, we won't scan for the
+  // slashes at all, and instead will decode the bytes literally into the
+  // returned Buffer. That said, that can also be tricky because, on windows,
+  // the file path separator *is* the ASCII backslash. This is a known issue
+  // on windows specific to the Shift-JIS encoding that we're not really going
+  // to solve here. Instead, we're going to do the best we can and just
+  // interpret the input url as a sequence of bytes.
+
+  // Because we are converting to a Windows file path here, we need to replace
+  // the explicit forward slash separators with backslashes. Note that this
+  // intentionally disregards any percent-encoded forward slashes in the path.
+  pathname = SideEffectFreeRegExpPrototypeSymbolReplace(FORWARD_SLASH, pathname, '\\');
+
+  // Now, let's start to build our Buffer. We will initially start with a
+  // Buffer allocated to fit in the entire string. Worst case there are no
+  // percent encoded characters and we take the string as is. Any invalid
+  // percent encodings, e.g. `%ZZ` are ignored and are passed through
+  // literally.
+  const decodedu8 = percentDecode(Buffer.from(pathname, 'utf8'));
+  const decodedPathname = Buffer.from(TypedArrayPrototypeGetBuffer(decodedu8),
+                                      TypedArrayPrototypeGetByteOffset(decodedu8),
+                                      TypedArrayPrototypeGetByteLength(decodedu8));
+  if (hostname !== '') {
+    // If hostname is set, then we have a UNC path
+    // Pass the hostname through domainToUnicode just in case
+    // it is an IDN using punycode encoding. We do not need to worry
+    // about percent encoding because the URL parser will have
+    // already taken care of that for us. Note that this only
+    // causes IDNs with an appropriate `xn--` prefix to be decoded.
+
+    // This is a bit tricky because of the need to convert to a Buffer
+    // followed by concatenation of the results.
+    const prefix = Buffer.from('\\\\', 'ascii');
+    const domain = Buffer.from(domainToUnicode(hostname), 'utf8');
+
+    return Buffer.concat([prefix, domain, decodedPathname]);
+  }
+  // Otherwise, it's a local path that requires a drive letter
+  // In this case we're only going to pay attention to the second and
+  // third bytes in the decodedPathname. If first byte is either an ASCII
+  // uppercase letter between 'A' and 'Z' or lowercase letter between
+  // 'a' and 'z', and the second byte must be an ASCII `:` or the
+  // operation will fail.
+
+  const letter = decodedPathname[1] | 0x20;
+  const sep = decodedPathname[2];
+
+  if (letter < CHAR_LOWERCASE_A || letter > CHAR_LOWERCASE_Z ||   // a..z A..Z
+      (sep !== CHAR_COLON)) {
+    throw new ERR_INVALID_FILE_URL_PATH('must be absolute');
+  }
+
+  // Now, we'll just return everything except the first byte of
+  // decodedPathname
+  return decodedPathname.subarray(1);
+}
+
 function getPathFromURLPosix(url) {
   if (url.hostname !== '') {
     throw new ERR_INVALID_FILE_URL_HOST(platform);
@@ -1500,6 +1577,28 @@ function getPathFromURLPosix(url) {
   return decodeURIComponent(pathname);
 }
 
+function getPathBufferFromURLPosix(url) {
+  if (url.hostname !== '') {
+    throw new ERR_INVALID_FILE_URL_HOST(platform);
+  }
+  const pathname = url.pathname;
+
+  // In the getPathFromURLPosix variant, we scan the input for forward slash
+  // (/) characters, specifically looking for the ASCII/UTF8 and forbidding
+  // its use. This is a bit tricky because these may conflict with non-UTF8
+  // encodings. Passing in an encoding option does not help since our Buffer
+  // encoding only knows about certain specific text encodings and a single
+  // file path might actually contain segments that use multiple encodings.
+  // It's tricky! So, for this variation where we are producing a buffer, we
+  // won't scan for the slashes at all, and instead will decode the bytes
+  // literally into the returned Buffer. We're going to do the best we can and
+  // just interpret the input url as a sequence of bytes.
+  const u8 = percentDecode(Buffer.from(pathname, 'utf8'));
+  return Buffer.from(TypedArrayPrototypeGetBuffer(u8),
+                     TypedArrayPrototypeGetByteOffset(u8),
+                     TypedArrayPrototypeGetByteLength(u8));
+}
+
 function fileURLToPath(path, options = kEmptyObject) {
   const windows = options?.windows;
   if (typeof path === 'string')
@@ -1511,6 +1610,24 @@ function fileURLToPath(path, options = kEmptyObject) {
   return (windows ?? isWindows) ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
 }
 
+// An alternative to fileURLToPath that outputs a Buffer
+// instead of a string. The other fileURLToPath does not
+// handle non-UTF8 encoded percent encodings at all, so
+// converting to a Buffer is necessary in cases where the
+// to string conversion would fail.
+function fileURLToPathBuffer(path, options = kEmptyObject) {
+  const windows = options?.windows;
+  if (typeof path === 'string') {
+    path = new URL(path);
+  } else if (!isURL(path)) {
+    throw new ERR_INVALID_ARG_TYPE('path', ['string', 'URL'], path);
+  }
+  if (path.protocol !== 'file:') {
+    throw new ERR_INVALID_URL_SCHEME('file');
+  }
+  return (windows ?? isWindows) ? getPathBufferFromURLWin32(path) : getPathBufferFromURLPosix(path);
+}
+
 function pathToFileURL(filepath, options = kEmptyObject) {
   const windows = options?.windows ?? isWindows;
   const isUNC = windows && StringPrototypeStartsWith(filepath, '\\\\');
@@ -1571,6 +1688,7 @@ function getURLOrigin(url) {
 
 module.exports = {
   fileURLToPath,
+  fileURLToPathBuffer,
   pathToFileURL,
   toPathIfFileURL,
   installObjectURLMethods,
diff --git a/lib/url.js b/lib/url.js
@@ -60,6 +60,7 @@ const {
   domainToASCII,
   domainToUnicode,
   fileURLToPath,
+  fileURLToPathBuffer,
   pathToFileURL: _pathToFileURL,
   urlToHttpOptions,
   unsafeProtocol,
@@ -1037,5 +1038,6 @@ module.exports = {
   // Utilities
   pathToFileURL,
   fileURLToPath,
+  fileURLToPathBuffer,
   urlToHttpOptions,
 };
diff --git a/test/parallel/test-bootstrap-modules.js b/test/parallel/test-bootstrap-modules.js
@@ -105,6 +105,8 @@ expected.beforePreExec = new Set([
   'Internal Binding wasm_web_api',
   'NativeModule internal/events/abort_listener',
   'NativeModule internal/modules/typescript',
+  'NativeModule internal/data_url',
+  'NativeModule internal/mime',
 ]);
 
 expected.atRunTime = new Set([
diff --git a/test/parallel/test-fileurltopathbuffer.js b/test/parallel/test-fileurltopathbuffer.js
@@ -0,0 +1,70 @@
+'use strict';
+
+const common = require('../common');
+
+// This test does not work on OSX due to the way it handles
+// non-Unicode sequences in file names.
+if (common.isMacOS) {
+  common.skip('Test unsupported on OSX');
+}
+
+// Unfortunately, the test also does not work on Windows
+// because the writeFileSync operation will replace the
+// non-Unicode characters with replacement characters when
+// it normalizes the path.
+if (common.isWindows) {
+  common.skip('Test unsupported on Windows');
+}
+
+const tmpdir = require('../common/tmpdir');
+
+const {
+  existsSync,
+  writeFileSync,
+} = require('node:fs');
+
+const {
+  ok,
+  throws,
+} = require('node:assert');
+
+const {
+  sep,
+} = require('node:path');
+
+tmpdir.refresh();
+
+const {
+  fileURLToPath,
+  fileURLToPathBuffer,
+} = require('node:url');
+
+const kShiftJisName = '%82%A0%82%A2%82%A4';
+const kShiftJisBuffer = Buffer.from([0x82, 0xA0, 0x82, 0xA2, 0x82, 0xA4]);
+
+const testPath = tmpdir.fileURL(kShiftJisName);
+
+ok(testPath.pathname.endsWith(`/${kShiftJisName}`));
+
+const tmpdirBuffer = Buffer.from(tmpdir.path + sep, 'utf8');
+const testPathBuffer = Buffer.concat([tmpdirBuffer, kShiftJisBuffer]);
+
+// We can use the Buffer version of the path to create a file and check
+// its existence. But we cannot use the URL version because it contains
+// non-Unicode percent-encoded characters.
+throws(() => writeFileSync(testPath, 'test'), {
+  name: 'URIError',
+});
+
+writeFileSync(testPathBuffer, 'test');
+ok(existsSync(testPathBuffer));
+
+// Using fileURLToPath fails because the URL contains non-Unicode
+// percent-encoded characters.
+throws(() => existsSync(fileURLToPath(testPath)), {
+  name: 'URIError',
+});
+
+// This variation succeeds because the URL is converted to a buffer
+// without trying to interpret the percent-encoded characters.
+ok(existsSync(fileURLToPathBuffer(testPath)));