feat: decode percent-encoded path in getPath (#2714)

* feat: decode percent-encoded path in `getPath` * refactor: Stop decoding URIs in the `param()` method, since they are already decoded in `getPath()` * test: add tests for decoding URI in path * chore: denoify * Revert "refactor: Stop decoding URIs in the `param()` method, since they are already decoded in `getPath()`" This reverts commit 7192497. * refactor: Replace "%25" before applying decodeURI() for avoid double decoding Co-authored-by: Szymon Marczak <[email protected]> * chore: denoify * refactor(utils): check existence of "%25" before replacing it with "%2525" Co-authored-by: Szymon Marczak <[email protected]> * feat(utils/url): Changed URL decoding to skip invalid sequences and decode as much as possible * chore: denoify --------- Co-authored-by: Szymon Marczak <[email protected]>
honojs · May 22, 2024 · 778ac10 · 778ac10
1 parent 8cb59e4
commit 778ac10
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 10 deletions.
diff --git a/benchmarks/utils/src/get-path.ts b/benchmarks/utils/src/get-path.ts
@@ -5,15 +5,61 @@ bench('noop', () => {})
 const request = new Request('http://localhost/about/me')
 
 group('getPath', () => {
-  bench('slice + indexOf', () => {
+  bench('slice + indexOf : w/o decodeURI', () => {
     const url = request.url
     const queryIndex = url.indexOf('?', 8)
-    url.slice(url.indexOf('/', 8), queryIndex === -1 ? undefined : queryIndex)
+    return url.slice(url.indexOf('/', 8), queryIndex === -1 ? undefined : queryIndex)
   })
 
-  bench('regexp', () => {
+  bench('regexp : w/o decodeURI', () => {
     const match = request.url.match(/^https?:\/\/[^/]+(\/[^?]*)/)
-    match ? match[1] : ''
+    return match ? match[1] : ''
+  })
+
+  bench('slice + indexOf', () => {
+    const url = request.url
+    const queryIndex = url.indexOf('?', 8)
+    const path = url.slice(url.indexOf('/', 8), queryIndex === -1 ? undefined : queryIndex)
+    return path.includes('%') ? decodeURIComponent(path) : path
+  })
+
+  bench('slice + for-loop + flag', () => {
+    const url = request.url
+    let start = url.indexOf('/', 8)
+    let i = start
+    let hasPercentEncoding = false
+    for (; i < url.length; i++) {
+      const charCode = url.charCodeAt(i)
+      if (charCode === 37) {
+        // '%'
+        hasPercentEncoding = true
+      } else if (charCode === 63) {
+        // '?'
+        break
+      }
+    }
+    return hasPercentEncoding ? decodeURIComponent(url.slice(start, i)) : url.slice(start, i)
+  })
+
+  bench('slice + for-loop + immediate return', () => {
+    const url = request.url
+    const start = url.indexOf('/', 8)
+    let i = start
+    for (; i < url.length; i++) {
+      const charCode = url.charCodeAt(i)
+      if (charCode === 37) {
+        // '%'
+        // If the path contains percent encoding, use `indexOf()` to find '?' and return the result immediately.
+        // Although this is a performance disadvantage, it is acceptable since we prefer cases that do not include percent encoding.
+        const queryIndex = url.indexOf('?', i)
+        const path = url.slice(start, queryIndex === -1 ? undefined : queryIndex)
+        return decodeURI(path.includes('%25') ? path.replace(/%25/g, '%2525') : path)
+      } else if (charCode === 63) {
+        // '?'
+        break
+      }
+    }
+    return url.slice(start, i)
   })
 })
 

diff --git a/deno_dist/utils/url.ts b/deno_dist/utils/url.ts
@@ -69,11 +69,48 @@ export const getPattern = (label: string): Pattern | null => {
   return null
 }
 
+/**
+ * Try to apply decodeURI() to given string.
+ * If it fails, skip invalid percent encoding or invalid UTF-8 sequences, and apply decodeURI() to the rest as much as possible.
+ * @param str The string to decode.
+ * @returns The decoded string that sometimes contains undecodable percent encoding.
+ * @example
+ * tryDecodeURI('Hello%20World') // 'Hello World'
+ * tryDecodeURI('Hello%20World/%A4%A2') // 'Hello World/%A4%A2'
+ */
+const tryDecodeURI = (str: string): string => {
+  try {
+    return decodeURI(str)
+  } catch {
+    return str.replace(/(?:%[0-9A-Fa-f]{2})+/g, (match) => {
+      try {
+        return decodeURI(match)
+      } catch {
+        return match
+      }
+    })
+  }
+}
+
 export const getPath = (request: Request): string => {
-  // Optimized: indexOf() + slice() is faster than RegExp
   const url = request.url
-  const queryIndex = url.indexOf('?', 8)
-  return url.slice(url.indexOf('/', 8), queryIndex === -1 ? undefined : queryIndex)
+  const start = url.indexOf('/', 8)
+  let i = start
+  for (; i < url.length; i++) {
+    const charCode = url.charCodeAt(i)
+    if (charCode === 37) {
+      // '%'
+      // If the path contains percent encoding, use `indexOf()` to find '?' and return the result immediately.
+      // Although this is a performance disadvantage, it is acceptable since we prefer cases that do not include percent encoding.
+      const queryIndex = url.indexOf('?', i)
+      const path = url.slice(start, queryIndex === -1 ? undefined : queryIndex)
+      return tryDecodeURI(path.includes('%25') ? path.replace(/%25/g, '%2525') : path)
+    } else if (charCode === 63) {
+      // '?'
+      break
+    }
+  }
+  return url.slice(start, i)
 }
 
 export const getQueryStrings = (url: string): string => {

diff --git a/src/hono.test.ts b/src/hono.test.ts
@@ -732,6 +732,89 @@ describe('Routing', () => {
       expect(res.status).toBe(404)
     })
   })
+
+  describe('Encoded path', () => {
+    let app: Hono
+    beforeEach(() => {
+      app = new Hono()
+    })
+
+    it('should decode path parameter', async () => {
+      app.get('/users/:id', (c) => c.text(`id is ${c.req.param('id')}`))
+
+      const res = await app.request('http://localhost/users/%C3%A7awa%20y%C3%AE%3F')
+      expect(res.status).toBe(200)
+      expect(await res.text()).toBe('id is çawa yî?')
+    })
+
+    it('should decode "/"', async () => {
+      app.get('/users/:id', (c) => c.text(`id is ${c.req.param('id')}`))
+
+      const res = await app.request('http://localhost/users/hono%2Fposts') // %2F is '/'
+      expect(res.status).toBe(200)
+      expect(await res.text()).toBe('id is hono/posts')
+    })
+
+    it('should decode alphabets', async () => {
+      app.get('/users/static', (c) => c.text('static'))
+
+      const res = await app.request('http://localhost/users/%73tatic') // %73 is 's'
+      expect(res.status).toBe(200)
+      expect(await res.text()).toBe('static')
+    })
+
+    it('should decode alphabets with invalid UTF-8 sequence', async () => {
+      app.get('/static/:path', (c) => {
+        try {
+          return c.text(`by c.req.param: ${c.req.param('path')}`) // this should throw an error
+        } catch (e) {
+          return c.text(`by c.req.url: ${c.req.url.replace(/.*\//, '')}`)
+        }
+      })
+
+      const res = await app.request('http://localhost/%73tatic/%A4%A2') // %73 is 's', %A4%A2 is invalid UTF-8 sequence
+      expect(res.status).toBe(200)
+      expect(await res.text()).toBe('by c.req.url: %A4%A2')
+    })
+
+    it('should decode alphabets with invalid percent encoding', async () => {
+      app.get('/static/:path', (c) => {
+        try {
+          return c.text(`by c.req.param: ${c.req.param('path')}`) // this should throw an error
+        } catch (e) {
+          return c.text(`by c.req.url: ${c.req.url.replace(/.*\//, '')}`)
+        }
+      })
+
+      const res = await app.request('http://localhost/%73tatic/%a') // %73 is 's', %a is invalid percent encoding
+      expect(res.status).toBe(200)
+      expect(await res.text()).toBe('by c.req.url: %a')
+    })
+
+    it('should be able to catch URIError', async () => {
+      app.onError((err, c) => {
+        if (err instanceof URIError) {
+          return c.text(err.message, 400)
+        }
+        throw err
+      })
+      app.get('/static/:path', (c) => {
+        return c.text(`by c.req.param: ${c.req.param('path')}`) // this should throw an error
+      })
+
+      const res = await app.request('http://localhost/%73tatic/%a') // %73 is 's', %a is invalid percent encoding
+      expect(res.status).toBe(400)
+      expect(await res.text()).toBe('URI malformed')
+    })
+
+    it('should not double decode', async () => {
+      app.get('/users/:id', (c) => c.text(`posts of ${c.req.param('id')}`))
+
+      const res = await app.request('http://localhost/users/%2525') // %25 is '%'
+      expect(res.status).toBe(200)
+      expect(await res.text()).toBe('posts of %25')
+    })
+  })
 })
 
 describe('param and query', () => {

diff --git a/src/utils/url.ts b/src/utils/url.ts
@@ -69,11 +69,48 @@ export const getPattern = (label: string): Pattern | null => {
   return null
 }
 
+/**
+ * Try to apply decodeURI() to given string.
+ * If it fails, skip invalid percent encoding or invalid UTF-8 sequences, and apply decodeURI() to the rest as much as possible.
+ * @param str The string to decode.
+ * @returns The decoded string that sometimes contains undecodable percent encoding.
+ * @example
+ * tryDecodeURI('Hello%20World') // 'Hello World'
+ * tryDecodeURI('Hello%20World/%A4%A2') // 'Hello World/%A4%A2'
+ */
+const tryDecodeURI = (str: string): string => {
+  try {
+    return decodeURI(str)
+  } catch {
+    return str.replace(/(?:%[0-9A-Fa-f]{2})+/g, (match) => {
+      try {
+        return decodeURI(match)
+      } catch {
+        return match
+      }
+    })
+  }
+}
+
 export const getPath = (request: Request): string => {
-  // Optimized: indexOf() + slice() is faster than RegExp
   const url = request.url
-  const queryIndex = url.indexOf('?', 8)
-  return url.slice(url.indexOf('/', 8), queryIndex === -1 ? undefined : queryIndex)
+  const start = url.indexOf('/', 8)
+  let i = start
+  for (; i < url.length; i++) {
+    const charCode = url.charCodeAt(i)
+    if (charCode === 37) {
+      // '%'
+      // If the path contains percent encoding, use `indexOf()` to find '?' and return the result immediately.
+      // Although this is a performance disadvantage, it is acceptable since we prefer cases that do not include percent encoding.
+      const queryIndex = url.indexOf('?', i)
+      const path = url.slice(start, queryIndex === -1 ? undefined : queryIndex)
+      return tryDecodeURI(path.includes('%25') ? path.replace(/%25/g, '%2525') : path)
+    } else if (charCode === 63) {
+      // '?'
+      break
+    }
+  }
+  return url.slice(start, i)
 }
 
 export const getQueryStrings = (url: string): string => {