From 17fae5adcc3b4a0fbeeccc7e3f6e71b807269345 Mon Sep 17 00:00:00 2001 From: Frederik Bolding Date: Tue, 12 Mar 2024 11:54:54 +0100 Subject: [PATCH] Use lexer for extraction of markdown links (#2261) Changes the implementation of the Markdown link extraction to use the lexer from `marked`. For the actual rendering we use https://github.com/syntax-tree/mdast-util-from-markdown but since that is ESM-only I have chosen to use `marked` in this PR. --- packages/snaps-utils/coverage.json | 6 +-- packages/snaps-utils/package.json | 1 + packages/snaps-utils/src/ui.test.ts | 60 +++++++++++++++++++++++++++++ packages/snaps-utils/src/ui.ts | 36 +++++++++++------ yarn.lock | 10 +++++ 5 files changed, 98 insertions(+), 15 deletions(-) diff --git a/packages/snaps-utils/coverage.json b/packages/snaps-utils/coverage.json index ed3cb454e0..648e009c22 100644 --- a/packages/snaps-utils/coverage.json +++ b/packages/snaps-utils/coverage.json @@ -1,6 +1,6 @@ { - "branches": 96.47, - "functions": 98.62, + "branches": 96.48, + "functions": 98.64, "lines": 98.74, - "statements": 94.48 + "statements": 94.51 } diff --git a/packages/snaps-utils/package.json b/packages/snaps-utils/package.json index d92bc49352..12a07afd99 100644 --- a/packages/snaps-utils/package.json +++ b/packages/snaps-utils/package.json @@ -66,6 +66,7 @@ "cron-parser": "^4.5.0", "fast-deep-equal": "^3.1.3", "fast-json-stable-stringify": "^2.1.0", + "marked": "^12.0.1", "rfdc": "^1.3.0", "semver": "^7.5.4", "ses": "^1.1.0", diff --git a/packages/snaps-utils/src/ui.test.ts b/packages/snaps-utils/src/ui.test.ts index 4c9a8c9ffd..5402601db3 100644 --- a/packages/snaps-utils/src/ui.test.ts +++ b/packages/snaps-utils/src/ui.test.ts @@ -19,6 +19,34 @@ describe('validateTextLinks', () => { expect(() => validateTextLinks('[](https://foo.bar)', () => false), ).not.toThrow(); + + expect(() => + validateTextLinks('[[test]](https://foo.bar)', () => false), + ).not.toThrow(); + + expect(() => + validateTextLinks('[test](https://foo.bar "foo bar baz")', () => false), + ).not.toThrow(); + + expect(() => + validateTextLinks('', () => false), + ).not.toThrow(); + + expect(() => + validateTextLinks( + `[foo][1] + [1]: https://foo.bar`, + () => false, + ), + ).not.toThrow(); + + expect(() => + validateTextLinks( + `[foo][1] + [1]: https://foo.bar "foo bar baz"`, + () => false, + ), + ).not.toThrow(); }); it('throws an error if an invalid link is found in text', () => { @@ -26,6 +54,38 @@ describe('validateTextLinks', () => { validateTextLinks('[test](http://foo.bar)', () => false), ).toThrow('Invalid URL: Protocol must be one of: https:, mailto:.'); + expect(() => + validateTextLinks('[[test]](http://foo.bar)', () => false), + ).toThrow('Invalid URL: Protocol must be one of: https:, mailto:.'); + + expect(() => validateTextLinks('', () => false)).toThrow( + 'Invalid URL: Protocol must be one of: https:, mailto:.', + ); + + expect(() => + validateTextLinks('[test](http://foo.bar "foo bar baz")', () => false), + ).toThrow('Invalid URL: Protocol must be one of: https:, mailto:.'); + + expect(() => + validateTextLinks( + `[foo][1] + [1]: http://foo.bar`, + () => false, + ), + ).toThrow('Invalid URL: Protocol must be one of: https:, mailto:.'); + + expect(() => + validateTextLinks( + `[foo][1] + [1]: http://foo.bar "foo bar baz"`, + () => false, + ), + ).toThrow('Invalid URL: Protocol must be one of: https:, mailto:.'); + + expect(() => validateTextLinks('[test](#code)', () => false)).toThrow( + 'Invalid URL: Unable to parse URL.', + ); + expect(() => validateTextLinks('[test](foo.bar)', () => false)).toThrow( 'Invalid URL: Unable to parse URL.', ); diff --git a/packages/snaps-utils/src/ui.ts b/packages/snaps-utils/src/ui.ts index c1bfe3982d..0b757a0959 100644 --- a/packages/snaps-utils/src/ui.ts +++ b/packages/snaps-utils/src/ui.ts @@ -1,11 +1,31 @@ import type { Component } from '@metamask/snaps-sdk'; import { NodeType } from '@metamask/snaps-sdk'; import { assert, AssertionError } from '@metamask/utils'; - -const MARKDOWN_LINK_REGEX = /\[(?[^\]]*)\]\((?[^)]+)\)/giu; +import type { Tokens } from 'marked'; +import { lexer, walkTokens } from 'marked'; const ALLOWED_PROTOCOLS = ['https:', 'mailto:']; +/** + * Extract all links from a Markdown text string using the `marked` lexer. + * + * @param text - The markdown text string. + * @returns A list of URLs linked to in the string. + */ +function getMarkdownLinks(text: string) { + const tokens = lexer(text); + const links: (Tokens.Link | Tokens.Generic)[] = []; + + // Walk the lexed tokens and collect all link tokens + walkTokens(tokens, (token) => { + if (token.type === 'link') { + links.push(token); + } + }); + + return links.map((link) => link?.href).filter(Boolean); +} + /** * Searches for markdown links in a string and checks them against the phishing list. * @@ -18,17 +38,9 @@ export function validateTextLinks( text: string, isOnPhishingList: (url: string) => boolean, ) { - const matches = String.prototype.matchAll.call(text, MARKDOWN_LINK_REGEX); - - for (const { groups } of matches) { - const link = groups?.url; - - /* This case should never happen with the regex but the TS type allows for undefined */ - /* istanbul ignore next */ - if (!link) { - continue; - } + const links = getMarkdownLinks(text); + for (const link of links) { try { const url = new URL(link); assert( diff --git a/yarn.lock b/yarn.lock index 9f0e31ee0e..75d4b0f7ca 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6008,6 +6008,7 @@ __metadata: istanbul-lib-report: ^3.0.0 istanbul-reports: ^3.1.5 jest: ^29.0.2 + marked: ^12.0.1 memfs: ^3.4.13 prettier: ^2.7.1 prettier-plugin-packagejson: ^2.2.11 @@ -17209,6 +17210,15 @@ __metadata: languageName: node linkType: hard +"marked@npm:^12.0.1": + version: 12.0.1 + resolution: "marked@npm:12.0.1" + bin: + marked: bin/marked.js + checksum: 35ebc6c4612fcc028a1cd6419321e336be5b29d3feb68dfd5aaa7fcddb399c7873cd3291d60daf342db3eede747757e4e18515f349f0ee7b84ec24254f3a4190 + languageName: node + linkType: hard + "md5.js@npm:^1.3.4": version: 1.3.5 resolution: "md5.js@npm:1.3.5"