fix(gatsby-source-wordpress): HTML image regex's (#29778) (#29883)

Co-authored-by: gatsbybot <[email protected]> (cherry picked from commit f6edccf) Co-authored-by: Tyler Barnes <[email protected]>
gatsbyjs · Mar 1, 2021 · c2ea9b9 · c2ea9b9
1 parent 706a754
commit c2ea9b9
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 31 deletions.
diff --git a/packages/gatsby-source-wordpress/__tests__/process-node.test.js b/packages/gatsby-source-wordpress/__tests__/process-node.test.js
@@ -0,0 +1,25 @@
+import {
+  getImgSrcRemoteFileMatchesFromNodeString,
+  getImgTagMatchesWithUrl,
+} from "../dist/steps/source-nodes/create-nodes/process-node"
+
+test(`HTML image transformation regex matches images`, async () => {
+  const wpUrl = `http://wp.fakesite.com`
+
+  const nodeString = `<img src=\\"https://wp.fakesite.com/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />
+
+  <img src=\\"http://wp.fakesite.com/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />
+
+  <img src=\\"/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />`
+
+  const matches = getImgSrcRemoteFileMatchesFromNodeString(nodeString)
+
+  expect(matches.length).toBe(3)
+
+  const imgTagMatches = getImgTagMatchesWithUrl({
+    nodeString,
+    wpUrl,
+  })
+
+  expect(imgTagMatches.length).toBe(3)
+})
diff --git a/packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js b/packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js
@@ -1,3 +1,4 @@
+/* eslint-disable no-useless-escape */
 import { isWebUri } from "valid-url"
 import { fluid } from "gatsby-plugin-sharp"
 import Img from "gatsby-image"
@@ -30,7 +31,7 @@ const getNodeEditLink = node => {
 
 const findReferencedImageNodeIds = ({ nodeString, pluginOptions, node }) => {
   // if the lazyNodes plugin option is set we don't need to find
-  // image node id's because those nodes will be fetched lazily in resolvers
+  // image node id's because those nodes will be fetched lazily in resolvers.
   if (pluginOptions.type.MediaItem.lazyNodes) {
     return []
   }
@@ -327,6 +328,17 @@ const getCheerioElementFromMatch = wpUrl => ({ match, tag = `img` }) => {
   }
 }
 
+const getCheerioElementsFromMatches = ({ imgTagMatches, wpUrl }) =>
+  imgTagMatches
+    .map(getCheerioElementFromMatch(wpUrl))
+    .filter(({ cheerioImg: { attribs } }) => {
+      if (!attribs.src) {
+        return false
+      }
+
+      return isWebUri(encodeURI(attribs.src))
+    })
+
 const getLargestSizeFromSizesAttribute = sizesString => {
   const sizesStringsArray = sizesString.split(`,`)
 
@@ -444,6 +456,28 @@ const cacheCreatedFileNodeBySrc = ({ node, src }) => {
   }
 }
 
+const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:[^'"])*\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" \.]*|)(?=\\"| |\.)/gim
+
+export const getImgSrcRemoteFileMatchesFromNodeString = nodeString =>
+  execall(imgSrcRemoteFileRegex, nodeString).filter(({ subMatches }) => {
+    // if our match is json encoded, that means it's inside a JSON
+    // encoded string field.
+    const isInJSON = subMatches[0].includes(`\\/\\/`)
+
+    // we shouldn't process encoded JSON, so skip this match if it's JSON
+    return !isInJSON
+  })
+
+export const getImgTagMatchesWithUrl = ({ nodeString, wpUrl }) =>
+  execall(
+    /<img([\w\W]+?)[\/]?>/gim,
+    nodeString
+      // we don't want to match images inside pre
+      .replace(/<pre([\w\W]+?)[\/]?>.*(<\/pre>)/gim, ``)
+      // and code tags, so temporarily remove those tags and everything inside them
+      .replace(/<code([\w\W]+?)[\/]?>.*(<\/code>)/gim, ``)
+  ).filter(filterMatches(wpUrl))
+
 const replaceNodeHtmlImages = async ({
   nodeString,
   node,
@@ -456,38 +490,15 @@ const replaceNodeHtmlImages = async ({
     return nodeString
   }
 
-  const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" .]*|)(?=\\"| |\.)/gim
+  const imageUrlMatches = getImgSrcRemoteFileMatchesFromNodeString(nodeString)
 
-  const imageUrlMatches = execall(imgSrcRemoteFileRegex, nodeString).filter(
-    ({ subMatches }) => {
-      // if our match is json encoded, that means it's inside a JSON
-      // encoded string field.
-      const isInJSON = subMatches[0].includes(`\\/\\/`)
-
-      // we shouldn't process encoded JSON, so skip this match if it's JSON
-      return !isInJSON
-    }
-  )
-
-  const imgTagMatches = execall(
-    /<img([\w\W]+?)[/]?>/gim,
-    nodeString
-      // we don't want to match images inside pre
-      .replace(/<pre([\w\W]+?)[/]?>.*(<\/pre>)/gim, ``)
-      // and code tags, so temporarily remove those tags and everything inside them
-      .replace(/<code([\w\W]+?)[/]?>.*(<\/code>)/gim, ``)
-  ).filter(filterMatches(wpUrl))
+  const imgTagMatches = getImgTagMatchesWithUrl({ nodeString, wpUrl })
 
   if (imageUrlMatches.length && imgTagMatches.length) {
-    const cheerioImages = imgTagMatches
-      .map(getCheerioElementFromMatch(wpUrl))
-      .filter(({ cheerioImg: { attribs } }) => {
-        if (!attribs.src) {
-          return false
-        }
-
-        return isWebUri(attribs.src)
-      })
+    const cheerioImages = getCheerioElementsFromMatches({
+      imgTagMatches,
+      wpUrl,
+    })
 
     const htmlMatchesToMediaItemNodesMap = await fetchNodeHtmlImageMediaItemNodes(
       {

diff --git a/...atsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js b/...atsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js
@@ -211,7 +211,8 @@ export const stripImageSizesFromUrl = url => {
   const fileExtension = urlToFileExtension(url)
 
   const imageSizesPattern = new RegExp(
-    `(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `.${fileExtension}` : ``}`
+    // eslint-disable-next-line no-useless-escape
+    `(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `\.${fileExtension}` : ``}`
   )
 
   let urlWithoutSizes = url.replace(imageSizesPattern, ``)