Handle images without an extension as 'image' MIME and '.image' exten…

…sion (#178) * Use a generic 'image' MIME type for inline images whose URL does not end in an image file extension * Also bundle with EPUBs images that don’t have a characteristic file extension as `image` MIME type and `.image` file extension. * Default MIME image type: docs & refactoring
danburzo · Aug 11, 2024 · fb5cd9f · fb5cd9f
1 parent 035a221
commit fb5cd9f
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 78 deletions.
diff --git a/index.js b/index.js
@@ -29,7 +29,7 @@ import {
 } from './src/constants/markdown.js';
 
 import slurp from './src/util/slurp.js';
-import fileMimetype from './src/util/file-mimetype.js';
+import { lookupMimetype } from './src/util/file-mimetype.js';
 import epubDate from './src/util/epub-date.js';
 import humanDate from './src/util/human-date.js';
 import outputPath from './src/util/output-path.js';
@@ -169,15 +169,15 @@ async function fetchContent(ref, fetchOptions = {}) {
 	if (!url) {
 		return {
 			buffer: await readFile(ref),
-			contentType: fileMimetype(ref)
+			contentType: lookupMimetype(ref)
 		};
 	}
 
 	if (url && url.protocol === 'file:') {
 		url = decodeURI(url.href.replace(/^file:\/\//, ''));
 		return {
 			buffer: await readFile(url),
-			contentType: fileMimetype(url)
+			contentType: lookupMimetype(url)
 		};
 	}
 
@@ -978,7 +978,7 @@ async function epubgen(data, output_path, options) {
 			remoteResources: remoteResources.map(entry => ({
 				id: entry.mapped.replace(/[^a-z0-9]/gi, ''),
 				href: entry.mapped,
-				mimetype: fileMimetype(entry.mapped)
+				mimetype: entry.mimetype
 			}))
 		});
 

diff --git a/src/constants/regex.js b/src/constants/regex.js
diff --git a/src/enhancements.js b/src/enhancements.js
@@ -1,6 +1,6 @@
 import { parseSrcset, stringifySrcset } from 'srcset';
 import replaceElementType from './replace-element-type.js';
-import { REGEX_IMAGE_URL } from './constants/regex.js';
+import { isImageURL } from './util/file-mimetype.js';
 
 /* 
 	Convert AMP markup to HMTL markup
@@ -52,7 +52,6 @@ function fixLazyLoadedImages(doc) {
 		<img src='original-size.png'/>
 */
 function imagesAtFullSize(doc) {
-	let include_pattern = REGEX_IMAGE_URL;
 	let exclude_patterns = [
 		/*
 			Exclude Wikipedia links to image file pages
@@ -85,7 +84,7 @@ function imagesAtFullSize(doc) {
 
 		// Only replace if the `href` matches an image file
 		if (
-			include_pattern.test(href) &&
+			isImageURL(href, doc) &&
 			!exclude_patterns.some(pattern => pattern.test(href))
 		) {
 			img.setAttribute('src', anchor.href);

diff --git a/src/inline-images.js b/src/inline-images.js
@@ -1,50 +1,29 @@
 import { parseSrcset, stringifySrcset } from 'srcset';
-import fileMimetype from './util/file-mimetype.js';
+import { getMimetypeFromURL, isImageURL } from './util/file-mimetype.js';
 import fetchBase64 from './util/fetch-base64.js';
 
-/* 
-	Note: it is unfortunate that we use two separate mechanisms
-	to discern when an URL points to an image, but here we are.
-
-	`image_mimetypes` here needs to be kept in sync with the
-	`REGEX_IMAGE_URL` constant!
-*/
-const image_mimetypes = new Set([
-	'image/avif',
-	'image/bmp',
-	'image/gif',
-	'image/jpeg',
-	'image/png',
-	'image/svg+xml',
-	'image/tiff',
-	'image/webp'
-]);
-
-function get_mime(src, doc) {
-	let pathname = src;
-	try {
-		pathname = new URL(src, doc.baseURI).pathname;
-	} catch (err) {
-		// no-op, probably due to bad `doc.baseURI`
-	}
-	return fileMimetype(pathname);
-}
-
 export default async function inlineImages(doc, fetchOptions = {}, out) {
 	if (out) {
 		out.write('Inlining images...\n');
 	}
 	let src_promises = Array.from(
 		doc.querySelectorAll('picture source[src], img[src]')
 	).map(async el => {
-		const mime = get_mime(el.src, doc);
-		if (mime && image_mimetypes.has(mime)) {
-			if (out) {
-				out.write(el.src + '\n');
-			}
-			let data = await fetchBase64(el.src, fetchOptions);
-			el.setAttribute('src', `data:${mime};base64,${data}`);
+		/*
+			For web pages using atypical URLs for images
+			let’s just use a generic MIME type and hope it works.
+			
+			For an example, see:
+			https://github.com/danburzo/percollate/issues/174
+		*/
+		let mime = isImageURL(el.src, doc)
+			? getMimetypeFromURL(el.src, doc)
+			: 'image';
+		if (out) {
+			out.write(el.src + '\n');
 		}
+		let data = await fetchBase64(el.src, fetchOptions);
+		el.setAttribute('src', `data:${mime};base64,${data}`);
 	});
 
 	let srcset_promises = Array.from(
@@ -71,18 +50,24 @@ export default async function inlineImages(doc, fetchOptions = {}, out) {
 				stringifySrcset(
 					await Promise.all(
 						items.map(async item => {
-							const mime = get_mime(item.url, doc);
-							if (mime && image_mimetypes.has(mime)) {
-								let data = await fetchBase64(
-									item.url,
-									fetchOptions
-								);
-								return {
-									...item,
-									url: `data:${mime};base64,${data}`
-								};
-							}
-							return item;
+							/*
+								For web pages using atypical URLs for images
+								let’s just use a generic MIME type and hope it works.
+								
+								For an example, see:
+								https://github.com/danburzo/percollate/issues/174
+							*/
+							let mime = isImageURL(item.url, doc)
+								? getMimetypeFromURL(item.url, doc)
+								: 'image';
+							let data = await fetchBase64(
+								item.url,
+								fetchOptions
+							);
+							return {
+								...item,
+								url: `data:${mime};base64,${data}`
+							};
 						})
 					)
 				)

diff --git a/src/remote-resources.js b/src/remote-resources.js
@@ -1,6 +1,10 @@
 import { randomUUID as uuid } from 'node:crypto';
 import { parseSrcset, stringifySrcset } from 'srcset';
-import { REGEX_IMAGE_URL } from './constants/regex.js';
+import {
+	getMimetypeFromURL,
+	extForMimetype,
+	isImageURL
+} from './util/file-mimetype.js';
 import { getUrlOrigin } from './util/url-origin.js';
 
 export default function remoteResources(doc) {
@@ -11,21 +15,24 @@ export default function remoteResources(doc) {
 		and return a uniquely generated file name instead.
 	 */
 	function collectAndReplace(src) {
-		let pathname = src;
-		try {
-			pathname = new URL(src, doc.baseURI).pathname;
-		} catch (err) {
-			// no-op, probably due to bad `doc.baseURI`.
-		}
-		let match = pathname.match(REGEX_IMAGE_URL);
-		if (!match) {
-			return src;
+		/*
+			If image URLs don’t have an extension with which
+			to figure out the image format, use the generic
+			`image` MIME media type and the `.image` extension
+			for EPUB remote resources.
+		*/
+		let mime = 'image',
+			ext = '.image';
+		if (isImageURL(src, doc)) {
+			mime = getMimetypeFromURL(src, doc);
+			ext = extForMimetype(mime);
 		}
 		if (!srcs.has(src)) {
 			srcs.set(src, {
 				original: src,
-				mapped: `rr-${uuid()}.${match[1]}`,
-				origin: getUrlOrigin(doc.baseURI)
+				mapped: `rr-${uuid()}${ext}`,
+				origin: getUrlOrigin(doc.baseURI),
+				mimetype: mime
 			});
 		}
 		return `./${srcs.get(src).mapped}`;

diff --git a/src/util/file-mimetype.js b/src/util/file-mimetype.js
@@ -1,11 +1,40 @@
 import mimetype from 'mimetype';
 
+const IMAGE_MIMETYPES = new Set([
+	'image/avif',
+	'image/bmp',
+	'image/gif',
+	'image/jpeg',
+	'image/png',
+	'image/svg+xml',
+	'image/tiff',
+	'image/webp'
+]);
+
 /*
 	Add newer image formats to the MIME type database.
  */
 mimetype.set('.webp', 'image/webp');
 mimetype.set('.avif', 'image/avif');
 
-export default function lookup(filepath) {
+export function lookupMimetype(filepath) {
 	return mimetype.lookup(filepath);
 }
+
+export function extForMimetype(type) {
+	return Object.entries(mimetype.catalog).find(it => it[1] === type)?.[0];
+}
+
+export function getMimetypeFromURL(src, doc) {
+	let pathname = src;
+	try {
+		pathname = new URL(src, doc.baseURI).pathname;
+	} catch (err) {
+		// no-op, probably due to bad `doc.baseURI`
+	}
+	return lookupMimetype(pathname);
+}
+
+export function isImageURL(src, doc) {
+	return IMAGE_MIMETYPES.has(getMimetypeFromURL(src, doc));
+}