Skip to content

Commit

Permalink
Handle images without an extension as 'image' MIME and '.image' exten…
Browse files Browse the repository at this point in the history
…sion (#178)

* Use a generic 'image' MIME type for inline images whose URL does not end in an image file extension

* Also bundle with EPUBs images that don’t have a characteristic file extension as `image` MIME type and `.image` file extension.

* Default MIME image type: docs & refactoring
  • Loading branch information
danburzo authored Aug 11, 2024
1 parent 035a221 commit fb5cd9f
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 78 deletions.
8 changes: 4 additions & 4 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import {
} from './src/constants/markdown.js';

import slurp from './src/util/slurp.js';
import fileMimetype from './src/util/file-mimetype.js';
import { lookupMimetype } from './src/util/file-mimetype.js';
import epubDate from './src/util/epub-date.js';
import humanDate from './src/util/human-date.js';
import outputPath from './src/util/output-path.js';
Expand Down Expand Up @@ -169,15 +169,15 @@ async function fetchContent(ref, fetchOptions = {}) {
if (!url) {
return {
buffer: await readFile(ref),
contentType: fileMimetype(ref)
contentType: lookupMimetype(ref)
};
}

if (url && url.protocol === 'file:') {
url = decodeURI(url.href.replace(/^file:\/\//, ''));
return {
buffer: await readFile(url),
contentType: fileMimetype(url)
contentType: lookupMimetype(url)
};
}

Expand Down Expand Up @@ -978,7 +978,7 @@ async function epubgen(data, output_path, options) {
remoteResources: remoteResources.map(entry => ({
id: entry.mapped.replace(/[^a-z0-9]/gi, ''),
href: entry.mapped,
mimetype: fileMimetype(entry.mapped)
mimetype: entry.mimetype
}))
});

Expand Down
10 changes: 0 additions & 10 deletions src/constants/regex.js

This file was deleted.

5 changes: 2 additions & 3 deletions src/enhancements.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { parseSrcset, stringifySrcset } from 'srcset';
import replaceElementType from './replace-element-type.js';
import { REGEX_IMAGE_URL } from './constants/regex.js';
import { isImageURL } from './util/file-mimetype.js';

/*
Convert AMP markup to HMTL markup
Expand Down Expand Up @@ -52,7 +52,6 @@ function fixLazyLoadedImages(doc) {
<img src='original-size.png'/>
*/
function imagesAtFullSize(doc) {
let include_pattern = REGEX_IMAGE_URL;
let exclude_patterns = [
/*
Exclude Wikipedia links to image file pages
Expand Down Expand Up @@ -85,7 +84,7 @@ function imagesAtFullSize(doc) {

// Only replace if the `href` matches an image file
if (
include_pattern.test(href) &&
isImageURL(href, doc) &&
!exclude_patterns.some(pattern => pattern.test(href))
) {
img.setAttribute('src', anchor.href);
Expand Down
81 changes: 33 additions & 48 deletions src/inline-images.js
Original file line number Diff line number Diff line change
@@ -1,50 +1,29 @@
import { parseSrcset, stringifySrcset } from 'srcset';
import fileMimetype from './util/file-mimetype.js';
import { getMimetypeFromURL, isImageURL } from './util/file-mimetype.js';
import fetchBase64 from './util/fetch-base64.js';

/*
Note: it is unfortunate that we use two separate mechanisms
to discern when an URL points to an image, but here we are.
`image_mimetypes` here needs to be kept in sync with the
`REGEX_IMAGE_URL` constant!
*/
const image_mimetypes = new Set([
'image/avif',
'image/bmp',
'image/gif',
'image/jpeg',
'image/png',
'image/svg+xml',
'image/tiff',
'image/webp'
]);

function get_mime(src, doc) {
let pathname = src;
try {
pathname = new URL(src, doc.baseURI).pathname;
} catch (err) {
// no-op, probably due to bad `doc.baseURI`
}
return fileMimetype(pathname);
}

export default async function inlineImages(doc, fetchOptions = {}, out) {
if (out) {
out.write('Inlining images...\n');
}
let src_promises = Array.from(
doc.querySelectorAll('picture source[src], img[src]')
).map(async el => {
const mime = get_mime(el.src, doc);
if (mime && image_mimetypes.has(mime)) {
if (out) {
out.write(el.src + '\n');
}
let data = await fetchBase64(el.src, fetchOptions);
el.setAttribute('src', `data:${mime};base64,${data}`);
/*
For web pages using atypical URLs for images
let’s just use a generic MIME type and hope it works.
For an example, see:
https://github.com/danburzo/percollate/issues/174
*/
let mime = isImageURL(el.src, doc)
? getMimetypeFromURL(el.src, doc)
: 'image';
if (out) {
out.write(el.src + '\n');
}
let data = await fetchBase64(el.src, fetchOptions);
el.setAttribute('src', `data:${mime};base64,${data}`);
});

let srcset_promises = Array.from(
Expand All @@ -71,18 +50,24 @@ export default async function inlineImages(doc, fetchOptions = {}, out) {
stringifySrcset(
await Promise.all(
items.map(async item => {
const mime = get_mime(item.url, doc);
if (mime && image_mimetypes.has(mime)) {
let data = await fetchBase64(
item.url,
fetchOptions
);
return {
...item,
url: `data:${mime};base64,${data}`
};
}
return item;
/*
For web pages using atypical URLs for images
let’s just use a generic MIME type and hope it works.
For an example, see:
https://github.com/danburzo/percollate/issues/174
*/
let mime = isImageURL(item.url, doc)
? getMimetypeFromURL(item.url, doc)
: 'image';
let data = await fetchBase64(
item.url,
fetchOptions
);
return {
...item,
url: `data:${mime};base64,${data}`
};
})
)
)
Expand Down
31 changes: 19 additions & 12 deletions src/remote-resources.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import { randomUUID as uuid } from 'node:crypto';
import { parseSrcset, stringifySrcset } from 'srcset';
import { REGEX_IMAGE_URL } from './constants/regex.js';
import {
getMimetypeFromURL,
extForMimetype,
isImageURL
} from './util/file-mimetype.js';
import { getUrlOrigin } from './util/url-origin.js';

export default function remoteResources(doc) {
Expand All @@ -11,21 +15,24 @@ export default function remoteResources(doc) {
and return a uniquely generated file name instead.
*/
function collectAndReplace(src) {
let pathname = src;
try {
pathname = new URL(src, doc.baseURI).pathname;
} catch (err) {
// no-op, probably due to bad `doc.baseURI`.
}
let match = pathname.match(REGEX_IMAGE_URL);
if (!match) {
return src;
/*
If image URLs don’t have an extension with which
to figure out the image format, use the generic
`image` MIME media type and the `.image` extension
for EPUB remote resources.
*/
let mime = 'image',
ext = '.image';
if (isImageURL(src, doc)) {
mime = getMimetypeFromURL(src, doc);
ext = extForMimetype(mime);
}
if (!srcs.has(src)) {
srcs.set(src, {
original: src,
mapped: `rr-${uuid()}.${match[1]}`,
origin: getUrlOrigin(doc.baseURI)
mapped: `rr-${uuid()}${ext}`,
origin: getUrlOrigin(doc.baseURI),
mimetype: mime
});
}
return `./${srcs.get(src).mapped}`;
Expand Down
31 changes: 30 additions & 1 deletion src/util/file-mimetype.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,40 @@
import mimetype from 'mimetype';

const IMAGE_MIMETYPES = new Set([
'image/avif',
'image/bmp',
'image/gif',
'image/jpeg',
'image/png',
'image/svg+xml',
'image/tiff',
'image/webp'
]);

/*
Add newer image formats to the MIME type database.
*/
mimetype.set('.webp', 'image/webp');
mimetype.set('.avif', 'image/avif');

export default function lookup(filepath) {
export function lookupMimetype(filepath) {
return mimetype.lookup(filepath);
}

export function extForMimetype(type) {
return Object.entries(mimetype.catalog).find(it => it[1] === type)?.[0];
}

export function getMimetypeFromURL(src, doc) {
let pathname = src;
try {
pathname = new URL(src, doc.baseURI).pathname;
} catch (err) {
// no-op, probably due to bad `doc.baseURI`
}
return lookupMimetype(pathname);
}

export function isImageURL(src, doc) {
return IMAGE_MIMETYPES.has(getMimetypeFromURL(src, doc));
}

0 comments on commit fb5cd9f

Please sign in to comment.