Skip to content

Commit

Permalink
WIP: Check external resources in the main loop as well via asset.load…
Browse files Browse the repository at this point in the history
…({ metadataOnly: true })
  • Loading branch information
papandreou committed Apr 1, 2018
1 parent ada6936 commit a94ed21
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 226 deletions.
243 changes: 31 additions & 212 deletions lib/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
const AssetGraph = require('assetgraph');
const async = require('async');
const request = require('request');
const version = require('../package.json').version;
const relationDebugDescription = require('./relationDebugDescription');
const prettyBytes = require('pretty-bytes');
Expand Down Expand Up @@ -147,184 +146,6 @@ async function hyperlink(
};
}

function httpStatus(asset, attempt = 1) {
const url = asset.url;
const relations = asset._incoming;

const loadReport = {
operator: 'external-check',
name: `external-check ${url}`,
at: [...new Set(relations.map(r => r.debugDescription))].join(
'\n '
),
expected: `200 ${url}`
};

return callback => {
if (shouldSkip(loadReport)) {
return setTimeout(callback);
}

request(
{
method: attempt === 1 ? 'head' : 'get',
url: asset.url,
strictSSL: true,
gzip: true,
headers: {
'User-Agent': hyperlinkUserAgent,
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br'
}
},
(error, res) => {
if (error) {
const code = error.code;
let actual = code || 'Unknown error';

switch (code) {
case 'ENOTFOUND':
actual = `DNS missing: ${asset.hostname}`;
break;
case 'HPE_INVALID_CONSTANT':
if (attempt === 1) {
return httpStatus(asset, attempt + 1)(callback);
}
break;
}

reportTest({
...loadReport,
ok: false,
actual
});

return callback();
}

const status = res.statusCode;

if (status >= 200 && status < 300) {
const contentType = res.headers['content-type'];
if (contentType && asset.type) {
const matchContentType = contentType.match(
/^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i
);
if (matchContentType && asset.expectedTypes) {
asset.contentType = matchContentType[1].toLowerCase();
asset._tryUpgrade();
}
} else if (!contentType) {
const contentTypeMisingReport = {
ok: false,
name: `content-type-missing ${asset.urlOrDescription}`,
operator: 'content-type-missing',
expected:
asset.contentType ||
`A Content-Type compatible with ${asset.type}`,
actual: contentType,
at: [...new Set(relations.map(r => r.debugDescription))].join(
'\n '
)
};

if (!shouldSkip(contentTypeMisingReport)) {
reportTest(contentTypeMisingReport);
}
}
}

// Some servers respond weirdly to HEAD requests. Make a second attempt with GET
if (attempt === 1 && status >= 400 && status < 600) {
return httpStatus(asset, attempt + 1)(callback);
}

// Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
if (attempt === 2 && status === 502) {
setTimeout(() => httpStatus(asset, attempt + 1)(callback), 1000);
return;
}

const redirects = res.request._redirect.redirects;
if (redirects.length > 0) {
const log = [{ redirectUri: url }, ...redirects].map(
(item, idx, arr) => {
if (arr[idx + 1]) {
item.statusCode = arr[idx + 1].statusCode;
} else {
item.statusCode = 200;
}

return item;
}
);

const redirectReport = {
operator: 'external-redirect',
name: `external-redirect ${url}`,
at: [...new Set(relations.map(r => r.debugDescription))].join(
'\n '
),
expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}`
};

const actual = log
.map(redirect => `${redirect.statusCode} ${redirect.redirectUri}`)
.join(' --> ');

if (!shouldSkip(redirectReport)) {
// A single temporary redirect is allowed
if ([302, 307].includes(log[0].statusCode)) {
if (log.length < 3) {
reportTest({
...redirectReport,
expected: actual,
actual,
ok: true
});
} else {
reportTest({
...redirectReport,
expected: `${log[0].statusCode} ${url} --> 200 ${
log[log.length - 1].redirectUri
}`,
actual,
ok: false
});
}
} else {
reportTest({
...redirectReport,
actual,
ok: false
});
}
}
}

if (status === 200) {
reportTest({
...loadReport,
ok: true,
actual: loadReport.expected
});

return callback();
}

reportTest({
...loadReport,
actual: `${status} ${url}`,
ok: false
});

return callback();
}
);
};
}

if (verbose) {
ag.on('addRelation', relation => {
console.error('addRelation', relation.toString());
Expand Down Expand Up @@ -424,9 +245,10 @@ async function hyperlink(
async function processAsset(asset) {
if (!processedAssets.has(asset)) {
processedAssets.add(asset);
const operator = asset._metadataOnly ? 'external-check' : 'load';
const loadReport = {
operator: 'load',
name: `load ${asset.urlOrDescription}`,
operator,
name: `${operator} ${asset.urlOrDescription}`,
expected: `200 ${asset.urlOrDescription}`
};

Expand All @@ -441,7 +263,8 @@ async function hyperlink(
}

try {
await asset.load();
// FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode
await asset.load({ metadataOnly: asset._metadataOnly });

reportTest({
...loadReport,
Expand All @@ -462,6 +285,20 @@ async function hyperlink(
return;
}

if (asset.statusCode >= 300 && asset.statusCode < 400) {
// TODO: Warn about chains of temporary redirects
const redirectRelation = asset.outgoingRelations.find(
r => r.type === 'HttpRedirect'
);
reportTest({
ok: asset.statusCode !== 301,
operator: 'external-redirect',
name: `external-redirect ${asset.url}`,
at: loadReport.at,
expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}}`
});
}

for (const relation of asset.externalRelations) {
// Only do work for supported protocols
if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) {
Expand Down Expand Up @@ -547,8 +384,10 @@ async function hyperlink(
}

let follow;

if (
let metadataOnly = asset._metadataOnly;
if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) {
follow = true;
} else if (
['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)
) {
follow = false;
Expand All @@ -568,7 +407,7 @@ async function hyperlink(
follow = true;
relation.to.stopProcessing = true;
} else {
relation.to.check = true;
metadataOnly = true;
}
}
} else if (
Expand All @@ -577,19 +416,19 @@ async function hyperlink(
if (followSourceMaps) {
follow = true;
} else {
relation.to.check = true;
metadataOnly = true;
}
} else if (
['SourceMapFile', 'SourceMapSource'].includes(relation.type)
) {
if (followSourceMaps) {
relation.to.check = true;
metadataOnly = true;
}
} else {
follow = true;
}

if (follow) {
if (follow || metadataOnly) {
if (assetTypesWithoutRelations.includes(relation.to.type)) {
// If we are handling local file-urls, follow but mark as end-of-line in processing
if (
Expand All @@ -599,15 +438,17 @@ async function hyperlink(
relation.to.stopProcessing = !recursive;
assetQueue.push(relation.to);
} else {
relation.to.check = true;
metadataOnly = true;
}
} else {
assetQueue.push(relation.to);
}
relation.to._metadataOnly = metadataOnly;
assetQueue.push(relation.to);
}
}

if (asset.type === 'Html') {
if (asset.type === 'Html' && !asset._metadataOnly) {
// Remember the set of ids in the document before unloading so incoming fragments can be checked:
asset.ids = new Set();
for (const element of Array.from(
Expand Down Expand Up @@ -680,28 +521,6 @@ async function hyperlink(
}
}

// Check urls
const assetsToCheck = ag
.findAssets({ check: true })
.filter(asset => !processedAssets.has(asset));
t.push({
name: `Crawling ${assetsToCheck.length} outgoing urls`
});

await new Promise((resolve, reject) =>
async.parallelLimit(
assetsToCheck.map(asset => httpStatus(asset)),
20,
err => {
if (err) {
reject(err);
} else {
resolve();
}
}
)
);

// Check Content-Type vs. incoming relation targetTypes:

for (const asset of ag.findAssets({ expectedTypes: { $exists: true } })) {
Expand Down
Loading

0 comments on commit a94ed21

Please sign in to comment.