Skip to content

Commit

Permalink
WIP: Check "external" resources in the main loop as well via asset.lo…
Browse files Browse the repository at this point in the history
…ad({ metadataOnly: true })
  • Loading branch information
papandreou committed Apr 1, 2018
1 parent 6ac9d9d commit 742d302
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 207 deletions.
225 changes: 29 additions & 196 deletions lib/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
const AssetGraph = require('assetgraph');
const async = require('async');
const request = require('request');
const version = require('../package.json').version;
const relationDebugDescription = require('./relationDebugDescription');
const prettyBytes = require('pretty-bytes');
Expand Down Expand Up @@ -140,171 +139,6 @@ async function hyperlink({
};
}

function httpStatus(asset, attempt = 1) {
const url = asset.url;
const relations = asset._incoming;

const loadReport = {
operator: 'external-check',
name: `external-check ${url}`,
at: [...new Set(relations.map(r => r.debugDescription))].join('\n '),
expected: `200 ${url}`
};

return callback => {
if (shouldSkip(loadReport)) {
return setTimeout(callback);
}

request({
method: attempt === 1 ? 'head' : 'get',
url: asset.url,
strictSSL: true,
gzip: true,
headers: {
'User-Agent': hyperlinkUserAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br'
}
}, (error, res) => {
if (error) {
const code = error.code;
let actual = code || 'Unknown error';

switch (code) {
case 'ENOTFOUND':
actual = `DNS missing: ${asset.hostname}`;
break;
case 'HPE_INVALID_CONSTANT':
if (attempt === 1) {
return httpStatus(asset, attempt + 1)(callback);
}
break;
}

reportTest({
...loadReport,
ok: false,
actual
});

return callback();
}

const status = res.statusCode;

if (status >= 200 && status < 300) {
const contentType = res.headers['content-type'];
if (contentType && asset.type) {
const matchContentType = contentType.match(
/^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i
);
if (matchContentType && asset.expectedTypes) {
asset.contentType = matchContentType[1].toLowerCase();
asset._tryUpgrade();
}
} else if (!contentType) {
const contentTypeMisingReport = {
ok: false,
name: `content-type-missing ${asset.urlOrDescription}`,
operator: 'content-type-missing',
expected: asset.contentType || `A Content-Type compatible with ${asset.type}`,
actual: contentType,
at: [...new Set(relations.map(r => r.debugDescription))].join('\n '),
};

if (!shouldSkip(contentTypeMisingReport)) {
reportTest(contentTypeMisingReport);
};
}
}

// Some servers respond weirdly to HEAD requests. Make a second attempt with GET
if (attempt === 1 && status >= 400 && status < 600) {
return httpStatus(asset, attempt + 1)(callback);
}

// Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
if (attempt === 2 && status === 502) {
setTimeout(
() => httpStatus(asset, attempt + 1)(callback),
1000
);
return;
}

const redirects = res.request._redirect.redirects;
if (redirects.length > 0) {
const log = [{ redirectUri: url }, ...redirects].map((item, idx, arr) => {
if (arr[idx + 1]) {
item.statusCode = arr[idx + 1].statusCode;
} else {
item.statusCode = 200;
}

return item;
});

const redirectReport = {
operator: 'external-redirect',
name: `external-redirect ${url}`,
at: [...new Set(relations.map(r => r.debugDescription))].join('\n '),
expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}`
};

const actual = log.map(
redirect => `${redirect.statusCode} ${redirect.redirectUri}`
).join(' --> ');

if (!shouldSkip(redirectReport)) {
// A single temporary redirect is allowed
if ([302, 307].includes(log[0].statusCode)) {
if (log.length < 3) {
reportTest({
...redirectReport,
expected: actual,
actual,
ok: true
});
} else {
reportTest({
...redirectReport,
expected: `${log[0].statusCode} ${url} --> 200 ${log[log.length - 1].redirectUri}`,
actual,
ok: false
});
}
} else {
reportTest({
...redirectReport,
actual,
ok: false
});
}
}
}

if (status === 200) {
reportTest({
...loadReport,
ok: true,
actual: loadReport.expected
});

return callback();
}

reportTest({
...loadReport,
actual: `${status} ${url}`,
ok: false
});

return callback();
});
};
}

if (verbose) {
ag.on('addRelation', relation => {
console.error('addRelation', relation.toString());
Expand Down Expand Up @@ -394,9 +228,10 @@ async function hyperlink({
async function processAsset(asset) {
if (!processedAssets.has(asset)) {
processedAssets.add(asset);
const operator = asset._metadataOnly ? 'external-check' : 'load';
const loadReport = {
operator: 'load',
name: `load ${asset.urlOrDescription}`,
operator,
name: `${operator} ${asset.urlOrDescription}`,
expected: `200 ${asset.urlOrDescription}`
};

Expand All @@ -411,7 +246,8 @@ async function hyperlink({
}

try {
await asset.load();
// FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode
await asset.load({metadataOnly: asset._metadataOnly});

reportTest({
...loadReport,
Expand All @@ -432,6 +268,18 @@ async function hyperlink({
return;
}

if (asset.statusCode >= 300 && asset.statusCode < 400) {
// TODO: Warn about chains of temporary redirects
const redirectRelation = asset.outgoingRelations.find(r => r.type === 'HttpRedirect');
reportTest({
ok: asset.statusCode !== 301,
operator: 'external-redirect',
name: `external-redirect ${asset.url}`,
at: loadReport.at,
expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}}`
});
}

for (const relation of asset.externalRelations) {
// Only do work for supported protocols
if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) {
Expand Down Expand Up @@ -463,7 +311,6 @@ async function hyperlink({
});
}
}

} else if (relation.to.type === 'Html') {
(relation.to.incomingFragments = relation.to.incomingFragments || []).push({
fragment,
Expand Down Expand Up @@ -509,8 +356,10 @@ async function hyperlink({
}

let follow;

if (['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)) {
let metadataOnly = asset._metadataOnly;
if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) {
follow = true;
} else if (['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)) {
follow = false;
relation.to['check' + relation.type] = true;
} else if (['HtmlAnchor', 'SvgAnchor', 'HtmlIFrame'].includes(relation.type)) {
Expand All @@ -522,39 +371,41 @@ async function hyperlink({
follow = true;
relation.to.stopProcessing = true;
} else {
relation.to.check = true;
metadataOnly = true;
}
}
} else if (/^(?:JavaScript|Css)Source(?:Mapping)Url$/.test(relation.type)) {
if (followSourceMaps) {
follow = true;
} else {
relation.to.check = true;
metadataOnly = true;
}
} else if (['SourceMapFile', 'SourceMapSource'].includes(relation.type)) {
if (followSourceMaps) {
relation.to.check = true;
metadataOnly = true;
}
} else {
follow = true;
}

if (follow) {
if (follow || metadataOnly) {
if (assetTypesWithoutRelations.includes(relation.to.type)) {
// If we are handling local file-urls, follow but mark as end-of-line in processing
if (relation.from.protocol === 'file:' && relation.to.protocol === 'file:') {
relation.to.stopProcessing = !recursive;
assetQueue.push(relation.to);
} else {
relation.to.check = true;
metadataOnly = true;
}
} else {
assetQueue.push(relation.to);
}
relation.to._metadataOnly = metadataOnly;
assetQueue.push(relation.to);
}
}

if (asset.type === 'Html') {
if (asset.type === 'Html' && !asset._metadataOnly) {
// Remember the set of ids in the document before unloading so incoming fragments can be checked:
asset.ids = new Set();
for (const element of Array.from(asset.parseTree.querySelectorAll('[id]'))) {
Expand Down Expand Up @@ -622,24 +473,6 @@ async function hyperlink({
}
}

// Check urls
const assetsToCheck = ag.findAssets({check: true}).filter(asset => !processedAssets.has(asset));
t.push({
name: `Crawling ${assetsToCheck.length} outgoing urls`
});

await new Promise((resolve, reject) => async.parallelLimit(
assetsToCheck.map(asset => httpStatus(asset)),
20,
err => {
if (err) {
reject(err);
} else {
resolve();
}
}
));

// Check Content-Type vs. incoming relation targetTypes:

for (const asset of ag.findAssets({expectedTypes: {$exists: true}})) {
Expand Down
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
"async": "^2.6.0",
"optimist": "^0.6.1",
"pretty-bytes": "^4.0.2",
"request": "^2.83.0",
"tap-render": "Munter/tap-render#0.1.7-patch3",
"urltools": "^0.3.1"
},
Expand Down
20 changes: 10 additions & 10 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ describe('hyperlink', function () {
name: 'load https://example.com/',
ok: true
});
t.push( { name: 'Crawling 2 outgoing urls' } );
// t.push( { name: 'Crawling 2 outgoing urls' } );
t.push(null, {
ok: true,
name: 'external-check https://google.com'
Expand Down Expand Up @@ -217,7 +217,7 @@ describe('hyperlink', function () {
ok: false,
operator: 'content-type-mismatch',
name: 'content-type-mismatch https://example.com/hey.png',
actual: 'Asset is used as both Image and Text',
actual: 'Asset is used as both Png and Text',
at: 'https://example.com/ (6:39) <img src="hey.png">'
});
});
Expand Down Expand Up @@ -260,8 +260,8 @@ describe('hyperlink', function () {
expect(t.push, 'to have a call satisfying', () => {
t.push(null, {
ok: false,
operator: 'content-type-missing',
name: 'content-type-missing https://example.com/hey.png',
operator: 'error',
actual: 'https://example.com/hey.png: No Content-Type response header received',
at: 'https://example.com/ (6:39) <img src="hey.png">'
});
});
Expand Down Expand Up @@ -332,9 +332,9 @@ describe('hyperlink', function () {
actual: expect.it('to begin with', 'ENOENT: no such file or directory')
});

t.push({
name: 'Crawling 0 outgoing urls'
});
// t.push({
// name: 'Crawling 0 outgoing urls'
// });

t.push({
name: 'Connecting to 0 hosts (checking <link rel="preconnect" href="...">'
Expand Down Expand Up @@ -673,12 +673,12 @@ describe('hyperlink', function () {
{
request: 'HEAD https://mycdn.com/404.eot',
response: 404
},
}/*,
// retry
{
request: 'GET https://mycdn.com/404.eot',
response: 404
}
}*/
]);

const t = new TapRender();
Expand All @@ -694,7 +694,7 @@ describe('hyperlink', function () {
operator: 'external-check',
name: 'external-check https://mycdn.com/404.eot',
expected: '200 https://mycdn.com/404.eot',
actual: '404 https://mycdn.com/404.eot'
actual: 'HTTP 404 Not Found'
});
});
});
Expand Down

0 comments on commit 742d302

Please sign in to comment.