Skip to content

Commit a7a829d

Browse files
committed
Check autolinks from bikeshed specs
Given w3c/reffy#1584
1 parent 3673f9d commit a7a829d

File tree

1 file changed

+156
-144
lines changed

1 file changed

+156
-144
lines changed

src/lib/study-backrefs.js

+156-144
Original file line numberDiff line numberDiff line change
@@ -237,168 +237,180 @@ function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortname
237237
trResults = trResults || [];
238238
const report = [];
239239

240-
const recordAnomaly = recordCategorizedAnomaly(report, 'links', possibleAnomalies);
241-
242240
edResults.forEach(spec => {
243241
if (shortnameFilter && spec.shortname !== shortnameFilter) return;
244-
Object.keys(spec.links?.rawlinks || {})
245-
.filter(matchSpecUrl)
246-
.forEach(link => {
247-
let shortname;
248-
if (spec.links.rawlinks[link].specShortname) {
249-
shortname = spec.links.rawlinks[link].specShortname;
250-
} else {
251-
let nakedLink = link;
252-
253-
// Ignoring links to PDF specs
254-
if (nakedLink.endsWith('.pdf')) {
255-
return;
256-
}
257-
258-
if (nakedLink.endsWith('.html')) {
259-
nakedLink = nakedLink.replace(/\/(Overview|overview|index)\.html$/, '/');
260-
}
261-
if (nakedLink[nakedLink.length - 1] !== '/' && !nakedLink.endsWith(".html")) {
262-
nakedLink += '/';
263-
}
264-
265-
// Detect links to dated specs
266-
const match = nakedLink.match(/www\.w3\.org\/TR\/[0-9]{4}\/([A-Z]+)-(.+)-[0-9]{8}\//);
267-
if (match) {
268-
// ED should not link to dated versions of the spec, unless it
269-
// voluntarily links to previous versions of itself
270-
if ((match[2] !== spec.shortname || outdatedShortnames[match[2]] === spec.shortname) && !['REC', 'NOTE'].includes(match[1])) {
271-
recordAnomaly(spec, 'datedUrls', link);
272-
}
242+
studyLinks(spec, spec.links?.rawlinks, report, edResults, trResults, htmlFragments);
243+
// given the current limitation of classification of links for bikeshed
244+
// https://github.com/w3c/reffy/issues/1584
245+
// we also check autolinks for bikeshed specs
246+
if (spec.generator === "bikeshed") {
247+
studyLinks(spec, spec.links?.autolinks, report, edResults, trResults, htmlFragments);
248+
}
249+
});
250+
return report;
251+
}
273252

274-
// TODO: consider pursuing the analysis with the non-dated version,
275-
// but note this may trigger some obscure broken fragment messages
276-
// when a fragment exists in the dated version but no longer exists
277-
// in the ED.
278-
return;
279-
}
253+
function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
254+
if (!links) return;
280255

281-
// Check whether the naked link matches any known URL in the crawl
282-
shortname = (edResults.find(r =>
283-
r.url === nakedLink ||
284-
(r.release && r.release.url === nakedLink) ||
285-
(r.nightly && r.nightly.url === nakedLink) ||
286-
(r.series && nakedLink === `https://www.w3.org/TR/${r.series.shortname}/`) && r.series.currentSpecification === r.shortname) || {}).shortname;
287-
288-
// If it does not match any known URL, try to compute a shortname out of
289-
// it directly.
290-
if (!shortname) {
291-
try {
292-
shortname = computeShortname(nakedLink);
293-
} catch (e) {
294-
recordAnomaly(spec, 'unknownSpecs', link);
295-
return;
296-
}
297-
}
298-
}
256+
const recordAnomaly = recordCategorizedAnomaly(report, 'links', possibleAnomalies);
299257

300-
if ((link.match(/w3\.org/) || link.match(/w3c\.github\.io/)) && shortNamesOfTransferedSpecs[shortname]) {
301-
// The specification should no longer be referenced.
302-
// In theory, we could still try to match the anchor against the
303-
// right spec. In practice, these outdated specs are sufficiently
304-
// outdated that it does not make a lot of sense to do so.
305-
recordAnomaly(spec, 'outdatedSpecs', link);
306-
return;
258+
Object.keys(links)
259+
.filter(matchSpecUrl)
260+
.forEach(link => {
261+
let shortname;
262+
if (links[link].specShortname) {
263+
shortname = links[link].specShortname;
264+
} else {
265+
let nakedLink = link;
266+
267+
// Ignoring links to PDF specs
268+
if (nakedLink.endsWith('.pdf')) {
269+
return;
270+
}
271+
272+
if (nakedLink.endsWith('.html')) {
273+
nakedLink = nakedLink.replace(/\/(Overview|overview|index)\.html$/, '/');
307274
}
308-
// Links to WHATWG commit snapshots
309-
if (link.match(/spec\.whatwg\.org/) && link.match(/commit-snapshots/)) {
310-
recordAnomaly(spec, 'outdatedSpecs', link);
311-
return;
275+
if (nakedLink[nakedLink.length - 1] !== '/' && !nakedLink.endsWith(".html")) {
276+
nakedLink += '/';
312277
}
313278

314-
if (link.match(/heycam\.github\.io/)) {
315-
recordAnomaly(spec, 'nonCanonicalRefs', link);
316-
shortname = 'webidl';
317-
}
318-
if (outdatedShortnames[shortname]) {
319-
shortname = outdatedShortnames[shortname];
320-
recordAnomaly(spec, 'nonCanonicalRefs', link);
279+
// Detect links to dated specs
280+
const match = nakedLink.match(/www\.w3\.org\/TR\/[0-9]{4}\/([A-Z]+)-(.+)-[0-9]{8}\//);
281+
if (match) {
282+
// ED should not link to dated versions of the spec, unless it
283+
// voluntarily links to previous versions of itself
284+
if ((match[2] !== spec.shortname || outdatedShortnames[match[2]] === spec.shortname) && !['REC', 'NOTE'].includes(match[1])) {
285+
recordAnomaly(spec, 'datedUrls', link);
286+
}
287+
288+
// TODO: consider pursuing the analysis with the non-dated version,
289+
// but note this may trigger some obscure broken fragment messages
290+
// when a fragment exists in the dated version but no longer exists
291+
// in the ED.
292+
return;
321293
}
322294

323-
// At this point, we managed to associate the link with a shortname,
324-
// let's check whether the shortname matches a spec in the crawl,
325-
// matching the exact spec shortname if possible, or the series
326-
// shortname otherwise (in which case we'll use the current spec)
327-
const sourceSpec =
328-
edResults.find(s => s.shortname === shortname) ||
329-
edResults.find(s => s.series.shortname === shortname && s.series.currentSpecification === s.shortname);
330-
if (!sourceSpec) {
331-
if (!shortnameOfNonNormativeDocs.includes(shortname)) {
295+
// Check whether the naked link matches any known URL in the crawl
296+
shortname = (edResults.find(r =>
297+
r.url === nakedLink ||
298+
(r.release && r.release.url === nakedLink) ||
299+
(r.nightly && r.nightly.url === nakedLink) ||
300+
(r.series && nakedLink === `https://www.w3.org/TR/${r.series.shortname}/`) && r.series.currentSpecification === r.shortname) || {}).shortname;
301+
302+
// If it does not match any known URL, try to compute a shortname out of
303+
// it directly.
304+
if (!shortname) {
305+
try {
306+
shortname = computeShortname(nakedLink);
307+
} catch (e) {
332308
recordAnomaly(spec, 'unknownSpecs', link);
309+
return;
333310
}
334-
return;
335311
}
336-
if (sourceSpec.error) {
337-
// no point in reporting an error on failed crawls
338-
return;
339-
}
340-
341-
// Self-references might be broken because of ED vs TR, ignore that
342-
if (shortname === spec.shortname || shortname === spec.series.shortname) {
343-
return;
312+
}
313+
314+
if ((link.match(/w3\.org/) || link.match(/w3c\.github\.io/)) && shortNamesOfTransferedSpecs[shortname]) {
315+
// The specification should no longer be referenced.
316+
// In theory, we could still try to match the anchor against the
317+
// right spec. In practice, these outdated specs are sufficiently
318+
// outdated that it does not make a lot of sense to do so.
319+
recordAnomaly(spec, 'outdatedSpecs', link);
320+
return;
321+
}
322+
// Links to WHATWG commit snapshots
323+
if (link.match(/spec\.whatwg\.org/) && link.match(/commit-snapshots/)) {
324+
recordAnomaly(spec, 'outdatedSpecs', link);
325+
return;
326+
}
327+
328+
if (link.match(/heycam\.github\.io/)) {
329+
recordAnomaly(spec, 'nonCanonicalRefs', link);
330+
shortname = 'webidl';
331+
}
332+
if (outdatedShortnames[shortname]) {
333+
shortname = outdatedShortnames[shortname];
334+
recordAnomaly(spec, 'nonCanonicalRefs', link);
335+
}
336+
337+
// At this point, we managed to associate the link with a shortname,
338+
// let's check whether the shortname matches a spec in the crawl,
339+
// matching the exact spec shortname if possible, or the series
340+
// shortname otherwise (in which case we'll use the current spec)
341+
const sourceSpec =
342+
edResults.find(s => s.shortname === shortname) ||
343+
edResults.find(s => s.series.shortname === shortname && s.series.currentSpecification === s.shortname);
344+
if (!sourceSpec) {
345+
if (!shortnameOfNonNormativeDocs.includes(shortname)) {
346+
recordAnomaly(spec, 'unknownSpecs', link);
344347
}
345-
346-
// Look for a corresponding entry in the TR crawl, which we'll use to
347-
// distinguish between broken links and "evolving" links (meaning links
348-
// that exist in the TR version but no longer exist in the ED)
349-
const trSourceSpec =
350-
trResults.find(s => s.shortname === shortname) ||
351-
trResults.find(s => s.series.shortname === shortname && s.series.currentSpecification === s.shortname) ||
352-
{};
353-
const headings = sourceSpec.headings || [];
354-
const dfns = sourceSpec.dfns || [];
355-
const ids = sourceSpec.ids || [];
356-
357-
// Check anchors
358-
const anchors = spec.links.rawlinks[link].anchors || [];
359-
for (const anchor of anchors) {
360-
const baseLink = (sourceSpec.nightly?.url === link || sourceSpec.nightly?.pages?.includes(link)) ? link : sourceSpec.nightly?.url;
361-
const matchFullNightlyLink = matchAnchor(baseLink, anchor);
362-
const matchFullReleaseLink = matchAnchor((sourceSpec.release || sourceSpec.nightly).url, anchor);
363-
const isKnownId = ids.find(matchFullNightlyLink);
364-
const heading = headings.find(h => matchFullNightlyLink(h.href));
365-
const dfn = dfns.find(d => matchFullNightlyLink(d.href));
366-
if (!isKnownId) {
367-
if ((trSourceSpec.ids || []).find(matchFullReleaseLink) && link.match(/w3\.org\/TR\//)) {
368-
recordAnomaly(spec, 'evolvingLinks', link + '#' + anchor);
348+
return;
349+
}
350+
if (sourceSpec.error) {
351+
// no point in reporting an error on failed crawls
352+
return;
353+
}
354+
355+
// Self-references might be broken because of ED vs TR, ignore that
356+
if (shortname === spec.shortname || shortname === spec.series.shortname) {
357+
return;
358+
}
359+
360+
// Look for a corresponding entry in the TR crawl, which we'll use to
361+
// distinguish between broken links and "evolving" links (meaning links
362+
// that exist in the TR version but no longer exist in the ED)
363+
const trSourceSpec =
364+
trResults.find(s => s.shortname === shortname) ||
365+
trResults.find(s => s.series.shortname === shortname && s.series.currentSpecification === s.shortname) ||
366+
{};
367+
const headings = sourceSpec.headings || [];
368+
const dfns = sourceSpec.dfns || [];
369+
const ids = sourceSpec.ids || [];
370+
371+
// Check anchors
372+
const anchors = links[link].anchors || [];
373+
for (const anchor of anchors) {
374+
const baseLink = (sourceSpec.nightly?.url === link || sourceSpec.nightly?.pages?.includes(link)) ? link : sourceSpec.nightly?.url;
375+
const matchFullNightlyLink = matchAnchor(baseLink, anchor);
376+
const matchFullReleaseLink = matchAnchor((sourceSpec.release || sourceSpec.nightly).url, anchor);
377+
const isKnownId = ids.find(matchFullNightlyLink);
378+
const heading = headings.find(h => matchFullNightlyLink(h.href));
379+
const dfn = dfns.find(d => matchFullNightlyLink(d.href));
380+
if (!isKnownId) {
381+
if ((trSourceSpec.ids || []).find(matchFullReleaseLink) && link.match(/w3\.org\/TR\//)) {
382+
recordAnomaly(spec, 'evolvingLinks', link + '#' + anchor);
383+
} else {
384+
if (link.startsWith('https://html.spec.whatwg.org/C') || link.startsWith('http://html.spec.whatwg.org/C')) {
385+
recordAnomaly(spec, 'nonCanonicalRefs', link);
386+
link = link.replace('http:', 'https:').replace('https://html.spec.whatwg.org/C', 'https://html.spec.whatwg.org/multipage');
387+
}
388+
// Links to single-page version of HTML spec
389+
if (link === 'https://html.spec.whatwg.org/' &&
390+
// is there an equivalent id in the multipage spec?
391+
ids.find(i => i.startsWith('https://html.spec.whatwg.org/multipage/') &&
392+
(i.endsWith('#' + anchor) || i.endsWith('#' + decodeURIComponent(anchor)) || i.endsWith('#' + encodeURIComponent(anchor))))) {
393+
// Should we keep track of those? ignoring for now
394+
} else if (link.startsWith('https://html.spec.whatwg.org/multipage') && htmlFragments &&
395+
htmlFragments[anchor] &&
396+
ids.find(matchAnchor(`https://html.spec.whatwg.org/multipage/${htmlFragments[anchor]}.html`, anchor))) {
397+
// Deal with anchors that are JS-redirected from
398+
// the multipage version of HTML
399+
recordAnomaly(spec, 'frailLinks', link + '#' + anchor);
400+
} else if (anchor.startsWith(':~:text=')) {
401+
// links using text fragments are inherently fragile
402+
recordAnomaly(spec, 'frailLinks', link + '#' + anchor);
369403
} else {
370-
if (link.startsWith('https://html.spec.whatwg.org/C') || link.startsWith('http://html.spec.whatwg.org/C')) {
371-
recordAnomaly(spec, 'nonCanonicalRefs', link);
372-
link = link.replace('http:', 'https:').replace('https://html.spec.whatwg.org/C', 'https://html.spec.whatwg.org/multipage');
373-
}
374-
// Links to single-page version of HTML spec
375-
if (link === 'https://html.spec.whatwg.org/' &&
376-
// is there an equivalent id in the multipage spec?
377-
ids.find(i => i.startsWith('https://html.spec.whatwg.org/multipage/') &&
378-
(i.endsWith('#' + anchor) || i.endsWith('#' + decodeURIComponent(anchor)) || i.endsWith('#' + encodeURIComponent(anchor))))) {
379-
// Should we keep track of those? ignoring for now
380-
} else if (link.startsWith('https://html.spec.whatwg.org/multipage') && htmlFragments &&
381-
htmlFragments[anchor] &&
382-
ids.find(matchAnchor(`https://html.spec.whatwg.org/multipage/${htmlFragments[anchor]}.html`, anchor))) {
383-
// Deal with anchors that are JS-redirected from
384-
// the multipage version of HTML
385-
recordAnomaly(spec, 'frailLinks', link + '#' + anchor);
386-
} else if (anchor.startsWith(':~:text=')) {
387-
// links using text fragments are inherently fragile
388-
recordAnomaly(spec, 'frailLinks', link + '#' + anchor);
389-
} else {
390-
recordAnomaly(spec, 'brokenLinks', link + '#' + anchor);
391-
}
404+
recordAnomaly(spec, 'brokenLinks', link + '#' + anchor);
392405
}
393-
} else if (!heading && !dfn) {
394-
recordAnomaly(spec, 'notDfn', link + '#' + anchor);
395-
} else if (dfn && dfn.access !== 'public') {
396-
recordAnomaly(spec, 'notExported', link + '#' + anchor);
397406
}
407+
} else if (!heading && !dfn) {
408+
recordAnomaly(spec, 'notDfn', link + '#' + anchor);
409+
} else if (dfn && dfn.access !== 'public') {
410+
recordAnomaly(spec, 'notExported', link + '#' + anchor);
398411
}
399-
});
400-
});
401-
return report;
412+
}
413+
});
402414
}
403415

404416
/**************************************************

0 commit comments

Comments
 (0)