Skip to content

Commit 04e8c41

Browse files
authored
Retrieve WHATWG spec titles from WHATWG database (#1666)
Build code used to rely on Specref to get the title of WHATWG specifications. This update makes it fetch info for WHATWG specs from the WHATWG database directly. To save one request, the code leverages the workstreams database, also used by fetch-groups, instead of the biblio file. On top of adding a new `whatwg` value to the `"source"` field, this update will also fix the titles of the WHATWG specs: they end with "Standard" in Specref but, while that matches the `<title>` tag, the actual spec title in the `<h1>` and the title in the WHATWG database don't end with "Standard". #docallmeDOM The update turns the `fetchJSON` function into a utility function. This is going to save a few requests (not that many!) that are common between fetch-info and fetch-groups. Specific functions in fetch-info were also adjusted not to do anything when there are no specs of interest in the list (this speeds up tests a bit, but has no impact on a full build since, by definition, there are specs of interest in the full list...)
1 parent 475c375 commit 04e8c41

File tree

5 files changed

+136
-161
lines changed

5 files changed

+136
-161
lines changed

Diff for: schema/definitions.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060

6161
"source": {
6262
"type": "string",
63-
"enum": ["w3c", "specref", "spec", "ietf"]
63+
"enum": ["w3c", "specref", "spec", "ietf", "whatwg"]
6464
},
6565

6666
"nightly": {

Diff for: src/fetch-groups.js

+7-19
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import Octokit from "./octokit.js";
1010
import parseSpecUrl from "./parse-spec-url.js";
11+
import fetchJSON from "./fetch-json.js";
1112

1213

1314
/**
@@ -84,19 +85,6 @@ export default async function (specs, options) {
8485
// same fetch request again and again
8586
const cache = {};
8687

87-
// Helper function to retrieve a JSON resource or return null if resource
88-
// cannot be retrieved
89-
async function fetchJSON(url, options) {
90-
const body = cache[url] ?? await fetch(url, options).then(res => {
91-
if (res.status !== 200) {
92-
throw new Error(`W3C API returned an error for ${url}, status code is ${res.status}`);
93-
}
94-
return res.json();
95-
});
96-
cache[url] = body;
97-
return body;
98-
}
99-
10088
for (const spec of specs) {
10189
if (spec.__last?.standing === 'discontinued' &&
10290
(!spec.standing || spec.standing === 'discontinued')) {
@@ -113,7 +101,7 @@ export default async function (specs, options) {
113101
if (ietfName) {
114102
spec.organization = spec.organization ?? "IETF";
115103
if (spec.groups) continue;
116-
const ietfJson = await fetchJSON(`https://datatracker.ietf.org/doc/${ietfName[1]}/doc.json`);
104+
const ietfJson = await fetchJSON(`https://datatracker.ietf.org/doc/${ietfName[1]}/doc.json`, options);
117105
if (ietfJson.group?.type === "WG") {
118106
spec.groups = [{
119107
name: `${ietfJson.group.name} Working Group`,
@@ -152,7 +140,7 @@ export default async function (specs, options) {
152140
}
153141

154142
if (info && info.owner === "whatwg") {
155-
const workstreams = await fetchJSON("https://raw.githubusercontent.com/whatwg/sg/main/db.json");
143+
const workstreams = await fetchJSON("https://raw.githubusercontent.com/whatwg/sg/main/db.json", options);
156144
const workstream = workstreams.workstreams.find(ws => ws.standards.find(s => s.href === spec.url));
157145
if (!workstream) {
158146
throw new Error(`No WHATWG workstream found for ${spec.url}`);
@@ -214,11 +202,11 @@ export default async function (specs, options) {
214202
else if (info.type === "tr") {
215203
// Use the W3C API to find info about /TR specs
216204
const url = `https://api.w3.org/specifications/${info.name}/versions/latest`;
217-
let resp = await fetchJSON(url);
205+
let resp = await fetchJSON(url, options);
218206
if (!resp?._links?.deliverers) {
219207
throw new Error(`W3C API did not return deliverers for the spec`);
220208
}
221-
resp = await fetchJSON(resp._links.deliverers.href);
209+
resp = await fetchJSON(resp._links.deliverers.href, options);
222210

223211
if (!resp?._links?.deliverers) {
224212
throw new Error(`W3C API did not return deliverers for the spec`);
@@ -250,7 +238,7 @@ export default async function (specs, options) {
250238
url = new URL(spec.url);
251239
url.pathname = "/w3c.json";
252240
}
253-
const body = await fetchJSON(url.toString());
241+
const body = await fetchJSON(url.toString(), options);
254242

255243
// Note the "group" property is either an ID or an array of IDs
256244
groups = [body?.group].flat().filter(g => !!g);
@@ -261,7 +249,7 @@ export default async function (specs, options) {
261249
spec.groups = [];
262250
for (const id of groups) {
263251
const url = ('' + id).startsWith("https://") ? id : `https://api.w3.org/groups/${id}`;
264-
const info = await fetchJSON(url);
252+
const info = await fetchJSON(url, options);
265253
spec.groups.push({
266254
name: info.name,
267255
url: info._links.homepage.href

Diff for: src/fetch-info.js

+76-108
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import loadSpec from "./load-spec.js";
4343
import computeShortname from "./compute-shortname.js";
4444
import Octokit from "./octokit.js";
4545
import ThrottledQueue from "./throttled-queue.js";
46+
import fetchJSON from "./fetch-json.js";
4647

4748
// Map spec statuses returned by Specref to those used in specs
4849
// Note we typically won't get /TR statuses from Specref, since all /TR URLs
@@ -55,8 +56,6 @@ const specrefStatusMapping = {
5556
"cg-draft": "Draft Community Group Report"
5657
};
5758

58-
const fetchQueue = new ThrottledQueue({ maxParallel: 2 });
59-
6059
async function useLastInfoForDiscontinuedSpecs(specs) {
6160
const results = {};
6261
for (const spec of specs) {
@@ -95,31 +94,22 @@ async function fetchInfoFromW3CApi(specs, options) {
9594
}
9695

9796
const url = `https://api.w3.org/specifications/${spec.shortname}/versions/latest`;
98-
const res = await fetchQueue.runThrottled(fetch, url, options);
99-
if (res.status === 404) {
100-
return;
101-
}
102-
if (res.status !== 200) {
103-
throw new Error(`W3C API returned an error, status code is ${res.status}, url was ${url}`);
104-
}
105-
106-
// Has the shortname changed from a W3C perspective?
107-
if (res.redirected) {
108-
const match = res.url.match(/\/specifications\/([^\/]+)\//);
109-
const w3cShortname = match ? match[1] : '';
110-
if (w3cShortname !== spec.shortname) {
111-
throw new Error(`W3C API redirects "${spec.shortname}" to ` +
112-
`"${w3cShortname}", update the shortname!`);
113-
}
114-
}
115-
116-
try {
117-
const body = await res.json();
118-
return body;
119-
}
120-
catch (err) {
121-
throw new Error("W3C API returned invalid JSON");
122-
}
97+
const body = await fetchJSON(url, options);
98+
99+
// The shortname of the specification may have changed. In such cases, the
100+
// W3C API silently redirects to the info for the new shortname, whereas we
101+
// want to make sure we use the latest shortname in browser-specs. The
102+
// actual shortname used by the W3C API does not appear explicitly in the
103+
// response to a "/versions/latest" request, but it appears implicitly in
104+
// the "_links/specification/href" URL.
105+
const match = body._links.specification.href.match(/\/specifications\/([^\/]+)$/);
106+
const shortname = match[1];
107+
if (shortname !== spec.shortname) {
108+
throw new Error(`W3C API redirects "${spec.shortname}" to ` +
109+
`"${shortname}", update the shortname!`);
110+
}
111+
112+
return body;
123113
}));
124114

125115
const seriesShortnames = new Set();
@@ -153,28 +143,15 @@ async function fetchInfoFromW3CApi(specs, options) {
153143
// Fetch info on the series
154144
const seriesInfo = await Promise.all([...seriesShortnames].map(async shortname => {
155145
const url = `https://api.w3.org/specification-series/${shortname}`;
156-
const res = await fetchQueue.runThrottled(fetch, url, options);
157-
if (res.status === 404) {
158-
return;
159-
}
160-
if (res.status !== 200) {
161-
throw new Error(`W3C API returned an error, status code is ${res.status}`);
162-
}
163-
try {
164-
const body = await res.json();
165-
166-
// The CSS specs and the CSS snapshots have different series shortnames for
167-
// us ("CSS" vs. "css"), but the W3C API is case-insentive, mixes the two
168-
// series, and claims that the series shortname is "CSS" or "css"
169-
// depending on which spec got published last. Let's get back to the
170-
// shortname we requested.
171-
body.shortname = shortname;
172-
173-
return body;
174-
}
175-
catch (err) {
176-
throw new Error("W3C API returned invalid JSON");
177-
}
146+
const body = await fetchJSON(url, options);
147+
148+
// The CSS specs and the CSS snapshots have different series shortnames for
149+
// us ("CSS" vs. "css"), but the W3C API is case-insentive, mixes the two
150+
// series, and claims that the series shortname is "CSS" or "css"
151+
// depending on which spec got published last. Let's get back to the
152+
// shortname we requested.
153+
body.shortname = shortname;
154+
return body;
178155
}));
179156

180157
results.__series = {};
@@ -207,6 +184,36 @@ async function fetchInfoFromW3CApi(specs, options) {
207184
return results;
208185
}
209186

187+
async function fetchInfoFromWHATWG(specs, options) {
188+
const whatwgRe = /\.whatwg\.org/;
189+
if (!specs.find(spec => spec.url.match(whatwgRe))) {
190+
return {};
191+
}
192+
193+
// Note: The WHATWG biblio.json file could also be used, but we're going to
194+
// need the workstreams database in any case in fetch-groups, so let's fetch
195+
// the database directly (this will put it in cache for fetch-groups)
196+
const url = 'https://raw.githubusercontent.com/whatwg/sg/main/db.json';
197+
const db = await fetchJSON(url, options);
198+
const standards = db.workstreams.map(ws => ws.standards).flat();
199+
200+
const specInfo = {};
201+
for (const spec of specs) {
202+
if (!spec.url.match(/\.whatwg\.org/)) {
203+
continue;
204+
}
205+
const entry = standards.find(std => std.href === spec.url);
206+
if (!entry) {
207+
console.warn(`[warning] WHATWG spec at ${spec.url} not found in WHATWG database`);
208+
continue;
209+
}
210+
specInfo[spec.shortname] = {
211+
nightly: { url: spec.url, status: 'Living Standard' },
212+
title: entry.name
213+
};
214+
}
215+
return specInfo;
216+
}
210217

211218
async function fetchInfoFromSpecref(specs, options) {
212219
function chunkArray(arr, len) {
@@ -224,11 +231,7 @@ async function fetchInfoFromSpecref(specs, options) {
224231
// API does not return the "source" field, so we need to retrieve the list
225232
// ourselves from Specref's GitHub repository.
226233
const specrefBrowserspecsUrl = "https://raw.githubusercontent.com/tobie/specref/main/refs/browser-specs.json";
227-
const browserSpecsResponse = await fetch(specrefBrowserspecsUrl, options);
228-
if (browserSpecsResponse.status !== 200) {
229-
throw new Error(`Could not retrieve specs contributed by browser-specs to Speref, status code is ${browserSpecsResponse.status}`);
230-
}
231-
const browserSpecs = await browserSpecsResponse.json();
234+
const browserSpecs = await fetchJSON(specrefBrowserspecsUrl, options);
232235
specs = specs.filter(spec => !browserSpecs[spec.shortname.toUpperCase()]);
233236

234237
// Browser-specs now acts as source for Specref for the WICG specs and W3C
@@ -244,18 +247,7 @@ async function fetchInfoFromSpecref(specs, options) {
244247
const chunksRes = await Promise.all(chunks.map(async chunk => {
245248
let specrefUrl = "https://api.specref.org/bibrefs?refs=" +
246249
chunk.map(spec => spec.shortname).join(',');
247-
248-
const res = await fetchQueue.runThrottled(fetch, specrefUrl, options);
249-
if (res.status !== 200) {
250-
throw new Error(`Could not query Specref, status code is ${res.status}`);
251-
}
252-
try {
253-
const body = await res.json();
254-
return body;
255-
}
256-
catch (err) {
257-
throw new Error("Specref returned invalid JSON");
258-
}
250+
return fetchJSON(specrefUrl, options);
259251
}));
260252

261253
const results = {};
@@ -315,54 +307,17 @@ async function fetchInfoFromSpecref(specs, options) {
315307

316308

317309
async function fetchInfoFromIETF(specs, options) {
318-
async function fetchJSONDoc(draftName) {
319-
const url = `https://datatracker.ietf.org/doc/${draftName}/doc.json`;
320-
const res = await fetchQueue.runThrottled(fetch, url, options);
321-
if (res.status !== 200) {
322-
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
323-
}
324-
try {
325-
return await res.json();
326-
}
327-
catch (err) {
328-
throw new Error(`IETF datatracker returned invalid JSON for ${url}`);
329-
}
330-
}
331-
332310
async function fetchRFCName(docUrl) {
333-
const res = await fetchQueue.runThrottled(fetch, docUrl, options);
334-
if (res.status !== 200) {
335-
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
336-
}
337-
try {
338-
const body = await res.json();
339-
if (!body.rfc) {
340-
throw new Error(`Could not find an RFC name in ${docUrl}`);
341-
}
342-
return `rfc${body.rfc}`;
343-
}
344-
catch (err) {
345-
throw new Error(`IETF datatracker returned invalid JSON for ${url}`);
346-
}
311+
const body = await fetchJSON(docUrl, options);
312+
return `rfc${body.rfc}`;
347313
}
348314

349315
async function fetchObsoletedBy(draftName) {
350316
if (!draftName.startsWith('rfc')) {
351317
return [];
352318
}
353319
const url = `https://datatracker.ietf.org/api/v1/doc/relateddocument/?format=json&relationship__slug__in=obs&target__name__in=${draftName}`;
354-
const res = await fetchQueue.runThrottled(fetch, url, options);
355-
if (res.status !== 200) {
356-
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
357-
}
358-
let body;
359-
try {
360-
body = await res.json();
361-
}
362-
catch (err) {
363-
throw new Error(`IETF datatracker returned invalid JSON for ${url}`);
364-
}
365-
320+
const body = await fetchJSON(url, options);
366321
return Promise.all(body.objects
367322
.map(obj => `https://datatracker.ietf.org${obj.source}`)
368323
.map(fetchRFCName));
@@ -388,6 +343,15 @@ async function fetchInfoFromIETF(specs, options) {
388343
return paths.filter(p => p.path.match(/^specs\/rfc\d+\.html$/))
389344
.map(p => p.path.match(/(rfc\d+)\.html$/)[1]);
390345
}
346+
347+
// IETF can only provide information about IETF specs, no need to fetch the
348+
// list of RFCs of the HTTP WG if there's no IETF spec in the list.
349+
if (!specs.find(spec =>
350+
spec.url.match(/\.rfc-editor\.org/) ||
351+
spec.url.match(/datatracker\.ietf\.org/))) {
352+
return {};
353+
}
354+
391355
const httpwgRFCs = await getHttpwgRFCs();
392356

393357
const info = await Promise.all(specs.map(async spec => {
@@ -404,7 +368,8 @@ async function fetchInfoFromIETF(specs, options) {
404368
if (!draftName) {
405369
throw new Error(`IETF document follows an unexpected URL pattern: ${spec.url}`);
406370
}
407-
const jsonDoc = await fetchJSONDoc(draftName[1]);
371+
const draftUrl = `https://datatracker.ietf.org/doc/${draftName[1]}/doc.json`;
372+
const jsonDoc = await fetchJSON(draftUrl, options);
408373
const lastRevision = jsonDoc.rev_history.pop();
409374
if (lastRevision.name !== draftName[1]) {
410375
throw new Error(`IETF spec ${spec.url} published under a new name "${lastRevision.name}". Canonical URL must be updated accordingly.`);
@@ -645,13 +610,16 @@ async function fetchInfo(specs, options) {
645610
{ name: 'discontinued', fn: useLastInfoForDiscontinuedSpecs },
646611
{ name: 'w3c', fn: fetchInfoFromW3CApi },
647612
{ name: 'ietf', fn: fetchInfoFromIETF },
613+
{ name: 'whatwg', fn: fetchInfoFromWHATWG },
648614
{ name: 'specref', fn: fetchInfoFromSpecref },
649615
{ name: 'spec', fn: fetchInfoFromSpecs }
650616
];
651617
let remainingSpecs = specs;
652618
for (let i = 0; i < steps.length ; i++) {
653619
const step = steps[i];
654-
info[step.name] = await step.fn(remainingSpecs, options);
620+
info[step.name] = remainingSpecs.length > 0 ?
621+
await step.fn(remainingSpecs, options) :
622+
{};
655623
remainingSpecs = remainingSpecs.filter(spec => !info[step.name][spec.shortname]);
656624
}
657625

Diff for: src/fetch-json.js

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import ThrottledQueue from "./throttled-queue.js";
2+
3+
// Make sure we remain "friendly" with servers
4+
const fetchQueue = new ThrottledQueue({ maxParallel: 2 });
5+
6+
// Maintain a cache of fetched JSON resources in memory to avoid sending the
7+
// same fetch request again and again
8+
const cache = {};
9+
10+
/**
11+
* Fetch a JSON URL
12+
*/
13+
export default async function (url, options) {
14+
if (cache[url]) {
15+
return structuredClone(cache[url]);
16+
}
17+
const res = await fetchQueue.runThrottled(fetch, url, options);
18+
if (res.status === 404) {
19+
return null;
20+
}
21+
if (res.status !== 200) {
22+
throw new Error(`Server returned an error for ${url}, status code is ${res.status}`);
23+
}
24+
25+
try {
26+
const body = await res.json();
27+
cache[url] = body;
28+
return structuredClone(body);
29+
}
30+
catch (err) {
31+
throw new Error(`Server returned invalid JSON for ${url}`);
32+
}
33+
}

0 commit comments

Comments
 (0)