Skip to content

Commit 75e8a11

Browse files
committed
large refactoring (part #2)
1 parent 6cc7ea2 commit 75e8a11

17 files changed

+299
-100
lines changed

website/docs-scraper/extract-content-from-markdown.mjs

Lines changed: 56 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
* SPDX-License-Identifier: MPL-2.0
44
*/
55

6+
import fs from 'fs-extra';
7+
8+
import _ from 'lodash';
9+
610
// remark
711
import { unified } from 'unified';
8-
import remarkParse from 'remark-parse';
912
import { remark } from 'remark';
1013
import { visit } from 'unist-util-visit';
1114
import { selectAll } from 'unist-util-select';
@@ -15,26 +18,32 @@ import { toString } from 'mdast-util-to-string';
1518

1619
import handlebars from 'handlebars';
1720

18-
import _ from 'lodash';
19-
2021
// plugins
2122
import remarkGfm from 'remark-gfm';
22-
import remarkHtml from 'remark-html';
23+
import remarkParse from 'remark-parse';
24+
// import remarkHtml from 'remark-html';
2325
import remarkRehype from 'remark-rehype';
2426
import rehypeRaw from 'rehype-raw';
2527
import rehypeStringify from 'rehype-stringify';
28+
import remarkFrontmatter from 'remark-frontmatter';
2629

2730
// local/custom
2831
import { WCAG_CRITERIA } from './parts/getWcagCriteria.mjs';
29-
import { replaceDocTags } from './parts/replaceDocTags.mjs';
32+
import { replaceDocComponentApiTags } from './parts/replaceDocComponentApiTags.mjs';
33+
import { removeIgnoredContent } from './parts/removeIgnoredContent.mjs';
3034
import { remarkRemoveComments } from './parts/remarkRemoveComments.mjs';
3135
import { remarkRemoveCodeBlocks } from './parts/remarkRemoveCodeBlocks.mjs';
32-
import { remarkRemoveEmptyParagraphs } from './parts/remarkRemoveEmptyParagraphs.mjs';
3336
import { remarkProcessCustomImageFormat } from './parts/remarkProcessCustomImageFormat.mjs';
34-
import { remarkStripHeliosContentBlocksDelimiters } from './parts/remarkStripHeliosContentBlocksDelimiters.mjs';
37+
import { remarkStripContentBlocksDelimiters } from './parts/remarkStripContentBlocksDelimiters.mjs';
38+
import { remarkProcessDocWcagList } from './parts/remarkProcessDocWcagList.mjs';
39+
import { remarkStripDocA11ySupport } from './parts/remarkStripDocA11ySupport.mjs';
40+
import { remarkStripDocBadge } from './parts/remarkStripDocBadge.mjs';
41+
import { remarkStripDocLayout } from './parts/remarkStripDocLayout.mjs';
3542
import { remarkStripHeliosHandlebarsExpressions } from './parts/remarkStripHeliosHandlebarsExpressions.mjs';
3643
// import { remarkStripHeliosReleaseNotesMetadata } from './remarkStripHeliosReleaseNotesMetadata.mjs';
3744
import { remarkHtmlSanitise } from './parts/remarkHtmlSanitise.mjs';
45+
import { rehypeRemoveDocListContainer } from './parts/rehypeRemoveDocListContainer.mjs';
46+
import { rehypeRemoveEmptyParagraphs } from './parts/rehypeRemoveEmptyParagraphs.mjs';
3847
import { setNodesHierarchy } from './parts/setNodesHierarchy.mjs';
3948
import { stringifyChildNodes } from './parts/stringifyChildNodes.mjs';
4049

@@ -186,19 +195,32 @@ export async function parseMarkdown(markdownContent) {
186195
// PROCESSING PIPELINE
187196
// --------------------
188197

189-
// we need to convert the `<Doc::***>` components to web-components-like code (HTML compatible)
190-
const standardizedContent = replaceDocTags(markdownContent);
198+
const testFile =
199+
'/Users/cristianorastelli/src/hashicorp/design-system/website/docs/testing/markdown/scraping-playground.md';
200+
201+
// TODO! remove! override
202+
const fileContent = await fs.readFile(testFile);
203+
markdownContent = fileContent.toString();
204+
205+
// remove content included in `<!-- algolia-ignore-[start/end] -->` delimiters
206+
markdownContent = removeIgnoredContent(markdownContent);
207+
208+
// replace `<Doc::ComponentApi(::*)>` tags with web-components-like `<doc-component-api>` custom tags (HTML compatible)
209+
markdownContent = replaceDocComponentApiTags(markdownContent);
191210

192211
// MARKDOWN AST PROCESSING
212+
// -----------------------
193213

194214
let tree = await unified()
195215
// convert the markdown to AST
196216
.use(remarkParse)
197217
// interpret special GFM markdown format
198218
.use(remarkGfm)
219+
// interpret the frontmatter block
220+
.use(remarkFrontmatter, { type: 'yaml', marker: '-' })
199221
// convert markdown to HTML (TODO do we need it? what happens if we remove this?)
200222
// .use(remarkHtml, { sanitize: remarkHtmlSanitise })
201-
.parse(standardizedContent);
223+
.parse(markdownContent);
202224

203225
// ✅ process custom images (showdown.js format)
204226
tree = await unified().use(remarkProcessCustomImageFormat).run(tree);
@@ -210,42 +232,51 @@ export async function parseMarkdown(markdownContent) {
210232
tree = await unified().use(remarkRemoveCodeBlocks).run(tree);
211233

212234
// ✅ remove content blocks delimiters
213-
tree = await unified()
214-
.use(remarkStripHeliosContentBlocksDelimiters)
215-
.run(tree);
235+
tree = await unified().use(remarkStripContentBlocksDelimiters).run(tree);
216236

217-
// 🤔 remove handlebars expressions (`{{...}}`) so they don't pollute the paragraphs
218-
tree = await unified().use(remarkStripHeliosHandlebarsExpressions).run(tree);
237+
// ✅ process <Doc::WcagList/> elements and convert the to custom AST node
238+
tree = await unified().use(remarkProcessDocWcagList).run(tree);
219239

220-
// ✅ remove empty paragraphs
221-
tree = await unified().use(remarkRemoveEmptyParagraphs).run(tree);
240+
// ✅ remove some <Doc::***/> elements
241+
tree = await unified().use(remarkStripDocA11ySupport).run(tree);
242+
tree = await unified().use(remarkStripDocBadge).run(tree);
243+
tree = await unified().use(remarkStripDocLayout).run(tree);
244+
245+
// 🤔 remove handlebars expressions (`{{...}}`) so they don't pollute the paragraphs
246+
// tree = await unified().use(remarkStripHeliosHandlebarsExpressions).run(tree);
222247

223248
// ✅ associate to each node the hierarchy in terms of headings level
224249
tree = await unified().use(setNodesHierarchy).run(tree);
225250

226-
// console.log('TREE BEFORE', JSON.stringify(tree, null, 2));
227-
// console.log('TREE AFTER', JSON.stringify(tree, null, 2));
251+
console.log('TREE BEFORE', JSON.stringify(tree, null, 2));
228252

229253
// HTML AST PROCESSING
254+
// -------------------
230255

231-
const html = await unified()
256+
const htmlTree = await unified()
232257
.use(remarkParse)
233-
// .use(remarkHtml, { sanitize: remarkHtmlSanitise })
234258
// .use(remarkRehype)
235259
.use(remarkRehype, { allowDangerousHtml: true })
236260
.use(rehypeRaw)
237-
.use(rehypeStringify)
261+
.use(rehypeStringify, { closeSelfClosing: true })
238262
.run(tree);
239263

240-
console.log('HTML', JSON.stringify(html, null, 2));
264+
// ✅ remove Doc::ListContainer (doc::listcontainer) wrappers
265+
tree = await unified().use(rehypeRemoveDocListContainer).run(htmlTree);
266+
267+
// ✅ remove empty paragraphs
268+
tree = await unified().use(rehypeRemoveEmptyParagraphs).run(htmlTree);
269+
270+
console.log('HTML AFTER', JSON.stringify(htmlTree, null, 2));
241271

242-
// process the relevant nodes
272+
// EXTRACT CONTENT FROM RELEVANT NODES
273+
// -----------------------------------
243274

244275
// // parse and index "doc" (ember) nodes for special components
245276
// .use(wcagListMapper)
246277
// .use(componentApiMapper)
247278

248-
// parse and index standard nodes
279+
// parse and index relevant nodes
249280
// await unified()
250281
// .use(headingMapper)
251282
// .use(paragraphMapper)

website/docs-scraper/index-website-content.mjs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
// DEV MODE VARIABLES
1212

1313
// used in development mode to skip API calls
14-
const DEV_SKIP_API_CALLS = false;
14+
const DEV_SKIP_API_CALLS = true;
1515
// used in development to process only the "testing" markdown files
16-
const DEV_MARKDOWN_TESTING = false;
16+
const DEV_MARKDOWN_TESTING = true;
1717

1818
// ===================================================
1919

website/docs-scraper/parts/debugLogNodes.mjs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,21 @@
66
import { visit } from 'unist-util-visit';
77

88
export const debugLogNodes = () => (tree) => {
9+
console.log('==========================================================');
910
visit(tree, (node, _index, parent) => {
1011
// if (node.type === 'text') {
11-
if (node.type !== 'root') {
12-
// if (node.type === 'text' && parent.type === 'paragraph') {
13-
console.log('TYPE', node.type);
14-
console.log('LOG', JSON.stringify(node, null, 2));
15-
// if (node.value) {
16-
// console.log('VALUE', node.value);
17-
// }
18-
console.log('-----------------');
12+
// if (node.type !== 'root') {
13+
// if (node.type === 'text' && parent.type === 'paragraph') {
14+
console.log('TYPE', node.type);
15+
if (node.value) {
16+
console.log('VALUE', node.value);
1917
}
18+
// console.log('LOG', JSON.stringify(node, null, 2));
19+
// if (node.value) {
20+
// console.log('VALUE', node.value);
21+
// }
22+
console.log('-----------------');
23+
// }
2024
});
25+
console.log('==========================================================');
2126
};
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
/**
2+
* Copyright (c) HashiCorp, Inc.
3+
* SPDX-License-Identifier: MPL-2.0
4+
*/
5+
6+
import { remove } from 'unist-util-remove';
7+
8+
export const rehypeRemoveDocListContainer = () => (tree) => {
9+
remove(tree, 'element', (node) => node.tagName === 'doc::listcontainer');
10+
};
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/**
2+
* Copyright (c) HashiCorp, Inc.
3+
* SPDX-License-Identifier: MPL-2.0
4+
*/
5+
6+
import { visit, SKIP } from 'unist-util-visit';
7+
8+
export const rehypeRemoveEmptyParagraphs = () => (tree) => {
9+
visit(tree, 'element', (node, index, parent) => {
10+
if (
11+
node.tagName === 'p' &&
12+
(!node.children ||
13+
node.children.length === 0 ||
14+
node.children.every(
15+
(child) => child.type === 'text' && child.value.trim() === ''
16+
))
17+
) {
18+
// see: https://unifiedjs.com/learn/recipe/remove-node/
19+
parent.children.splice(index, 1);
20+
return [SKIP, index];
21+
}
22+
});
23+
};
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/**
2+
* Copyright (c) HashiCorp, Inc.
3+
* SPDX-License-Identifier: MPL-2.0
4+
*/
5+
6+
import { visit } from 'unist-util-visit';
7+
8+
export const remarkProcessDocWcagList = () => (tree) => {
9+
visit(tree, 'text', (node) => {
10+
// eg. <Doc::WcagList @criteriaList={{array "1.1.1" "1.2.1" ... }} />
11+
const match = node.value.match(
12+
/<Doc::WcagList @criteriaList={{array (.*)}} \/>/i
13+
);
14+
if (match) {
15+
node.type = 'doc-wcag-list';
16+
node.criteria = match[1].replaceAll('"', '').split(' ');
17+
node.position = { ...node.position };
18+
delete node.value;
19+
}
20+
});
21+
};

website/docs-scraper/parts/remarkRemoveEmptyParagraphs.mjs

Lines changed: 0 additions & 22 deletions
This file was deleted.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
/**
2+
* Copyright (c) HashiCorp, Inc.
3+
* SPDX-License-Identifier: MPL-2.0
4+
*/
5+
6+
import { visit } from 'unist-util-visit';
7+
8+
export const remarkStripContentBlocksDelimiters = () => (tree) => {
9+
visit(tree, 'text', (node) => {
10+
node.value = node.value.replace(/^!!!.*$/gm, '').replace(/\n!!!$/gm, '');
11+
});
12+
};
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/**
2+
* Copyright (c) HashiCorp, Inc.
3+
* SPDX-License-Identifier: MPL-2.0
4+
*/
5+
6+
import { visit } from 'unist-util-visit';
7+
8+
export const remarkStripDocA11ySupport = () => (tree) => {
9+
visit(tree, 'text', (node) => {
10+
//eg. <Doc::A11ySupport />
11+
node.value = node.value.replace(/<Doc::A11ySupport \/>/gi, '');
12+
});
13+
};
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/**
2+
* Copyright (c) HashiCorp, Inc.
3+
* SPDX-License-Identifier: MPL-2.0
4+
*/
5+
6+
import { visit } from 'unist-util-visit';
7+
8+
export const remarkStripDocBadge = () => (tree) => {
9+
visit(tree, 'text', (node) => {
10+
//eg. <Doc::Badge @type='success'>Conformant</Doc::Badge>
11+
node.value = node.value.replace(/<Doc::Badge .*?>.*?<\/Doc::Badge>/i, '');
12+
});
13+
};

0 commit comments

Comments
 (0)