large refactoring (part #2)

didoo · didoo · commit 75e8a117de87 · 2023-11-29T22:41:57.000Z
diff --git a/website/docs-scraper/extract-content-from-markdown.mjs b/website/docs-scraper/extract-content-from-markdown.mjs
@@ -3,9 +3,12 @@
  * SPDX-License-Identifier: MPL-2.0
  */
 
+import fs from 'fs-extra';
+
+import _ from 'lodash';
+
 // remark
 import { unified } from 'unified';
-import remarkParse from 'remark-parse';
 import { remark } from 'remark';
 import { visit } from 'unist-util-visit';
 import { selectAll } from 'unist-util-select';
@@ -15,26 +18,32 @@ import { toString } from 'mdast-util-to-string';
 
 import handlebars from 'handlebars';
 
-import _ from 'lodash';
-
 // plugins
 import remarkGfm from 'remark-gfm';
-import remarkHtml from 'remark-html';
+import remarkParse from 'remark-parse';
+// import remarkHtml from 'remark-html';
 import remarkRehype from 'remark-rehype';
 import rehypeRaw from 'rehype-raw';
 import rehypeStringify from 'rehype-stringify';
+import remarkFrontmatter from 'remark-frontmatter';
 
 // local/custom
 import { WCAG_CRITERIA } from './parts/getWcagCriteria.mjs';
-import { replaceDocTags } from './parts/replaceDocTags.mjs';
+import { replaceDocComponentApiTags } from './parts/replaceDocComponentApiTags.mjs';
+import { removeIgnoredContent } from './parts/removeIgnoredContent.mjs';
 import { remarkRemoveComments } from './parts/remarkRemoveComments.mjs';
 import { remarkRemoveCodeBlocks } from './parts/remarkRemoveCodeBlocks.mjs';
-import { remarkRemoveEmptyParagraphs } from './parts/remarkRemoveEmptyParagraphs.mjs';
 import { remarkProcessCustomImageFormat } from './parts/remarkProcessCustomImageFormat.mjs';
-import { remarkStripHeliosContentBlocksDelimiters } from './parts/remarkStripHeliosContentBlocksDelimiters.mjs';
+import { remarkStripContentBlocksDelimiters } from './parts/remarkStripContentBlocksDelimiters.mjs';
+import { remarkProcessDocWcagList } from './parts/remarkProcessDocWcagList.mjs';
+import { remarkStripDocA11ySupport } from './parts/remarkStripDocA11ySupport.mjs';
+import { remarkStripDocBadge } from './parts/remarkStripDocBadge.mjs';
+import { remarkStripDocLayout } from './parts/remarkStripDocLayout.mjs';
 import { remarkStripHeliosHandlebarsExpressions } from './parts/remarkStripHeliosHandlebarsExpressions.mjs';
 // import { remarkStripHeliosReleaseNotesMetadata } from './remarkStripHeliosReleaseNotesMetadata.mjs';
 import { remarkHtmlSanitise } from './parts/remarkHtmlSanitise.mjs';
+import { rehypeRemoveDocListContainer } from './parts/rehypeRemoveDocListContainer.mjs';
+import { rehypeRemoveEmptyParagraphs } from './parts/rehypeRemoveEmptyParagraphs.mjs';
 import { setNodesHierarchy } from './parts/setNodesHierarchy.mjs';
 import { stringifyChildNodes } from './parts/stringifyChildNodes.mjs';
 
@@ -186,19 +195,32 @@ export async function parseMarkdown(markdownContent) {
   // PROCESSING PIPELINE
   // --------------------
 
-  // we need to convert the `<Doc::***>` components to web-components-like code (HTML compatible)
-  const standardizedContent = replaceDocTags(markdownContent);
+  const testFile =
+    '/Users/cristianorastelli/src/hashicorp/design-system/website/docs/testing/markdown/scraping-playground.md';
+
+  // TODO! remove! override
+  const fileContent = await fs.readFile(testFile);
+  markdownContent = fileContent.toString();
+
+  // remove content included in `<!-- algolia-ignore-[start/end] -->` delimiters
+  markdownContent = removeIgnoredContent(markdownContent);
+
+  // replace `<Doc::ComponentApi(::*)>` tags with web-components-like `<doc-component-api>` custom tags (HTML compatible)
+  markdownContent = replaceDocComponentApiTags(markdownContent);
 
   // MARKDOWN AST PROCESSING
+  // -----------------------
 
   let tree = await unified()
     // convert the markdown to AST
     .use(remarkParse)
     // interpret special GFM markdown format
     .use(remarkGfm)
+    // interpret the frontmatter block
+    .use(remarkFrontmatter, { type: 'yaml', marker: '-' })
     // convert markdown to HTML (TODO do we need it? what happens if we remove this?)
     // .use(remarkHtml, { sanitize: remarkHtmlSanitise })
-    .parse(standardizedContent);
+    .parse(markdownContent);
 
   // ✅ process custom images (showdown.js format)
   tree = await unified().use(remarkProcessCustomImageFormat).run(tree);
@@ -210,42 +232,51 @@ export async function parseMarkdown(markdownContent) {
   tree = await unified().use(remarkRemoveCodeBlocks).run(tree);
 
   // ✅ remove content blocks delimiters
-  tree = await unified()
-    .use(remarkStripHeliosContentBlocksDelimiters)
-    .run(tree);
+  tree = await unified().use(remarkStripContentBlocksDelimiters).run(tree);
 
-  // 🤔 remove handlebars expressions (`{{...}}`) so they don't pollute the paragraphs
-  tree = await unified().use(remarkStripHeliosHandlebarsExpressions).run(tree);
+  // ✅ process <Doc::WcagList/> elements and convert the to custom AST node
+  tree = await unified().use(remarkProcessDocWcagList).run(tree);
 
-  // ✅ remove empty paragraphs
-  tree = await unified().use(remarkRemoveEmptyParagraphs).run(tree);
+  // ✅ remove some <Doc::***/> elements
+  tree = await unified().use(remarkStripDocA11ySupport).run(tree);
+  tree = await unified().use(remarkStripDocBadge).run(tree);
+  tree = await unified().use(remarkStripDocLayout).run(tree);
+
+  // 🤔 remove handlebars expressions (`{{...}}`) so they don't pollute the paragraphs
+  // tree = await unified().use(remarkStripHeliosHandlebarsExpressions).run(tree);
 
   // ✅ associate to each node the hierarchy in terms of headings level
   tree = await unified().use(setNodesHierarchy).run(tree);
 
-  // console.log('TREE BEFORE', JSON.stringify(tree, null, 2));
-  // console.log('TREE AFTER', JSON.stringify(tree, null, 2));
+  console.log('TREE BEFORE', JSON.stringify(tree, null, 2));
 
   // HTML AST PROCESSING
+  // -------------------
 
-  const html = await unified()
+  const htmlTree = await unified()
     .use(remarkParse)
-    // .use(remarkHtml, { sanitize: remarkHtmlSanitise })
     // .use(remarkRehype)
     .use(remarkRehype, { allowDangerousHtml: true })
     .use(rehypeRaw)
-    .use(rehypeStringify)
+    .use(rehypeStringify, { closeSelfClosing: true })
     .run(tree);
 
-  console.log('HTML', JSON.stringify(html, null, 2));
+  // ✅ remove Doc::ListContainer (doc::listcontainer) wrappers
+  tree = await unified().use(rehypeRemoveDocListContainer).run(htmlTree);
+
+  // ✅ remove empty paragraphs
+  tree = await unified().use(rehypeRemoveEmptyParagraphs).run(htmlTree);
+
+  console.log('HTML AFTER', JSON.stringify(htmlTree, null, 2));
 
-  // process the relevant nodes
+  // EXTRACT CONTENT FROM RELEVANT NODES
+  // -----------------------------------
 
   // // parse and index "doc" (ember) nodes for special components
   // .use(wcagListMapper)
   // .use(componentApiMapper)
 
-  // parse and index standard nodes
+  // parse and index relevant nodes
   // await unified()
   //   .use(headingMapper)
   //   .use(paragraphMapper)
diff --git a/website/docs-scraper/index-website-content.mjs b/website/docs-scraper/index-website-content.mjs
@@ -11,9 +11,9 @@
 // DEV MODE VARIABLES
 
 // used in development mode to skip API calls
-const DEV_SKIP_API_CALLS = false;
+const DEV_SKIP_API_CALLS = true;
 // used in development to process only the "testing" markdown files
-const DEV_MARKDOWN_TESTING = false;
+const DEV_MARKDOWN_TESTING = true;
 
 // ===================================================
 
diff --git a/website/docs-scraper/parts/debugLogNodes.mjs b/website/docs-scraper/parts/debugLogNodes.mjs
@@ -6,16 +6,21 @@
 import { visit } from 'unist-util-visit';
 
 export const debugLogNodes = () => (tree) => {
+  console.log('==========================================================');
   visit(tree, (node, _index, parent) => {
     // if (node.type === 'text') {
-    if (node.type !== 'root') {
-      // if (node.type === 'text' && parent.type === 'paragraph') {
-      console.log('TYPE', node.type);
-      console.log('LOG', JSON.stringify(node, null, 2));
-      // if (node.value) {
-      //   console.log('VALUE', node.value);
-      // }
-      console.log('-----------------');
+    // if (node.type !== 'root') {
+    // if (node.type === 'text' && parent.type === 'paragraph') {
+    console.log('TYPE', node.type);
+    if (node.value) {
+      console.log('VALUE', node.value);
     }
+    // console.log('LOG', JSON.stringify(node, null, 2));
+    // if (node.value) {
+    //   console.log('VALUE', node.value);
+    // }
+    console.log('-----------------');
+    // }
   });
+  console.log('==========================================================');
 };
diff --git a/website/docs-scraper/parts/rehypeRemoveDocListContainer.mjs b/website/docs-scraper/parts/rehypeRemoveDocListContainer.mjs
@@ -0,0 +1,10 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { remove } from 'unist-util-remove';
+
+export const rehypeRemoveDocListContainer = () => (tree) => {
+  remove(tree, 'element', (node) => node.tagName === 'doc::listcontainer');
+};
diff --git a/website/docs-scraper/parts/rehypeRemoveEmptyParagraphs.mjs b/website/docs-scraper/parts/rehypeRemoveEmptyParagraphs.mjs
@@ -0,0 +1,23 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { visit, SKIP } from 'unist-util-visit';
+
+export const rehypeRemoveEmptyParagraphs = () => (tree) => {
+  visit(tree, 'element', (node, index, parent) => {
+    if (
+      node.tagName === 'p' &&
+      (!node.children ||
+        node.children.length === 0 ||
+        node.children.every(
+          (child) => child.type === 'text' && child.value.trim() === ''
+        ))
+    ) {
+      // see: https://unifiedjs.com/learn/recipe/remove-node/
+      parent.children.splice(index, 1);
+      return [SKIP, index];
+    }
+  });
+};
diff --git a/website/docs-scraper/parts/remarkProcessDocWcagList.mjs b/website/docs-scraper/parts/remarkProcessDocWcagList.mjs
@@ -0,0 +1,21 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { visit } from 'unist-util-visit';
+
+export const remarkProcessDocWcagList = () => (tree) => {
+  visit(tree, 'text', (node) => {
+    // eg. <Doc::WcagList @criteriaList={{array "1.1.1" "1.2.1" ... }} />
+    const match = node.value.match(
+      /<Doc::WcagList @criteriaList={{array (.*)}} \/>/i
+    );
+    if (match) {
+      node.type = 'doc-wcag-list';
+      node.criteria = match[1].replaceAll('"', '').split(' ');
+      node.position = { ...node.position };
+      delete node.value;
+    }
+  });
+};
diff --git a/website/docs-scraper/parts/remarkRemoveEmptyParagraphs.mjs b/website/docs-scraper/parts/remarkRemoveEmptyParagraphs.mjs
diff --git a/website/docs-scraper/parts/remarkStripContentBlocksDelimiters.mjs b/website/docs-scraper/parts/remarkStripContentBlocksDelimiters.mjs
@@ -0,0 +1,12 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { visit } from 'unist-util-visit';
+
+export const remarkStripContentBlocksDelimiters = () => (tree) => {
+  visit(tree, 'text', (node) => {
+    node.value = node.value.replace(/^!!!.*$/gm, '').replace(/\n!!!$/gm, '');
+  });
+};
diff --git a/website/docs-scraper/parts/remarkStripDocA11ySupport.mjs b/website/docs-scraper/parts/remarkStripDocA11ySupport.mjs
@@ -0,0 +1,13 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { visit } from 'unist-util-visit';
+
+export const remarkStripDocA11ySupport = () => (tree) => {
+  visit(tree, 'text', (node) => {
+    //eg. <Doc::A11ySupport />
+    node.value = node.value.replace(/<Doc::A11ySupport \/>/gi, '');
+  });
+};
diff --git a/website/docs-scraper/parts/remarkStripDocBadge.mjs b/website/docs-scraper/parts/remarkStripDocBadge.mjs
@@ -0,0 +1,13 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { visit } from 'unist-util-visit';
+
+export const remarkStripDocBadge = () => (tree) => {
+  visit(tree, 'text', (node) => {
+    //eg. <Doc::Badge @type='success'>Conformant</Doc::Badge>
+    node.value = node.value.replace(/<Doc::Badge .*?>.*?<\/Doc::Badge>/i, '');
+  });
+};
diff --git a/website/docs-scraper/parts/remarkStripDocLayout.mjs b/website/docs-scraper/parts/remarkStripDocLayout.mjs
@@ -0,0 +1,14 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+import { visit } from 'unist-util-visit';
+
+export const remarkStripDocLayout = () => (tree) => {
+  visit(tree, 'text', (node) => {
+    node.value = node.value
+      .replace(/<Doc::Layout .*?>/gim, '')
+      .replace(/<\/Doc::Layout>/gim, '');
+  });
+};
diff --git a/website/docs-scraper/parts/remarkStripHeliosContentBlocksDelimiters.mjs b/website/docs-scraper/parts/remarkStripHeliosContentBlocksDelimiters.mjs
diff --git a/website/docs-scraper/parts/removeIgnoredContent.mjs b/website/docs-scraper/parts/removeIgnoredContent.mjs
@@ -0,0 +1,12 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+// replace <!-- algolia-ignore-start -->...<!-- algolia-ignore-end -->
+
+export const removeIgnoredContent = (markdownContent) =>
+  markdownContent.replace(
+    /(<!-- algolia-ignore-start -->[\s\S]*?<!-- algolia-ignore-end -->)/gim,
+    ''
+  );
diff --git a/website/docs-scraper/parts/replaceDocComponentApiTags.mjs b/website/docs-scraper/parts/replaceDocComponentApiTags.mjs
@@ -0,0 +1,13 @@
+/**
+ * Copyright (c) HashiCorp, Inc.
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+export const replaceDocComponentApiTags = (markdownContent) =>
+  markdownContent
+    .replaceAll('<Doc::ComponentApi as |C|>', '<doc-component-api>')
+    .replaceAll('</Doc::ComponentApi>', '</doc-component-api>')
+    .replace(/(<\/?)C\.Property/gim, (_match, p1) => {
+      const tag = p1 + 'doc-component-api-property';
+      return tag;
+    });
diff --git a/website/docs-scraper/parts/replaceDocTags.mjs b/website/docs-scraper/parts/replaceDocTags.mjs
diff --git a/website/package.json b/website/package.json
diff --git a/yarn.lock b/yarn.lock