33 * SPDX-License-Identifier: MPL-2.0
44 */
55
6+ import fs from 'fs-extra' ;
7+
8+ import _ from 'lodash' ;
9+
610// remark
711import { unified } from 'unified' ;
8- import remarkParse from 'remark-parse' ;
912import { remark } from 'remark' ;
1013import { visit } from 'unist-util-visit' ;
1114import { selectAll } from 'unist-util-select' ;
@@ -15,26 +18,32 @@ import { toString } from 'mdast-util-to-string';
1518
1619import handlebars from 'handlebars' ;
1720
18- import _ from 'lodash' ;
19-
2021// plugins
2122import remarkGfm from 'remark-gfm' ;
22- import remarkHtml from 'remark-html' ;
23+ import remarkParse from 'remark-parse' ;
24+ // import remarkHtml from 'remark-html';
2325import remarkRehype from 'remark-rehype' ;
2426import rehypeRaw from 'rehype-raw' ;
2527import rehypeStringify from 'rehype-stringify' ;
28+ import remarkFrontmatter from 'remark-frontmatter' ;
2629
2730// local/custom
2831import { WCAG_CRITERIA } from './parts/getWcagCriteria.mjs' ;
29- import { replaceDocTags } from './parts/replaceDocTags.mjs' ;
32+ import { replaceDocComponentApiTags } from './parts/replaceDocComponentApiTags.mjs' ;
33+ import { removeIgnoredContent } from './parts/removeIgnoredContent.mjs' ;
3034import { remarkRemoveComments } from './parts/remarkRemoveComments.mjs' ;
3135import { remarkRemoveCodeBlocks } from './parts/remarkRemoveCodeBlocks.mjs' ;
32- import { remarkRemoveEmptyParagraphs } from './parts/remarkRemoveEmptyParagraphs.mjs' ;
3336import { remarkProcessCustomImageFormat } from './parts/remarkProcessCustomImageFormat.mjs' ;
34- import { remarkStripHeliosContentBlocksDelimiters } from './parts/remarkStripHeliosContentBlocksDelimiters.mjs' ;
37+ import { remarkStripContentBlocksDelimiters } from './parts/remarkStripContentBlocksDelimiters.mjs' ;
38+ import { remarkProcessDocWcagList } from './parts/remarkProcessDocWcagList.mjs' ;
39+ import { remarkStripDocA11ySupport } from './parts/remarkStripDocA11ySupport.mjs' ;
40+ import { remarkStripDocBadge } from './parts/remarkStripDocBadge.mjs' ;
41+ import { remarkStripDocLayout } from './parts/remarkStripDocLayout.mjs' ;
3542import { remarkStripHeliosHandlebarsExpressions } from './parts/remarkStripHeliosHandlebarsExpressions.mjs' ;
3643// import { remarkStripHeliosReleaseNotesMetadata } from './remarkStripHeliosReleaseNotesMetadata.mjs';
3744import { remarkHtmlSanitise } from './parts/remarkHtmlSanitise.mjs' ;
45+ import { rehypeRemoveDocListContainer } from './parts/rehypeRemoveDocListContainer.mjs' ;
46+ import { rehypeRemoveEmptyParagraphs } from './parts/rehypeRemoveEmptyParagraphs.mjs' ;
3847import { setNodesHierarchy } from './parts/setNodesHierarchy.mjs' ;
3948import { stringifyChildNodes } from './parts/stringifyChildNodes.mjs' ;
4049
@@ -186,19 +195,32 @@ export async function parseMarkdown(markdownContent) {
186195 // PROCESSING PIPELINE
187196 // --------------------
188197
189- // we need to convert the `<Doc::***>` components to web-components-like code (HTML compatible)
190- const standardizedContent = replaceDocTags ( markdownContent ) ;
198+ const testFile =
199+ '/Users/cristianorastelli/src/hashicorp/design-system/website/docs/testing/markdown/scraping-playground.md' ;
200+
201+ // TODO! remove! override
202+ const fileContent = await fs . readFile ( testFile ) ;
203+ markdownContent = fileContent . toString ( ) ;
204+
205+ // remove content included in `<!-- algolia-ignore-[start/end] -->` delimiters
206+ markdownContent = removeIgnoredContent ( markdownContent ) ;
207+
208+ // replace `<Doc::ComponentApi(::*)>` tags with web-components-like `<doc-component-api>` custom tags (HTML compatible)
209+ markdownContent = replaceDocComponentApiTags ( markdownContent ) ;
191210
192211 // MARKDOWN AST PROCESSING
212+ // -----------------------
193213
194214 let tree = await unified ( )
195215 // convert the markdown to AST
196216 . use ( remarkParse )
197217 // interpret special GFM markdown format
198218 . use ( remarkGfm )
219+ // interpret the frontmatter block
220+ . use ( remarkFrontmatter , { type : 'yaml' , marker : '-' } )
199221 // convert markdown to HTML (TODO do we need it? what happens if we remove this?)
200222 // .use(remarkHtml, { sanitize: remarkHtmlSanitise })
201- . parse ( standardizedContent ) ;
223+ . parse ( markdownContent ) ;
202224
203225 // ✅ process custom images (showdown.js format)
204226 tree = await unified ( ) . use ( remarkProcessCustomImageFormat ) . run ( tree ) ;
@@ -210,42 +232,51 @@ export async function parseMarkdown(markdownContent) {
210232 tree = await unified ( ) . use ( remarkRemoveCodeBlocks ) . run ( tree ) ;
211233
212234 // ✅ remove content blocks delimiters
213- tree = await unified ( )
214- . use ( remarkStripHeliosContentBlocksDelimiters )
215- . run ( tree ) ;
235+ tree = await unified ( ) . use ( remarkStripContentBlocksDelimiters ) . run ( tree ) ;
216236
217- // 🤔 remove handlebars expressions (`{{...}}`) so they don't pollute the paragraphs
218- tree = await unified ( ) . use ( remarkStripHeliosHandlebarsExpressions ) . run ( tree ) ;
237+ // ✅ process <Doc::WcagList/> elements and convert the to custom AST node
238+ tree = await unified ( ) . use ( remarkProcessDocWcagList ) . run ( tree ) ;
219239
220- // ✅ remove empty paragraphs
221- tree = await unified ( ) . use ( remarkRemoveEmptyParagraphs ) . run ( tree ) ;
240+ // ✅ remove some <Doc::***/> elements
241+ tree = await unified ( ) . use ( remarkStripDocA11ySupport ) . run ( tree ) ;
242+ tree = await unified ( ) . use ( remarkStripDocBadge ) . run ( tree ) ;
243+ tree = await unified ( ) . use ( remarkStripDocLayout ) . run ( tree ) ;
244+
245+ // 🤔 remove handlebars expressions (`{{...}}`) so they don't pollute the paragraphs
246+ // tree = await unified().use(remarkStripHeliosHandlebarsExpressions).run(tree);
222247
223248 // ✅ associate to each node the hierarchy in terms of headings level
224249 tree = await unified ( ) . use ( setNodesHierarchy ) . run ( tree ) ;
225250
226- // console.log('TREE BEFORE', JSON.stringify(tree, null, 2));
227- // console.log('TREE AFTER', JSON.stringify(tree, null, 2));
251+ console . log ( 'TREE BEFORE' , JSON . stringify ( tree , null , 2 ) ) ;
228252
229253 // HTML AST PROCESSING
254+ // -------------------
230255
231- const html = await unified ( )
256+ const htmlTree = await unified ( )
232257 . use ( remarkParse )
233- // .use(remarkHtml, { sanitize: remarkHtmlSanitise })
234258 // .use(remarkRehype)
235259 . use ( remarkRehype , { allowDangerousHtml : true } )
236260 . use ( rehypeRaw )
237- . use ( rehypeStringify )
261+ . use ( rehypeStringify , { closeSelfClosing : true } )
238262 . run ( tree ) ;
239263
240- console . log ( 'HTML' , JSON . stringify ( html , null , 2 ) ) ;
264+ // ✅ remove Doc::ListContainer (doc::listcontainer) wrappers
265+ tree = await unified ( ) . use ( rehypeRemoveDocListContainer ) . run ( htmlTree ) ;
266+
267+ // ✅ remove empty paragraphs
268+ tree = await unified ( ) . use ( rehypeRemoveEmptyParagraphs ) . run ( htmlTree ) ;
269+
270+ console . log ( 'HTML AFTER' , JSON . stringify ( htmlTree , null , 2 ) ) ;
241271
242- // process the relevant nodes
272+ // EXTRACT CONTENT FROM RELEVANT NODES
273+ // -----------------------------------
243274
244275 // // parse and index "doc" (ember) nodes for special components
245276 // .use(wcagListMapper)
246277 // .use(componentApiMapper)
247278
248- // parse and index standard nodes
279+ // parse and index relevant nodes
249280 // await unified()
250281 // .use(headingMapper)
251282 // .use(paragraphMapper)
0 commit comments