document an automation workflow in nodejs from fetch(url) through mozilla / readability through mixmark-io / turndown
In this document, we will outline an automation workflow in Node.js using the mozilla-readability
, jsdom
, and turndown
libraries. The workflow will consist of the following steps:
- Fetch the content of a given URL
- Extract the main readable content using Mozilla's Readability library
- Convert the extracted content to Markdown using Turndown
First, ensure you have Node.js installed and the required packages installed using npm:
npm install @mozilla/readability
npm install turndown
Now, let's create a script called automationWorkflow.js and follow these steps:
-
Import the required libraries:
const { Readability } = require('@mozilla/readability'); const { JSDOM } = require('jsdom'); const TurndownService = require('turndown'); const fs = require('fs').promises;
-
Create a function to fetch the content of a given URL:
async function fetchURLContent(url) { try { const response = await fetch(url); const html = await response.text(); return html; } catch (error) { console.error(`Error fetching URL content: ${error}`); } }
-
Create a function to extract the main readable content using Mozilla's Readability library:
function extractReadableContent(html, url) { try { const dom = new JSDOM(html, url); const reader = new Readability(dom.window.document); const article = reader.parse(); return article; } catch (error) { console.error(`Error extracting readable content: ${error}`); } }
-
Create a function to convert the extracted content to Markdown using ShowdownJS:
function convertToMarkdown(html) { try { const turndownService = new TurndownService(); const markdown = turndownService.turndown(html); return markdown; } catch (error) { console.error(`Error converting HTML to Markdown: ${error}`); } }
-
Create an async function to execute the entire workflow:
async function runWorkflow(url) { const html = await fetchURLContent(url); const article = extractReadableContent(html, url); if (article) { const markdown = convertToMarkdown(article.content); console.log(markdown); } else { console.error('Unable to extract readable content from the URL'); } }
-
Execute the workflow with a sample URL:
const url = 'https://example.com/article'; runWorkflow(url);
Save the script and run it using the following command:
node automationWorkflow.js
This will output the main readable content of the provided URL in Markdown format. You can modify the script as needed to further customize the automation workflow.