Skip to content

Latest commit

 

History

History
86 lines (65 loc) · 3.07 KB

ChatGPT4-workflow.md

File metadata and controls

86 lines (65 loc) · 3.07 KB

document an automation workflow in nodejs to retrieve a specified /wiki/ page from simple.wikipedia.org, remove all nodes with class="reference", remove all nodes with "editsection" in the class, remove all nodes including and after id="References", then send the html through @mozilla/readability, make an array of all links with a url beginning with /wiki/ then remove all links from the html, then pass the html through turndown, and return the markdown and the array of urls

To implement the described automation workflow in Node.js, you will need to install a few packages. Make sure you have Node.js installed, then create a new project and run the following commands:

npm init -y
npm install axios
npm install jsdom
npm install @mozilla/readability
npm install turndown

Now, create a file called wikiToMarkdown.js and add the following code:

const axios = require('axios');
const { JSDOM } = require('jsdom');
const { Readability } = require('@mozilla/readability');
const TurndownService = require('turndown');

async function getSimpleWikiMarkdown(wikiPage) {
  const wikiUrl = `https://simple.wikipedia.org/wiki/${wikiPage}`;
  const response = await axios.get(wikiUrl);

  const dom = new JSDOM(response.data);
  const document = dom.window.document;

  // Remove nodes with class="reference"
  const references = document.querySelectorAll('.reference');
  references.forEach((ref) => ref.remove());

  // Remove nodes with "editsection" in the class
  const editSections = document.querySelectorAll('[class*="editsection"]');
  editSections.forEach((section) => section.remove());

  // Remove all nodes including and after id="References"
  const refsElem = document.querySelector('#References');
  if (refsElem) {
    refsElem.remove();
    let sibling = refsElem.nextElementSibling;
    while (sibling) {
      const nextSibling = sibling.nextElementSibling;
      sibling.remove();
      sibling = nextSibling;
    }
  }

  // Send the HTML through @mozilla/readability
  const reader = new Readability(dom.window.document);
  const article = reader.parse();

  // Make an array of all links with a URL beginning with /wiki/
  const allLinks = Array.from(document.querySelectorAll('a[href^="/wiki/"]'));
  const wikiLinks = allLinks.map((link) => link.href);

  // Remove all links from the HTML
  allLinks.forEach((link) => link.remove());

  // Pass the HTML through turndown
  const turndownService = new TurndownService();
  const markdown = turndownService.turndown(article.content);

  // Return the markdown and the array of URLs
  return { markdown, wikiLinks };
}

module.exports = getSimpleWikiMarkdown;

Now you can use the getSimpleWikiMarkdown function from another file, like this:

const getSimpleWikiMarkdown = require('./wikiToMarkdown');

(async () => {
  const { markdown, wikiLinks } = await getSimpleWikiMarkdown('Node.js');

  console.log('Markdown content:');
  console.log(markdown);

  console.log('Wiki links:');
  console.log(wikiLinks);
})();

This code will get the specified wiki page, process it as described, and return the markdown content and an array of URLs.