Converting to using fast-xml-parser (#157)

seantomburke · web-flow · commit 14f44f0deef6 · 2024-11-08T01:00:38.000-08:00
* Converting to using fast-xml-parser

* Removing xml2js

* Don't expose the parser unecessarily

* Adding parser options

* Adding parser options

---------

Co-authored-by: Sean Thomas Burke &lt;seantomburke@users.noreply.github.com&gt;
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -86,9 +86,9 @@
     "typescript": "^4.1.2"
   },
   "dependencies": {
+    "fast-xml-parser": "^4.5.0",
     "got": "^11.8.0",
     "is-gzip": "2.0.0",
-    "p-limit": "^3.1.0",
-    "xml2js": "^0.5.0"
+    "p-limit": "^3.1.0"
   }
 }
diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
@@ -6,7 +6,7 @@
  * @author Sean Burke <@seantomburke>
  */
 
-import { parseStringPromise } from 'xml2js';
+import { XMLParser } from 'fast-xml-parser';
 import got from 'got';
 import zlib from 'zlib';
 import pLimit from 'p-limit';
@@ -95,6 +95,7 @@ export default class Sitemapper {
       errors: results.errors || [],
     };
   }
+
   /**
    * Get the timeout
    *
@@ -174,7 +175,7 @@ export default class Sitemapper {
   }
 
   /**
-   * Requests the URL and uses parseStringPromise to parse through and find the data
+   * Requests the URL and uses fast-xml-parser to parse through and find the data
    *
    * @private
    * @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml)
@@ -218,8 +219,10 @@ export default class Sitemapper {
         responseBody = response.body;
       }
 
-      // otherwise parse the XML that was returned.
-      const data = await parseStringPromise(responseBody);
+      // Parse XML using fast-xml-parser
+      const parser = new XMLParser();
+
+      const data = parser.parse(responseBody.toString());
 
       // return the results
       return { error: null, data };
@@ -312,26 +315,32 @@ export default class Sitemapper {
         if (this.debug) {
           console.debug(`Urlset found during "crawl('${url}')"`);
         }
-        // filter out any urls that are older than the lastmod
-        const sites = data.urlset.url
+
+        // Convert single object to array if needed
+        const urlArray = Array.isArray(data.urlset.url)
+          ? data.urlset.url
+          : [data.urlset.url];
+
+        // Begin filtering the urls
+        const sites = urlArray
           .filter((site) => {
             if (this.lastmod === 0) return true;
             if (site.lastmod === undefined) return false;
-            const modified = new Date(site.lastmod[0]).getTime();
+            const modified = new Date(site.lastmod).getTime();
 
             return modified >= this.lastmod;
           })
           .filter((site) => {
-            return !this.isExcluded(site.loc[0]);
+            return !this.isExcluded(site.loc);
           })
           .map((site) => {
             if (!this.fields) {
-              return site.loc && site.loc[0];
+              return site.loc;
             } else {
               let fields = {};
               for (const [field, active] of Object.entries(this.fields)) {
                 if (active && site[field]) {
-                  fields[field] = site[field][0];
+                  fields[field] = site[field];
                 }
               }
               return fields;
@@ -349,7 +358,7 @@ export default class Sitemapper {
         }
         // Map each child url into a promise to create an array of promises
         const sitemap = data.sitemapindex.sitemap
-          .map((map) => map.loc && map.loc[0])
+          .map((map) => map.loc)
           .filter((url) => {
             return !this.isExcluded(url);
           });
@@ -441,8 +450,8 @@ export default class Sitemapper {
    * @param {Buffer} body - body of the gzipped file
    * @returns {boolean}
    */
-  decompressResponseBody(body) {
-    return new Promise((resolve, reject) => {
+  async decompressResponseBody(body) {
+    return await new Promise((resolve, reject) => {
       const buffer = Buffer.from(body);
       zlib.gunzip(buffer, (err, result) => {
         if (err) {
@@ -488,7 +497,7 @@ export default class Sitemapper {
  *
  * @typedef {Object} ParseData
  *
- * @property {Error} error that either comes from `parseStringPromise` or `got` or custom error
+ * @property {Error} error that either comes from fast-xml-parser or `got` or custom error
  * @property {Object} data
  * @property {string} data.url - URL of sitemap
  * @property {Array} data.urlset - Array of returned URLs
diff --git a/src/examples/index.js b/src/examples/index.js
@@ -1,12 +1,12 @@
 import Sitemapper from '../assets/sitemapper';
 
 // URL to be crawled
-const exampleURL = 'https://www.walmart.com/sitemap_topic.xml';
+const exampleURL = 'https://wp.seantburke.com/sitemap.xml';
 
 // Instantiate an instance
 const sitemapper = new Sitemapper({
   url: exampleURL, // url to crawl
-  debug: false, // don't show debug logs
+  debug: true, // don't show debug logs
   timeout: 10000, // 10 seconds
   concurrency: 10, // Number of maximum concurrent sitemap crawl threads
   retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)

Original file line number	Diff line number	Diff line change
`@@ -86,9 +86,9 @@`
`86`	`86`	`"typescript": "^4.1.2"`
`87`	`87`	`},`
`88`	`88`	`"dependencies": {`
	`89`	`+ "fast-xml-parser": "^4.5.0",`
`89`	`90`	`"got": "^11.8.0",`
`90`	`91`	`"is-gzip": "2.0.0",`
`91`		`- "p-limit": "^3.1.0",`
`92`		`- "xml2js": "^0.5.0"`
	`92`	`+ "p-limit": "^3.1.0"`
`93`	`93`	`}`
`94`	`94`	`}`