Skip to content

Commit 14f44f0

Browse files
authored
Converting to using fast-xml-parser (#157)
* Converting to using fast-xml-parser * Removing xml2js * Don't expose the parser unecessarily * Adding parser options * Adding parser options --------- Co-authored-by: Sean Thomas Burke <[email protected]>
1 parent 867cd5a commit 14f44f0

File tree

4 files changed

+68
-64
lines changed

4 files changed

+68
-64
lines changed

Diff for: package-lock.json

+41-46
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: package.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,9 @@
8686
"typescript": "^4.1.2"
8787
},
8888
"dependencies": {
89+
"fast-xml-parser": "^4.5.0",
8990
"got": "^11.8.0",
9091
"is-gzip": "2.0.0",
91-
"p-limit": "^3.1.0",
92-
"xml2js": "^0.5.0"
92+
"p-limit": "^3.1.0"
9393
}
9494
}

Diff for: src/assets/sitemapper.js

+23-14
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* @author Sean Burke <@seantomburke>
77
*/
88

9-
import { parseStringPromise } from 'xml2js';
9+
import { XMLParser } from 'fast-xml-parser';
1010
import got from 'got';
1111
import zlib from 'zlib';
1212
import pLimit from 'p-limit';
@@ -95,6 +95,7 @@ export default class Sitemapper {
9595
errors: results.errors || [],
9696
};
9797
}
98+
9899
/**
99100
* Get the timeout
100101
*
@@ -174,7 +175,7 @@ export default class Sitemapper {
174175
}
175176

176177
/**
177-
* Requests the URL and uses parseStringPromise to parse through and find the data
178+
* Requests the URL and uses fast-xml-parser to parse through and find the data
178179
*
179180
* @private
180181
* @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml)
@@ -218,8 +219,10 @@ export default class Sitemapper {
218219
responseBody = response.body;
219220
}
220221

221-
// otherwise parse the XML that was returned.
222-
const data = await parseStringPromise(responseBody);
222+
// Parse XML using fast-xml-parser
223+
const parser = new XMLParser();
224+
225+
const data = parser.parse(responseBody.toString());
223226

224227
// return the results
225228
return { error: null, data };
@@ -312,26 +315,32 @@ export default class Sitemapper {
312315
if (this.debug) {
313316
console.debug(`Urlset found during "crawl('${url}')"`);
314317
}
315-
// filter out any urls that are older than the lastmod
316-
const sites = data.urlset.url
318+
319+
// Convert single object to array if needed
320+
const urlArray = Array.isArray(data.urlset.url)
321+
? data.urlset.url
322+
: [data.urlset.url];
323+
324+
// Begin filtering the urls
325+
const sites = urlArray
317326
.filter((site) => {
318327
if (this.lastmod === 0) return true;
319328
if (site.lastmod === undefined) return false;
320-
const modified = new Date(site.lastmod[0]).getTime();
329+
const modified = new Date(site.lastmod).getTime();
321330

322331
return modified >= this.lastmod;
323332
})
324333
.filter((site) => {
325-
return !this.isExcluded(site.loc[0]);
334+
return !this.isExcluded(site.loc);
326335
})
327336
.map((site) => {
328337
if (!this.fields) {
329-
return site.loc && site.loc[0];
338+
return site.loc;
330339
} else {
331340
let fields = {};
332341
for (const [field, active] of Object.entries(this.fields)) {
333342
if (active && site[field]) {
334-
fields[field] = site[field][0];
343+
fields[field] = site[field];
335344
}
336345
}
337346
return fields;
@@ -349,7 +358,7 @@ export default class Sitemapper {
349358
}
350359
// Map each child url into a promise to create an array of promises
351360
const sitemap = data.sitemapindex.sitemap
352-
.map((map) => map.loc && map.loc[0])
361+
.map((map) => map.loc)
353362
.filter((url) => {
354363
return !this.isExcluded(url);
355364
});
@@ -441,8 +450,8 @@ export default class Sitemapper {
441450
* @param {Buffer} body - body of the gzipped file
442451
* @returns {boolean}
443452
*/
444-
decompressResponseBody(body) {
445-
return new Promise((resolve, reject) => {
453+
async decompressResponseBody(body) {
454+
return await new Promise((resolve, reject) => {
446455
const buffer = Buffer.from(body);
447456
zlib.gunzip(buffer, (err, result) => {
448457
if (err) {
@@ -488,7 +497,7 @@ export default class Sitemapper {
488497
*
489498
* @typedef {Object} ParseData
490499
*
491-
* @property {Error} error that either comes from `parseStringPromise` or `got` or custom error
500+
* @property {Error} error that either comes from fast-xml-parser or `got` or custom error
492501
* @property {Object} data
493502
* @property {string} data.url - URL of sitemap
494503
* @property {Array} data.urlset - Array of returned URLs

Diff for: src/examples/index.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import Sitemapper from '../assets/sitemapper';
22

33
// URL to be crawled
4-
const exampleURL = 'https://www.walmart.com/sitemap_topic.xml';
4+
const exampleURL = 'https://wp.seantburke.com/sitemap.xml';
55

66
// Instantiate an instance
77
const sitemapper = new Sitemapper({
88
url: exampleURL, // url to crawl
9-
debug: false, // don't show debug logs
9+
debug: true, // don't show debug logs
1010
timeout: 10000, // 10 seconds
1111
concurrency: 10, // Number of maximum concurrent sitemap crawl threads
1212
retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)

0 commit comments

Comments
 (0)