Skip to content

Commit 2b0894a

Browse files
authored
Exclusions option to filter urls (#148)
1 parent 89048e9 commit 2b0894a

File tree

3 files changed

+102
-4
lines changed

3 files changed

+102
-4
lines changed

Diff for: sitemapper.d.ts

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ export interface SitemapperOptions {
2020
timeout?: number;
2121
url?: string;
2222
fields?: {[name: string]: boolean};
23+
exclusions?: RegExp[];
2324
}
2425

2526
declare class Sitemapper {

Diff for: src/assets/sitemapper.js

+23-4
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,13 @@ export default class Sitemapper {
2828
* @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones.
2929
* @params {lastmod} [options.lastmod] - the minimum lastmod value for urls
3030
* @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got"
31+
* @params {Array<RegExp>} [options.exclusions] - Array of regex patterns to exclude URLs
3132
*
3233
* @example let sitemap = new Sitemapper({
3334
* url: 'https://wp.seantburke.com/sitemap.xml',
3435
* timeout: 15000,
35-
* lastmod: 1630693759
36+
* lastmod: 1630693759,
37+
* exclusions: [/foo.com/, /bar.xml/] // Filters out URLs matching these patterns
3638
* });
3739
*/
3840
constructor(options) {
@@ -49,6 +51,7 @@ export default class Sitemapper {
4951
settings.rejectUnauthorized === false ? false : true;
5052
this.fields = settings.fields || false;
5153
this.proxyAgent = settings.proxyAgent || {};
54+
this.exclusions = settings.exclusions || [];
5255
}
5356

5457
/**
@@ -319,6 +322,9 @@ export default class Sitemapper {
319322

320323
return modified >= this.lastmod;
321324
})
325+
.filter((site) => {
326+
return !this.isExcluded(site.loc[0])
327+
})
322328
.map((site) => {
323329
if( !this.fields) {
324330
return site.loc && site.loc[0];
@@ -343,9 +349,11 @@ export default class Sitemapper {
343349
console.debug(`Additional sitemap found during "crawl('${url}')"`);
344350
}
345351
// Map each child url into a promise to create an array of promises
346-
const sitemap = data.sitemapindex.sitemap.map(
347-
(map) => map.loc && map.loc[0]
348-
);
352+
const sitemap = data.sitemapindex.sitemap
353+
.map((map) => map.loc && map.loc[0])
354+
.filter((url) => {
355+
return !this.isExcluded(url)
356+
});
349357

350358
// Parse all child urls within the concurrency limit in the settings
351359
const limit = pLimit(this.concurrency);
@@ -446,6 +454,17 @@ export default class Sitemapper {
446454
});
447455
});
448456
}
457+
458+
/**
459+
* Checks if a urls is excluded based on the exclusion patterns.
460+
*
461+
* @param {string} url - The URL to check.
462+
* @returns {boolean} Returns true if the urls is excluded, false otherwise.
463+
*/
464+
isExcluded(url) {
465+
if (this.exclusions.length === 0) return false;
466+
return this.exclusions.some((pattern) => pattern.test(url));
467+
}
449468
}
450469

451470
/**

Diff for: src/tests/test.js

+78
Original file line numberDiff line numberDiff line change
@@ -263,4 +263,82 @@ describe('Sitemapper', function () {
263263
});
264264
});
265265
});
266+
267+
describe('exclusions option', function () {
268+
// check for the url that should be excluded in a later test
269+
it('should prevent false positive', function (done) {
270+
this.timeout(30000);
271+
const url = 'https://wp.seantburke.com/sitemap.xml';
272+
// exclude video and image sitemap index urls
273+
sitemapper.exclusions = [/video/,/image/]
274+
sitemapper.fetch(url)
275+
.then(data => {
276+
data.sites.should.be.Array;
277+
data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.true
278+
done();
279+
})
280+
.catch(error => {
281+
console.error('Test failed');
282+
done(error);
283+
});
284+
});
285+
286+
it('should filter out page_id urls', function (done) {
287+
this.timeout(30000);
288+
const url = 'https://wp.seantburke.com/sitemap.xml';
289+
// exclude page_id=2
290+
sitemapper.exclusions = [/page_id/]
291+
sitemapper.fetch(url)
292+
.then(data => {
293+
data.sites.should.be.Array;
294+
data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.false;
295+
done();
296+
})
297+
.catch(error => {
298+
console.error('Test failed');
299+
done(error);
300+
});
301+
});
302+
});
303+
304+
describe('isExcluded method', function () {
305+
it('should return false when no exclusions are set', function () {
306+
const result = sitemapper.isExcluded('https://foo.com/page1');
307+
result.should.be.false();
308+
});
309+
310+
it('should return false when url does not match any exclusion patterns', function () {
311+
sitemapper.exclusions = [/\.pdf$/, /private/];
312+
const result = sitemapper.isExcluded('https://foo.com/page1');
313+
result.should.be.false();
314+
});
315+
316+
it('should return false when url matches an exclusion pattern', function () {
317+
sitemapper.exclusions = [/\.pdf$/, /private/];
318+
const result = sitemapper.isExcluded('https://foo.com/document.pdf');
319+
result.should.be.true();
320+
});
321+
322+
it('should return true when url matches any of multiple exclusion patterns', function () {
323+
sitemapper.exclusions = [/\.pdf$/, /private/, /temp/];
324+
const result = sitemapper.isExcluded('https://foo.com/private/temp.html');
325+
result.should.be.true();
326+
});
327+
328+
it('should handle complex regex patterns correctly', function () {
329+
sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/]
330+
const result1 = sitemapper.isExcluded('https://foo.com/en/private/page');
331+
const result2 = sitemapper.isExcluded('https://foo.com/en/public/page');
332+
result1.should.be.true();
333+
result2.should.be.false();
334+
});
335+
336+
it('should handle case sensitivity correctly', function () {
337+
sitemapper.exclusions = [/private/i];
338+
const result1 = sitemapper.isExcluded('https://foo.com/PRIVATE/page');
339+
const result2 = sitemapper.isExcluded('https://foo.com/Private/page');
340+
result1.should.be.true();
341+
result2.should.be.true();
342+
});
343+
});
266344
});

0 commit comments

Comments
 (0)