Skip to content

Commit

Permalink
feat: speed up the scrapper detection by domain (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
Victor Fernandes authored Feb 1, 2024
1 parent ad85386 commit 8ce73f1
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 16 deletions.
27 changes: 14 additions & 13 deletions src/background.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getCurrentTab, runScrapper, ScrapperOptions } from './utils/chrome';
import { getCurrentTab, getDomainName, runScrapper, ScrapperOptions } from './utils/chrome';
import scrapperOptions from './scrappers';

function urlMatchesPatternUrl(url: string, patternURL: string) {
Expand All @@ -14,19 +14,20 @@ function urlMatchesPatternUrl(url: string, patternURL: string) {
function getScrapperOptionsByUrl(url: string, title: string): ScrapperOptions | null {
let options;

for (let i = 0; i < scrapperOptions.length; i++) {
const patternUrl = scrapperOptions[i].url;
const domain = getDomainName(url);

if (typeof patternUrl === 'string' && urlMatchesPatternUrl(url, patternUrl)) {
options = scrapperOptions[i];
break;
} else if (
Array.isArray(patternUrl) &&
patternUrl.some((scrapperURL: string) => urlMatchesPatternUrl(url, scrapperURL))
) {
options = scrapperOptions[i];
break;
}
if (domain && scrapperOptions.has(domain)) {
const scrappers = scrapperOptions.get(domain)!;

const scrapper = scrappers.find((scrapper) => {
if (Array.isArray(scrapper.url)) {
return scrapper.url.some((scrapperURL: string) => urlMatchesPatternUrl(url, scrapperURL));
} else {
return urlMatchesPatternUrl(url, scrapper.url);
}
});

options = scrapper;
}

if (options) {
Expand Down
27 changes: 27 additions & 0 deletions src/scrappers/booking.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
url: https://www.booking.com/searchresults*
header: Booking.com search results
listElementsQuery: '[data-testid="property-card"]'
elementParser:
- title: Title
query: '[data-testid="title"]'
type: text

- title: Review score
query: '[data-testid="review-score"] div:nth-child(1)'
type: text

- title: Distance
query: '[data-testid="distance"]'
type: text

- title: Price w/ discounts
query: '[data-testid="price-and-discounted-price"]'
type: text

- title: Offers
query: '[data-testid="gallery-ribbon"]'
type: text

- title: Booking URL
query: a
type: clean-url
22 changes: 19 additions & 3 deletions src/scrappers/index.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
import { ScrapperOptions } from '../utils/chrome';
import { getDomainName, ScrapperOptions } from '../utils/chrome';

// @ts-ignore
const data = import.meta.glob('./*.yml', { eager: true });
const scrappers: Array<ScrapperOptions> = [];
const scrappers = new Map<string, Array<ScrapperOptions>>();

for (const scrapperPath in data) {
scrappers.push(data[scrapperPath].default);
const url = data[scrapperPath].default.url;
let hostname = '';

if (Array.isArray(url)) {
hostname = getDomainName(url[0]);
} else {
hostname = getDomainName(url);
}

if (hostname) {
if (scrappers.has(hostname)) {
const options = scrappers.get(hostname)!;
scrappers.set(hostname, [...options, data[scrapperPath].default]);
} else {
scrappers.set(hostname, [data[scrapperPath].default]);
}
}
}

export default scrappers;
17 changes: 17 additions & 0 deletions src/utils/chrome.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,23 @@ import { customScrapper } from './scrappers/custom';
import { scrapHTMLTables } from './scrappers/html-tables';
import { scrapDivHTMLTables, ScrapDivTablesOptions } from './scrappers/div-tables';

export function getDomainName(url: string) {
const urlParsed = new URL(url);

// Split the hostname into parts
const hostnameParts = urlParsed.hostname.split('.');

// Determine the number of parts in the TLD
const tldCount = urlParsed.hostname.endsWith(hostnameParts[hostnameParts.length - 1]) ? 1 : 2;

// Remove the last n parts (TLD)
for (let i = 0; i < tldCount; i++) {
hostnameParts.pop();
}

return hostnameParts.at(-1)!;
}

export async function getCurrentTab() {
const [tab] = await chrome.tabs.query({
active: true,
Expand Down

0 comments on commit 8ce73f1

Please sign in to comment.