Skip to content

Commit

Permalink
Create top-1m-pricelink-to-cache.ts
Browse files Browse the repository at this point in the history
  • Loading branch information
wanghaisheng authored Jan 15, 2025
1 parent b847ba7 commit 325845c
Showing 1 changed file with 219 additions and 0 deletions.
219 changes: 219 additions & 0 deletions prices/top-1m-pricelink-to-cache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
import fs from 'fs';
import path from 'path';
import process from 'node:process';
import * as unzipper from 'unzipper';
import csvParser from 'csv-parser';
import httpx from 'httpx';
import Database from 'better-sqlite3';
import * as xml2js from 'xml2js';

// Constants for downloading and processing
const ZIP_FILE_URL = 'https://tranco-list.eu/top-1m.csv.zip';
const ZIP_FILE_PATH = 'top-1m.csv.zip';
const EXTRACTED_CSV_PATH = 'top-1m.csv'; // This will be used after unzipping the file

// Configuration options
const MAX_CONCURRENT_REQUESTS = 100;
const RATE_LIMIT_DELAY = 1000;

const filePath = process.argv[2];
if (!filePath) {
console.error(`File path missing. Usage:
$ pnpm tsx scripts/updateSqliteDatabaseRows.ts data/persisted-to-cache/database.db
`);
process.exit(1);
}

// Database connection
const db = new Database(filePath);
db.pragma('journal_mode = WAL');

// Create table if not exists
db.prepare(`
CREATE TABLE IF NOT EXISTS domain_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL,
found_price_urls TEXT,
checked_urls TEXT,
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
`).run();

// Function to download and unzip the file
const downloadAndUnzip = async () => {
const wget = require('child_process').execSync;
console.log(`Downloading file from ${ZIP_FILE_URL}...`);
wget(`wget ${ZIP_FILE_URL}`);
console.log(`Unzipping the downloaded file...`);
fs.createReadStream(ZIP_FILE_PATH)
.pipe(unzipper.Extract({ path: '.' }))
.on('close', () => console.log('File unzipped successfully.'));
};

// Function to parse CSV and extract domains
const parseDomainsFromCSV = async (csvFilePath: string): Promise<string[]> => {
return new Promise((resolve, reject) => {
const domains: string[] = [];
fs.createReadStream(csvFilePath)
.pipe(csvParser())
.on('data', (row) => {
// The domain is in the second column (index 1)
domains.push(row[Object.keys(row)[1]]);
})
.on('end', () => {
console.log(`Parsed ${domains.length} domains.`);
resolve(domains);
})
.on('error', reject);
});
};

// Function to fetch robots.txt
const fetchRobotsTxt = async (domain: string): Promise<string | null> => {
const url = `http://${domain}/robots.txt`;
try {
const response = await httpx.get(url);
if (response.status === 200) {
return response.data;
}
return null;
} catch (error) {
console.error(`Error fetching robots.txt for ${domain}: ${error}`);
return null;
}
};

// Function to parse the sitemap URLs
const parseSitemapUrls = (robotsTxt: string): string[] => {
const sitemapUrls: string[] = [];
const lines = robotsTxt.split('\n');
for (let line of lines) {
if (line.toLowerCase().startsWith('sitemap:')) {
const sitemapUrl = line.split(':', 2)[1].trim();
sitemapUrls.push(sitemapUrl);
}
}
return sitemapUrls;
};

// Function to fetch and parse the sitemap for price URLs
const fetchAndParseSitemap = async (sitemapUrl: string): Promise<string[]> => {
const foundPriceUrls: string[] = [];
try {
const response = await httpx.get(sitemapUrl);
if (response.status === 200) {
const parser = new xml2js.Parser();
parser.parseString(response.data, (err, result) => {
if (err) {
console.error(`Error parsing sitemap ${sitemapUrl}: ${err}`);
return;
}
const urls = result.urlset.url || [];
for (const url of urls) {
const loc = url.loc && url.loc[0];
if (loc && loc.includes('price')) {
foundPriceUrls.push(loc);
}
}
});
}
} catch (error) {
console.error(`Error fetching sitemap ${sitemapUrl}: ${error}`);
}
return foundPriceUrls;
};

// Function to check fallback URLs if no price URLs are found
const checkFallbackUrls = async (domain: string): Promise<string[]> => {
const urlsToCheck = [`http://${domain}/price`, `http://${domain}/price-plan`];
const checkedUrls: string[] = [];
for (let url of urlsToCheck) {
try {
const response = await httpx.get(url);
if (response.status === 404) {
checkedUrls.push(`404 Not Found: ${url}`);
} else {
checkedUrls.push(`Found: ${url} (Status: ${response.status})`);
}
} catch (error) {
checkedUrls.push(`Error: ${url} - ${error}`);
}
}
return checkedUrls;
};

// Function to save results in batches
const saveDomainResults = (domain: string, foundPriceUrls: string[], checkedUrls: string[]): void => {
const insert = db.prepare(`
INSERT INTO domain_results (domain, found_price_urls, checked_urls)
VALUES (@domain, @found_price_urls, @checked_urls)
`);

insert.run({
domain,
found_price_urls: JSON.stringify(foundPriceUrls),
checked_urls: JSON.stringify(checkedUrls)
});
};

// Function to process domains asynchronously with concurrency control
const processDomains = async (domains: string[]): Promise<void> => {
const queue = [...domains];
let activeRequests = 0;

const processDomain = async (domain: string) => {
activeRequests++;

console.log(`Checking domain: ${domain}`);
const robotsTxt = await fetchRobotsTxt(domain);
let foundPriceUrls: string[] = [];
let checkedUrls: string[] = [];

if (robotsTxt) {
const sitemapUrls = parseSitemapUrls(robotsTxt);
if (sitemapUrls.length > 0) {
for (const sitemapUrl of sitemapUrls) {
const priceUrls = await fetchAndParseSitemap(sitemapUrl);
foundPriceUrls = foundPriceUrls.concat(priceUrls);
}
}
}

if (foundPriceUrls.length === 0) {
checkedUrls = await checkFallbackUrls(domain);
}

saveDomainResults(domain, foundPriceUrls, checkedUrls);
console.log(`Results saved for domain: ${domain}`);

activeRequests--;
};

const runBatch = async () => {
const domainBatch = queue.splice(0, MAX_CONCURRENT_REQUESTS);

await Promise.all(domainBatch.map((domain) => processDomain(domain)));

if (queue.length > 0) {
await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY)); // Rate limit between batches
await runBatch(); // Continue processing remaining domains
}
};

await runBatch();
console.log('All domains processed and results saved.');
};

// Main function to download, unzip, parse CSV, and process domains
const main = async () => {
try {
await downloadAndUnzip(); // Download and unzip the file
const domains = await parseDomainsFromCSV(EXTRACTED_CSV_PATH); // Parse domains from CSV
await processDomains(domains); // Process domains in batches
} catch (error) {
console.error('Error in processing domains:', error);
}
};

main();

0 comments on commit 325845c

Please sign in to comment.