From 58ab05730fe530d7963c8388cd6bdb5f058321d8 Mon Sep 17 00:00:00 2001 From: alvaro-naves Date: Sat, 9 Oct 2021 16:24:28 +0100 Subject: [PATCH] Updated request using pool and added device parameter --- google-index-checker.js | 146 +++++++++++++++++++--------------------- lib/poolRequest.js | 29 ++++++++ lib/url-encoder.js | 15 +++-- package.json | 7 +- 4 files changed, 112 insertions(+), 85 deletions(-) create mode 100644 lib/poolRequest.js diff --git a/google-index-checker.js b/google-index-checker.js index d85dce7..c27d8f8 100644 --- a/google-index-checker.js +++ b/google-index-checker.js @@ -1,37 +1,47 @@ // Required Modules import 'dotenv/config' import chalk from 'chalk' // Terminal and string styling -import { createWriteStream } from 'fs' // Node file system module +import axios from 'axios' // Axios client +import { createWriteStream, writeFileSync } from 'fs' // Node file system module import { access } from 'fs/promises' // Promises Node file system module import { parseCSV } from './lib/parser.js' // Convert csv to json module -import axios from 'axios' // Axios client -import { requestUrl, compareUrl } from './lib/url-encoder.js' // Encoding functions +import { googlelify, encodeURL } from './lib/url-encoder.js' // Encoding functions import { timer } from './lib/timer.js' // Timer function +import { poolRequest } from './lib/poolRequest.js' +import sanitizeHtml from 'sanitize-html' + // Settings const { yellow, cyan, white, red, green } = chalk const start = Date.now() // Date counter to check duration of script const site = 'https://www.google.com/search?q=' // Google search query const urlsFile = './urls.csv' // File containing all the urls to check const apiUrl = 'http://api.scraperapi.com/?api_key=' // ScraperAPI url +const params = '&device_type=desktop' const apiKey = process.env.SCRAPERAPI_KEY -let totalUrls = 0 -let notIndexCounter = 0 -let instances = 0 -let urls = [] -// Check if file exist and count number of urls, if it does not exists, exit with message +let count = 1 +let notIndexedCounter = 0 + +// Collect URLS, get Concurrent max max a run request in pool ;(async () => { + urls = await getUrls() + const concurrent = await getConcurrent() + await poolRequest(concurrent, [...urls], runRequest, 'timeout') + finalMessage(urls.length) +})() + +// Gather URLS from file +async function getUrls() { try { await access(urlsFile) - urls = await parseCSV(urlsFile) - totalUrls = urls.length - batchRequest() + return await parseCSV(urlsFile) } catch { console.log(yellow('No urls.csv file found.')) - process.exit() + process.exit(1) } -})() +} +// Connect to API to get allowed number of concurrent requests async function getConcurrent() { try { const { data } = await axios(`http://api.scraperapi.com/account?api_key=${apiKey}`) @@ -42,63 +52,36 @@ async function getConcurrent() { `${error.response.status} - Incorrect or missing API key please check your APIKEY.js file and make sure it includes a correct API key from https://www.scraperapi.com/` ) } else { - console.error('There is a problem connecting to Scraperapi') - process.exit(1) + console.error('There is a problem connecting to Scraperapi, please try again later') + process.exit() } } } -// Batch request with maximun according to account -async function batchRequest() { - console.log(green('Requesting...')) - instances = 1 - - const data = [...urls] - const concurrent = await getConcurrent() +// HTTP request async +async function runRequest(url) { + try { + // Prepare url to search like google does + const requestUrl = googlelify(url) - while (data.length) { - await new Promise((resolve) => setTimeout(resolve, 1000)) + // HTTP request using axios, scraperapi, google and the enconded url + const { data, status } = await axios( + `${apiUrl}${apiKey}&url=${site}${requestUrl}${params}` + ) - const batch = data.splice(0, concurrent).map((url) => runRequest(url, urls.length)) + // Check if it matches google search results + const indexation = matchResponse(url, data) - const results = await Promise.allSettled(batch) + // Print to terminal each url, its number and status code + const counter = `${count++}/${urls.length}` + const statusPrint = green.bold(status) + const indexPrint = white.bold(indexation) - for (const { status, reason } of results) { - if (status === 'rejected') data.push(reason) - } - } + console.log(cyan(`Checking: ${counter} ${url} ${statusPrint} ${indexPrint}`)) - finalMessage() -} + // Create, append and clear stream + const stream = createWriteStream('./results.csv', 'utf8') -// HTTP request async promise -async function runRequest(url, len) { - // Make requests with encoded URL through axios with header options and write it to results - try { - // URL encoded for request - const request = requestUrl(url) - // URL encoded to check indexation - const compare = compareUrl(url, false) - // URL encoded for discrepancies - const utfEncoded = compareUrl(url, true) - // HTTPS request using axios, scraperapi, google and the enconded url - const res = await axios(`${apiUrl}${apiKey}&url=${site}${request}`) - const indexation = matchResponse(url, res.data, compare, utfEncoded) - ? 'Indexed' - : 'Not Indexed' - // Print to terminal each url, its number and status code - console.log( - cyan( - `Checking: ${instances++}/${len} ${url} ${green.bold(res.status)} ${white.bold( - indexation - )}` - ) - ) - // Create append streamclear - const stream = createWriteStream('./results.csv', { - flags: 'a', - encoding: 'utf8' - }) // Append evaluation from response to file stream.write(`${url}, ${indexation}\n`) // End stream to avoid accumulation @@ -107,38 +90,51 @@ async function runRequest(url, len) { // Request made and server responded const status = error.response ? error.response.status : 500 + if (status === 429) { + console.error('Too many request, something went wrong check with SpaperAPI') + process.exit(1) + } + // Log with different color to highlight the error - console.error(yellow(`Error: ${url} ${red(status)} ${green('Repeating')}`)) + console.error(yellow(`Error: ${url} ${red(status)} ${green('Re-trying')}`)) - throw url + throw { + name: 'timeout', + error: url + } } } // Compare url against google response url -function matchResponse(url, res, compare, utfEncoded) { - let matchURL = false - - if (url.includes(`'`)) { - matchURL = res.includes(`href="${compare}"`) | res.includes(`href="${utfEncoded}"`) +function matchResponse(url, res) { + // Look for a tags with href attribute only + const content = sanitizeHtml(res, { + allowedTags: ['a'], + allowedAttributes: { a: ['href'] } + }) + + // Set not index to start + let indexResult = 'Not Indexed' + + // If the encoded version of the URL is on google + if (content.includes(`href="${encodeURL(url)}"`)) { + indexResult = 'Indexed' } else { - matchURL = res.includes(`href="${compare}"`) + notIndexedCounter += 1 } - // Counter foreach not index - if (!matchURL) notIndexCounter += 1 - - return matchURL + return indexResult } -function finalMessage() { +function finalMessage(totalUrls) { console.log( `\n${totalUrls} URLS, results.csv file successfully written in ${timer( Date.now() - start )}\n` ) console.log( - `${green.bold(`Indexed: ` + (totalUrls - notIndexCounter))}\n${red.bold( - `Not indexed: ` + notIndexCounter + `\n` + `${green.bold(`Indexed: ` + (totalUrls - notIndexedCounter))}\n${red.bold( + `Not indexed: ` + notIndexedCounter + `\n` )}` ) } diff --git a/lib/poolRequest.js b/lib/poolRequest.js new file mode 100644 index 0000000..51752cb --- /dev/null +++ b/lib/poolRequest.js @@ -0,0 +1,29 @@ +// Pool request +export async function poolRequest(poolLimit, array, iteratorFn, exception) { + const promises = [] + const racers = [] + + for (const item of array) { + const pro = Promise.resolve().then(() => iteratorFn(item, array)) + promises.push(pro) + + if (poolLimit <= array.length) { + const racer = pro.then(() => racers.splice(racers.indexOf(racer), 1)) + racers.push(racer) + if (racers.length >= poolLimit) { + await Promise.race(racers).catch((err) => console.log(err)) + } + } + } + + const results = await Promise.allSettled(promises) + + for (const { status, reason } of results) { + if (status === 'rejected') { + const { name, error } = reason + if (name === exception) { + await poolRequest(poolLimit, [error], iteratorFn, 'timeout') + } + } + } +} diff --git a/lib/url-encoder.js b/lib/url-encoder.js index 3fd3b43..c28d902 100644 --- a/lib/url-encoder.js +++ b/lib/url-encoder.js @@ -1,5 +1,5 @@ // Encode URL as google does by removing quotes or their encoded version -export const requestUrl = (url) => { +export const googlelify = (url) => { // DecodeURI if not malformed try { url = decodeURI(url) @@ -17,7 +17,7 @@ export const requestUrl = (url) => { } // The URL used to compare against google source code -export const compareUrl = (url, utfChar) => { +export const encodeURL = (url) => { // We got a url that already has encoding so return as it is if (url.includes('%')) { url = url.replace(/ /g, '%20') @@ -26,16 +26,17 @@ export const compareUrl = (url, utfChar) => { // We got a clean url that needs encoding url = encodeURI(url) // We have a list of reserve characters we need to un-encode - const reserve = { '%5B': '[', '%5D': ']' } + const reserve = {} + reserve[`%5B`] = '[' + reserve[`%5D`] = ']' + reserve[`'`] = '%27' + // We create a regex for those reserve characters const regex = new RegExp(Object.keys(reserve).join('|'), 'g') // And replace only those present - url = url.replace(regex, (match) => reserve[match]).replace(/^%22|%22$/g, '') + url = url.replace(regex, (match) => reserve[match]) } - // Create a different url for servers that encode apostophe - if (utfChar) url = url.replace(/\'/g, '%27') - // Covert & to & return url.replace(/&/g, '&') } diff --git a/package.json b/package.json index 869c70b..e67c780 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "google-index-checker", - "version": "3.2.0", + "version": "3.3.0", "license": "MIT", "description": "Check Google indexation status at scale", "author": "Álvaro Fernández & José Hernando | Builtvsible", @@ -18,6 +18,7 @@ "axios": "^0.21.4", "chalk": "^2.4.2", "csv-parser": "^3.0.0", - "dotenv": "^10.0.0" + "dotenv": "^10.0.0", + "sanitize-html": "^2.5.1" } -} +} \ No newline at end of file