Skip to content

Commit

Permalink
Updated request using pool and added device parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
alvaro-escalante committed Oct 9, 2021
1 parent 026b06d commit 58ab057
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 85 deletions.
146 changes: 71 additions & 75 deletions google-index-checker.js
Original file line number Diff line number Diff line change
@@ -1,37 +1,47 @@
// Required Modules
import 'dotenv/config'
import chalk from 'chalk' // Terminal and string styling
import { createWriteStream } from 'fs' // Node file system module
import axios from 'axios' // Axios client
import { createWriteStream, writeFileSync } from 'fs' // Node file system module
import { access } from 'fs/promises' // Promises Node file system module
import { parseCSV } from './lib/parser.js' // Convert csv to json module
import axios from 'axios' // Axios client
import { requestUrl, compareUrl } from './lib/url-encoder.js' // Encoding functions
import { googlelify, encodeURL } from './lib/url-encoder.js' // Encoding functions
import { timer } from './lib/timer.js' // Timer function
import { poolRequest } from './lib/poolRequest.js'
import sanitizeHtml from 'sanitize-html'

// Settings
const { yellow, cyan, white, red, green } = chalk
const start = Date.now() // Date counter to check duration of script
const site = 'https://www.google.com/search?q=' // Google search query
const urlsFile = './urls.csv' // File containing all the urls to check
const apiUrl = 'http://api.scraperapi.com/?api_key=' // ScraperAPI url
const params = '&device_type=desktop'
const apiKey = process.env.SCRAPERAPI_KEY

let totalUrls = 0
let notIndexCounter = 0
let instances = 0
let urls = []
// Check if file exist and count number of urls, if it does not exists, exit with message
let count = 1
let notIndexedCounter = 0

// Collect URLS, get Concurrent max max a run request in pool
;(async () => {
urls = await getUrls()
const concurrent = await getConcurrent()
await poolRequest(concurrent, [...urls], runRequest, 'timeout')
finalMessage(urls.length)
})()

// Gather URLS from file
async function getUrls() {
try {
await access(urlsFile)
urls = await parseCSV(urlsFile)
totalUrls = urls.length
batchRequest()
return await parseCSV(urlsFile)
} catch {
console.log(yellow('No urls.csv file found.'))
process.exit()
process.exit(1)
}
})()
}

// Connect to API to get allowed number of concurrent requests
async function getConcurrent() {
try {
const { data } = await axios(`http://api.scraperapi.com/account?api_key=${apiKey}`)
Expand All @@ -42,63 +52,36 @@ async function getConcurrent() {
`${error.response.status} - Incorrect or missing API key please check your APIKEY.js file and make sure it includes a correct API key from https://www.scraperapi.com/`
)
} else {
console.error('There is a problem connecting to Scraperapi')
process.exit(1)
console.error('There is a problem connecting to Scraperapi, please try again later')
process.exit()
}
}
}

// Batch request with maximun according to account
async function batchRequest() {
console.log(green('Requesting...'))
instances = 1

const data = [...urls]
const concurrent = await getConcurrent()
// HTTP request async
async function runRequest(url) {
try {
// Prepare url to search like google does
const requestUrl = googlelify(url)

while (data.length) {
await new Promise((resolve) => setTimeout(resolve, 1000))
// HTTP request using axios, scraperapi, google and the enconded url
const { data, status } = await axios(
`${apiUrl}${apiKey}&url=${site}${requestUrl}${params}`
)

const batch = data.splice(0, concurrent).map((url) => runRequest(url, urls.length))
// Check if it matches google search results
const indexation = matchResponse(url, data)

const results = await Promise.allSettled(batch)
// Print to terminal each url, its number and status code
const counter = `${count++}/${urls.length}`
const statusPrint = green.bold(status)
const indexPrint = white.bold(indexation)

for (const { status, reason } of results) {
if (status === 'rejected') data.push(reason)
}
}
console.log(cyan(`Checking: ${counter} ${url} ${statusPrint} ${indexPrint}`))

finalMessage()
}
// Create, append and clear stream
const stream = createWriteStream('./results.csv', 'utf8')

// HTTP request async promise
async function runRequest(url, len) {
// Make requests with encoded URL through axios with header options and write it to results
try {
// URL encoded for request
const request = requestUrl(url)
// URL encoded to check indexation
const compare = compareUrl(url, false)
// URL encoded for discrepancies
const utfEncoded = compareUrl(url, true)
// HTTPS request using axios, scraperapi, google and the enconded url
const res = await axios(`${apiUrl}${apiKey}&url=${site}${request}`)
const indexation = matchResponse(url, res.data, compare, utfEncoded)
? 'Indexed'
: 'Not Indexed'
// Print to terminal each url, its number and status code
console.log(
cyan(
`Checking: ${instances++}/${len} ${url} ${green.bold(res.status)} ${white.bold(
indexation
)}`
)
)
// Create append streamclear
const stream = createWriteStream('./results.csv', {
flags: 'a',
encoding: 'utf8'
})
// Append evaluation from response to file
stream.write(`${url}, ${indexation}\n`)
// End stream to avoid accumulation
Expand All @@ -107,38 +90,51 @@ async function runRequest(url, len) {
// Request made and server responded
const status = error.response ? error.response.status : 500

if (status === 429) {
console.error('Too many request, something went wrong check with SpaperAPI')
process.exit(1)
}

// Log with different color to highlight the error
console.error(yellow(`Error: ${url} ${red(status)} ${green('Repeating')}`))
console.error(yellow(`Error: ${url} ${red(status)} ${green('Re-trying')}`))

throw url
throw {
name: 'timeout',
error: url
}
}
}

// Compare url against google response url
function matchResponse(url, res, compare, utfEncoded) {
let matchURL = false

if (url.includes(`'`)) {
matchURL = res.includes(`href="${compare}"`) | res.includes(`href="${utfEncoded}"`)
function matchResponse(url, res) {
// Look for a tags with href attribute only
const content = sanitizeHtml(res, {
allowedTags: ['a'],
allowedAttributes: { a: ['href'] }
})

// Set not index to start
let indexResult = 'Not Indexed'

// If the encoded version of the URL is on google
if (content.includes(`href="${encodeURL(url)}"`)) {
indexResult = 'Indexed'
} else {
matchURL = res.includes(`href="${compare}"`)
notIndexedCounter += 1
}

// Counter foreach not index
if (!matchURL) notIndexCounter += 1

return matchURL
return indexResult
}

function finalMessage() {
function finalMessage(totalUrls) {
console.log(
`\n${totalUrls} URLS, results.csv file successfully written in ${timer(
Date.now() - start
)}\n`
)
console.log(
`${green.bold(`Indexed: ` + (totalUrls - notIndexCounter))}\n${red.bold(
`Not indexed: ` + notIndexCounter + `\n`
`${green.bold(`Indexed: ` + (totalUrls - notIndexedCounter))}\n${red.bold(
`Not indexed: ` + notIndexedCounter + `\n`
)}`
)
}
29 changes: 29 additions & 0 deletions lib/poolRequest.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Pool request
export async function poolRequest(poolLimit, array, iteratorFn, exception) {
const promises = []
const racers = []

for (const item of array) {
const pro = Promise.resolve().then(() => iteratorFn(item, array))
promises.push(pro)

if (poolLimit <= array.length) {
const racer = pro.then(() => racers.splice(racers.indexOf(racer), 1))
racers.push(racer)
if (racers.length >= poolLimit) {
await Promise.race(racers).catch((err) => console.log(err))
}
}
}

const results = await Promise.allSettled(promises)

for (const { status, reason } of results) {
if (status === 'rejected') {
const { name, error } = reason
if (name === exception) {
await poolRequest(poolLimit, [error], iteratorFn, 'timeout')
}
}
}
}
15 changes: 8 additions & 7 deletions lib/url-encoder.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Encode URL as google does by removing quotes or their encoded version
export const requestUrl = (url) => {
export const googlelify = (url) => {
// DecodeURI if not malformed
try {
url = decodeURI(url)
Expand All @@ -17,7 +17,7 @@ export const requestUrl = (url) => {
}

// The URL used to compare against google source code
export const compareUrl = (url, utfChar) => {
export const encodeURL = (url) => {
// We got a url that already has encoding so return as it is
if (url.includes('%')) {
url = url.replace(/ /g, '%20')
Expand All @@ -26,16 +26,17 @@ export const compareUrl = (url, utfChar) => {
// We got a clean url that needs encoding
url = encodeURI(url)
// We have a list of reserve characters we need to un-encode
const reserve = { '%5B': '[', '%5D': ']' }
const reserve = {}
reserve[`%5B`] = '['
reserve[`%5D`] = ']'
reserve[`'`] = '%27'

// We create a regex for those reserve characters
const regex = new RegExp(Object.keys(reserve).join('|'), 'g')
// And replace only those present
url = url.replace(regex, (match) => reserve[match]).replace(/^%22|%22$/g, '')
url = url.replace(regex, (match) => reserve[match])
}

// Create a different url for servers that encode apostophe
if (utfChar) url = url.replace(/\'/g, '%27')

// Covert & to &amp;
return url.replace(/&/g, '&amp;')
}
7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "google-index-checker",
"version": "3.2.0",
"version": "3.3.0",
"license": "MIT",
"description": "Check Google indexation status at scale",
"author": "Álvaro Fernández & José Hernando | Builtvsible",
Expand All @@ -18,6 +18,7 @@
"axios": "^0.21.4",
"chalk": "^2.4.2",
"csv-parser": "^3.0.0",
"dotenv": "^10.0.0"
"dotenv": "^10.0.0",
"sanitize-html": "^2.5.1"
}
}
}

0 comments on commit 58ab057

Please sign in to comment.