From 58ab05730fe530d7963c8388cd6bdb5f058321d8 Mon Sep 17 00:00:00 2001
From: alvaro-naves <a.fernandez.naves@gmail.com>
Date: Sat, 9 Oct 2021 16:24:28 +0100
Subject: [PATCH] Updated request using pool and added device parameter

---
 google-index-checker.js | 146 +++++++++++++++++++---------------------
 lib/poolRequest.js      |  29 ++++++++
 lib/url-encoder.js      |  15 +++--
 package.json            |   7 +-
 4 files changed, 112 insertions(+), 85 deletions(-)
 create mode 100644 lib/poolRequest.js

diff --git a/google-index-checker.js b/google-index-checker.js
index d85dce7..c27d8f8 100644
--- a/google-index-checker.js
+++ b/google-index-checker.js
@@ -1,37 +1,47 @@
 // Required Modules
 import 'dotenv/config'
 import chalk from 'chalk' // Terminal and string styling
-import { createWriteStream } from 'fs' // Node file system module
+import axios from 'axios' // Axios client
+import { createWriteStream, writeFileSync } from 'fs' // Node file system module
 import { access } from 'fs/promises' // Promises Node file system module
 import { parseCSV } from './lib/parser.js' // Convert csv to json module
-import axios from 'axios' // Axios client
-import { requestUrl, compareUrl } from './lib/url-encoder.js' // Encoding functions
+import { googlelify, encodeURL } from './lib/url-encoder.js' // Encoding functions
 import { timer } from './lib/timer.js' // Timer function
+import { poolRequest } from './lib/poolRequest.js'
+import sanitizeHtml from 'sanitize-html'
+
 // Settings
 const { yellow, cyan, white, red, green } = chalk
 const start = Date.now() // Date counter to check duration of script
 const site = 'https://www.google.com/search?q=' // Google search query
 const urlsFile = './urls.csv' // File containing all the urls to check
 const apiUrl = 'http://api.scraperapi.com/?api_key=' // ScraperAPI url
+const params = '&device_type=desktop'
 const apiKey = process.env.SCRAPERAPI_KEY
 
-let totalUrls = 0
-let notIndexCounter = 0
-let instances = 0
-let urls = []
-// Check if file exist and count number of urls, if it does not exists, exit with message
+let count = 1
+let notIndexedCounter = 0
+
+// Collect URLS, get Concurrent max max a run request in pool
 ;(async () => {
+  urls = await getUrls()
+  const concurrent = await getConcurrent()
+  await poolRequest(concurrent, [...urls], runRequest, 'timeout')
+  finalMessage(urls.length)
+})()
+
+// Gather URLS from file
+async function getUrls() {
   try {
     await access(urlsFile)
-    urls = await parseCSV(urlsFile)
-    totalUrls = urls.length
-    batchRequest()
+    return await parseCSV(urlsFile)
   } catch {
     console.log(yellow('No urls.csv file found.'))
-    process.exit()
+    process.exit(1)
   }
-})()
+}
 
+// Connect to API to get allowed number of concurrent requests
 async function getConcurrent() {
   try {
     const { data } = await axios(`http://api.scraperapi.com/account?api_key=${apiKey}`)
@@ -42,63 +52,36 @@ async function getConcurrent() {
         `${error.response.status} - Incorrect or missing API key please check your APIKEY.js file and make sure it includes a correct API key from https://www.scraperapi.com/`
       )
     } else {
-      console.error('There is a problem connecting to Scraperapi')
-      process.exit(1)
+      console.error('There is a problem connecting to Scraperapi, please try again later')
+      process.exit()
     }
   }
 }
 
-// Batch request with maximun according to account
-async function batchRequest() {
-  console.log(green('Requesting...'))
-  instances = 1
-
-  const data = [...urls]
-  const concurrent = await getConcurrent()
+// HTTP request async
+async function runRequest(url) {
+  try {
+    // Prepare url to search like google does
+    const requestUrl = googlelify(url)
 
-  while (data.length) {
-    await new Promise((resolve) => setTimeout(resolve, 1000))
+    // HTTP request using axios, scraperapi, google and the enconded url
+    const { data, status } = await axios(
+      `${apiUrl}${apiKey}&url=${site}${requestUrl}${params}`
+    )
 
-    const batch = data.splice(0, concurrent).map((url) => runRequest(url, urls.length))
+    // Check if it matches google search results
+    const indexation = matchResponse(url, data)
 
-    const results = await Promise.allSettled(batch)
+    // Print to terminal each url, its number and status code
+    const counter = `${count++}/${urls.length}`
+    const statusPrint = green.bold(status)
+    const indexPrint = white.bold(indexation)
 
-    for (const { status, reason } of results) {
-      if (status === 'rejected') data.push(reason)
-    }
-  }
+    console.log(cyan(`Checking: ${counter} ${url} ${statusPrint} ${indexPrint}`))
 
-  finalMessage()
-}
+    // Create, append and clear stream
+    const stream = createWriteStream('./results.csv', 'utf8')
 
-// HTTP request async promise
-async function runRequest(url, len) {
-  // Make requests with encoded URL through axios with header options and write it to results
-  try {
-    // URL encoded for request
-    const request = requestUrl(url)
-    // URL encoded to check indexation
-    const compare = compareUrl(url, false)
-    // URL encoded for discrepancies
-    const utfEncoded = compareUrl(url, true)
-    // HTTPS request using axios, scraperapi, google and the enconded url
-    const res = await axios(`${apiUrl}${apiKey}&url=${site}${request}`)
-    const indexation = matchResponse(url, res.data, compare, utfEncoded)
-      ? 'Indexed'
-      : 'Not Indexed'
-    // Print to terminal each url, its number and status code
-    console.log(
-      cyan(
-        `Checking: ${instances++}/${len} ${url} ${green.bold(res.status)} ${white.bold(
-          indexation
-        )}`
-      )
-    )
-    // Create append streamclear
-    const stream = createWriteStream('./results.csv', {
-      flags: 'a',
-      encoding: 'utf8'
-    })
     // Append evaluation from response to file
     stream.write(`${url}, ${indexation}\n`)
     // End stream to avoid accumulation
@@ -107,38 +90,51 @@ async function runRequest(url, len) {
     // Request made and server responded
     const status = error.response ? error.response.status : 500
 
+    if (status === 429) {
+      console.error('Too many request, something went wrong check with SpaperAPI')
+      process.exit(1)
+    }
+
     // Log with different color to highlight the error
-    console.error(yellow(`Error: ${url} ${red(status)} ${green('Repeating')}`))
+    console.error(yellow(`Error: ${url} ${red(status)} ${green('Re-trying')}`))
 
-    throw url
+    throw {
+      name: 'timeout',
+      error: url
+    }
   }
 }
 
 // Compare url against google response url
-function matchResponse(url, res, compare, utfEncoded) {
-  let matchURL = false
-
-  if (url.includes(`'`)) {
-    matchURL = res.includes(`href="${compare}"`) | res.includes(`href="${utfEncoded}"`)
+function matchResponse(url, res) {
+  // Look for a tags with href attribute only
+  const content = sanitizeHtml(res, {
+    allowedTags: ['a'],
+    allowedAttributes: { a: ['href'] }
+  })
+
+  // Set not index to start
+  let indexResult = 'Not Indexed'
+
+  // If the encoded version of the URL is on google
+  if (content.includes(`href="${encodeURL(url)}"`)) {
+    indexResult = 'Indexed'
   } else {
-    matchURL = res.includes(`href="${compare}"`)
+    notIndexedCounter += 1
   }
 
-  // Counter foreach not index
-  if (!matchURL) notIndexCounter += 1
-
-  return matchURL
+  return indexResult
 }
 
-function finalMessage() {
+function finalMessage(totalUrls) {
   console.log(
     `\n${totalUrls} URLS, results.csv file successfully written in ${timer(
       Date.now() - start
     )}\n`
   )
   console.log(
-    `${green.bold(`Indexed: ` + (totalUrls - notIndexCounter))}\n${red.bold(
-      `Not indexed: ` + notIndexCounter + `\n`
+    `${green.bold(`Indexed: ` + (totalUrls - notIndexedCounter))}\n${red.bold(
+      `Not indexed: ` + notIndexedCounter + `\n`
     )}`
   )
 }
diff --git a/lib/poolRequest.js b/lib/poolRequest.js
new file mode 100644
index 0000000..51752cb
--- /dev/null
+++ b/lib/poolRequest.js
@@ -0,0 +1,29 @@
+// Pool request
+export async function poolRequest(poolLimit, array, iteratorFn, exception) {
+  const promises = []
+  const racers = []
+
+  for (const item of array) {
+    const pro = Promise.resolve().then(() => iteratorFn(item, array))
+    promises.push(pro)
+
+    if (poolLimit <= array.length) {
+      const racer = pro.then(() => racers.splice(racers.indexOf(racer), 1))
+      racers.push(racer)
+      if (racers.length >= poolLimit) {
+        await Promise.race(racers).catch((err) => console.log(err))
+      }
+    }
+  }
+
+  const results = await Promise.allSettled(promises)
+
+  for (const { status, reason } of results) {
+    if (status === 'rejected') {
+      const { name, error } = reason
+      if (name === exception) {
+        await poolRequest(poolLimit, [error], iteratorFn, 'timeout')
+      }
+    }
+  }
+}
diff --git a/lib/url-encoder.js b/lib/url-encoder.js
index 3fd3b43..c28d902 100644
--- a/lib/url-encoder.js
+++ b/lib/url-encoder.js
@@ -1,5 +1,5 @@
 // Encode URL as google does by removing quotes or their encoded version
-export const requestUrl = (url) => {
+export const googlelify = (url) => {
   // DecodeURI if not malformed
   try {
     url = decodeURI(url)
@@ -17,7 +17,7 @@ export const requestUrl = (url) => {
 }
 
 // The URL used to compare against google source code
-export const compareUrl = (url, utfChar) => {
+export const encodeURL = (url) => {
   // We got a url that already has encoding so return as it is
   if (url.includes('%')) {
     url = url.replace(/ /g, '%20')
@@ -26,16 +26,17 @@ export const compareUrl = (url, utfChar) => {
     // We got a clean url that needs encoding
     url = encodeURI(url)
     // We have a list of reserve characters we need to un-encode
-    const reserve = { '%5B': '[', '%5D': ']' }
+    const reserve = {}
+    reserve[`%5B`] = '['
+    reserve[`%5D`] = ']'
+    reserve[`'`] = '%27'
+
     // We create a regex for those reserve characters
     const regex = new RegExp(Object.keys(reserve).join('|'), 'g')
     // And replace only those present
-    url = url.replace(regex, (match) => reserve[match]).replace(/^%22|%22$/g, '')
+    url = url.replace(regex, (match) => reserve[match])
   }
 
-  // Create a different url for servers that encode apostophe
-  if (utfChar) url = url.replace(/\'/g, '%27')
-
   // Covert & to &amp;
   return url.replace(/&/g, '&amp;')
 }
diff --git a/package.json b/package.json
index 869c70b..e67c780 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "google-index-checker",
-  "version": "3.2.0",
+  "version": "3.3.0",
   "license": "MIT",
   "description": "Check Google indexation status at scale",
   "author": "Álvaro Fernández & José Hernando | Builtvsible",
@@ -18,6 +18,7 @@
     "axios": "^0.21.4",
     "chalk": "^2.4.2",
     "csv-parser": "^3.0.0",
-    "dotenv": "^10.0.0"
+    "dotenv": "^10.0.0",
+    "sanitize-html": "^2.5.1"
   }
-}
+}
\ No newline at end of file