|
2 | 2 | """ELM Web Scraping - Google search."""
|
3 | 3 | import os
|
4 | 4 | import json
|
5 |
| -import pprint |
6 | 5 | import asyncio
|
7 | 6 | import logging
|
8 | 7 | import requests
|
9 |
| -from itertools import zip_longest, chain |
10 |
| -from contextlib import AsyncExitStack |
11 | 8 |
|
12 | 9 | from apiclient.discovery import build
|
13 | 10 | from rebrowser_playwright.async_api import (
|
14 |
| - TimeoutError as PlaywrightTimeoutError |
15 |
| -) |
| 11 | + TimeoutError as PlaywrightTimeoutError) |
16 | 12 |
|
17 |
| -from elm.web.file_loader import AsyncFileLoader |
18 | 13 | from elm.web.search.base import (PlaywrightSearchEngineLinkSearch,
|
19 | 14 | APISearchEngineLinkSearch)
|
20 | 15 |
|
@@ -53,14 +48,29 @@ class PlaywrightGoogleLinkSearch(PlaywrightSearchEngineLinkSearch):
|
53 | 48 | async def _perform_search(self, page, search_query):
|
54 | 49 | """Fill in search bar with user query and hit enter"""
|
55 | 50 | logger.trace("Finding search bar for query: %r", search_query)
|
| 51 | + await self._fill_in_search_bar(page, search_query) |
| 52 | + logger.trace("Hitting enter for query: %r", search_query) |
| 53 | + await page.keyboard.press('Enter') |
| 54 | + |
| 55 | + async def _fill_in_search_bar(self, page, search_query): |
| 56 | + """Attempt to find and fill the search bar several ways""" |
56 | 57 | try:
|
57 |
| - await page.get_by_label("Search", exact=True).fill(search_query) |
| 58 | + return await (page |
| 59 | + .get_by_label("Search", exact=True) |
| 60 | + .fill(search_query)) |
58 | 61 | except PlaywrightTimeoutError:
|
59 |
| - search_bar = page.locator('[autofocus]') |
| 62 | + pass |
| 63 | + |
| 64 | + search_bar = page.locator('[name="q"]') |
| 65 | + try: |
60 | 66 | await search_bar.clear()
|
61 |
| - await search_bar.fill(search_query) |
62 |
| - logger.trace("Hitting enter for query: %r", search_query) |
63 |
| - await page.keyboard.press('Enter') |
| 67 | + return await search_bar.fill(search_query) |
| 68 | + except PlaywrightTimeoutError: |
| 69 | + pass |
| 70 | + |
| 71 | + search_bar = page.locator('[autofocus]') |
| 72 | + await search_bar.clear() |
| 73 | + return await search_bar.fill(search_query) |
64 | 74 |
|
65 | 75 |
|
66 | 76 | class PlaywrightGoogleCSELinkSearch(PlaywrightSearchEngineLinkSearch):
|
@@ -169,121 +179,3 @@ async def _search(self, query, num_results=10):
|
169 | 179 | results = json.loads(response.text).get('organic', {})
|
170 | 180 | return list(filter(None, (result.get("link", "").replace("+", "%20")
|
171 | 181 | for result in results)))
|
172 |
| - |
173 |
| - |
174 |
| -async def google_results_as_docs( |
175 |
| - queries, |
176 |
| - num_urls=None, |
177 |
| - browser_semaphore=None, |
178 |
| - task_name=None, |
179 |
| - **file_loader_kwargs, |
180 |
| -): |
181 |
| - """Retrieve top ``N`` google search results as document instances. |
182 |
| -
|
183 |
| - Parameters |
184 |
| - ---------- |
185 |
| - queries : collection of str |
186 |
| - Collection of strings representing google queries. Documents for |
187 |
| - the top `num_urls` google search results (from all of these |
188 |
| - queries _combined_ will be returned from this function. |
189 |
| - num_urls : int, optional |
190 |
| - Number of unique top Google search result to return as docs. The |
191 |
| - google search results from all queries are interleaved and the |
192 |
| - top `num_urls` unique URL's are downloaded as docs. If this |
193 |
| - number is less than ``len(queries)``, some of your queries may |
194 |
| - not contribute to the final output. By default, ``None``, which |
195 |
| - sets ``num_urls = 3 * len(queries)``. |
196 |
| - browser_semaphore : :class:`asyncio.Semaphore`, optional |
197 |
| - Semaphore instance that can be used to limit the number of |
198 |
| - playwright browsers open concurrently. If ``None``, no limits |
199 |
| - are applied. By default, ``None``. |
200 |
| - task_name : str, optional |
201 |
| - Optional task name to use in :func:`asyncio.create_task`. |
202 |
| - By default, ``None``. |
203 |
| - **file_loader_kwargs |
204 |
| - Keyword-argument pairs to initialize |
205 |
| - :class:`elm.web.file_loader.AsyncFileLoader` with. If found, the |
206 |
| - "pw_launch_kwargs" key in these will also be used to initialize |
207 |
| - the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch` |
208 |
| - used for the google URL search. By default, ``None``. |
209 |
| -
|
210 |
| - Returns |
211 |
| - ------- |
212 |
| - list of :class:`elm.web.document.BaseDocument` |
213 |
| - List of documents representing the top `num_urls` results from |
214 |
| - the google searches across all `queries`. |
215 |
| - """ |
216 |
| - pw_launch_kwargs = file_loader_kwargs.get("pw_launch_kwargs", {}) |
217 |
| - urls = await _find_urls( |
218 |
| - queries, |
219 |
| - num_results=10, |
220 |
| - browser_sem=browser_semaphore, |
221 |
| - task_name=task_name, |
222 |
| - **pw_launch_kwargs |
223 |
| - ) |
224 |
| - num_urls = num_urls or 3 * len(queries) |
225 |
| - urls = _down_select_urls(urls, num_urls=num_urls) |
226 |
| - logger.debug("Downloading documents for URLS: \n\t-%s", "\n\t-".join(urls)) |
227 |
| - docs = await _load_docs(urls, browser_semaphore, **file_loader_kwargs) |
228 |
| - return docs |
229 |
| - |
230 |
| - |
231 |
| -async def _find_urls( |
232 |
| - queries, num_results=10, browser_sem=None, task_name=None, **kwargs |
233 |
| -): |
234 |
| - """Parse google search output for URLs.""" |
235 |
| - searchers = [ |
236 |
| - asyncio.create_task( |
237 |
| - _search_single( |
238 |
| - query, browser_sem, num_results=num_results, **kwargs |
239 |
| - ), |
240 |
| - name=task_name, |
241 |
| - ) |
242 |
| - for query in queries |
243 |
| - ] |
244 |
| - return await asyncio.gather(*searchers) |
245 |
| - |
246 |
| - |
247 |
| -async def _search_single(question, browser_sem, num_results=10, **kwargs): |
248 |
| - """Perform a single google search.""" |
249 |
| - if browser_sem is None: |
250 |
| - browser_sem = AsyncExitStack() |
251 |
| - |
252 |
| - search_engine = PlaywrightGoogleLinkSearch(**kwargs) |
253 |
| - logger.trace("Single search browser_semaphore=%r", browser_sem) |
254 |
| - async with browser_sem: |
255 |
| - logger.trace("Starting search for %r with browser_semaphore=%r", |
256 |
| - question, browser_sem) |
257 |
| - return await search_engine.results(question, num_results=num_results) |
258 |
| - |
259 |
| - |
260 |
| -def _down_select_urls(search_results, num_urls=5): |
261 |
| - """Select the top 5 URLs.""" |
262 |
| - all_urls = chain.from_iterable( |
263 |
| - zip_longest(*[results[0] for results in search_results]) |
264 |
| - ) |
265 |
| - urls = set() |
266 |
| - for url in all_urls: |
267 |
| - if not url: |
268 |
| - continue |
269 |
| - urls.add(url) |
270 |
| - if len(urls) == num_urls: |
271 |
| - break |
272 |
| - return urls |
273 |
| - |
274 |
| - |
275 |
| -async def _load_docs(urls, browser_semaphore=None, **kwargs): |
276 |
| - """Load a document for each input URL.""" |
277 |
| - logger.trace("Downloading docs for the following URL's:\n%r", urls) |
278 |
| - logger.trace("kwargs for AsyncFileLoader:\n%s", |
279 |
| - pprint.PrettyPrinter().pformat(kwargs)) |
280 |
| - file_loader = AsyncFileLoader( |
281 |
| - browser_semaphore=browser_semaphore, **kwargs |
282 |
| - ) |
283 |
| - docs = await file_loader.fetch_all(*urls) |
284 |
| - |
285 |
| - page_lens = {doc.metadata.get("source", "Unknown"): len(doc.pages) |
286 |
| - for doc in docs} |
287 |
| - logger.debug("Loaded the following number of pages for docs:\n%s", |
288 |
| - pprint.PrettyPrinter().pformat(page_lens)) |
289 |
| - return [doc for doc in docs if not doc.empty] |
0 commit comments