Skip to content

Commit d2e56d3

Browse files
authored
Merge pull request #49 from NREL/pp/search_with_fallbacks
Add fallback options to search
2 parents 3f75787 + 7902e64 commit d2e56d3

File tree

12 files changed

+398
-233
lines changed

12 files changed

+398
-233
lines changed

docs/source/dev/ords_architecture.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -760,7 +760,7 @@ We give a rough breakdown of the following call:
760760
.. code-block:: python
761761
762762
import asyncio
763-
from elm.web.google_search import google_results_as_docs
763+
from elm.web.search import web_search_links_as_docs
764764
765765
QUERIES = [
766766
"NREL wiki",
@@ -769,7 +769,7 @@ We give a rough breakdown of the following call:
769769
]
770770
771771
async def main():
772-
docs = await google_results_as_docs(QUERIES, num_urls=4)
772+
docs = await web_search_links_as_docs(QUERIES, num_urls=4)
773773
return docs
774774
775775
if __name__ == "__main__":
@@ -778,7 +778,7 @@ We give a rough breakdown of the following call:
778778
779779
**Step-by-Step:**
780780

781-
1. :func:`~elm.web.google_search.google_results_as_docs()` is invoked with 3 queries and ``num_urls=4``.
781+
1. :func:`~elm.web.search.run.web_search_links_as_docs()` is invoked with 3 queries and ``num_urls=4``.
782782
2. Each of the three queries are processed asynchronously, creating a :class:`~elm.web.google_search.PlaywrightGoogleLinkSearch` instance and retrieving the top URL results.
783783
3. Internal code reduces the URL lists returned from each of the queries into the top 4 URLs.
784784
4. :class:`~elm.web.file_loader.AsyncFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored
@@ -789,7 +789,7 @@ We give a rough breakdown of the following call:
789789
.. mermaid::
790790

791791
sequenceDiagram
792-
participant A as google_results_as_docs()
792+
participant A as web_search_links_as_docs()
793793
participant B as PlaywrightGoogleLinkSearch
794794
participant D as AsyncFileLoader
795795
participant E as HTMLDocument

elm/exceptions.py

+8
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,13 @@ class ELMError(Exception):
66
"""Generic ELM Error."""
77

88

9+
class ELMKeyError(ELMError, KeyError):
10+
"""ELM Key Error."""
11+
12+
13+
class ELMInputError(ELMError, ValueError):
14+
"""ELM Input (Value) Error."""
15+
16+
917
class ELMRuntimeError(ELMError, RuntimeError):
1018
"""ELM RuntimeError."""

elm/ords/download.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from elm.ords.services.threaded import TempFileCache
88
from elm.ords.validation.location import CountyValidator
99
from elm.web.document import PDFDocument
10-
from elm.web.search.google import google_results_as_docs
10+
from elm.web.search import web_search_links_as_docs
1111
from elm.web.utilities import filter_documents
1212

1313

@@ -101,7 +101,7 @@ async def _docs_from_google_search(
101101
"file_cache_coroutine": TempFileCache.call,
102102
}
103103
)
104-
return await google_results_as_docs(
104+
return await web_search_links_as_docs(
105105
queries,
106106
num_urls=num_urls,
107107
browser_semaphore=browser_semaphore,

elm/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
ELM version number
33
"""
44

5-
__version__ = "0.0.9"
5+
__version__ = "0.0.10"

elm/web/file_loader.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919

2020
async def _read_pdf_doc(pdf_bytes, **kwargs):
2121
"""Default read PDF function (runs in main thread)"""
22-
pages = read_pdf(pdf_bytes)
22+
verbose = kwargs.pop("verbose", True)
23+
pages = read_pdf(pdf_bytes, verbose=verbose)
2324
return PDFDocument(pages, **kwargs)
2425

2526

elm/web/search/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""ELM web search functions"""
2+
3+
from .run import web_search_links_as_docs

elm/web/search/base.py

+5
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ async def _get_links(self, queries, num_results):
154154
async def _extract_links(self, page, num_results):
155155
"""Extract links for top `num_results` on page"""
156156
links = await asyncio.to_thread(page.locator, self._SE_SR_TAG)
157+
158+
if not self.launch_kwargs.get("headless", True):
159+
# Viz purposes only
160+
[await links.nth(i).hover() for i in range(num_results)]
161+
157162
return [await links.nth(i).get_attribute("href")
158163
for i in range(num_results)]
159164

elm/web/search/google.py

+21-129
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,14 @@
22
"""ELM Web Scraping - Google search."""
33
import os
44
import json
5-
import pprint
65
import asyncio
76
import logging
87
import requests
9-
from itertools import zip_longest, chain
10-
from contextlib import AsyncExitStack
118

129
from apiclient.discovery import build
1310
from rebrowser_playwright.async_api import (
14-
TimeoutError as PlaywrightTimeoutError
15-
)
11+
TimeoutError as PlaywrightTimeoutError)
1612

17-
from elm.web.file_loader import AsyncFileLoader
1813
from elm.web.search.base import (PlaywrightSearchEngineLinkSearch,
1914
APISearchEngineLinkSearch)
2015

@@ -53,14 +48,29 @@ class PlaywrightGoogleLinkSearch(PlaywrightSearchEngineLinkSearch):
5348
async def _perform_search(self, page, search_query):
5449
"""Fill in search bar with user query and hit enter"""
5550
logger.trace("Finding search bar for query: %r", search_query)
51+
await self._fill_in_search_bar(page, search_query)
52+
logger.trace("Hitting enter for query: %r", search_query)
53+
await page.keyboard.press('Enter')
54+
55+
async def _fill_in_search_bar(self, page, search_query):
56+
"""Attempt to find and fill the search bar several ways"""
5657
try:
57-
await page.get_by_label("Search", exact=True).fill(search_query)
58+
return await (page
59+
.get_by_label("Search", exact=True)
60+
.fill(search_query))
5861
except PlaywrightTimeoutError:
59-
search_bar = page.locator('[autofocus]')
62+
pass
63+
64+
search_bar = page.locator('[name="q"]')
65+
try:
6066
await search_bar.clear()
61-
await search_bar.fill(search_query)
62-
logger.trace("Hitting enter for query: %r", search_query)
63-
await page.keyboard.press('Enter')
67+
return await search_bar.fill(search_query)
68+
except PlaywrightTimeoutError:
69+
pass
70+
71+
search_bar = page.locator('[autofocus]')
72+
await search_bar.clear()
73+
return await search_bar.fill(search_query)
6474

6575

6676
class PlaywrightGoogleCSELinkSearch(PlaywrightSearchEngineLinkSearch):
@@ -169,121 +179,3 @@ async def _search(self, query, num_results=10):
169179
results = json.loads(response.text).get('organic', {})
170180
return list(filter(None, (result.get("link", "").replace("+", "%20")
171181
for result in results)))
172-
173-
174-
async def google_results_as_docs(
175-
queries,
176-
num_urls=None,
177-
browser_semaphore=None,
178-
task_name=None,
179-
**file_loader_kwargs,
180-
):
181-
"""Retrieve top ``N`` google search results as document instances.
182-
183-
Parameters
184-
----------
185-
queries : collection of str
186-
Collection of strings representing google queries. Documents for
187-
the top `num_urls` google search results (from all of these
188-
queries _combined_ will be returned from this function.
189-
num_urls : int, optional
190-
Number of unique top Google search result to return as docs. The
191-
google search results from all queries are interleaved and the
192-
top `num_urls` unique URL's are downloaded as docs. If this
193-
number is less than ``len(queries)``, some of your queries may
194-
not contribute to the final output. By default, ``None``, which
195-
sets ``num_urls = 3 * len(queries)``.
196-
browser_semaphore : :class:`asyncio.Semaphore`, optional
197-
Semaphore instance that can be used to limit the number of
198-
playwright browsers open concurrently. If ``None``, no limits
199-
are applied. By default, ``None``.
200-
task_name : str, optional
201-
Optional task name to use in :func:`asyncio.create_task`.
202-
By default, ``None``.
203-
**file_loader_kwargs
204-
Keyword-argument pairs to initialize
205-
:class:`elm.web.file_loader.AsyncFileLoader` with. If found, the
206-
"pw_launch_kwargs" key in these will also be used to initialize
207-
the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
208-
used for the google URL search. By default, ``None``.
209-
210-
Returns
211-
-------
212-
list of :class:`elm.web.document.BaseDocument`
213-
List of documents representing the top `num_urls` results from
214-
the google searches across all `queries`.
215-
"""
216-
pw_launch_kwargs = file_loader_kwargs.get("pw_launch_kwargs", {})
217-
urls = await _find_urls(
218-
queries,
219-
num_results=10,
220-
browser_sem=browser_semaphore,
221-
task_name=task_name,
222-
**pw_launch_kwargs
223-
)
224-
num_urls = num_urls or 3 * len(queries)
225-
urls = _down_select_urls(urls, num_urls=num_urls)
226-
logger.debug("Downloading documents for URLS: \n\t-%s", "\n\t-".join(urls))
227-
docs = await _load_docs(urls, browser_semaphore, **file_loader_kwargs)
228-
return docs
229-
230-
231-
async def _find_urls(
232-
queries, num_results=10, browser_sem=None, task_name=None, **kwargs
233-
):
234-
"""Parse google search output for URLs."""
235-
searchers = [
236-
asyncio.create_task(
237-
_search_single(
238-
query, browser_sem, num_results=num_results, **kwargs
239-
),
240-
name=task_name,
241-
)
242-
for query in queries
243-
]
244-
return await asyncio.gather(*searchers)
245-
246-
247-
async def _search_single(question, browser_sem, num_results=10, **kwargs):
248-
"""Perform a single google search."""
249-
if browser_sem is None:
250-
browser_sem = AsyncExitStack()
251-
252-
search_engine = PlaywrightGoogleLinkSearch(**kwargs)
253-
logger.trace("Single search browser_semaphore=%r", browser_sem)
254-
async with browser_sem:
255-
logger.trace("Starting search for %r with browser_semaphore=%r",
256-
question, browser_sem)
257-
return await search_engine.results(question, num_results=num_results)
258-
259-
260-
def _down_select_urls(search_results, num_urls=5):
261-
"""Select the top 5 URLs."""
262-
all_urls = chain.from_iterable(
263-
zip_longest(*[results[0] for results in search_results])
264-
)
265-
urls = set()
266-
for url in all_urls:
267-
if not url:
268-
continue
269-
urls.add(url)
270-
if len(urls) == num_urls:
271-
break
272-
return urls
273-
274-
275-
async def _load_docs(urls, browser_semaphore=None, **kwargs):
276-
"""Load a document for each input URL."""
277-
logger.trace("Downloading docs for the following URL's:\n%r", urls)
278-
logger.trace("kwargs for AsyncFileLoader:\n%s",
279-
pprint.PrettyPrinter().pformat(kwargs))
280-
file_loader = AsyncFileLoader(
281-
browser_semaphore=browser_semaphore, **kwargs
282-
)
283-
docs = await file_loader.fetch_all(*urls)
284-
285-
page_lens = {doc.metadata.get("source", "Unknown"): len(doc.pages)
286-
for doc in docs}
287-
logger.debug("Loaded the following number of pages for docs:\n%s",
288-
pprint.PrettyPrinter().pformat(page_lens))
289-
return [doc for doc in docs if not doc.empty]

0 commit comments

Comments
 (0)