Merge pull request #49 from NREL/pp/search_with_fallbacks

ppinchuk · web-flow · commit d2e56d3cb420 · 2025-02-13T16:02:04.000-07:00
Add fallback options to search
diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst
@@ -760,7 +760,7 @@ We give a rough breakdown of the following call:
 .. code-block:: python
 
     import asyncio
-    from elm.web.google_search import google_results_as_docs
+    from elm.web.search import web_search_links_as_docs
 
     QUERIES = [
         "NREL wiki",
@@ -769,7 +769,7 @@ We give a rough breakdown of the following call:
     ]
 
     async def main():
-        docs = await google_results_as_docs(QUERIES, num_urls=4)
+        docs = await web_search_links_as_docs(QUERIES, num_urls=4)
         return docs
 
     if __name__ == "__main__":
@@ -778,7 +778,7 @@ We give a rough breakdown of the following call:
 
 **Step-by-Step:**
 
-1. :func:`~elm.web.google_search.google_results_as_docs()` is invoked with 3 queries and ``num_urls=4``.
+1. :func:`~elm.web.search.run.web_search_links_as_docs()` is invoked with 3 queries and ``num_urls=4``.
 2. Each of the three queries are processed asynchronously, creating a :class:`~elm.web.google_search.PlaywrightGoogleLinkSearch` instance and retrieving the top URL results.
 3. Internal code reduces the URL lists returned from each of the queries into the top 4 URLs.
 4. :class:`~elm.web.file_loader.AsyncFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored
@@ -789,7 +789,7 @@ We give a rough breakdown of the following call:
 .. mermaid::
 
     sequenceDiagram
-        participant A as google_results_as_docs()
+        participant A as web_search_links_as_docs()
         participant B as PlaywrightGoogleLinkSearch
         participant D as AsyncFileLoader
         participant E as HTMLDocument
diff --git a/elm/exceptions.py b/elm/exceptions.py
@@ -6,5 +6,13 @@ class ELMError(Exception):
     """Generic ELM Error."""
 
 
+class ELMKeyError(ELMError, KeyError):
+    """ELM Key Error."""
+
+
+class ELMInputError(ELMError, ValueError):
+    """ELM Input (Value) Error."""
+
+
 class ELMRuntimeError(ELMError, RuntimeError):
     """ELM RuntimeError."""
diff --git a/elm/ords/download.py b/elm/ords/download.py
@@ -7,7 +7,7 @@
 from elm.ords.services.threaded import TempFileCache
 from elm.ords.validation.location import CountyValidator
 from elm.web.document import PDFDocument
-from elm.web.search.google import google_results_as_docs
+from elm.web.search import web_search_links_as_docs
 from elm.web.utilities import filter_documents
 
 
@@ -101,7 +101,7 @@ async def _docs_from_google_search(
             "file_cache_coroutine": TempFileCache.call,
         }
     )
-    return await google_results_as_docs(
+    return await web_search_links_as_docs(
         queries,
         num_urls=num_urls,
         browser_semaphore=browser_semaphore,
diff --git a/elm/version.py b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.9"
+__version__ = "0.0.10"
diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
@@ -19,7 +19,8 @@
 
 async def _read_pdf_doc(pdf_bytes, **kwargs):
     """Default read PDF function (runs in main thread)"""
-    pages = read_pdf(pdf_bytes)
+    verbose = kwargs.pop("verbose", True)
+    pages = read_pdf(pdf_bytes, verbose=verbose)
     return PDFDocument(pages, **kwargs)
 
 
diff --git a/elm/web/search/__init__.py b/elm/web/search/__init__.py
@@ -0,0 +1,3 @@
+"""ELM web search functions"""
+
+from .run import web_search_links_as_docs
diff --git a/elm/web/search/base.py b/elm/web/search/base.py
@@ -154,6 +154,11 @@ async def _get_links(self, queries, num_results):
     async def _extract_links(self, page, num_results):
         """Extract links for top `num_results` on page"""
         links = await asyncio.to_thread(page.locator, self._SE_SR_TAG)
+
+        if not self.launch_kwargs.get("headless", True):
+            # Viz purposes only
+            [await links.nth(i).hover() for i in range(num_results)]
+
         return [await links.nth(i).get_attribute("href")
                 for i in range(num_results)]
 
diff --git a/elm/web/search/google.py b/elm/web/search/google.py
@@ -2,19 +2,14 @@
 """ELM Web Scraping - Google search."""
 import os
 import json
-import pprint
 import asyncio
 import logging
 import requests
-from itertools import zip_longest, chain
-from contextlib import AsyncExitStack
 
 from apiclient.discovery import build
 from rebrowser_playwright.async_api import (
-    TimeoutError as PlaywrightTimeoutError
-)
+    TimeoutError as PlaywrightTimeoutError)
 
-from elm.web.file_loader import AsyncFileLoader
 from elm.web.search.base import (PlaywrightSearchEngineLinkSearch,
                                  APISearchEngineLinkSearch)
 
@@ -53,14 +48,29 @@ class PlaywrightGoogleLinkSearch(PlaywrightSearchEngineLinkSearch):
     async def _perform_search(self, page, search_query):
         """Fill in search bar with user query and hit enter"""
         logger.trace("Finding search bar for query: %r", search_query)
+        await self._fill_in_search_bar(page, search_query)
+        logger.trace("Hitting enter for query: %r", search_query)
+        await page.keyboard.press('Enter')
+
+    async def _fill_in_search_bar(self, page, search_query):
+        """Attempt to find and fill the search bar several ways"""
         try:
-            await page.get_by_label("Search", exact=True).fill(search_query)
+            return await (page
+                          .get_by_label("Search", exact=True)
+                          .fill(search_query))
         except PlaywrightTimeoutError:
-            search_bar = page.locator('[autofocus]')
+            pass
+
+        search_bar = page.locator('[name="q"]')
+        try:
             await search_bar.clear()
-            await search_bar.fill(search_query)
-        logger.trace("Hitting enter for query: %r", search_query)
-        await page.keyboard.press('Enter')
+            return await search_bar.fill(search_query)
+        except PlaywrightTimeoutError:
+            pass
+
+        search_bar = page.locator('[autofocus]')
+        await search_bar.clear()
+        return await search_bar.fill(search_query)
 
 
 class PlaywrightGoogleCSELinkSearch(PlaywrightSearchEngineLinkSearch):
@@ -169,121 +179,3 @@ async def _search(self, query, num_results=10):
         results = json.loads(response.text).get('organic', {})
         return list(filter(None, (result.get("link", "").replace("+", "%20")
                                   for result in results)))
-
-
-async def google_results_as_docs(
-    queries,
-    num_urls=None,
-    browser_semaphore=None,
-    task_name=None,
-    **file_loader_kwargs,
-):
-    """Retrieve top ``N`` google search results as document instances.
-
-    Parameters
-    ----------
-    queries : collection of str
-        Collection of strings representing google queries. Documents for
-        the top `num_urls` google search results (from all of these
-        queries _combined_ will be returned from this function.
-    num_urls : int, optional
-        Number of unique top Google search result to return as docs. The
-        google search results from all queries are interleaved and the
-        top `num_urls` unique URL's are downloaded as docs. If this
-        number is less than ``len(queries)``, some of your queries may
-        not contribute to the final output. By default, ``None``, which
-        sets ``num_urls = 3 * len(queries)``.
-    browser_semaphore : :class:`asyncio.Semaphore`, optional
-        Semaphore instance that can be used to limit the number of
-        playwright browsers open concurrently. If ``None``, no limits
-        are applied. By default, ``None``.
-    task_name : str, optional
-        Optional task name to use in :func:`asyncio.create_task`.
-        By default, ``None``.
-    **file_loader_kwargs
-        Keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. If found, the
-        "pw_launch_kwargs" key in these will also be used to initialize
-        the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
-        used for the google URL search. By default, ``None``.
-
-    Returns
-    -------
-    list of :class:`elm.web.document.BaseDocument`
-        List of documents representing the top `num_urls` results from
-        the google searches across all `queries`.
-    """
-    pw_launch_kwargs = file_loader_kwargs.get("pw_launch_kwargs", {})
-    urls = await _find_urls(
-        queries,
-        num_results=10,
-        browser_sem=browser_semaphore,
-        task_name=task_name,
-        **pw_launch_kwargs
-    )
-    num_urls = num_urls or 3 * len(queries)
-    urls = _down_select_urls(urls, num_urls=num_urls)
-    logger.debug("Downloading documents for URLS: \n\t-%s", "\n\t-".join(urls))
-    docs = await _load_docs(urls, browser_semaphore, **file_loader_kwargs)
-    return docs
-
-
-async def _find_urls(
-    queries, num_results=10, browser_sem=None, task_name=None, **kwargs
-):
-    """Parse google search output for URLs."""
-    searchers = [
-        asyncio.create_task(
-            _search_single(
-                query, browser_sem, num_results=num_results, **kwargs
-            ),
-            name=task_name,
-        )
-        for query in queries
-    ]
-    return await asyncio.gather(*searchers)
-
-
-async def _search_single(question, browser_sem, num_results=10, **kwargs):
-    """Perform a single google search."""
-    if browser_sem is None:
-        browser_sem = AsyncExitStack()
-
-    search_engine = PlaywrightGoogleLinkSearch(**kwargs)
-    logger.trace("Single search browser_semaphore=%r", browser_sem)
-    async with browser_sem:
-        logger.trace("Starting search for %r with browser_semaphore=%r",
-                     question, browser_sem)
-        return await search_engine.results(question, num_results=num_results)
-
-
-def _down_select_urls(search_results, num_urls=5):
-    """Select the top 5 URLs."""
-    all_urls = chain.from_iterable(
-        zip_longest(*[results[0] for results in search_results])
-    )
-    urls = set()
-    for url in all_urls:
-        if not url:
-            continue
-        urls.add(url)
-        if len(urls) == num_urls:
-            break
-    return urls
-
-
-async def _load_docs(urls, browser_semaphore=None, **kwargs):
-    """Load a document for each input URL."""
-    logger.trace("Downloading docs for the following URL's:\n%r", urls)
-    logger.trace("kwargs for AsyncFileLoader:\n%s",
-                 pprint.PrettyPrinter().pformat(kwargs))
-    file_loader = AsyncFileLoader(
-        browser_semaphore=browser_semaphore, **kwargs
-    )
-    docs = await file_loader.fetch_all(*urls)
-
-    page_lens = {doc.metadata.get("source", "Unknown"): len(doc.pages)
-                 for doc in docs}
-    logger.debug("Loaded the following number of pages for docs:\n%s",
-                 pprint.PrettyPrinter().pformat(page_lens))
-    return [doc for doc in docs if not doc.empty]
diff --git a/elm/web/search/run.py b/elm/web/search/run.py
diff --git a/elm/web/utilities.py b/elm/web/utilities.py
diff --git a/examples/web_scraping_pipeline/example_scrape_wiki.ipynb b/examples/web_scraping_pipeline/example_scrape_wiki.ipynb
diff --git a/tests/web/search/test_web_search_run.py b/tests/web/search/test_web_search_run.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"""ELM web search functions"""`
	`2`	`+`
	`3`	`+from .run import web_search_links_as_docs`