forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adds
UnstructuredURLLoader
for loading data from urls (langch…
…ain-ai#979) ### Summary Adds a `UnstructuredURLLoader` that supports loading data from a list of URLs. ### Testing ```python from langchain.document_loaders import UnstructuredURLLoader urls = [ "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023", "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023" ] loader = UnstructuredURLLoader(urls=urls) raw_documents = loader.load() ```
- Loading branch information
1 parent
d7c4540
commit da86016
Showing
3 changed files
with
112 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "2dfc4698", | ||
"metadata": {}, | ||
"source": [ | ||
"# URL\n", | ||
"\n", | ||
"This covers how to load HTML documents from a list of URLs into a document format that we can use downstream." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "16c3699e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
" from langchain.document_loaders import UnstructuredURLLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "836fbac1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"urls = [\n", | ||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n", | ||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\"\n", | ||
"]\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "00f46fda", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = UnstructuredURLLoader(urls=urls)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "b68a26b3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
"""Loader that loads PDF files.""" | ||
from typing import List | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class UnstructuredURLLoader(BaseLoader): | ||
"""Loader that uses unstructured to load HTML files.""" | ||
|
||
def __init__(self, urls: List[str]): | ||
"""Initialize with file path.""" | ||
try: | ||
import unstructured # noqa:F401 | ||
except ImportError: | ||
raise ValueError( | ||
"unstructured package not found, please install it with " | ||
"`pip install unstructured`" | ||
) | ||
self.urls = urls | ||
|
||
def load(self) -> List[Document]: | ||
"""Load file.""" | ||
from unstructured.partition.html import partition_html | ||
|
||
docs: List[Document] = list() | ||
for url in self.urls: | ||
elements = partition_html(url=url) | ||
text = "\n\n".join([str(el) for el in elements]) | ||
metadata = {"source": url} | ||
docs.append(Document(page_content=text, metadata=metadata)) | ||
return docs |