From 91bd5db84afcfa3f15db71fd5ef9fc019b84443e Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 11 Sep 2024 12:33:23 +0200 Subject: [PATCH] Add format option with raw HTML to URL component --- .../base/langflow/components/data/URL.py | 16 +++++++++++--- .../starter_projects/Blog Writer.json | 22 ++++++++++++++++++- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/backend/base/langflow/components/data/URL.py b/src/backend/base/langflow/components/data/URL.py index e9f507a3838..012180bc71e 100644 --- a/src/backend/base/langflow/components/data/URL.py +++ b/src/backend/base/langflow/components/data/URL.py @@ -1,10 +1,10 @@ import re -from langchain_community.document_loaders.web_base import WebBaseLoader +from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader from langflow.helpers.data import data_to_text from langflow.custom import Component -from langflow.io import MessageTextInput, Output +from langflow.io import DropdownInput, MessageTextInput, Output from langflow.schema import Data from langflow.schema.message import Message @@ -22,6 +22,13 @@ class URLComponent(Component): info="Enter one or more URLs, by clicking the '+' button.", is_list=True, ), + DropdownInput( + name="format", + display_name="Output format", + info="Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.", + options=["Text", "Raw HTML"], + value="Text", + ), ] outputs = [ @@ -64,7 +71,10 @@ def ensure_url(self, string: str) -> str: def fetch_content(self) -> list[Data]: urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()] - loader = WebBaseLoader(web_paths=urls, encoding="utf-8") + if self.format == "Raw HTML": + loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8") + else: + loader = WebBaseLoader(web_paths=urls, encoding="utf-8") docs = loader.load() data = [Data(text=doc.page_content, **doc.metadata) for doc in docs] self.status = data diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Blog Writer.json b/src/backend/base/langflow/initial_setup/starter_projects/Blog Writer.json index 811d71a728e..3d3ab51a17f 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Blog Writer.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Blog Writer.json @@ -200,7 +200,27 @@ "show": true, "title_case": false, "type": "code", - "value": "import re\n\nfrom langchain_community.document_loaders.web_base import WebBaseLoader\n\nfrom langflow.helpers.data import data_to_text\nfrom langflow.custom import Component\nfrom langflow.io import MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Fetch content from one or more URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n info=\"Enter one or more URLs, by clicking the '+' button.\",\n is_list=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"\n Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n raise ValueError(f\"Invalid URL: {string}\")\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n" + "value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.helpers.data import data_to_text\nfrom langflow.custom import Component\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Fetch content from one or more URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n info=\"Enter one or more URLs, by clicking the '+' button.\",\n is_list=True,\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output format\",\n info=\"Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"\n Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n raise ValueError(f\"Invalid URL: {string}\")\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n" + }, + "format": { + "_input_type": "DropdownInput", + "advanced": false, + "combobox": false, + "display_name": "Output format", + "dynamic": false, + "info": "Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.", + "name": "format", + "options": [ + "Text", + "Raw HTML" + ], + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "trace_as_metadata": true, + "type": "str", + "value": "Text" }, "urls": { "advanced": false,