Merge pull request #9 from import-ai/feature/reader_timeout

LucienShui · web-flow · commit e57a623f8490 · 2025-01-23T23:29:40.000+08:00
Add timeout to html reader
diff --git a/tests/function/test_html_reader.py b/tests/function/test_html_reader.py
@@ -8,18 +8,20 @@
 from common import project_root
 from common.trace_info import TraceInfo
 from tests.helper.fixture import trace_info
-from wizard.config import OpenAIConfig
+from wizard.config import OpenAIConfig, ReaderConfig
 from wizard.entity import Task
 from wizard.wand.functions.html_reader import HTMLReader
 
 
 @pytest.fixture(scope="function")
-def openai_config() -> OpenAIConfig:
+def reader_config() -> ReaderConfig:
     load_dotenv(dotenv_path=project_root.path(".env"))
-    return OpenAIConfig(
-        api_key=os.environ["MBW_TASK_READER_API_KEY"],
-        base_url=os.environ["MBW_TASK_READER_BASE_URL"],
-        model=os.environ["MBW_TASK_READER_MODEL"],
+    return ReaderConfig(
+        openai=OpenAIConfig(
+            api_key=os.environ["MBW_TASK_READER_OPENAI_API_KEY"],
+            base_url=os.environ["MBW_TASK_READER_OPENAI_BASE_URL"],
+            model=os.environ["MBW_TASK_READER_OPENAI_MODEL"],
+        )
     )
 
 
@@ -29,15 +31,15 @@ def task() -> Task:
         return pickle.load(f)
 
 
-async def test_html_reader(openai_config: OpenAIConfig, task: Task, trace_info: TraceInfo):
-    c = HTMLReader(openai_config)
+async def test_html_reader(reader_config: ReaderConfig, task: Task, trace_info: TraceInfo):
+    c = HTMLReader(reader_config)
     result = await c.run(task, trace_info)
     print(jsonlib.dumps(result, ensure_ascii=False, separators=(",", ":")))
     # assert "Implement a notification system for updates and alerts." in result["markdown"]
 
 
-async def test_html_clean(openai_config: OpenAIConfig, task: Task):
-    c = HTMLReader(openai_config)
+async def test_html_clean(reader_config: ReaderConfig, task: Task):
+    c = HTMLReader(reader_config)
     html = task.input["html"]
     url = task.input["url"]
     print(f"raw length: {len(html)}")
diff --git a/wizard/config.py b/wizard/config.py
@@ -35,8 +35,13 @@ class BackendConfig(BaseModel):
     base_url: str
 
 
+class ReaderConfig(BaseModel):
+    openai: OpenAIConfig
+    timeout: float = Field(default=180, description="timeout second for reading html")
+
+
 class TaskConfig(BaseModel):
-    reader: OpenAIConfig
+    reader: ReaderConfig
 
 
 class Config(BaseModel):
diff --git a/wizard/wand/functions/html_reader.py b/wizard/wand/functions/html_reader.py
@@ -1,12 +1,13 @@
 import asyncio
 import json as jsonlib
 import re
+from urllib.parse import urlparse
 
 from bs4 import BeautifulSoup, Comment, Tag
 from openai import AsyncOpenAI
 
 from common.trace_info import TraceInfo
-from wizard.config import OpenAIConfig
+from wizard.config import OpenAIConfig, ReaderConfig
 from wizard.entity import Task
 from wizard.wand.functions.base_function import BaseFunction
 
@@ -46,13 +47,17 @@ class HTMLReader(BaseFunction):
         "news.qq.com": {
             "name": "div",
             "class_": "content-article"
+        },
+        "zhuanlan.zhihu.com": {
+            "name": "article"
         }
-
     }
 
-    def __init__(self, openai_config: OpenAIConfig):
+    def __init__(self, reader_config: ReaderConfig):
+        openai_config: OpenAIConfig = reader_config.openai
         self.client = AsyncOpenAI(api_key=openai_config.api_key, base_url=openai_config.base_url)
-        self.model = openai_config.model
+        self.model: str = openai_config.model
+        self.timeout: float = reader_config.timeout
 
     @classmethod
     def content_selector(cls, url: str, soup: BeautifulSoup) -> Tag:
@@ -127,7 +132,9 @@ def create_prompt(cls, text: str, instruction: str = None, schema: str = None) -
         if not instruction:
             instruction = "Extract the main content from the given HTML and convert it to Markdown format."
         if schema:
-            instruction = "Extract the specified information from the given HTML and present it in a structured JSON format. If any of the fields are not found in the HTML document, set their values to `Unknown` in the JSON output."
+            instruction = ("Extract the specified information from the given HTML and present it in a structured JSON "
+                           "format. If any of the fields are not found in the HTML document, set their values to "
+                           "`Unknown` in the JSON output.")
             prompt = f"{instruction}\n```html\n{text}\n```\nThe JSON schema is as follows:```json\n{schema}\n```"
         else:
             prompt = f"{instruction}\n```html\n{text}\n```"
@@ -176,14 +183,31 @@ async def run(self, task: Task, trace_info: TraceInfo, stream: bool = False) ->
         html = input_dict["html"]
         url = input_dict["url"]
 
+        domain: str = urlparse(url).netloc
+        trace_info = trace_info.bind(domain=domain)
+
         cleaned_html = self.clean_html(url, html, clean_svg=True, clean_base64=True, remove_atts=True,
                                        compress=True, remove_empty_tag=True, enable_content_selector=True)
-        trace_info.info({"len(html)": len(html), "len(cleaned_html)": len(cleaned_html)})
-
-        metadata, content = await asyncio.gather(
-            self.extract_content(cleaned_html, schema=self.SCHEMA),
-            self.extract_content(cleaned_html, stream=stream)
-        )
+        trace_info.info({
+            "len(html)": len(html),
+            "len(cleaned_html)": len(cleaned_html),
+            "compress_rate": f"{len(cleaned_html) * 100 / len(html): .2f}%"
+        })
+
+        metadata_task = asyncio.create_task(self.extract_content(cleaned_html, schema=self.SCHEMA))
+        content_task = asyncio.create_task(self.extract_content(cleaned_html, stream=stream))
+
+        try:
+            metadata = await asyncio.wait_for(metadata_task, timeout=self.timeout)
+        except asyncio.TimeoutError:
+            trace_info.error({"error": "metadata TimeoutError"})
+            metadata = {}
+
+        try:
+            content = await asyncio.wait_for(content_task, timeout=self.timeout)
+        except asyncio.TimeoutError:
+            trace_info.error({"error": "content TimeoutError"})
+            content = "Timeout, please retry."
 
         filtered_metadata: dict = {k: v for k, v in metadata.items() if v != "Unknown"}