|
1 | 1 | import asyncio
|
2 | 2 | import json as jsonlib
|
3 | 3 | import re
|
| 4 | +from urllib.parse import urlparse |
4 | 5 |
|
5 | 6 | from bs4 import BeautifulSoup, Comment, Tag
|
6 | 7 | from openai import AsyncOpenAI
|
7 | 8 |
|
8 | 9 | from common.trace_info import TraceInfo
|
9 |
| -from wizard.config import OpenAIConfig |
| 10 | +from wizard.config import OpenAIConfig, ReaderConfig |
10 | 11 | from wizard.entity import Task
|
11 | 12 | from wizard.wand.functions.base_function import BaseFunction
|
12 | 13 |
|
@@ -46,13 +47,17 @@ class HTMLReader(BaseFunction):
|
46 | 47 | "news.qq.com": {
|
47 | 48 | "name": "div",
|
48 | 49 | "class_": "content-article"
|
| 50 | + }, |
| 51 | + "zhuanlan.zhihu.com": { |
| 52 | + "name": "article" |
49 | 53 | }
|
50 |
| - |
51 | 54 | }
|
52 | 55 |
|
53 |
| - def __init__(self, openai_config: OpenAIConfig): |
| 56 | + def __init__(self, reader_config: ReaderConfig): |
| 57 | + openai_config: OpenAIConfig = reader_config.openai |
54 | 58 | self.client = AsyncOpenAI(api_key=openai_config.api_key, base_url=openai_config.base_url)
|
55 |
| - self.model = openai_config.model |
| 59 | + self.model: str = openai_config.model |
| 60 | + self.timeout: float = reader_config.timeout |
56 | 61 |
|
57 | 62 | @classmethod
|
58 | 63 | def content_selector(cls, url: str, soup: BeautifulSoup) -> Tag:
|
@@ -127,7 +132,9 @@ def create_prompt(cls, text: str, instruction: str = None, schema: str = None) -
|
127 | 132 | if not instruction:
|
128 | 133 | instruction = "Extract the main content from the given HTML and convert it to Markdown format."
|
129 | 134 | if schema:
|
130 |
| - instruction = "Extract the specified information from the given HTML and present it in a structured JSON format. If any of the fields are not found in the HTML document, set their values to `Unknown` in the JSON output." |
| 135 | + instruction = ("Extract the specified information from the given HTML and present it in a structured JSON " |
| 136 | + "format. If any of the fields are not found in the HTML document, set their values to " |
| 137 | + "`Unknown` in the JSON output.") |
131 | 138 | prompt = f"{instruction}\n```html\n{text}\n```\nThe JSON schema is as follows:```json\n{schema}\n```"
|
132 | 139 | else:
|
133 | 140 | prompt = f"{instruction}\n```html\n{text}\n```"
|
@@ -176,14 +183,31 @@ async def run(self, task: Task, trace_info: TraceInfo, stream: bool = False) ->
|
176 | 183 | html = input_dict["html"]
|
177 | 184 | url = input_dict["url"]
|
178 | 185 |
|
| 186 | + domain: str = urlparse(url).netloc |
| 187 | + trace_info = trace_info.bind(domain=domain) |
| 188 | + |
179 | 189 | cleaned_html = self.clean_html(url, html, clean_svg=True, clean_base64=True, remove_atts=True,
|
180 | 190 | compress=True, remove_empty_tag=True, enable_content_selector=True)
|
181 |
| - trace_info.info({"len(html)": len(html), "len(cleaned_html)": len(cleaned_html)}) |
182 |
| - |
183 |
| - metadata, content = await asyncio.gather( |
184 |
| - self.extract_content(cleaned_html, schema=self.SCHEMA), |
185 |
| - self.extract_content(cleaned_html, stream=stream) |
186 |
| - ) |
| 191 | + trace_info.info({ |
| 192 | + "len(html)": len(html), |
| 193 | + "len(cleaned_html)": len(cleaned_html), |
| 194 | + "compress_rate": f"{len(cleaned_html) * 100 / len(html): .2f}%" |
| 195 | + }) |
| 196 | + |
| 197 | + metadata_task = asyncio.create_task(self.extract_content(cleaned_html, schema=self.SCHEMA)) |
| 198 | + content_task = asyncio.create_task(self.extract_content(cleaned_html, stream=stream)) |
| 199 | + |
| 200 | + try: |
| 201 | + metadata = await asyncio.wait_for(metadata_task, timeout=self.timeout) |
| 202 | + except asyncio.TimeoutError: |
| 203 | + trace_info.error({"error": "metadata TimeoutError"}) |
| 204 | + metadata = {} |
| 205 | + |
| 206 | + try: |
| 207 | + content = await asyncio.wait_for(content_task, timeout=self.timeout) |
| 208 | + except asyncio.TimeoutError: |
| 209 | + trace_info.error({"error": "content TimeoutError"}) |
| 210 | + content = "Timeout, please retry." |
187 | 211 |
|
188 | 212 | filtered_metadata: dict = {k: v for k, v in metadata.items() if v != "Unknown"}
|
189 | 213 |
|
|
0 commit comments