From be5166d9904f659129335f4a32e1455bc80998f6 Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Tue, 30 Sep 2025 17:38:25 +0800 Subject: [PATCH] feat: add html parse api url parsing --- llm_web_kit/api/dependencies.py | 1 + llm_web_kit/api/requirements.txt | 1 + llm_web_kit/api/services/html_service.py | 44 +++++++++++++++++-- .../parser/layout_batch_parser.py | 35 ++++++++------- 4 files changed, 62 insertions(+), 19 deletions(-) diff --git a/llm_web_kit/api/dependencies.py b/llm_web_kit/api/dependencies.py index 32eadeb6..a2814fce 100644 --- a/llm_web_kit/api/dependencies.py +++ b/llm_web_kit/api/dependencies.py @@ -31,6 +31,7 @@ class Settings(BaseSettings): # 模型配置 model_path: Optional[str] = None max_content_length: int = 10 * 1024 * 1024 # 10MB + crawl_url: str = "http://10.140.0.94:9500/crawl" # 缓存配置 cache_ttl: int = 3600 # 1小时 diff --git a/llm_web_kit/api/requirements.txt b/llm_web_kit/api/requirements.txt index c7cbd56f..fc428808 100644 --- a/llm_web_kit/api/requirements.txt +++ b/llm_web_kit/api/requirements.txt @@ -3,6 +3,7 @@ aiohttp>=3.9.0 # FastAPI 相关依赖 fastapi>=0.104.0 +httpx>=0.27.0 pydantic>=2.0.0 pydantic-settings>=2.0.0 diff --git a/llm_web_kit/api/services/html_service.py b/llm_web_kit/api/services/html_service.py index ba124e27..0bb247a0 100644 --- a/llm_web_kit/api/services/html_service.py +++ b/llm_web_kit/api/services/html_service.py @@ -5,9 +5,11 @@ from typing import Any, Dict, Optional -from llm_web_kit.simple import extract_content_from_main_html +import httpx -from ..dependencies import get_inference_service, get_logger, get_settings +from llm_web_kit.api.dependencies import (get_inference_service, get_logger, + get_settings) +from llm_web_kit.simple import extract_content_from_main_html logger = get_logger(__name__) settings = get_settings() @@ -37,8 +39,26 @@ async def parse_html( ) -> Dict[str, Any]: """解析 HTML 内容.""" try: + if not html_content and url: + logger.info(f'HTML 内容为空,尝试从 URL 爬取: {url}') + try: + async with httpx.AsyncClient() as client: + response = await client.post(settings.crawl_url, json={'url': url}, timeout=60) + response.raise_for_status() + data = response.json() + html_content = data.get('html') + if not html_content: + raise ValueError('爬取成功,但未返回 HTML 内容') + logger.info(f'URL 爬取成功,内容长度: {len(html_content)}') + except httpx.RequestError as exc: + logger.error(f"调用爬虫服务失败: {exc}") + raise ValueError(f"无法从 URL 爬取内容: {exc}") + except Exception as e: + logger.error(f'爬取或解析爬取内容失败: {e}') + raise ValueError(f'处理爬取内容时发生错误: {e}') + if not html_content: - raise ValueError('必须提供 HTML 内容') + raise ValueError('必须提供 HTML 内容或有效的 URL') # 延迟导入,避免模块导入期异常导致服务类不可用 try: @@ -83,3 +103,21 @@ async def _parse_with_model(self, html_content: str, options: Optional[Dict[str, if self._inference_service is None: self._inference_service = get_inference_service() return await self._inference_service.inference(html_content, options or {}) + + +if __name__ == '__main__': + import asyncio + + # 重新导入以确保加载最新的代码,绕过缓存问题 + from llm_web_kit.api.dependencies import get_settings + settings = get_settings() + + async def main(): + async with httpx.AsyncClient() as client: + response = await client.post(settings.crawl_url, json={'url': 'https://aws.amazon.com/what-is/retrieval-augmented-generation/'}, timeout=60) + response.raise_for_status() + data = response.json() + html_content = data.get('html') + print(html_content) + + asyncio.run(main()) diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index 5d6ef2e4..48a9f5c5 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -73,22 +73,25 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: content, body = self.process(html_source, template_dict_html) # 相似度计算 - if pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, None): - template_main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML] - if pre_data.get(PreDataJsonKey.SIMILARITY_LAYER, None): - layer = pre_data[PreDataJsonKey.SIMILARITY_LAYER] - else: - layer = self.__get_max_width_layer(template_data) - feature1 = get_feature(template_main_html) - feature2 = get_feature(body) - sim = None - if feature1 is not None and feature2 is not None: - sim = similarity(feature1, feature2, layer_n=layer) - pre_data[PreDataJsonKey.MAIN_HTML_SIM] = sim - if sim is None or sim < SIMILARITY_THRESHOLD: - pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False - else: - pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = True + try: + if pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, None): + template_main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML] + if pre_data.get(PreDataJsonKey.SIMILARITY_LAYER, None): + layer = pre_data[PreDataJsonKey.SIMILARITY_LAYER] + else: + layer = self.__get_max_width_layer(template_data) + feature1 = get_feature(template_main_html) + feature2 = get_feature(body) + sim = None + if feature1 is not None and feature2 is not None: + sim = similarity(feature1, feature2, layer_n=layer) + pre_data[PreDataJsonKey.MAIN_HTML_SIM] = sim + if sim is None or sim < SIMILARITY_THRESHOLD: + pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False + else: + pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = True + except Exception: + pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = None # 结果返回 pre_data[PreDataJsonKey.MAIN_HTML] = content