Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llm_web_kit/api/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class Settings(BaseSettings):
# 模型配置
model_path: Optional[str] = None
max_content_length: int = 10 * 1024 * 1024 # 10MB
crawl_url: str = "http://10.140.0.94:9500/crawl"

# 缓存配置
cache_ttl: int = 3600 # 1小时
Expand Down
1 change: 1 addition & 0 deletions llm_web_kit/api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ aiohttp>=3.9.0

# FastAPI 相关依赖
fastapi>=0.104.0
httpx>=0.27.0
pydantic>=2.0.0
pydantic-settings>=2.0.0

Expand Down
44 changes: 41 additions & 3 deletions llm_web_kit/api/services/html_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

from typing import Any, Dict, Optional

from llm_web_kit.simple import extract_content_from_main_html
import httpx

from ..dependencies import get_inference_service, get_logger, get_settings
from llm_web_kit.api.dependencies import (get_inference_service, get_logger,
get_settings)
from llm_web_kit.simple import extract_content_from_main_html

logger = get_logger(__name__)
settings = get_settings()
Expand Down Expand Up @@ -37,8 +39,26 @@ async def parse_html(
) -> Dict[str, Any]:
"""解析 HTML 内容."""
try:
if not html_content and url:
logger.info(f'HTML 内容为空,尝试从 URL 爬取: {url}')
try:
async with httpx.AsyncClient() as client:
response = await client.post(settings.crawl_url, json={'url': url}, timeout=60)
response.raise_for_status()
data = response.json()
html_content = data.get('html')
if not html_content:
raise ValueError('爬取成功,但未返回 HTML 内容')
logger.info(f'URL 爬取成功,内容长度: {len(html_content)}')
except httpx.RequestError as exc:
logger.error(f"调用爬虫服务失败: {exc}")
raise ValueError(f"无法从 URL 爬取内容: {exc}")
except Exception as e:
logger.error(f'爬取或解析爬取内容失败: {e}')
raise ValueError(f'处理爬取内容时发生错误: {e}')

if not html_content:
raise ValueError('必须提供 HTML 内容')
raise ValueError('必须提供 HTML 内容或有效的 URL')

# 延迟导入,避免模块导入期异常导致服务类不可用
try:
Expand Down Expand Up @@ -83,3 +103,21 @@ async def _parse_with_model(self, html_content: str, options: Optional[Dict[str,
if self._inference_service is None:
self._inference_service = get_inference_service()
return await self._inference_service.inference(html_content, options or {})


if __name__ == '__main__':
import asyncio

# 重新导入以确保加载最新的代码,绕过缓存问题
from llm_web_kit.api.dependencies import get_settings
settings = get_settings()

async def main():
async with httpx.AsyncClient() as client:
response = await client.post(settings.crawl_url, json={'url': 'https://aws.amazon.com/what-is/retrieval-augmented-generation/'}, timeout=60)
response.raise_for_status()
data = response.json()
html_content = data.get('html')
print(html_content)

asyncio.run(main())
35 changes: 19 additions & 16 deletions llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,25 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
content, body = self.process(html_source, template_dict_html)

# 相似度计算
if pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, None):
template_main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
if pre_data.get(PreDataJsonKey.SIMILARITY_LAYER, None):
layer = pre_data[PreDataJsonKey.SIMILARITY_LAYER]
else:
layer = self.__get_max_width_layer(template_data)
feature1 = get_feature(template_main_html)
feature2 = get_feature(body)
sim = None
if feature1 is not None and feature2 is not None:
sim = similarity(feature1, feature2, layer_n=layer)
pre_data[PreDataJsonKey.MAIN_HTML_SIM] = sim
if sim is None or sim < SIMILARITY_THRESHOLD:
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
else:
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = True
try:
if pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, None):
template_main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
if pre_data.get(PreDataJsonKey.SIMILARITY_LAYER, None):
layer = pre_data[PreDataJsonKey.SIMILARITY_LAYER]
else:
layer = self.__get_max_width_layer(template_data)
feature1 = get_feature(template_main_html)
feature2 = get_feature(body)
sim = None
if feature1 is not None and feature2 is not None:
sim = similarity(feature1, feature2, layer_n=layer)
pre_data[PreDataJsonKey.MAIN_HTML_SIM] = sim
if sim is None or sim < SIMILARITY_THRESHOLD:
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
else:
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = True
except Exception:
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = None

# 结果返回
pre_data[PreDataJsonKey.MAIN_HTML] = content
Expand Down