diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 3c37da8e..05b8352e 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -3,6 +3,7 @@ import uuid from typing import Dict, List, Tuple +from bs4 import BeautifulSoup from lxml import etree, html from selectolax.parser import HTMLParser @@ -858,8 +859,12 @@ def simplify_html(html_str) -> etree.Element: _xpath_mapping: xpath映射 """ # 使用selectolax的HTMLParser来修复html - soup = HTMLParser(html_str) - fixed_html = soup.html + try: + soup = HTMLParser(html_str) + fixed_html = soup.html + except Exception: + soup = BeautifulSoup(html_str, 'html.parser') + fixed_html = str(soup) preprocessed_html = remove_xml_declaration(fixed_html) # 注释通过lxml的HTMLParser的remove_comments参数处理