- 2024/11/25: Project Initialization
llm-web-kit is a python library that ..
- Remove headers, footers, footnotes, page numbers, etc., to ensure semantic coherence.
- Output text in human-readable order, suitable for single-column, multi-column, and complex layouts.
This diagram shows three main HTML content extraction methods:
-
extract by magic_html+recognize: Two-stage complete extraction that first uses magic-html to extract main content, then converts it to structured markdown.
-
only extract by recognize: Direct content recognition that converts main_html to structured format without main content identification.
-
only extract main_html by magic-html: First-stage only extraction that identifies and extracts main content area while preserving HTML structure.
from llm_web_kit.simple import extract_content_from_html_with_magic_html
from loguru import logger
def extract(url:str, html:str) -> str:
try:
nlp_md = extract_content_from_html_with_magic_html(url, html)
# or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md')
# or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md', use_raw_image_url=True)
# or nlp_md = extract_content_from_html_with_magic_html(url, html, language='zh')
return nlp_md
except Exception as e:
logger.exception(e)
return None
if __name__=="__main__":
url = ""
html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
markdown = extract(url, html)
print(markdown)
from llm_web_kit.simple import extract_content_from_main_html
from loguru import logger
def extract(url:str, html:str) -> str:
try:
nlp_md = extract_content_from_main_html(url, html)
# or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
# or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md', use_raw_image_url=True)
# or nlp_md = extract_content_from_main_html(url, html, language='zh')
return nlp_md
except Exception as e:
logger.exception(e)
return None
if __name__=="__main__":
url = ""
html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
markdown = extract(url, html)
print(markdown)
from llm_web_kit.simple import extract_main_html_only
from loguru import logger
def extract(url:str, html:str) -> str:
try:
main_html = extract_main_html_only(url, html)
return main_html
except Exception as e:
logger.exception(e)
return None
if __name__=="__main__":
url = ""
html = '''<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p>正常内容</p>
</body></html>'''
main_html = extract(url, html)
print(main_html)
import traceback
from loguru import logger
from llm_web_kit.main_html_parser.simplify_html.simplify_html import simplify_html
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
def extract(response_json: dict, html:str) -> str:
try:
_, typical_raw_tag_html, _ = simplify_html(html)
pre_data = PreDataJson({})
pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html
pre_data['success_label_enable'] = True
pre_data[PreDataJsonKey.LLM_RESPONSE] = response_json
parser = MapItemToHtmlTagsParser({})
pre_data = parser.parse_single(pre_data)
main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
is_success = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS]
return main_html, is_success
except Exception as e:
logger.exception(e)
return None
if __name__=="__main__":
response_json = {'item_id 1': 0, 'item_id 2': 1, 'item_id 3': 1}
html = ""
main_html, is_success = extract(response_json, html)
from llm_web_kit.libs.html_utils import get_plain_text_fast
html_source = ""
text = get_plain_text_fast(html_source)
# language = detect_lang(text)