Changelog

English | 简体中文

Changelog

2024/11/25: Project Initialization

llm-web-kit

Project Introduction

llm-web-kit is a python library that ..

Key Features

Remove headers, footers, footnotes, page numbers, etc., to ensure semantic coherence.
Output text in human-readable order, suitable for single-column, multi-column, and complex layouts.

Quick Start

This diagram shows three main HTML content extraction methods:

extract by magic_html+recognize: Two-stage complete extraction that first uses magic-html to extract main content, then converts it to structured markdown.
only extract by recognize: Direct content recognition that converts main_html to structured format without main content identification.
only extract main_html by magic-html: First-stage only extraction that identifies and extracts main content area while preserving HTML structure.

extract by magic_html+recognize

from llm_web_kit.simple import extract_content_from_html_with_magic_html
from loguru import logger

def extract(url:str, html:str) -> str:
    try:
        nlp_md = extract_content_from_html_with_magic_html(url, html)
        # or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md')
        # or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md', use_raw_image_url=True)
        # or nlp_md = extract_content_from_html_with_magic_html(url, html, language='zh')
        return nlp_md
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    url = ""
    html = '''<html><body>
    <div class="options-div-0-0 option-box__items" style="display: none;">
        <span class="bedroom-rate__title">Room Only Rate</span>
        <span class="bedroom-rate__price">£1,230.00</span>
    </div>
    <p>正常内容</p>
    </body></html>'''
    markdown = extract(url, html)
    print(markdown)

only extract by recognize

from llm_web_kit.simple import extract_content_from_main_html
from loguru import logger

def extract(url:str, html:str) -> str:
    try:
        nlp_md = extract_content_from_main_html(url, html)
        # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
        # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md', use_raw_image_url=True)
        # or nlp_md = extract_content_from_main_html(url, html, language='zh')
        return nlp_md
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    url = ""
    html = '''<html><body>
    <div class="options-div-0-0 option-box__items" style="display: none;">
        <span class="bedroom-rate__title">Room Only Rate</span>
        <span class="bedroom-rate__price">£1,230.00</span>
    </div>
    <p>正常内容</p>
    </body></html>'''
    markdown = extract(url, html)
    print(markdown)

only extract main_html by magic-html

from llm_web_kit.simple import extract_main_html_only
from loguru import logger

def extract(url:str, html:str) -> str:
    try:
        main_html = extract_main_html_only(url, html)
        return main_html
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    url = ""
    html = '''<html><body>
    <div class="options-div-0-0 option-box__items" style="display: none;">
        <span class="bedroom-rate__title">Room Only Rate</span>
        <span class="bedroom-rate__price">£1,230.00</span>
    </div>
    <p>正常内容</p>
    </body></html>'''
    main_html = extract(url, html)
    print(main_html)

extract main_html by model response

import traceback
from loguru import logger
from llm_web_kit.main_html_parser.simplify_html.simplify_html import simplify_html
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser

def extract(response_json: dict, html:str) -> str:
    try:
        _, typical_raw_tag_html, _ = simplify_html(html)
        pre_data = PreDataJson({})
        pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
        pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html
        pre_data['success_label_enable'] = True
        pre_data[PreDataJsonKey.LLM_RESPONSE] = response_json
        parser = MapItemToHtmlTagsParser({})
        pre_data = parser.parse_single(pre_data)
        main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
        is_success = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS]
        return main_html, is_success
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    response_json =  {'item_id 1': 0, 'item_id 2': 1, 'item_id 3': 1}
    html = ""
    main_html, is_success = extract(response_json, html)

extract plain text from html source

from llm_web_kit.libs.html_utils import get_plain_text_fast
html_source = ""
text = get_plain_text_fast(html_source)
# language = detect_lang(text)

Name		Name	Last commit message	Last commit date
Latest commit History 615 Commits
.github		.github
bench		bench
docs		docs
jupyter		jupyter
llm_web_kit		llm_web_kit
requirements		requirements
tests		tests
.codecov.yml		.codecov.yml
.gitignore		.gitignore
.pre-commit-config.yaml		.pre-commit-config.yaml
API_zh-CN.md		API_zh-CN.md
README.md		README.md
README_zh-CN.md		README_zh-CN.md
requirements.txt		requirements.txt
setup.py		setup.py
update_version.py		update_version.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

Changelog

Table of Contents

llm-web-kit

Project Introduction

Key Features

Quick Start

extract by magic_html+recognize

only extract by recognize

only extract main_html by magic-html

extract main_html by model response

extract plain text from html source

Pipeline

Usage

TODO

Known Issues

FAQ

contributors

License Information

Acknowledgments

Citation

Star History

links

About

Uh oh!

Releases 13

Packages

Uh oh!

Contributors 21

Uh oh!

Languages

ccprocessor/llm-webkit-mirror

Folders and files

Latest commit

History

Repository files navigation

Changelog

Table of Contents

llm-web-kit

Project Introduction

Key Features

Quick Start

extract by magic_html+recognize

only extract by recognize

only extract main_html by magic-html

extract main_html by model response

extract plain text from html source

Pipeline

Usage

TODO

Known Issues

FAQ

contributors

License Information

Acknowledgments

Citation

Star History

links

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases 13

Packages 0

Uh oh!

Contributors 21

Uh oh!

Languages

Packages