Skip to content

ccprocessor/llm-webkit-mirror

Repository files navigation

Changelog

  • 2024/11/25: Project Initialization

Table of Contents

  1. llm-web-kit
  2. TODO
  3. Known Issues
  4. FAQ
  5. All Thanks To Our Contributors
  6. License Information
  7. Acknowledgments
  8. Citation
  9. Star History
  10. Links

llm-web-kit

Project Introduction

llm-web-kit is a python library that ..

Key Features

  • Remove headers, footers, footnotes, page numbers, etc., to ensure semantic coherence.
  • Output text in human-readable order, suitable for single-column, multi-column, and complex layouts.

Quick Start

extract_method picture

This diagram shows three main HTML content extraction methods:

  1. extract by magic_html+recognize: Two-stage complete extraction that first uses magic-html to extract main content, then converts it to structured markdown.

  2. only extract by recognize: Direct content recognition that converts main_html to structured format without main content identification.

  3. only extract main_html by magic-html: First-stage only extraction that identifies and extracts main content area while preserving HTML structure.

extract by magic_html+recognize

from llm_web_kit.simple import extract_content_from_html_with_magic_html
from loguru import logger

def extract(url:str, html:str) -> str:
    try:
        nlp_md = extract_content_from_html_with_magic_html(url, html)
        # or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md')
        # or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md', use_raw_image_url=True)
        # or nlp_md = extract_content_from_html_with_magic_html(url, html, language='zh')
        return nlp_md
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    url = ""
    html = '''<html><body>
    <div class="options-div-0-0 option-box__items" style="display: none;">
        <span class="bedroom-rate__title">Room Only Rate</span>
        <span class="bedroom-rate__price">£1,230.00</span>
    </div>
    <p>正常内容</p>
    </body></html>'''
    markdown = extract(url, html)
    print(markdown)

only extract by recognize

from llm_web_kit.simple import extract_content_from_main_html
from loguru import logger

def extract(url:str, html:str) -> str:
    try:
        nlp_md = extract_content_from_main_html(url, html)
        # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
        # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md', use_raw_image_url=True)
        # or nlp_md = extract_content_from_main_html(url, html, language='zh')
        return nlp_md
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    url = ""
    html = '''<html><body>
    <div class="options-div-0-0 option-box__items" style="display: none;">
        <span class="bedroom-rate__title">Room Only Rate</span>
        <span class="bedroom-rate__price">£1,230.00</span>
    </div>
    <p>正常内容</p>
    </body></html>'''
    markdown = extract(url, html)
    print(markdown)

only extract main_html by magic-html

from llm_web_kit.simple import extract_main_html_only
from loguru import logger

def extract(url:str, html:str) -> str:
    try:
        main_html = extract_main_html_only(url, html)
        return main_html
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    url = ""
    html = '''<html><body>
    <div class="options-div-0-0 option-box__items" style="display: none;">
        <span class="bedroom-rate__title">Room Only Rate</span>
        <span class="bedroom-rate__price">£1,230.00</span>
    </div>
    <p>正常内容</p>
    </body></html>'''
    main_html = extract(url, html)
    print(main_html)

extract main_html by model response

import traceback
from loguru import logger
from llm_web_kit.main_html_parser.simplify_html.simplify_html import simplify_html
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser

def extract(response_json: dict, html:str) -> str:
    try:
        _, typical_raw_tag_html, _ = simplify_html(html)
        pre_data = PreDataJson({})
        pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
        pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html
        pre_data['success_label_enable'] = True
        pre_data[PreDataJsonKey.LLM_RESPONSE] = response_json
        parser = MapItemToHtmlTagsParser({})
        pre_data = parser.parse_single(pre_data)
        main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
        is_success = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS]
        return main_html, is_success
    except Exception as e:
        logger.exception(e)
    return None

if __name__=="__main__":
    response_json =  {'item_id 1': 0, 'item_id 2': 1, 'item_id 3': 1}
    html = ""
    main_html, is_success = extract(response_json, html)

extract plain text from html source

from llm_web_kit.libs.html_utils import get_plain_text_fast
html_source = ""
text = get_plain_text_fast(html_source)
# language = detect_lang(text)

Pipeline

  1. HTML pre-dedup
  2. domain clustering
  3. layout clustering
  4. typical layout node selection
  5. HTML node select by LLM
  6. html parse layout by layout

Usage

TODO

Known Issues

FAQ

contributors

contributors

License Information

Acknowledgments

Citation

Star History

Star History Chart

links

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Packages

No packages published

Contributors 21