Skip to content

Commit 1f2b422

Browse files
committed
Hub: Inquire information from https://llmtxt.dev/hub
1 parent 68fe7b7 commit 1f2b422

File tree

3 files changed

+81
-0
lines changed

3 files changed

+81
-0
lines changed

src/cratedb_about/cli.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import logging
22
import typing as t
33
from pathlib import Path
4+
from pprint import pprint
45

56
import click
67
from pueblo.util.cli import boot_click
78

89
from cratedb_about.bundle.llmstxt import LllmsTxtBuilder
10+
from cratedb_about.hub.model import LLMsTxtHub
911
from cratedb_about.outline import CrateDbKnowledgeOutline
1012
from cratedb_about.query.core import CrateDbKnowledgeConversation
1113
from cratedb_about.query.model import Example
@@ -99,6 +101,17 @@ def bundle(ctx: click.Context, url: str, format_: str, outdir: Path) -> None:
99101
logger.info("Ready.")
100102

101103

104+
@cli.command()
105+
@click.pass_context
106+
def hub(ctx: click.Context) -> None:
107+
"""
108+
Inquire information from https://llmtxt.dev/hub.
109+
"""
110+
txt_hub = LLMsTxtHub().fetch()
111+
pprint(txt_hub.items)
112+
logger.info("Ready.")
113+
114+
102115
@cli.command()
103116
@click.argument("question", type=str, required=False)
104117
@click.option("--backend", type=click.Choice(["openai", "claude"]), default="openai")

src/cratedb_about/hub/__init__.py

Whitespace-only changes.

src/cratedb_about/hub/model.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import dataclasses
2+
import logging
3+
import typing as t
4+
5+
from bs4 import BeautifulSoup
6+
7+
from cratedb_about.util import get_cache_client
8+
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
@dataclasses.dataclass
14+
class Resource:
15+
url: str
16+
size: int = -1
17+
18+
19+
@dataclasses.dataclass
20+
class LLMsTxtHubItem:
21+
title: str
22+
website: str
23+
description: str
24+
logo: str
25+
tags: t.List[str] = dataclasses.field(default_factory=list)
26+
resources: t.List[Resource] = dataclasses.field(default_factory=list)
27+
28+
29+
class LLMsTxtHub:
30+
url: str = "https://llmtxt.dev/hub"
31+
32+
def __init__(self):
33+
self.items: t.List[LLMsTxtHubItem] = []
34+
self.client = get_cache_client(ttl=60*60*24)
35+
36+
def fetch(self):
37+
index_html = self.client.get(self.url)
38+
bs = BeautifulSoup(index_html, "html.parser")
39+
cards = bs.find_all(attrs={"class": "website-card"})
40+
self.items = [self.card_to_model(card) for card in cards]
41+
self.acquire_sizes()
42+
return self
43+
44+
def acquire_sizes(self):
45+
logger.info(f"Acquiring sizes for {len(self.items)} items")
46+
for item in self.items:
47+
logger.info(f"Acquiring size for {item}")
48+
for resource in item.resources:
49+
try:
50+
response = self.client.get(resource.url)
51+
resource.size = len(response.text)
52+
except Exception as e:
53+
logger.warning(f"Failed to acquire size for {item}: {e}")
54+
55+
@staticmethod
56+
def card_to_model(card):
57+
divs = card.find(name="div")
58+
title = divs.find(name="h3").text
59+
tags = []
60+
for tag in divs.find_all(name="span"):
61+
tags.append(tag.text)
62+
website = divs.find(name="p", attrs={"class": "text-sm"}).text
63+
description = divs.find(name="p", attrs={"class": "text-sm", "title": True}).text
64+
logo_url = divs.find(name="img").get("src")
65+
resources = []
66+
for anchor in divs.find_all(name="a"):
67+
resources.append(Resource(url=anchor.get("href")))
68+
return LLMsTxtHubItem(title=title, website=website, description=description, logo=logo_url, tags=tags, resources=resources)

0 commit comments

Comments
 (0)