|
| 1 | +import dataclasses |
| 2 | +import logging |
| 3 | +import typing as t |
| 4 | + |
| 5 | +from bs4 import BeautifulSoup |
| 6 | + |
| 7 | +from cratedb_about.util import get_cache_client |
| 8 | + |
| 9 | + |
| 10 | +logger = logging.getLogger(__name__) |
| 11 | + |
| 12 | + |
| 13 | +@dataclasses.dataclass |
| 14 | +class Resource: |
| 15 | + url: str |
| 16 | + size: int = -1 |
| 17 | + |
| 18 | + |
| 19 | +@dataclasses.dataclass |
| 20 | +class LLMsTxtHubItem: |
| 21 | + title: str |
| 22 | + website: str |
| 23 | + description: str |
| 24 | + logo: str |
| 25 | + tags: t.List[str] = dataclasses.field(default_factory=list) |
| 26 | + resources: t.List[Resource] = dataclasses.field(default_factory=list) |
| 27 | + |
| 28 | + |
| 29 | +class LLMsTxtHub: |
| 30 | + url: str = "https://llmtxt.dev/hub" |
| 31 | + |
| 32 | + def __init__(self): |
| 33 | + self.items: t.List[LLMsTxtHubItem] = [] |
| 34 | + self.client = get_cache_client(ttl=60*60*24) |
| 35 | + |
| 36 | + def fetch(self): |
| 37 | + index_html = self.client.get(self.url) |
| 38 | + bs = BeautifulSoup(index_html, "html.parser") |
| 39 | + cards = bs.find_all(attrs={"class": "website-card"}) |
| 40 | + self.items = [self.card_to_model(card) for card in cards] |
| 41 | + self.acquire_sizes() |
| 42 | + return self |
| 43 | + |
| 44 | + def acquire_sizes(self): |
| 45 | + logger.info(f"Acquiring sizes for {len(self.items)} items") |
| 46 | + for item in self.items: |
| 47 | + logger.info(f"Acquiring size for {item}") |
| 48 | + for resource in item.resources: |
| 49 | + try: |
| 50 | + response = self.client.get(resource.url) |
| 51 | + resource.size = len(response.text) |
| 52 | + except Exception as e: |
| 53 | + logger.warning(f"Failed to acquire size for {item}: {e}") |
| 54 | + |
| 55 | + @staticmethod |
| 56 | + def card_to_model(card): |
| 57 | + divs = card.find(name="div") |
| 58 | + title = divs.find(name="h3").text |
| 59 | + tags = [] |
| 60 | + for tag in divs.find_all(name="span"): |
| 61 | + tags.append(tag.text) |
| 62 | + website = divs.find(name="p", attrs={"class": "text-sm"}).text |
| 63 | + description = divs.find(name="p", attrs={"class": "text-sm", "title": True}).text |
| 64 | + logo_url = divs.find(name="img").get("src") |
| 65 | + resources = [] |
| 66 | + for anchor in divs.find_all(name="a"): |
| 67 | + resources.append(Resource(url=anchor.get("href"))) |
| 68 | + return LLMsTxtHubItem(title=title, website=website, description=description, logo=logo_url, tags=tags, resources=resources) |
0 commit comments