Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/cratedb_about/cli.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
import typing as t
from pathlib import Path
from pprint import pprint
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Use dedicated formatters instead of pprint

The pprint module is flagged by the linter as it's generally intended for debugging rather than production code.

Apply this diff to use structured formatters instead:

-from pprint import pprint
+import dataclasses
+import json

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In src/cratedb_about/cli.py at line 4, replace the use of the pprint module with
a dedicated structured formatter suitable for production, such as the logging
module's formatters or JSON formatting. Remove the import of pprint and update
the code to output data using the chosen formatter to ensure cleaner, more
maintainable, and production-appropriate output.


import click
from pueblo.util.cli import boot_click

from cratedb_about.bundle.llmstxt import LllmsTxtBuilder
from cratedb_about.hub.model import LLMsTxtHub
from cratedb_about.outline import CrateDbKnowledgeOutline
from cratedb_about.query.core import CrateDbKnowledgeConversation
from cratedb_about.query.model import Example
Expand Down Expand Up @@ -99,6 +101,17 @@ def bundle(ctx: click.Context, url: str, format_: str, outdir: Path) -> None:
logger.info("Ready.")


@cli.command()
@click.pass_context
def hub(ctx: click.Context) -> None:
"""
Inquire information from https://llmtxt.dev/hub.
"""
txt_hub = LLMsTxtHub().fetch()
pprint(txt_hub.items)
logger.info("Ready.")


@cli.command()
@click.argument("question", type=str, required=False)
@click.option("--backend", type=click.Choice(["openai", "claude"]), default="openai")
Expand Down
Empty file.
68 changes: 68 additions & 0 deletions src/cratedb_about/hub/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import dataclasses
import logging
import typing as t

from bs4 import BeautifulSoup

from cratedb_about.util import get_cache_client


logger = logging.getLogger(__name__)


@dataclasses.dataclass
class Resource:
url: str
size: int = -1


@dataclasses.dataclass
class LLMsTxtHubItem:
title: str
website: str
description: str
logo: str
tags: t.List[str] = dataclasses.field(default_factory=list)
resources: t.List[Resource] = dataclasses.field(default_factory=list)


class LLMsTxtHub:
url: str = "https://llmtxt.dev/hub"

def __init__(self):
self.items: t.List[LLMsTxtHubItem] = []
self.client = get_cache_client(ttl=60*60*24)

def fetch(self):
index_html = self.client.get(self.url)
bs = BeautifulSoup(index_html, "html.parser")
cards = bs.find_all(attrs={"class": "website-card"})
self.items = [self.card_to_model(card) for card in cards]
self.acquire_sizes()
return self

def acquire_sizes(self):
logger.info(f"Acquiring sizes for {len(self.items)} items")
for item in self.items:
logger.info(f"Acquiring size for {item}")
for resource in item.resources:
try:
response = self.client.get(resource.url)
resource.size = len(response.text)
except Exception as e:
logger.warning(f"Failed to acquire size for {item}: {e}")

@staticmethod
def card_to_model(card):
divs = card.find(name="div")
title = divs.find(name="h3").text
tags = []
for tag in divs.find_all(name="span"):
tags.append(tag.text)
website = divs.find(name="p", attrs={"class": "text-sm"}).text
description = divs.find(name="p", attrs={"class": "text-sm", "title": True}).text
logo_url = divs.find(name="img").get("src")
resources = []
for anchor in divs.find_all(name="a"):
resources.append(Resource(url=anchor.get("href")))
return LLMsTxtHubItem(title=title, website=website, description=description, logo=logo_url, tags=tags, resources=resources)
Comment on lines +55 to +68
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling and type annotations to card_to_model method

The card_to_model method lacks error handling for HTML parsing and is missing return type annotation. If the HTML structure changes, the method will fail with confusing errors.

Apply this diff to add error handling and type annotation:

@staticmethod
-def card_to_model(card):
-    divs = card.find(name="div")
-    title = divs.find(name="h3").text
-    tags = []
-    for tag in divs.find_all(name="span"):
-        tags.append(tag.text)
-    website = divs.find(name="p", attrs={"class": "text-sm"}).text
-    description = divs.find(name="p", attrs={"class": "text-sm", "title": True}).text
-    logo_url = divs.find(name="img").get("src")
-    resources = []
-    for anchor in divs.find_all(name="a"):
-        resources.append(Resource(url=anchor.get("href")))
-    return LLMsTxtHubItem(title=title, website=website, description=description, logo=logo_url, tags=tags, resources=resources)
+def card_to_model(card) -> LLMsTxtHubItem:
+    try:
+        divs = card.find(name="div")
+        if not divs:
+            raise ValueError("Card structure doesn't contain expected div element")
+            
+        title_elem = divs.find(name="h3")
+        if not title_elem:
+            raise ValueError("Missing title element (h3)")
+        title = title_elem.text
+        
+        tags = []
+        for tag in divs.find_all(name="span"):
+            tags.append(tag.text)
+            
+        website_elem = divs.find(name="p", attrs={"class": "text-sm"})
+        if not website_elem:
+            raise ValueError("Missing website element (p.text-sm)")
+        website = website_elem.text
+        
+        desc_elem = divs.find(name="p", attrs={"class": "text-sm", "title": True})
+        if not desc_elem:
+            raise ValueError("Missing description element (p.text-sm[title])")
+        description = desc_elem.text
+        
+        logo_elem = divs.find(name="img")
+        logo_url = logo_elem.get("src") if logo_elem else None
+        
+        resources = []
+        for anchor in divs.find_all(name="a"):
+            href = anchor.get("href")
+            if href:
+                resources.append(Resource(url=href))
+                
+        return LLMsTxtHubItem(
+            title=title, 
+            website=website, 
+            description=description, 
+            logo=logo_url, 
+            tags=tags, 
+            resources=resources
+        )
+    except Exception as e:
+        logger.error(f"Failed to parse card: {e}")
+        raise ValueError(f"Failed to parse card: {e}") from e
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
@staticmethod
def card_to_model(card):
divs = card.find(name="div")
title = divs.find(name="h3").text
tags = []
for tag in divs.find_all(name="span"):
tags.append(tag.text)
website = divs.find(name="p", attrs={"class": "text-sm"}).text
description = divs.find(name="p", attrs={"class": "text-sm", "title": True}).text
logo_url = divs.find(name="img").get("src")
resources = []
for anchor in divs.find_all(name="a"):
resources.append(Resource(url=anchor.get("href")))
return LLMsTxtHubItem(title=title, website=website, description=description, logo=logo_url, tags=tags, resources=resources)
@staticmethod
def card_to_model(card) -> LLMsTxtHubItem:
try:
divs = card.find(name="div")
if not divs:
raise ValueError("Card structure doesn't contain expected div element")
title_elem = divs.find(name="h3")
if not title_elem:
raise ValueError("Missing title element (h3)")
title = title_elem.text
tags = []
for tag in divs.find_all(name="span"):
tags.append(tag.text)
website_elem = divs.find(name="p", attrs={"class": "text-sm"})
if not website_elem:
raise ValueError("Missing website element (p.text-sm)")
website = website_elem.text
desc_elem = divs.find(name="p", attrs={"class": "text-sm", "title": True})
if not desc_elem:
raise ValueError("Missing description element (p.text-sm[title])")
description = desc_elem.text
logo_elem = divs.find(name="img")
logo_url = logo_elem.get("src") if logo_elem else None
resources = []
for anchor in divs.find_all(name="a"):
href = anchor.get("href")
if href:
resources.append(Resource(url=href))
return LLMsTxtHubItem(
title=title,
website=website,
description=description,
logo=logo_url,
tags=tags,
resources=resources,
)
except Exception as e:
logger.error(f"Failed to parse card: {e}")
raise ValueError(f"Failed to parse card: {e}") from e
🧰 Tools
🪛 Ruff (0.11.9)

68-68: Line too long (131 > 100)

(E501)

🤖 Prompt for AI Agents
In src/cratedb_about/hub/model.py around lines 55 to 68, the card_to_model
method lacks error handling for potential missing HTML elements and is missing a
return type annotation. Add appropriate try-except blocks to catch and handle
exceptions during HTML parsing, returning None or raising a clear error if
required elements are missing. Also, add a return type annotation indicating the
method returns an instance of LLMsTxtHubItem or None to improve code clarity and
robustness.

Loading