Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
8478cd9
Merge pull request #1 from coleam00/main
Chillbruhhh Mar 8, 2025
3d5cda6
Merge branch 'coleam00:main' into main
Chillbruhhh Apr 3, 2025
0641c53
Merge branch 'coleam00:main' into main
Chillbruhhh Aug 14, 2025
e857719
Merge branch 'coleam00:main' into main
Chillbruhhh Aug 22, 2025
ddbe3c1
fixed the llms.txt/fulls-llm.txt/llms.md etc. to be crawleed finally.…
Aug 22, 2025
a5b18ad
updated coderabbits suggestion - resolved
Aug 22, 2025
e5024fe
refined to code rabbits suggestions take 2, should be final take. did…
Aug 22, 2025
50c4b09
3rd times the charm, added nit picky thing from code rabbit. code rab…
Aug 22, 2025
907acf1
Merge branch 'coleam00:main' into main
Chillbruhhh Aug 22, 2025
ebd1ca5
Merge branch 'coleam00:main' into fix/(llms.txt)-not-crawling-links-i…
Chillbruhhh Aug 23, 2025
ad7f02f
Fixed progress bar accuracy and OpenAI API compatibility issues
Aug 23, 2025
8265c8b
removed gpt-5-handlings since thats a seprate issue and doesnt pertai…
Aug 24, 2025
6f01340
Merge branch 'coleam00:main' into main
Chillbruhhh Aug 26, 2025
c86afb1
Merge branch 'coleam00:main' into main
Chillbruhhh Aug 31, 2025
1c40d28
fixed the llms-full.txt crawling issue. now crawls just that page whe…
Aug 31, 2025
f8303bd
Merge remote-tracking branch 'origin/main' into fix/(llms.txt)-not-cr…
Aug 31, 2025
a43b1df
fixed a few things so it will work with the current branch!
Aug 31, 2025
5df242e
added some enhancemments to ui rendering aswell and other little misc…
Aug 31, 2025
4a4dfe1
Merge branch 'coleam00:main' into main
Chillbruhhh Sep 2, 2025
7856407
Merge branch 'coleam00:main' into main
Chillbruhhh Sep 2, 2025
1b4d88f
updated for the new progress ui polling system, and rebased to main, …
claude Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 83 additions & 4 deletions python/src/server/services/crawling/crawling_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,40 @@ async def code_progress_callback(data: dict):
f"Unregistered orchestration service on error | progress_id={self.progress_id}"
)

def _is_self_link(self, link: str, base_url: str) -> bool:
"""
Check if a link is a self-referential link to the base URL.
Handles query parameters, fragments, trailing slashes, and normalizes
scheme/host/ports for accurate comparison.

Args:
link: The link to check
base_url: The base URL to compare against

Returns:
True if the link is self-referential, False otherwise
"""
try:
from urllib.parse import urlparse

def _core(u: str) -> str:
p = urlparse(u)
scheme = (p.scheme or "http").lower()
host = (p.hostname or "").lower()
port = p.port
if (scheme == "http" and port in (None, 80)) or (scheme == "https" and port in (None, 443)):
port_part = ""
else:
port_part = f":{port}" if port else ""
path = p.path.rstrip("/")
return f"{scheme}://{host}{port_part}{path}"

return _core(link) == _core(base_url)
except Exception as e:
logger.warning(f"Error checking if link is self-referential: {e}", exc_info=True)
# Fallback to simple string comparison
return link.rstrip('/') == base_url.rstrip('/')

async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
"""
Detect URL type and perform appropriate crawling.
Expand All @@ -536,8 +570,8 @@ async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
crawl_results = []
crawl_type = None

if self.url_handler.is_txt(url):
# Handle text files
if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url):
# Handle text files
crawl_type = "llms-txt" if "llms" in url.lower() else "text_file"
if self.progress_tracker:
await self.progress_tracker.update(
Expand All @@ -550,9 +584,54 @@ async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
crawl_results = await self.crawl_markdown_file(
url,
progress_callback=await self._create_crawl_progress_callback("crawling"),
start_progress=10,
end_progress=20,
start_progress=5,
end_progress=10,
)
# Check if this is a link collection file and extract links
if crawl_results and len(crawl_results) > 0:
content = crawl_results[0].get('markdown', '')
if self.url_handler.is_link_collection_file(url, content):
# Extract links from the content
extracted_links = self.url_handler.extract_markdown_links(content, url)

# Filter out self-referential links to avoid redundant crawling
if extracted_links:
original_count = len(extracted_links)
extracted_links = [
link for link in extracted_links
if not self._is_self_link(link, url)
]
self_filtered_count = original_count - len(extracted_links)
if self_filtered_count > 0:
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")

# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
if extracted_links:
original_count = len(extracted_links)
extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)]
filtered_count = original_count - len(extracted_links)
if filtered_count > 0:
logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")

if extracted_links:
# Crawl the extracted links using batch crawling
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
batch_results = await self.crawl_batch_with_progress(
extracted_links,
max_concurrent=request.get('max_concurrent'), # None -> use DB settings
progress_callback=await self._create_crawl_progress_callback("crawling"),
start_progress=10,
end_progress=20,
)

# Combine original text file results with batch results
crawl_results.extend(batch_results)
crawl_type = "link_collection_with_crawled_links"

logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
else:
logger.info(f"No valid links found in link collection file: {url}")
logger.info(f"Text file crawling completed: {len(crawl_results)} results")

Comment thread
Chillbruhhh marked this conversation as resolved.
elif self.url_handler.is_sitemap(url):
# Handle sitemaps
Expand Down
191 changes: 187 additions & 4 deletions python/src/server/services/crawling/helpers/url_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import hashlib
import re
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from typing import List, Optional

from ....config.logfire_config import get_logger

Expand All @@ -32,6 +33,26 @@ def is_sitemap(url: str) -> bool:
except Exception as e:
logger.warning(f"Error checking if URL is sitemap: {e}")
return False

@staticmethod
def is_markdown(url: str) -> bool:
"""
Check if a URL points to a markdown file (.md, .mdx, .markdown).

Args:
url: URL to check

Returns:
True if URL is a markdown file, False otherwise
"""
try:
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
path = parsed.path.lower()
return path.endswith(('.md', '.mdx', '.markdown'))
except Exception as e:
logger.warning(f"Error checking if URL is markdown file: {e}", exc_info=True)
return False

@staticmethod
def is_txt(url: str) -> bool:
Expand All @@ -45,9 +66,11 @@ def is_txt(url: str) -> bool:
True if URL is a text file, False otherwise
"""
try:
return url.endswith(".txt")
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
return parsed.path.lower().endswith('.txt')
except Exception as e:
logger.warning(f"Error checking if URL is text file: {e}")
logger.warning(f"Error checking if URL is text file: {e}", exc_info=True)
return False

@staticmethod
Expand Down Expand Up @@ -240,7 +263,7 @@ def generate_unique_source_id(url: str) -> str:
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]

except Exception as e:
# Redact sensitive query params from error logs
# Redacted sensitive query params from error logs
try:
redacted = url.split("?", 1)[0] if "?" in url else url
except Exception:
Expand All @@ -251,6 +274,166 @@ def generate_unique_source_id(url: str) -> str:
# Fallback: use a hash of the error message + url to still get something unique
fallback = f"error_{redacted}_{str(e)}"
return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]

@staticmethod
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
"""
Extract markdown-style links from text content.

Args:
content: Text content to extract links from
base_url: Base URL to resolve relative links against

Returns:
List of absolute URLs found in the content
"""
try:
if not content:
return []

# Ultimate URL pattern with comprehensive format support:
# 1) [text](url) - markdown links
# 2) <https://...> - autolinks
# 3) https://... - bare URLs with protocol
# 4) //example.com - protocol-relative URLs
# 5) www.example.com - scheme-less www URLs
combined_pattern = re.compile(
r'\[(?P<text>[^\]]*)\]\((?P<md>[^)]+)\)' # named: md
r'|<\s*(?P<auto>https?://[^>\s]+)\s*>' # named: auto
r'|(?P<bare>https?://[^\s<>()\[\]"]+)' # named: bare
r'|(?P<proto>//[^\s<>()\[\]"]+)' # named: protocol-relative
r'|(?P<www>www\.[^\s<>()\[\]"]+)' # named: www.* without scheme
)

def _clean_url(u: str) -> str:
# Trim whitespace and comprehensive trailing punctuation
# Also remove invisible Unicode characters that can break URLs
import unicodedata
cleaned = u.strip().rstrip('.,;:)]>')
# Remove invisible/control characters but keep valid URL characters
cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc'))
return cleaned

urls = []
for match in re.finditer(combined_pattern, content):
url = (
match.group('md')
or match.group('auto')
or match.group('bare')
or match.group('proto')
or match.group('www')
)
if not url:
continue
url = _clean_url(url)

# Skip empty URLs, anchors, and mailto links
if not url or url.startswith('#') or url.startswith('mailto:'):
continue

# Normalize all URL formats to https://
if url.startswith('//'):
url = f'https:{url}'
elif url.startswith('www.'):
url = f'https://{url}'

# Convert relative URLs to absolute if base_url provided
if base_url and not url.startswith(('http://', 'https://')):
try:
url = urljoin(base_url, url)
except Exception as e:
logger.warning(f"Failed to resolve relative URL {url} with base {base_url}: {e}")
continue

# Only include HTTP/HTTPS URLs
if url.startswith(('http://', 'https://')):
urls.append(url)

# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)

logger.info(f"Extracted {len(unique_urls)} unique links from content")
return unique_urls

except Exception as e:
logger.error(f"Error extracting markdown links: {e}", exc_info=True)
return []

@staticmethod
def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
"""
Check if a URL/file appears to be a link collection file like llms.txt.

Args:
url: URL to check
content: Optional content to analyze for link density

Returns:
True if file appears to be a link collection, False otherwise
"""
try:
# Extract filename from URL
parsed = urlparse(url)
filename = parsed.path.split('/')[-1].lower()

# Check for specific link collection filenames
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
link_collection_patterns = [
# .txt variants - files that typically contain lists of links
'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
# .md/.mdx/.markdown variants
'llms.md', 'links.md', 'resources.md', 'references.md',
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
]

# Direct filename match
if filename in link_collection_patterns:
logger.info(f"Detected link collection file by filename: {filename}")
return True

# Pattern-based detection for variations, but exclude "full" variants
# Only match files that are likely link collections, not complete content files
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
# Exclude files with "full" in the name - these typically contain complete content, not just links
if 'full' not in filename:
# Match files that start with common link collection prefixes
base_patterns = ['llms', 'links', 'resources', 'references']
if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
logger.info(f"Detected potential link collection file: {filename}")
return True

# Content-based detection if content is provided
if content:
# Never treat "full" variants as link collections to preserve single-page behavior
if 'full' in filename:
logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
return False
# Reuse extractor to avoid regex divergence and maintain consistency
extracted_links = URLHandler.extract_markdown_links(content, url)
total_links = len(extracted_links)

# Calculate link density (links per 100 characters)
content_length = len(content.strip())
if content_length > 0:
link_density = (total_links * 100) / content_length

# If more than 2% of content is links, likely a link collection
if link_density > 2.0 and total_links > 3:
logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
return True

return False

except Exception as e:
logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
return False


@staticmethod
def extract_display_name(url: str) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,4 +219,4 @@ async def generate_contextual_embeddings_batch(
except Exception as e:
search_logger.error(f"Error in contextual embedding batch: {e}")
# Return non-contextual for all chunks
return [(chunk, False) for chunk in chunks]
return [(chunk, False) for chunk in chunks]
7 changes: 7 additions & 0 deletions python/src/server/services/storage/code_storage_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,10 @@ async def add_code_examples_to_supabase(
"status": "code_storage",
"percentage": progress_percentage,
"log": f"Stored batch {batch_num}/{total_batches} of code examples",
# Stage-specific batch fields to prevent contamination with document storage
"code_current_batch": batch_num,
"code_total_batches": total_batches,
# Keep generic fields for backward compatibility
"batch_number": batch_num,
"total_batches": total_batches,
})
Expand All @@ -966,4 +970,7 @@ async def add_code_examples_to_supabase(
"percentage": 100,
"log": f"Code storage completed. Stored {total_items} code examples.",
"total_items": total_items,
# Keep final batch info for code storage completion
"code_total_batches": (total_items + batch_size - 1) // batch_size,
"code_current_batch": (total_items + batch_size - 1) // batch_size,
})
17 changes: 11 additions & 6 deletions python/src/server/services/storage/document_storage_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ async def embedding_progress_wrapper(message: str, percentage: float):

# Simple batch completion info
batch_info = {
# Stage-specific batch fields to prevent contamination with code examples
"document_completed_batches": completed_batches,
"document_total_batches": total_batches,
"document_current_batch": batch_num,
# Keep generic fields for backward compatibility
"completed_batches": completed_batches,
"total_batches": total_batches,
"current_batch": batch_num,
Expand Down Expand Up @@ -401,12 +406,12 @@ async def embedding_progress_wrapper(message: str, percentage: float):
"document_storage",
100, # Ensure we report 100%
f"Document storage completed: {len(contents)} chunks stored in {total_batches} batches",
completed_batches=total_batches,
total_batches=total_batches,
current_batch=total_batches,
chunks_processed=len(contents),
# DON'T send 'status': 'completed' - that's for the orchestration service only!
)
completed_batches=total_batches,
total_batches=total_batches,
current_batch=total_batches,
chunks_processed=len(contents),
# DON'T send 'status': 'completed' - that's for the orchestration service only!
)
except Exception as e:
search_logger.warning(f"Progress callback failed during completion: {e}. Storage still successful.")

Expand Down