diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index cddb331d3c..e85c1fa250 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -526,6 +526,40 @@ async def code_progress_callback(data: dict): f"Unregistered orchestration service on error | progress_id={self.progress_id}" ) + def _is_self_link(self, link: str, base_url: str) -> bool: + """ + Check if a link is a self-referential link to the base URL. + Handles query parameters, fragments, trailing slashes, and normalizes + scheme/host/ports for accurate comparison. + + Args: + link: The link to check + base_url: The base URL to compare against + + Returns: + True if the link is self-referential, False otherwise + """ + try: + from urllib.parse import urlparse + + def _core(u: str) -> str: + p = urlparse(u) + scheme = (p.scheme or "http").lower() + host = (p.hostname or "").lower() + port = p.port + if (scheme == "http" and port in (None, 80)) or (scheme == "https" and port in (None, 443)): + port_part = "" + else: + port_part = f":{port}" if port else "" + path = p.path.rstrip("/") + return f"{scheme}://{host}{port_part}{path}" + + return _core(link) == _core(base_url) + except Exception as e: + logger.warning(f"Error checking if link is self-referential: {e}", exc_info=True) + # Fallback to simple string comparison + return link.rstrip('/') == base_url.rstrip('/') + async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple: """ Detect URL type and perform appropriate crawling. @@ -536,8 +570,8 @@ async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple: crawl_results = [] crawl_type = None - if self.url_handler.is_txt(url): - # Handle text files + if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url): + # Handle text files crawl_type = "llms-txt" if "llms" in url.lower() else "text_file" if self.progress_tracker: await self.progress_tracker.update( @@ -550,9 +584,54 @@ async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple: crawl_results = await self.crawl_markdown_file( url, progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=10, - end_progress=20, + start_progress=5, + end_progress=10, ) + # Check if this is a link collection file and extract links + if crawl_results and len(crawl_results) > 0: + content = crawl_results[0].get('markdown', '') + if self.url_handler.is_link_collection_file(url, content): + # Extract links from the content + extracted_links = self.url_handler.extract_markdown_links(content, url) + + # Filter out self-referential links to avoid redundant crawling + if extracted_links: + original_count = len(extracted_links) + extracted_links = [ + link for link in extracted_links + if not self._is_self_link(link, url) + ] + self_filtered_count = original_count - len(extracted_links) + if self_filtered_count > 0: + logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links") + + # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling + if extracted_links: + original_count = len(extracted_links) + extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)] + filtered_count = original_count - len(extracted_links) + if filtered_count > 0: + logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links") + + if extracted_links: + # Crawl the extracted links using batch crawling + logger.info(f"Crawling {len(extracted_links)} extracted links from {url}") + batch_results = await self.crawl_batch_with_progress( + extracted_links, + max_concurrent=request.get('max_concurrent'), # None -> use DB settings + progress_callback=await self._create_crawl_progress_callback("crawling"), + start_progress=10, + end_progress=20, + ) + + # Combine original text file results with batch results + crawl_results.extend(batch_results) + crawl_type = "link_collection_with_crawled_links" + + logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)") + else: + logger.info(f"No valid links found in link collection file: {url}") + logger.info(f"Text file crawling completed: {len(crawl_results)} results") elif self.url_handler.is_sitemap(url): # Handle sitemaps diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 19d953529e..33c75c57b7 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -6,7 +6,8 @@ import hashlib import re -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin +from typing import List, Optional from ....config.logfire_config import get_logger @@ -32,6 +33,26 @@ def is_sitemap(url: str) -> bool: except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False + + @staticmethod + def is_markdown(url: str) -> bool: + """ + Check if a URL points to a markdown file (.md, .mdx, .markdown). + + Args: + url: URL to check + + Returns: + True if URL is a markdown file, False otherwise + """ + try: + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + path = parsed.path.lower() + return path.endswith(('.md', '.mdx', '.markdown')) + except Exception as e: + logger.warning(f"Error checking if URL is markdown file: {e}", exc_info=True) + return False @staticmethod def is_txt(url: str) -> bool: @@ -45,9 +66,11 @@ def is_txt(url: str) -> bool: True if URL is a text file, False otherwise """ try: - return url.endswith(".txt") + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + return parsed.path.lower().endswith('.txt') except Exception as e: - logger.warning(f"Error checking if URL is text file: {e}") + logger.warning(f"Error checking if URL is text file: {e}", exc_info=True) return False @staticmethod @@ -240,7 +263,7 @@ def generate_unique_source_id(url: str) -> str: return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] except Exception as e: - # Redact sensitive query params from error logs + # Redacted sensitive query params from error logs try: redacted = url.split("?", 1)[0] if "?" in url else url except Exception: @@ -251,6 +274,166 @@ def generate_unique_source_id(url: str) -> str: # Fallback: use a hash of the error message + url to still get something unique fallback = f"error_{redacted}_{str(e)}" return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16] + + @staticmethod + def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]: + """ + Extract markdown-style links from text content. + + Args: + content: Text content to extract links from + base_url: Base URL to resolve relative links against + + Returns: + List of absolute URLs found in the content + """ + try: + if not content: + return [] + + # Ultimate URL pattern with comprehensive format support: + # 1) [text](url) - markdown links + # 2) - autolinks + # 3) https://... - bare URLs with protocol + # 4) //example.com - protocol-relative URLs + # 5) www.example.com - scheme-less www URLs + combined_pattern = re.compile( + r'\[(?P[^\]]*)\]\((?P[^)]+)\)' # named: md + r'|<\s*(?Phttps?://[^>\s]+)\s*>' # named: auto + r'|(?Phttps?://[^\s<>()\[\]"]+)' # named: bare + r'|(?P//[^\s<>()\[\]"]+)' # named: protocol-relative + r'|(?Pwww\.[^\s<>()\[\]"]+)' # named: www.* without scheme + ) + + def _clean_url(u: str) -> str: + # Trim whitespace and comprehensive trailing punctuation + # Also remove invisible Unicode characters that can break URLs + import unicodedata + cleaned = u.strip().rstrip('.,;:)]>') + # Remove invisible/control characters but keep valid URL characters + cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc')) + return cleaned + + urls = [] + for match in re.finditer(combined_pattern, content): + url = ( + match.group('md') + or match.group('auto') + or match.group('bare') + or match.group('proto') + or match.group('www') + ) + if not url: + continue + url = _clean_url(url) + + # Skip empty URLs, anchors, and mailto links + if not url or url.startswith('#') or url.startswith('mailto:'): + continue + + # Normalize all URL formats to https:// + if url.startswith('//'): + url = f'https:{url}' + elif url.startswith('www.'): + url = f'https://{url}' + + # Convert relative URLs to absolute if base_url provided + if base_url and not url.startswith(('http://', 'https://')): + try: + url = urljoin(base_url, url) + except Exception as e: + logger.warning(f"Failed to resolve relative URL {url} with base {base_url}: {e}") + continue + + # Only include HTTP/HTTPS URLs + if url.startswith(('http://', 'https://')): + urls.append(url) + + # Remove duplicates while preserving order + seen = set() + unique_urls = [] + for url in urls: + if url not in seen: + seen.add(url) + unique_urls.append(url) + + logger.info(f"Extracted {len(unique_urls)} unique links from content") + return unique_urls + + except Exception as e: + logger.error(f"Error extracting markdown links: {e}", exc_info=True) + return [] + + @staticmethod + def is_link_collection_file(url: str, content: Optional[str] = None) -> bool: + """ + Check if a URL/file appears to be a link collection file like llms.txt. + + Args: + url: URL to check + content: Optional content to analyze for link density + + Returns: + True if file appears to be a link collection, False otherwise + """ + try: + # Extract filename from URL + parsed = urlparse(url) + filename = parsed.path.split('/')[-1].lower() + + # Check for specific link collection filenames + # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links + link_collection_patterns = [ + # .txt variants - files that typically contain lists of links + 'llms.txt', 'links.txt', 'resources.txt', 'references.txt', + # .md/.mdx/.markdown variants + 'llms.md', 'links.md', 'resources.md', 'references.md', + 'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx', + 'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown', + ] + + # Direct filename match + if filename in link_collection_patterns: + logger.info(f"Detected link collection file by filename: {filename}") + return True + + # Pattern-based detection for variations, but exclude "full" variants + # Only match files that are likely link collections, not complete content files + if filename.endswith(('.txt', '.md', '.mdx', '.markdown')): + # Exclude files with "full" in the name - these typically contain complete content, not just links + if 'full' not in filename: + # Match files that start with common link collection prefixes + base_patterns = ['llms', 'links', 'resources', 'references'] + if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns): + logger.info(f"Detected potential link collection file: {filename}") + return True + + # Content-based detection if content is provided + if content: + # Never treat "full" variants as link collections to preserve single-page behavior + if 'full' in filename: + logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}") + return False + # Reuse extractor to avoid regex divergence and maintain consistency + extracted_links = URLHandler.extract_markdown_links(content, url) + total_links = len(extracted_links) + + # Calculate link density (links per 100 characters) + content_length = len(content.strip()) + if content_length > 0: + link_density = (total_links * 100) / content_length + + # If more than 2% of content is links, likely a link collection + if link_density > 2.0 and total_links > 3: + logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%") + return True + + return False + + except Exception as e: + logger.warning(f"Error checking if file is link collection: {e}", exc_info=True) + return False + @staticmethod def extract_display_name(url: str) -> str: diff --git a/python/src/server/services/embeddings/contextual_embedding_service.py b/python/src/server/services/embeddings/contextual_embedding_service.py index 7469d5adde..e72d81a512 100644 --- a/python/src/server/services/embeddings/contextual_embedding_service.py +++ b/python/src/server/services/embeddings/contextual_embedding_service.py @@ -219,4 +219,4 @@ async def generate_contextual_embeddings_batch( except Exception as e: search_logger.error(f"Error in contextual embedding batch: {e}") # Return non-contextual for all chunks - return [(chunk, False) for chunk in chunks] + return [(chunk, False) for chunk in chunks] \ No newline at end of file diff --git a/python/src/server/services/storage/code_storage_service.py b/python/src/server/services/storage/code_storage_service.py index cacc7d7d12..e987939e1b 100644 --- a/python/src/server/services/storage/code_storage_service.py +++ b/python/src/server/services/storage/code_storage_service.py @@ -955,6 +955,10 @@ async def add_code_examples_to_supabase( "status": "code_storage", "percentage": progress_percentage, "log": f"Stored batch {batch_num}/{total_batches} of code examples", + # Stage-specific batch fields to prevent contamination with document storage + "code_current_batch": batch_num, + "code_total_batches": total_batches, + # Keep generic fields for backward compatibility "batch_number": batch_num, "total_batches": total_batches, }) @@ -966,4 +970,7 @@ async def add_code_examples_to_supabase( "percentage": 100, "log": f"Code storage completed. Stored {total_items} code examples.", "total_items": total_items, + # Keep final batch info for code storage completion + "code_total_batches": (total_items + batch_size - 1) // batch_size, + "code_current_batch": (total_items + batch_size - 1) // batch_size, }) diff --git a/python/src/server/services/storage/document_storage_service.py b/python/src/server/services/storage/document_storage_service.py index 4fc07a1879..392394e896 100644 --- a/python/src/server/services/storage/document_storage_service.py +++ b/python/src/server/services/storage/document_storage_service.py @@ -349,6 +349,11 @@ async def embedding_progress_wrapper(message: str, percentage: float): # Simple batch completion info batch_info = { + # Stage-specific batch fields to prevent contamination with code examples + "document_completed_batches": completed_batches, + "document_total_batches": total_batches, + "document_current_batch": batch_num, + # Keep generic fields for backward compatibility "completed_batches": completed_batches, "total_batches": total_batches, "current_batch": batch_num, @@ -401,12 +406,12 @@ async def embedding_progress_wrapper(message: str, percentage: float): "document_storage", 100, # Ensure we report 100% f"Document storage completed: {len(contents)} chunks stored in {total_batches} batches", - completed_batches=total_batches, - total_batches=total_batches, - current_batch=total_batches, - chunks_processed=len(contents), - # DON'T send 'status': 'completed' - that's for the orchestration service only! - ) + completed_batches=total_batches, + total_batches=total_batches, + current_batch=total_batches, + chunks_processed=len(contents), + # DON'T send 'status': 'completed' - that's for the orchestration service only! + ) except Exception as e: search_logger.warning(f"Progress callback failed during completion: {e}. Storage still successful.")