coleam00 · coleam00 · Sep 6, 2025 · Mar 8, 2025 · Apr 3, 2025 · Aug 14, 2025
diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
@@ -526,6 +526,40 @@ async def code_progress_callback(data: dict):
                     f"Unregistered orchestration service on error | progress_id={self.progress_id}"
                 )
 
+    def _is_self_link(self, link: str, base_url: str) -> bool:
+        """
+        Check if a link is a self-referential link to the base URL.
+        Handles query parameters, fragments, trailing slashes, and normalizes
+        scheme/host/ports for accurate comparison.
+
+        Args:
+            link: The link to check
+            base_url: The base URL to compare against
+
+        Returns:
+            True if the link is self-referential, False otherwise
+        """
+        try:
+            from urllib.parse import urlparse
+
+            def _core(u: str) -> str:
+                p = urlparse(u)
+                scheme = (p.scheme or "http").lower()
+                host = (p.hostname or "").lower()
+                port = p.port
+                if (scheme == "http" and port in (None, 80)) or (scheme == "https" and port in (None, 443)):
+                    port_part = ""
+                else:
+                    port_part = f":{port}" if port else ""
+                path = p.path.rstrip("/")
+                return f"{scheme}://{host}{port_part}{path}"
+
+            return _core(link) == _core(base_url)
+        except Exception as e:
+            logger.warning(f"Error checking if link is self-referential: {e}", exc_info=True)
+            # Fallback to simple string comparison
+            return link.rstrip('/') == base_url.rstrip('/')
+
     async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
         """
         Detect URL type and perform appropriate crawling.
@@ -536,8 +570,8 @@ async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
         crawl_results = []
         crawl_type = None
 
-        if self.url_handler.is_txt(url):
-            # Handle text files
+        if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url):
+            # Handle text files  
             crawl_type = "llms-txt" if "llms" in url.lower() else "text_file"
             if self.progress_tracker:
                 await self.progress_tracker.update(
@@ -550,9 +584,54 @@ async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
             crawl_results = await self.crawl_markdown_file(
                 url,
                 progress_callback=await self._create_crawl_progress_callback("crawling"),
-                start_progress=10,
-                end_progress=20,
+                start_progress=5,
+                end_progress=10,
             )
+            # Check if this is a link collection file and extract links
+            if crawl_results and len(crawl_results) > 0:
+                content = crawl_results[0].get('markdown', '')
+                if self.url_handler.is_link_collection_file(url, content):
+                    # Extract links from the content
+                    extracted_links = self.url_handler.extract_markdown_links(content, url)
+
+                    # Filter out self-referential links to avoid redundant crawling
+                    if extracted_links:
+                        original_count = len(extracted_links)
+                        extracted_links = [
+                            link for link in extracted_links
+                            if not self._is_self_link(link, url)
+                        ]
+                        self_filtered_count = original_count - len(extracted_links)
+                        if self_filtered_count > 0:
+                            logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
+
+                    # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
+                    if extracted_links:
+                        original_count = len(extracted_links)
+                        extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)]
+                        filtered_count = original_count - len(extracted_links)
+                        if filtered_count > 0:
+                            logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
+
+                    if extracted_links:
+                        # Crawl the extracted links using batch crawling
+                        logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
+                        batch_results = await self.crawl_batch_with_progress(
+                            extracted_links,
+                            max_concurrent=request.get('max_concurrent'),  # None -> use DB settings
+                            progress_callback=await self._create_crawl_progress_callback("crawling"),
+                            start_progress=10,
+                            end_progress=20,
+                        )
+
+                        # Combine original text file results with batch results
+                        crawl_results.extend(batch_results)
+                        crawl_type = "link_collection_with_crawled_links"
+
+                        logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
+                    else:
+                        logger.info(f"No valid links found in link collection file: {url}")
+                        logger.info(f"Text file crawling completed: {len(crawl_results)} results")
 
         elif self.url_handler.is_sitemap(url):
             # Handle sitemaps

diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py
@@ -6,7 +6,8 @@
 
 import hashlib
 import re
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin
+from typing import List, Optional
 
 from ....config.logfire_config import get_logger
 
@@ -32,6 +33,26 @@ def is_sitemap(url: str) -> bool:
         except Exception as e:
             logger.warning(f"Error checking if URL is sitemap: {e}")
             return False
+
+    @staticmethod  
+    def is_markdown(url: str) -> bool:
+        """
+        Check if a URL points to a markdown file (.md, .mdx, .markdown).
+
+        Args:
+            url: URL to check
+
+        Returns:
+            True if URL is a markdown file, False otherwise
+        """
+        try:
+            parsed = urlparse(url)
+            # Normalize to lowercase and ignore query/fragment
+            path = parsed.path.lower()
+            return path.endswith(('.md', '.mdx', '.markdown'))
+        except Exception as e:
+            logger.warning(f"Error checking if URL is markdown file: {e}", exc_info=True)
+            return False
 
     @staticmethod
     def is_txt(url: str) -> bool:
@@ -45,9 +66,11 @@ def is_txt(url: str) -> bool:
             True if URL is a text file, False otherwise
         """
         try:
-            return url.endswith(".txt")
+            parsed = urlparse(url)
+            # Normalize to lowercase and ignore query/fragment
+            return parsed.path.lower().endswith('.txt')
         except Exception as e:
-            logger.warning(f"Error checking if URL is text file: {e}")
+            logger.warning(f"Error checking if URL is text file: {e}", exc_info=True)
             return False
 
     @staticmethod
@@ -240,7 +263,7 @@ def generate_unique_source_id(url: str) -> str:
             return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]
 
         except Exception as e:
-            # Redact sensitive query params from error logs
+            # Redacted sensitive query params from error logs
             try:
                 redacted = url.split("?", 1)[0] if "?" in url else url
             except Exception:
@@ -251,6 +274,166 @@ def generate_unique_source_id(url: str) -> str:
             # Fallback: use a hash of the error message + url to still get something unique
             fallback = f"error_{redacted}_{str(e)}"
             return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
+
+    @staticmethod
+    def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
+        """
+        Extract markdown-style links from text content.
+
+        Args:
+            content: Text content to extract links from
+            base_url: Base URL to resolve relative links against
+
+        Returns:
+            List of absolute URLs found in the content
+        """
+        try:
+            if not content:
+                return []
+
+            # Ultimate URL pattern with comprehensive format support:
+            #  1) [text](url) - markdown links
+            #  2) <https://...> - autolinks  
+            #  3) https://... - bare URLs with protocol
+            #  4) //example.com - protocol-relative URLs
+            #  5) www.example.com - scheme-less www URLs
+            combined_pattern = re.compile(
+                r'\[(?P<text>[^\]]*)\]\((?P<md>[^)]+)\)'      # named: md
+                r'|<\s*(?P<auto>https?://[^>\s]+)\s*>'        # named: auto
+                r'|(?P<bare>https?://[^\s<>()\[\]"]+)'        # named: bare
+                r'|(?P<proto>//[^\s<>()\[\]"]+)'              # named: protocol-relative
+                r'|(?P<www>www\.[^\s<>()\[\]"]+)'             # named: www.* without scheme
+            )
+
+            def _clean_url(u: str) -> str:
+                # Trim whitespace and comprehensive trailing punctuation
+                # Also remove invisible Unicode characters that can break URLs
+                import unicodedata
+                cleaned = u.strip().rstrip('.,;:)]>')
+                # Remove invisible/control characters but keep valid URL characters
+                cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc'))
+                return cleaned
+
+            urls = []
+            for match in re.finditer(combined_pattern, content):
+                url = (
+                    match.group('md')
+                    or match.group('auto')
+                    or match.group('bare')
+                    or match.group('proto')
+                    or match.group('www')
+                )
+                if not url:
+                    continue
+                url = _clean_url(url)
+
+                # Skip empty URLs, anchors, and mailto links
+                if not url or url.startswith('#') or url.startswith('mailto:'):
+                    continue
+
+                # Normalize all URL formats to https://
+                if url.startswith('//'):
+                    url = f'https:{url}'
+                elif url.startswith('www.'):
+                    url = f'https://{url}'
+
+                # Convert relative URLs to absolute if base_url provided
+                if base_url and not url.startswith(('http://', 'https://')):
+                    try:
+                        url = urljoin(base_url, url)
+                    except Exception as e:
+                        logger.warning(f"Failed to resolve relative URL {url} with base {base_url}: {e}")
+                        continue
+
+                # Only include HTTP/HTTPS URLs
+                if url.startswith(('http://', 'https://')):
+                    urls.append(url)
+
+            # Remove duplicates while preserving order
+            seen = set()
+            unique_urls = []
+            for url in urls:
+                if url not in seen:
+                    seen.add(url)
+                    unique_urls.append(url)
+
+            logger.info(f"Extracted {len(unique_urls)} unique links from content")
+            return unique_urls
+
+        except Exception as e:
+            logger.error(f"Error extracting markdown links: {e}", exc_info=True)
+            return []
+
+    @staticmethod
+    def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
+        """
+        Check if a URL/file appears to be a link collection file like llms.txt.
+
+        Args:
+            url: URL to check
+            content: Optional content to analyze for link density
+
+        Returns:
+            True if file appears to be a link collection, False otherwise
+        """
+        try:
+            # Extract filename from URL
+            parsed = urlparse(url)
+            filename = parsed.path.split('/')[-1].lower()
+
+            # Check for specific link collection filenames
+            # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
+            link_collection_patterns = [
+                # .txt variants - files that typically contain lists of links
+                'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
+                # .md/.mdx/.markdown variants
+                'llms.md', 'links.md', 'resources.md', 'references.md',
+                'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
+                'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
+            ]
+
+            # Direct filename match
+            if filename in link_collection_patterns:
+                logger.info(f"Detected link collection file by filename: {filename}")
+                return True
+
+            # Pattern-based detection for variations, but exclude "full" variants
+            # Only match files that are likely link collections, not complete content files
+            if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
+                # Exclude files with "full" in the name - these typically contain complete content, not just links
+                if 'full' not in filename:
+                    # Match files that start with common link collection prefixes
+                    base_patterns = ['llms', 'links', 'resources', 'references']
+                    if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
+                        logger.info(f"Detected potential link collection file: {filename}")
+                        return True
+
+            # Content-based detection if content is provided
+            if content:
+                # Never treat "full" variants as link collections to preserve single-page behavior
+                if 'full' in filename:
+                    logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
+                    return False
+                # Reuse extractor to avoid regex divergence and maintain consistency
+                extracted_links = URLHandler.extract_markdown_links(content, url)
+                total_links = len(extracted_links)
+
+                # Calculate link density (links per 100 characters)
+                content_length = len(content.strip())
+                if content_length > 0:
+                    link_density = (total_links * 100) / content_length
+
+                    # If more than 2% of content is links, likely a link collection
+                    if link_density > 2.0 and total_links > 3:
+                        logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
+                        return True
+
+            return False
+
+        except Exception as e:
+            logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
+            return False
+
 
     @staticmethod
     def extract_display_name(url: str) -> str:

diff --git a/python/src/server/services/embeddings/contextual_embedding_service.py b/python/src/server/services/embeddings/contextual_embedding_service.py
@@ -219,4 +219,4 @@ async def generate_contextual_embeddings_batch(
     except Exception as e:
         search_logger.error(f"Error in contextual embedding batch: {e}")
         # Return non-contextual for all chunks
-        return [(chunk, False) for chunk in chunks]
+        return [(chunk, False) for chunk in chunks]
diff --git a/python/src/server/services/storage/code_storage_service.py b/python/src/server/services/storage/code_storage_service.py
@@ -955,6 +955,10 @@ async def add_code_examples_to_supabase(
                 "status": "code_storage",
                 "percentage": progress_percentage,
                 "log": f"Stored batch {batch_num}/{total_batches} of code examples",
+                # Stage-specific batch fields to prevent contamination with document storage
+                "code_current_batch": batch_num,
+                "code_total_batches": total_batches,
+                # Keep generic fields for backward compatibility
                 "batch_number": batch_num,
                 "total_batches": total_batches,
             })
@@ -966,4 +970,7 @@ async def add_code_examples_to_supabase(
             "percentage": 100,
             "log": f"Code storage completed. Stored {total_items} code examples.",
             "total_items": total_items,
+            # Keep final batch info for code storage completion
+            "code_total_batches": (total_items + batch_size - 1) // batch_size,
+            "code_current_batch": (total_items + batch_size - 1) // batch_size,
         })
diff --git a/python/src/server/services/storage/document_storage_service.py b/python/src/server/services/storage/document_storage_service.py
@@ -349,6 +349,11 @@ async def embedding_progress_wrapper(message: str, percentage: float):
 
                     # Simple batch completion info
                     batch_info = {
+                        # Stage-specific batch fields to prevent contamination with code examples
+                        "document_completed_batches": completed_batches,
+                        "document_total_batches": total_batches,
+                        "document_current_batch": batch_num,
+                        # Keep generic fields for backward compatibility
                         "completed_batches": completed_batches,
                         "total_batches": total_batches,
                         "current_batch": batch_num,
@@ -401,12 +406,12 @@ async def embedding_progress_wrapper(message: str, percentage: float):
                     "document_storage",
                     100,  # Ensure we report 100%
                     f"Document storage completed: {len(contents)} chunks stored in {total_batches} batches",
-                completed_batches=total_batches,
-                total_batches=total_batches,
-                current_batch=total_batches,
-                chunks_processed=len(contents),
-                # DON'T send 'status': 'completed' - that's for the orchestration service only!
-            )
+                    completed_batches=total_batches,
+                    total_batches=total_batches,
+                    current_batch=total_batches,
+                    chunks_processed=len(contents),
+                    # DON'T send 'status': 'completed' - that's for the orchestration service only!
+                )
             except Exception as e:
                 search_logger.warning(f"Progress callback failed during completion: {e}. Storage still successful.")