diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index e05cd6ec19..55cd6d924a 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -74,10 +74,11 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None): self.url_handler = URLHandler() self.site_config = SiteConfig() self.markdown_generator = self.site_config.get_markdown_generator() + self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator() # Initialize strategies - self.batch_strategy = BatchCrawlStrategy(crawler, self.markdown_generator) - self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.markdown_generator) + self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator) + self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator) self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator) self.sitemap_strategy = SitemapCrawlStrategy() diff --git a/python/src/server/services/crawling/helpers/site_config.py b/python/src/server/services/crawling/helpers/site_config.py index e7ea9f7ed6..846fe4509f 100644 --- a/python/src/server/services/crawling/helpers/site_config.py +++ b/python/src/server/services/crawling/helpers/site_config.py @@ -4,6 +4,7 @@ Handles site-specific configurations and detection. """ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter from ....config.logfire_config import get_logger @@ -96,3 +97,33 @@ def get_markdown_generator(): "code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else '' } ) + + @staticmethod + def get_link_pruning_markdown_generator(): + """ + Get markdown generator for the recursive crawling strategy that cleans up pages crawled. + + Returns: + Configured markdown generator + """ + prune_filter = PruningContentFilter( + threshold=0.2, + threshold_type="fixed" + ) + + return DefaultMarkdownGenerator( + content_source="html", # Use raw HTML to preserve code blocks + content_filter=prune_filter, + options={ + "mark_code": True, # Mark code blocks properly + "handle_code_in_pre": True, # Handle
tags
+ "body_width": 0, # No line wrapping
+ "skip_internal_links": True, # Add to reduce noise
+ "include_raw_html": False, # Prevent HTML in markdown
+ "escape": False, # Don't escape special chars in code
+ "decode_unicode": True, # Decode unicode characters
+ "strip_empty_lines": False, # Preserve empty lines in code
+ "preserve_code_formatting": True, # Custom option if supported
+ "code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
+ }
+ )
diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py
index 2834d55940..1457fdca48 100644
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -231,12 +231,12 @@ async def report_progress(progress_val: int, message: str, status: str = "crawli
raise
processed += 1
- if result.success and result.markdown:
+ if result.success and result.markdown and result.markdown.fit_markdown:
# Map back to original URL
original_url = url_mapping.get(result.url, result.url)
successful_results.append({
"url": original_url,
- "markdown": result.markdown,
+ "markdown": result.markdown.fit_markdown,
"html": result.html, # Use raw HTML
})
else:
diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py
index 436902ee75..d13b51d480 100644
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -276,10 +276,10 @@ def normalize_url(url):
visited.add(norm_url)
total_processed += 1
- if result.success and result.markdown:
+ if result.success and result.markdown and result.markdown.fit_markdown:
results_all.append({
"url": original_url,
- "markdown": result.markdown,
+ "markdown": result.markdown.fit_markdown,
"html": result.html, # Always use raw HTML for code extraction
})
depth_successful += 1