Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions python/src/server/services/crawling/crawling_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,11 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None):
self.url_handler = URLHandler()
self.site_config = SiteConfig()
self.markdown_generator = self.site_config.get_markdown_generator()
self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()

# Initialize strategies
self.batch_strategy = BatchCrawlStrategy(crawler, self.markdown_generator)
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.markdown_generator)
self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator)
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator)
self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
self.sitemap_strategy = SitemapCrawlStrategy()

Expand Down
31 changes: 31 additions & 0 deletions python/src/server/services/crawling/helpers/site_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Handles site-specific configurations and detection.
"""
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter

from ....config.logfire_config import get_logger

Expand Down Expand Up @@ -96,3 +97,33 @@ def get_markdown_generator():
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
}
)

@staticmethod
def get_link_pruning_markdown_generator():
"""
Get markdown generator for the recursive crawling strategy that cleans up pages crawled.

Returns:
Configured markdown generator
"""
prune_filter = PruningContentFilter(
threshold=0.2,
threshold_type="fixed"
)

return DefaultMarkdownGenerator(
content_source="html", # Use raw HTML to preserve code blocks
content_filter=prune_filter,
options={
"mark_code": True, # Mark code blocks properly
"handle_code_in_pre": True, # Handle <pre><code> tags
"body_width": 0, # No line wrapping
"skip_internal_links": True, # Add to reduce noise
"include_raw_html": False, # Prevent HTML in markdown
"escape": False, # Don't escape special chars in code
"decode_unicode": True, # Decode unicode characters
"strip_empty_lines": False, # Preserve empty lines in code
"preserve_code_formatting": True, # Custom option if supported
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
}
)
4 changes: 2 additions & 2 deletions python/src/server/services/crawling/strategies/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,12 @@ async def report_progress(progress_val: int, message: str, status: str = "crawli
raise

processed += 1
if result.success and result.markdown:
if result.success and result.markdown and result.markdown.fit_markdown:
# Map back to original URL
original_url = url_mapping.get(result.url, result.url)
successful_results.append({
"url": original_url,
"markdown": result.markdown,
"markdown": result.markdown.fit_markdown,
"html": result.html, # Use raw HTML
})
else:
Expand Down
4 changes: 2 additions & 2 deletions python/src/server/services/crawling/strategies/recursive.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,10 +276,10 @@ def normalize_url(url):
visited.add(norm_url)
total_processed += 1

if result.success and result.markdown:
if result.success and result.markdown and result.markdown.fit_markdown:
results_all.append({
"url": original_url,
"markdown": result.markdown,
"markdown": result.markdown.fit_markdown,
"html": result.html, # Always use raw HTML for code extraction
})
depth_successful += 1
Expand Down