diff --git a/.agent/scripts/watercrawl-helper.sh b/.agent/scripts/watercrawl-helper.sh new file mode 100755 index 000000000..5c690373c --- /dev/null +++ b/.agent/scripts/watercrawl-helper.sh @@ -0,0 +1,1056 @@ +#!/bin/bash +# shellcheck disable=SC2034,SC2155,SC2317,SC2329,SC2016,SC2181,SC1091,SC2154,SC2015,SC2086,SC2129,SC2030,SC2031,SC2119,SC2120,SC2001,SC2162,SC2088,SC2089,SC2090,SC2029,SC2006,SC2153 + +# WaterCrawl Helper Script +# Modern web crawling framework for LLM-ready data extraction +# +# SELF-HOSTED FIRST: This script prioritizes self-hosted deployment over cloud API. +# WaterCrawl can be deployed locally via Docker or on VPS/Cloudron. +# +# WaterCrawl transforms web content into structured, AI-ready data with: +# - Smart crawling with depth/domain/path controls +# - Web search engine integration +# - Sitemap generation and analysis +# - JavaScript rendering with screenshots +# - AI-powered content processing (OpenAI integration) +# - Extensible plugin system +# +# Usage: ./watercrawl-helper.sh [command] [options] +# Commands: +# docker-setup - Clone repo and prepare Docker deployment (RECOMMENDED) +# docker-start - Start WaterCrawl Docker containers +# docker-stop - Stop WaterCrawl Docker containers +# docker-logs - View Docker container logs +# coolify-deploy - Deploy to Coolify (self-hosted PaaS) +# setup - Install Node.js SDK (for API access) +# status - Check WaterCrawl configuration and connectivity +# scrape - Scrape a single URL +# crawl - Crawl a website with depth control +# search - Search the web using WaterCrawl's search engine +# sitemap - Generate sitemap for a website +# api-key - Configure WaterCrawl API key +# api-url - Configure custom API URL (for self-hosted) +# help - Show this help message +# +# Author: AI DevOps Framework +# Version: 1.0.0 +# License: MIT + +# Colors for output +readonly GREEN='\033[0;32m' +readonly BLUE='\033[0;34m' +readonly YELLOW='\033[1;33m' +readonly RED='\033[0;31m' +readonly PURPLE='\033[0;35m' +readonly NC='\033[0m' # No Color + +# Common constants +readonly ERROR_UNKNOWN_COMMAND="Unknown command:" +readonly HELP_SHOW_MESSAGE="Show this help message" + +# Constants +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit +readonly SCRIPT_DIR +readonly CONFIG_DIR="$SCRIPT_DIR/../configs" +readonly MCP_ENV_FILE="$HOME/.config/aidevops/mcp-env.sh" +readonly WATERCRAWL_CLOUD_URL="https://app.watercrawl.dev" +readonly WATERCRAWL_LOCAL_URL="http://localhost" +readonly NPM_PACKAGE="@watercrawl/nodejs" +readonly WATERCRAWL_REPO="https://github.com/watercrawl/WaterCrawl.git" +readonly WATERCRAWL_DIR="$HOME/.aidevops/watercrawl" + +# Print functions +print_success() { + local message="$1" + echo -e "${GREEN}[OK] $message${NC}" + return 0 +} + +print_info() { + local message="$1" + echo -e "${BLUE}[INFO] $message${NC}" + return 0 +} + +print_warning() { + local message="$1" + echo -e "${YELLOW}[WARN] $message${NC}" + return 0 +} + +print_error() { + local message="$1" + echo -e "${RED}[ERROR] $message${NC}" + return 0 +} + +print_header() { + local message="$1" + echo -e "${PURPLE}=== $message ===${NC}" + return 0 +} + +# Load configuration from mcp-env.sh +load_config() { + if [[ -f "$MCP_ENV_FILE" ]]; then + # shellcheck source=/dev/null + source "$MCP_ENV_FILE" + fi + + # Default to local URL if self-hosted, otherwise cloud + if [[ -z "$WATERCRAWL_API_URL" ]]; then + if [[ -d "$WATERCRAWL_DIR" ]] && docker ps -q -f name=watercrawl 2>/dev/null | grep -q .; then + WATERCRAWL_API_URL="$WATERCRAWL_LOCAL_URL" + else + WATERCRAWL_API_URL="$WATERCRAWL_CLOUD_URL" + fi + fi + + return 0 +} + +# Load API key from mcp-env.sh +load_api_key() { + load_config + + if [[ -z "$WATERCRAWL_API_KEY" ]]; then + return 1 + fi + + return 0 +} + +# Check if Docker is available +check_docker() { + if ! command -v docker &> /dev/null; then + print_error "Docker is not installed. Please install Docker first." + return 1 + fi + + if ! docker info &> /dev/null; then + print_error "Docker daemon is not running. Please start Docker." + return 1 + fi + + return 0 +} + +# Check if Node.js is available +check_node() { + if ! command -v node &> /dev/null; then + print_error "Node.js is not installed. Please install Node.js 14+ first." + return 1 + fi + + local node_version + node_version=$(node -v | sed 's/v//' | cut -d. -f1) + + if [[ "$node_version" -lt 14 ]]; then + print_error "Node.js 14+ is required. Current version: $(node -v)" + return 1 + fi + + return 0 +} + +# Check if npm package is installed +check_npm_package() { + if npm list -g "$NPM_PACKAGE" &> /dev/null; then + return 0 + fi + + if npm list "$NPM_PACKAGE" &> /dev/null 2>&1; then + return 0 + fi + + return 1 +} + +# Setup Docker deployment (RECOMMENDED) +docker_setup() { + print_header "Setting up WaterCrawl Self-Hosted (Docker)" + + if ! check_docker; then + return 1 + fi + + # Create directory + mkdir -p "$WATERCRAWL_DIR" + + # Clone or update repository + if [[ -d "$WATERCRAWL_DIR/.git" ]]; then + print_info "Updating existing WaterCrawl installation..." + cd "$WATERCRAWL_DIR" || return 1 + git pull origin main + else + print_info "Cloning WaterCrawl repository..." + git clone "$WATERCRAWL_REPO" "$WATERCRAWL_DIR" + cd "$WATERCRAWL_DIR" || return 1 + fi + + # Setup environment file + if [[ ! -f "$WATERCRAWL_DIR/docker/.env" ]]; then + print_info "Creating environment configuration..." + cp "$WATERCRAWL_DIR/docker/.env.example" "$WATERCRAWL_DIR/docker/.env" + + # Generate secure keys + local secret_key + secret_key=$(openssl rand -hex 32) + local api_encryption_key + api_encryption_key=$(python3 -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" 2>/dev/null || openssl rand -base64 32) + + # Update .env with secure values + sed -i.bak "s|SECRET_KEY=.*|SECRET_KEY=$secret_key|" "$WATERCRAWL_DIR/docker/.env" + sed -i.bak "s|API_ENCRYPTION_KEY=.*|API_ENCRYPTION_KEY=$api_encryption_key|" "$WATERCRAWL_DIR/docker/.env" + rm -f "$WATERCRAWL_DIR/docker/.env.bak" + + print_success "Environment file created with secure keys" + else + print_info "Environment file already exists" + fi + + print_success "WaterCrawl Docker setup complete!" + print_info "" + print_info "Next steps:" + print_info "1. Review config: $WATERCRAWL_DIR/docker/.env" + print_info "2. Start services: $0 docker-start" + print_info "3. Create admin user: $0 docker-admin" + print_info "4. Access dashboard: http://localhost" + print_info "" + print_info "For production deployment on a domain, update these in .env:" + print_info " MINIO_EXTERNAL_ENDPOINT=your-domain.com" + print_info " MINIO_BROWSER_REDIRECT_URL=https://your-domain.com/minio-console/" + print_info " MINIO_SERVER_URL=https://your-domain.com/" + + return 0 +} + +# Start Docker containers +docker_start() { + print_header "Starting WaterCrawl Docker Containers" + + if ! check_docker; then + return 1 + fi + + if [[ ! -d "$WATERCRAWL_DIR/docker" ]]; then + print_error "WaterCrawl not installed. Run: $0 docker-setup" + return 1 + fi + + cd "$WATERCRAWL_DIR/docker" || return 1 + + print_info "Starting services..." + if docker compose up -d; then + print_success "WaterCrawl started successfully!" + print_info "" + print_info "Services available at:" + print_info " Frontend: http://localhost" + print_info " API: http://localhost/api" + print_info " MinIO Console: http://localhost/minio-console" + print_info "" + print_info "View logs: $0 docker-logs" + + # Update API URL to local + configure_api_url "$WATERCRAWL_LOCAL_URL" + else + print_error "Failed to start WaterCrawl" + return 1 + fi + + return 0 +} + +# Stop Docker containers +docker_stop() { + print_header "Stopping WaterCrawl Docker Containers" + + if ! check_docker; then + return 1 + fi + + if [[ ! -d "$WATERCRAWL_DIR/docker" ]]; then + print_error "WaterCrawl not installed" + return 1 + fi + + cd "$WATERCRAWL_DIR/docker" || return 1 + + if docker compose down; then + print_success "WaterCrawl stopped" + else + print_error "Failed to stop WaterCrawl" + return 1 + fi + + return 0 +} + +# View Docker logs +docker_logs() { + local service="$1" + + if ! check_docker; then + return 1 + fi + + if [[ ! -d "$WATERCRAWL_DIR/docker" ]]; then + print_error "WaterCrawl not installed" + return 1 + fi + + cd "$WATERCRAWL_DIR/docker" || return 1 + + if [[ -n "$service" ]]; then + docker compose logs -f "$service" + else + docker compose logs -f + fi + + return 0 +} + +# Create admin user +docker_admin() { + print_header "Creating WaterCrawl Admin User" + + if ! check_docker; then + return 1 + fi + + if [[ ! -d "$WATERCRAWL_DIR/docker" ]]; then + print_error "WaterCrawl not installed" + return 1 + fi + + cd "$WATERCRAWL_DIR/docker" || return 1 + + print_info "Creating superuser (follow prompts)..." + docker compose exec app python manage.py createsuperuser + + return 0 +} + +# Deploy to Coolify +coolify_deploy() { + print_header "Deploying WaterCrawl to Coolify" + + print_info "WaterCrawl can be deployed to Coolify as a Docker Compose application." + print_info "" + print_info "Steps:" + print_info "1. In Coolify, create a new 'Docker Compose' resource" + print_info "2. Use Git repository: $WATERCRAWL_REPO" + print_info "3. Set Docker Compose path: docker/docker-compose.yml" + print_info "4. Configure environment variables in Coolify:" + print_info " - SECRET_KEY (generate secure key)" + print_info " - API_ENCRYPTION_KEY (generate secure key)" + print_info " - MINIO_EXTERNAL_ENDPOINT (your domain)" + print_info " - FRONTEND_URL (https://your-domain.com)" + print_info "5. Set up domain/SSL in Coolify" + print_info "6. Deploy!" + print_info "" + print_info "For detailed Coolify deployment, see:" + print_info " https://docs.watercrawl.dev/self-hosted/installation" + print_info "" + print_info "Or use coolify-helper.sh for automated deployment:" + print_info " bash .agent/scripts/coolify-helper.sh deploy watercrawl" + + return 0 +} + +# Setup Node.js SDK (for API access) +setup_sdk() { + print_header "Setting up WaterCrawl Node.js SDK" + + if ! check_node; then + return 1 + fi + + print_info "Installing WaterCrawl Node.js SDK..." + if npm install -g "$NPM_PACKAGE"; then + print_success "WaterCrawl SDK installed successfully" + else + print_warning "Global install failed, trying local install..." + if npm install "$NPM_PACKAGE"; then + print_success "WaterCrawl SDK installed locally" + else + print_error "Failed to install WaterCrawl SDK" + return 1 + fi + fi + + # Check for API key + if ! load_api_key; then + print_warning "WaterCrawl API key not configured" + print_info "" + print_info "For self-hosted: Create user at http://localhost, get API key from dashboard" + print_info "For cloud API: Get key from https://app.watercrawl.dev" + print_info "" + print_info "Then run: $0 api-key YOUR_API_KEY" + else + print_success "API key already configured" + fi + + return 0 +} + +# Configure API key +configure_api_key() { + local api_key="$1" + + if [[ -z "$api_key" ]]; then + print_error "API key is required" + print_info "Usage: $0 api-key YOUR_API_KEY" + print_info "" + print_info "For self-hosted: Get key from http://localhost dashboard" + print_info "For cloud API: Get key from https://app.watercrawl.dev" + return 1 + fi + + print_header "Configuring WaterCrawl API Key" + + # Create config directory if needed + mkdir -p "$(dirname "$MCP_ENV_FILE")" + + # Check if file exists and has the key + if [[ -f "$MCP_ENV_FILE" ]]; then + if grep -q "^export WATERCRAWL_API_KEY=" "$MCP_ENV_FILE"; then + # Update existing key + sed -i.bak "s|^export WATERCRAWL_API_KEY=.*|export WATERCRAWL_API_KEY=\"$api_key\"|" "$MCP_ENV_FILE" + rm -f "${MCP_ENV_FILE}.bak" + print_success "API key updated in $MCP_ENV_FILE" + else + # Append new key + echo "" >> "$MCP_ENV_FILE" + echo "# WaterCrawl API Key" >> "$MCP_ENV_FILE" + echo "export WATERCRAWL_API_KEY=\"$api_key\"" >> "$MCP_ENV_FILE" + print_success "API key added to $MCP_ENV_FILE" + fi + else + # Create new file + cat > "$MCP_ENV_FILE" << EOF +#!/bin/bash +# MCP Environment Variables +# This file is sourced by helper scripts to load API keys +# Permissions should be 600 (chmod 600 $MCP_ENV_FILE) + +# WaterCrawl Configuration +export WATERCRAWL_API_KEY="$api_key" +EOF + chmod 600 "$MCP_ENV_FILE" + print_success "Created $MCP_ENV_FILE with API key" + fi + + return 0 +} + +# Configure custom API URL (for self-hosted) +configure_api_url() { + local api_url="$1" + + if [[ -z "$api_url" ]]; then + print_error "API URL is required" + print_info "Usage: $0 api-url http://your-watercrawl-instance.com" + return 1 + fi + + print_header "Configuring WaterCrawl API URL" + + # Create config directory if needed + mkdir -p "$(dirname "$MCP_ENV_FILE")" + + # Check if file exists and has the URL + if [[ -f "$MCP_ENV_FILE" ]]; then + if grep -q "^export WATERCRAWL_API_URL=" "$MCP_ENV_FILE"; then + # Update existing URL + sed -i.bak "s|^export WATERCRAWL_API_URL=.*|export WATERCRAWL_API_URL=\"$api_url\"|" "$MCP_ENV_FILE" + rm -f "${MCP_ENV_FILE}.bak" + print_success "API URL updated to: $api_url" + else + # Append new URL + echo "export WATERCRAWL_API_URL=\"$api_url\"" >> "$MCP_ENV_FILE" + print_success "API URL added: $api_url" + fi + else + # Create new file + cat > "$MCP_ENV_FILE" << EOF +#!/bin/bash +# MCP Environment Variables +# Permissions should be 600 (chmod 600 $MCP_ENV_FILE) + +# WaterCrawl Configuration +export WATERCRAWL_API_URL="$api_url" +EOF + chmod 600 "$MCP_ENV_FILE" + print_success "Created $MCP_ENV_FILE with API URL" + fi + + return 0 +} + +# Check status +check_status() { + print_header "WaterCrawl Status" + + load_config + + # Check Docker installation + if [[ -d "$WATERCRAWL_DIR" ]]; then + print_success "Self-hosted: Installed at $WATERCRAWL_DIR" + + if check_docker 2>/dev/null; then + cd "$WATERCRAWL_DIR/docker" 2>/dev/null || true + if docker compose ps 2>/dev/null | grep -q "Up"; then + print_success "Docker: Running" + docker compose ps 2>/dev/null | grep -E "NAME|watercrawl" || true + else + print_warning "Docker: Not running (run: $0 docker-start)" + fi + fi + else + print_info "Self-hosted: Not installed (run: $0 docker-setup)" + fi + + # Check Node.js SDK + if check_node 2>/dev/null; then + print_success "Node.js: $(node -v)" + if check_npm_package; then + local version + version=$(npm list -g "$NPM_PACKAGE" 2>/dev/null | grep "$NPM_PACKAGE" | sed 's/.*@//' || echo "installed") + print_success "SDK: $NPM_PACKAGE@$version" + else + print_info "SDK: Not installed (run: $0 setup)" + fi + else + print_warning "Node.js: Not available" + fi + + # Check API configuration + print_info "API URL: ${WATERCRAWL_API_URL:-not configured}" + + if [[ -n "$WATERCRAWL_API_KEY" ]]; then + print_success "API Key: Configured" + + # Test API connectivity + local response + response=$(curl -s -o /dev/null -w "%{http_code}" \ + -H "Authorization: Bearer $WATERCRAWL_API_KEY" \ + "${WATERCRAWL_API_URL}/api/v1/core/crawl-requests/" 2>/dev/null) + + if [[ "$response" == "200" ]]; then + print_success "API: Connected" + elif [[ "$response" == "000" ]]; then + print_warning "API: Cannot connect to $WATERCRAWL_API_URL" + else + print_warning "API: HTTP $response" + fi + else + print_warning "API Key: Not configured (run: $0 api-key YOUR_KEY)" + fi + + print_info "" + print_info "Self-hosted docs: https://docs.watercrawl.dev/self-hosted/" + print_info "Cloud dashboard: https://app.watercrawl.dev" + + return 0 +} + +# Scrape a single URL +scrape_url() { + local url="$1" + local output_file="$2" + + if [[ -z "$url" ]]; then + print_error "URL is required" + print_info "Usage: $0 scrape [output.json]" + return 1 + fi + + if ! load_api_key; then + print_error "API key not configured" + print_info "Run: $0 api-key YOUR_API_KEY" + return 1 + fi + + print_header "Scraping: $url" + print_info "Using API: $WATERCRAWL_API_URL" + + # Create Node.js script for scraping + local temp_script + temp_script=$(mktemp /tmp/watercrawl_scrape_XXXXXX.mjs) + + cat > "$temp_script" << 'SCRIPT' +import { WaterCrawlAPIClient } from '@watercrawl/nodejs'; + +const apiKey = process.env.WATERCRAWL_API_KEY; +const apiUrl = process.env.WATERCRAWL_API_URL; +const url = process.argv[2]; + +if (!apiKey) { + console.error('Error: WATERCRAWL_API_KEY not set'); + process.exit(1); +} + +if (!url) { + console.error('Error: URL required'); + process.exit(1); +} + +const client = new WaterCrawlAPIClient(apiKey, apiUrl); + +try { + console.error('Scraping URL...'); + const result = await client.scrapeUrl(url, { + only_main_content: true, + include_links: true, + wait_time: 2000 + }); + + console.log(JSON.stringify(result, null, 2)); +} catch (error) { + console.error('Error:', error.message); + process.exit(1); +} +SCRIPT + + local result + if result=$(WATERCRAWL_API_KEY="$WATERCRAWL_API_KEY" WATERCRAWL_API_URL="$WATERCRAWL_API_URL" node "$temp_script" "$url" 2>&1); then + if [[ -n "$output_file" ]]; then + echo "$result" > "$output_file" + print_success "Results saved to: $output_file" + else + echo "$result" + fi + print_success "Scrape completed" + else + print_error "Scrape failed: $result" + rm -f "$temp_script" + return 1 + fi + + rm -f "$temp_script" + return 0 +} + +# Crawl a website +crawl_website() { + local url="$1" + local max_depth="${2:-2}" + local page_limit="${3:-50}" + local output_file="$4" + + if [[ -z "$url" ]]; then + print_error "URL is required" + print_info "Usage: $0 crawl [max_depth] [page_limit] [output.json]" + return 1 + fi + + if ! load_api_key; then + print_error "API key not configured" + print_info "Run: $0 api-key YOUR_API_KEY" + return 1 + fi + + print_header "Crawling: $url" + print_info "Using API: $WATERCRAWL_API_URL" + print_info "Max depth: $max_depth, Page limit: $page_limit" + + # Create Node.js script for crawling + local temp_script + temp_script=$(mktemp /tmp/watercrawl_crawl_XXXXXX.mjs) + + cat > "$temp_script" << 'SCRIPT' +import { WaterCrawlAPIClient } from '@watercrawl/nodejs'; + +const apiKey = process.env.WATERCRAWL_API_KEY; +const apiUrl = process.env.WATERCRAWL_API_URL; +const url = process.argv[2]; +const maxDepth = parseInt(process.argv[3]) || 2; +const pageLimit = parseInt(process.argv[4]) || 50; + +if (!apiKey) { + console.error('Error: WATERCRAWL_API_KEY not set'); + process.exit(1); +} + +if (!url) { + console.error('Error: URL required'); + process.exit(1); +} + +const client = new WaterCrawlAPIClient(apiKey, apiUrl); + +try { + console.error(`Creating crawl request (depth: ${maxDepth}, limit: ${pageLimit})...`); + + const crawlRequest = await client.createCrawlRequest( + url, + { + max_depth: maxDepth, + page_limit: pageLimit + }, + { + only_main_content: true, + include_links: true, + wait_time: 2000 + } + ); + + console.error(`Crawl started: ${crawlRequest.uuid}`); + console.error('Monitoring progress...'); + + const results = []; + for await (const event of client.monitorCrawlRequest(crawlRequest.uuid)) { + if (event.type === 'state') { + console.error(`Status: ${event.data.status}, Pages: ${event.data.number_of_documents}`); + } else if (event.type === 'result') { + results.push({ + url: event.data.url, + title: event.data.title, + content: event.data.result + }); + console.error(`Crawled: ${event.data.url}`); + } + } + + console.log(JSON.stringify({ + crawl_id: crawlRequest.uuid, + total_pages: results.length, + results: results + }, null, 2)); + +} catch (error) { + console.error('Error:', error.message); + process.exit(1); +} +SCRIPT + + local result + if result=$(WATERCRAWL_API_KEY="$WATERCRAWL_API_KEY" WATERCRAWL_API_URL="$WATERCRAWL_API_URL" node "$temp_script" "$url" "$max_depth" "$page_limit" 2>&1); then + if [[ -n "$output_file" ]]; then + echo "$result" | grep -v "^\(Status:\|Crawled:\|Creating\|Crawl started\|Monitoring\)" > "$output_file" + print_success "Results saved to: $output_file" + else + echo "$result" | grep -v "^\(Status:\|Crawled:\|Creating\|Crawl started\|Monitoring\)" + fi + print_success "Crawl completed" + else + print_error "Crawl failed" + echo "$result" >&2 + rm -f "$temp_script" + return 1 + fi + + rm -f "$temp_script" + return 0 +} + +# Search the web +search_web() { + local query="$1" + local limit="${2:-5}" + local output_file="$3" + + if [[ -z "$query" ]]; then + print_error "Search query is required" + print_info "Usage: $0 search [limit] [output.json]" + return 1 + fi + + if ! load_api_key; then + print_error "API key not configured" + print_info "Run: $0 api-key YOUR_API_KEY" + return 1 + fi + + print_header "Searching: $query" + print_info "Using API: $WATERCRAWL_API_URL" + print_info "Result limit: $limit" + + # Create Node.js script for searching + local temp_script + temp_script=$(mktemp /tmp/watercrawl_search_XXXXXX.mjs) + + cat > "$temp_script" << 'SCRIPT' +import { WaterCrawlAPIClient } from '@watercrawl/nodejs'; + +const apiKey = process.env.WATERCRAWL_API_KEY; +const apiUrl = process.env.WATERCRAWL_API_URL; +const query = process.argv[2]; +const limit = parseInt(process.argv[3]) || 5; + +if (!apiKey) { + console.error('Error: WATERCRAWL_API_KEY not set'); + process.exit(1); +} + +if (!query) { + console.error('Error: Query required'); + process.exit(1); +} + +const client = new WaterCrawlAPIClient(apiKey, apiUrl); + +try { + console.error(`Searching for: "${query}"...`); + + const results = await client.createSearchRequest( + query, + { + depth: 'basic', + search_type: 'web' + }, + limit, + true, // sync + true // download + ); + + console.log(JSON.stringify(results, null, 2)); + +} catch (error) { + console.error('Error:', error.message); + process.exit(1); +} +SCRIPT + + local result + if result=$(WATERCRAWL_API_KEY="$WATERCRAWL_API_KEY" WATERCRAWL_API_URL="$WATERCRAWL_API_URL" node "$temp_script" "$query" "$limit" 2>&1); then + if [[ -n "$output_file" ]]; then + echo "$result" | grep -v "^Searching" > "$output_file" + print_success "Results saved to: $output_file" + else + echo "$result" | grep -v "^Searching" + fi + print_success "Search completed" + else + print_error "Search failed" + echo "$result" >&2 + rm -f "$temp_script" + return 1 + fi + + rm -f "$temp_script" + return 0 +} + +# Generate sitemap +generate_sitemap() { + local url="$1" + local output_file="$2" + local format="${3:-json}" + + if [[ -z "$url" ]]; then + print_error "URL is required" + print_info "Usage: $0 sitemap [output.json] [format: json|markdown|graph]" + return 1 + fi + + if ! load_api_key; then + print_error "API key not configured" + print_info "Run: $0 api-key YOUR_API_KEY" + return 1 + fi + + print_header "Generating sitemap: $url" + print_info "Using API: $WATERCRAWL_API_URL" + print_info "Format: $format" + + # Create Node.js script for sitemap + local temp_script + temp_script=$(mktemp /tmp/watercrawl_sitemap_XXXXXX.mjs) + + cat > "$temp_script" << 'SCRIPT' +import { WaterCrawlAPIClient } from '@watercrawl/nodejs'; + +const apiKey = process.env.WATERCRAWL_API_KEY; +const apiUrl = process.env.WATERCRAWL_API_URL; +const url = process.argv[2]; +const format = process.argv[3] || 'json'; + +if (!apiKey) { + console.error('Error: WATERCRAWL_API_KEY not set'); + process.exit(1); +} + +if (!url) { + console.error('Error: URL required'); + process.exit(1); +} + +const client = new WaterCrawlAPIClient(apiKey, apiUrl); + +try { + console.error(`Creating sitemap request for: ${url}...`); + + const sitemapRequest = await client.createSitemapRequest( + url, + { + include_subdomains: true, + ignore_sitemap_xml: false, + include_paths: [], + exclude_paths: [] + }, + true, // sync + true // download + ); + + // If sync returned the results directly + if (Array.isArray(sitemapRequest)) { + console.log(JSON.stringify(sitemapRequest, null, 2)); + } else if (typeof sitemapRequest === 'string') { + console.log(sitemapRequest); + } else { + // Need to get results separately + const results = await client.getSitemapResults(sitemapRequest.uuid, format); + if (typeof results === 'string') { + console.log(results); + } else { + console.log(JSON.stringify(results, null, 2)); + } + } + +} catch (error) { + console.error('Error:', error.message); + process.exit(1); +} +SCRIPT + + local result + if result=$(WATERCRAWL_API_KEY="$WATERCRAWL_API_KEY" WATERCRAWL_API_URL="$WATERCRAWL_API_URL" node "$temp_script" "$url" "$format" 2>&1); then + if [[ -n "$output_file" ]]; then + echo "$result" | grep -v "^Creating sitemap" > "$output_file" + print_success "Sitemap saved to: $output_file" + else + echo "$result" | grep -v "^Creating sitemap" + fi + print_success "Sitemap generated" + else + print_error "Sitemap generation failed" + echo "$result" >&2 + rm -f "$temp_script" + return 1 + fi + + rm -f "$temp_script" + return 0 +} + +# Show help +show_help() { + echo "WaterCrawl Helper Script" + echo "Modern web crawling framework for LLM-ready data extraction" + echo "" + echo "SELF-HOSTED FIRST: Prioritizes local Docker deployment over cloud API." + echo "" + echo "Usage: $0 [command] [options]" + echo "" + echo "Self-Hosted Deployment (RECOMMENDED):" + echo " docker-setup - Clone repo and prepare Docker deployment" + echo " docker-start - Start WaterCrawl Docker containers" + echo " docker-stop - Stop WaterCrawl Docker containers" + echo " docker-logs [service] - View Docker container logs" + echo " docker-admin - Create admin user" + echo " coolify-deploy - Instructions for Coolify deployment" + echo "" + echo "SDK & Configuration:" + echo " setup - Install Node.js SDK" + echo " status - Check configuration and connectivity" + echo " api-key - Configure WaterCrawl API key" + echo " api-url - Configure custom API URL (self-hosted)" + echo "" + echo "Crawling Operations:" + echo " scrape [output.json] - Scrape a single URL" + echo " crawl [depth] [limit] [out] - Crawl website with depth control" + echo " search [limit] [out] - Search the web" + echo " sitemap [out] [format] - Generate sitemap (json|markdown|graph)" + echo "" + echo " help - $HELP_SHOW_MESSAGE" + echo "" + echo "Quick Start (Self-Hosted):" + echo " $0 docker-setup # Clone and configure" + echo " $0 docker-start # Start services" + echo " $0 docker-admin # Create admin user" + echo " # Login at http://localhost, get API key from dashboard" + echo " $0 api-key YOUR_API_KEY # Configure key" + echo " $0 scrape https://example.com # Test crawling" + echo "" + echo "Quick Start (Cloud API):" + echo " $0 setup # Install SDK" + echo " $0 api-url https://app.watercrawl.dev" + echo " $0 api-key YOUR_API_KEY # From app.watercrawl.dev" + echo " $0 scrape https://example.com" + echo "" + echo "Resources:" + echo " Self-hosted docs: https://docs.watercrawl.dev/self-hosted/" + echo " Cloud dashboard: https://app.watercrawl.dev" + echo " GitHub: https://github.com/watercrawl/WaterCrawl" + echo " Framework docs: .agent/tools/browser/watercrawl.md" + return 0 +} + +# Main function +main() { + local command="${1:-help}" + local param2="$2" + local param3="$3" + local param4="$4" + local param5="$5" + + case "$command" in + "docker-setup") + docker_setup + ;; + "docker-start") + docker_start + ;; + "docker-stop") + docker_stop + ;; + "docker-logs") + docker_logs "$param2" + ;; + "docker-admin") + docker_admin + ;; + "coolify-deploy"|"coolify") + coolify_deploy + ;; + "setup") + setup_sdk + ;; + "status") + check_status + ;; + "api-key") + configure_api_key "$param2" + ;; + "api-url") + configure_api_url "$param2" + ;; + "scrape") + scrape_url "$param2" "$param3" + ;; + "crawl") + crawl_website "$param2" "$param3" "$param4" "$param5" + ;; + "search") + search_web "$param2" "$param3" "$param4" + ;; + "sitemap") + generate_sitemap "$param2" "$param3" "$param4" + ;; + "help"|"-h"|"--help"|"") + show_help + ;; + *) + print_error "$ERROR_UNKNOWN_COMMAND $command" + show_help + return 1 + ;; + esac + return 0 +} + +main "$@" + +exit 0 diff --git a/.agent/subagent-index.toon b/.agent/subagent-index.toon index 49e361dd4..e2383d585 100644 --- a/.agent/subagent-index.toon +++ b/.agent/subagent-index.toon @@ -33,7 +33,7 @@ tools/build-agent/,Agent design - composing efficient agents,build-agent|agent-r tools/build-mcp/,MCP development - creating MCP servers,build-mcp|server-patterns|api-wrapper|transports|deployment tools/ai-assistants/,AI tool integration - configuring assistants,agno|capsolver|windsurf|configuration tools/ai-orchestration/,AI orchestration - visual builders and multi-agent,overview|langflow|crewai|autogen|openprose -tools/browser/,Browser automation - scraping and testing,agent-browser|stagehand|playwright|playwriter|crawl4ai|pagespeed|peekaboo +tools/browser/,Browser automation - scraping and testing,agent-browser|stagehand|playwright|playwriter|crawl4ai|watercrawl|pagespeed|peekaboo tools/mobile/,Mobile development - iOS/Android emulators,minisim tools/pdf/,PDF processing - form filling and digital signatures,overview|libpdf tools/ui/,UI components - design systems and debugging,shadcn|ui-skills|frontend-debugging @@ -73,7 +73,7 @@ bug-fixing,workflows/bug-fixing.md,Bug fix workflow feature-development,workflows/feature-development.md,Feature development workflow --> - WaterCrawl (cloud API with search) | +-> Bulk pages / structured CSS/XPath? --> Crawl4AI (fastest extraction, parallel) | +-> Need to login/interact first? --> Playwright or dev-browser, then extract | +-> Unknown structure, need AI to parse? --> Crawl4AI LLM mode or Stagehand extract() + | +-> Quick API without infrastructure? --> WaterCrawl (managed service) | +-> AUTOMATE (forms, clicks, multi-step)? | | @@ -128,26 +130,28 @@ Tested 2026-01-24, macOS ARM64 (Apple Silicon), headless, warm daemon. Median of ## Feature Matrix -| Feature | Playwright | dev-browser | agent-browser | Crawl4AI | Playwriter | Stagehand | -|---------|-----------|-------------|---------------|----------|------------|-----------| -| **Headless** | Yes | Yes | Yes (default) | Yes | No (your browser) | Yes | -| **Session persistence** | storageState | Profile dir | state save/load | user_data_dir | Your browser | Per-instance | -| **Cookie management** | Full API | Persistent | CLI commands | Persistent | Your browser | Per-instance | -| **Proxy support** | Full | Via launch args | No | Full (ProxyConfig) | Your browser | Via args | -| **SOCKS5/VPN** | Yes | Possible | No | Yes | Your browser | Via args | -| **Browser extensions** | Yes (persistent ctx) | Yes (profile) | No | No | Yes (yours) | Possible | -| **Multi-session** | Per-context | Named pages | --session flag | Per-crawl | Per-tab | Per-instance | -| **Form filling** | Full API | Full API | CLI fill/click | No | Full API | Natural language | -| **Screenshots** | Full API | Full API | CLI command | Built-in | Full API | Via page | -| **Data extraction** | evaluate() | evaluate() | eval command | CSS/XPath/LLM | evaluate() | extract() + schema | -| **Natural language** | No | No | No | LLM extraction | No | act/extract/observe | -| **Self-healing** | No | No | No | No | No | Yes | -| **AI-optimized output** | No | ARIA snapshots | Snapshot + refs | Markdown/JSON | No | Structured schemas | -| **Anti-detect** | rebrowser-patches | Via launch args | No | No | Your browser | Via Playwright | -| **Fingerprint rotation** | No (add Camoufox) | No | No | No | No | No | -| **Multi-profile** | storageState dirs | Profile dir | --session | user_data_dir | No | No | -| **Setup required** | npm install | Server running | npm install | pip/Docker | Extension click | npm + API key | -| **Interface** | JS/TS API | TS scripts | CLI | Python API | JS API | JS/Python SDK | +| Feature | Playwright | dev-browser | agent-browser | Crawl4AI | WaterCrawl | Playwriter | Stagehand | +|---------|-----------|-------------|---------------|----------|------------|------------|-----------| +| **Headless** | Yes | Yes | Yes (default) | Yes | Cloud API | No (your browser) | Yes | +| **Session persistence** | storageState | Profile dir | state save/load | user_data_dir | API sessions | Your browser | Per-instance | +| **Cookie management** | Full API | Persistent | CLI commands | Persistent | Via API | Your browser | Per-instance | +| **Proxy support** | Full | Via launch args | No | Full (ProxyConfig) | Datacenter+Residential | Your browser | Via args | +| **SOCKS5/VPN** | Yes | Possible | No | Yes | No | Your browser | Via args | +| **Browser extensions** | Yes (persistent ctx) | Yes (profile) | No | No | No | Yes (yours) | Possible | +| **Multi-session** | Per-context | Named pages | --session flag | Per-crawl | Per-request | Per-tab | Per-instance | +| **Form filling** | Full API | Full API | CLI fill/click | No | No | Full API | Natural language | +| **Screenshots** | Full API | Full API | CLI command | Built-in | PDF/Screenshot | Full API | Via page | +| **Data extraction** | evaluate() | evaluate() | eval command | CSS/XPath/LLM | Markdown/JSON | evaluate() | extract() + schema | +| **Natural language** | No | No | No | LLM extraction | No | No | act/extract/observe | +| **Self-healing** | No | No | No | No | No | No | Yes | +| **AI-optimized output** | No | ARIA snapshots | Snapshot + refs | Markdown/JSON | Markdown/JSON | No | Structured schemas | +| **Web search** | No | No | No | No | Yes | No | No | +| **Sitemap generation** | No | No | No | No | Yes | No | No | +| **Anti-detect** | rebrowser-patches | Via launch args | No | No | No | Your browser | Via Playwright | +| **Fingerprint rotation** | No (add Camoufox) | No | No | No | No | No | No | +| **Multi-profile** | storageState dirs | Profile dir | --session | user_data_dir | N/A | No | No | +| **Setup required** | npm install | Server running | npm install | pip/Docker | API key | Extension click | npm + API key | +| **Interface** | JS/TS API | TS scripts | CLI | Python API | REST/SDK | JS API | JS/Python SDK | ## Quick Reference @@ -157,6 +161,7 @@ Tested 2026-01-24, macOS ARM64 (Apple Silicon), headless, warm daemon. Median of | **dev-browser** | Persistent sessions, dev testing, TypeScript | Fast | `dev-browser-helper.sh setup && start` | | **agent-browser** | CLI/CI/CD, AI agents, parallel sessions | Fast (warm) | `agent-browser-helper.sh setup` | | **Crawl4AI** | Web scraping, bulk extraction, structured data | Fast | `pip install crawl4ai` (venv) | +| **WaterCrawl** | Cloud API, web search, sitemap generation | Fast | `watercrawl-helper.sh setup` + API key | | **Playwriter** | Existing browser, extensions, bypass detection | Medium | Chrome extension + `npx playwriter` | | **Stagehand** | Unknown pages, natural language, self-healing | Slow | `stagehand-helper.sh setup` + API key | | **Anti-detect** | Bot evasion, multi-account, fingerprint rotation | Medium | `anti-detect-helper.sh setup` | diff --git a/.agent/tools/browser/watercrawl.md b/.agent/tools/browser/watercrawl.md new file mode 100644 index 000000000..4fb1d6f75 --- /dev/null +++ b/.agent/tools/browser/watercrawl.md @@ -0,0 +1,482 @@ +--- +description: WaterCrawl - Modern web crawling for LLM-ready data +mode: subagent +tools: + read: true + write: false + edit: false + bash: true + glob: true + grep: true + webfetch: true + task: true +--- + +# WaterCrawl Integration Guide + + + +## Quick Reference + +- **Purpose**: Transform web content into LLM-ready structured data +- **Type**: Open-source, self-hosted first (Docker/Coolify), cloud API fallback +- **Self-Hosted**: `bash .agent/scripts/watercrawl-helper.sh docker-setup` +- **Cloud API**: `bash .agent/scripts/watercrawl-helper.sh api-url https://app.watercrawl.dev` + +**Self-Hosted Commands**: `docker-setup|docker-start|docker-stop|docker-logs|docker-admin|coolify-deploy` +**API Commands**: `setup|status|api-key|api-url|scrape|crawl|search|sitemap|help` + +**Key Features**: +- Smart crawling with depth/domain/path controls +- Web search engine integration (real-time web search) +- Sitemap generation and analysis +- JavaScript rendering with wait times +- AI-powered content processing (OpenAI integration) +- Extensible plugin system +- Proxy support (datacenter + residential) + +**Self-Hosted Endpoints** (default): +- Frontend: http://localhost +- API: http://localhost/api +- MinIO Console: http://localhost/minio-console + +**Installation Path**: `~/.aidevops/watercrawl/` + +**SDKs**: Node.js (`@watercrawl/nodejs`), Python (`watercrawl-py`), Go, PHP + +**Env Vars**: `WATERCRAWL_API_KEY`, `WATERCRAWL_API_URL` (stored in `~/.config/aidevops/mcp-env.sh`) + +**vs Crawl4AI**: Both self-hostable. WaterCrawl has web search + full web UI; Crawl4AI has CAPTCHA solving + Python-native. Use WaterCrawl for web search and team dashboards. Use Crawl4AI for CAPTCHA-heavy sites. + +**vs Firecrawl**: Similar features. WaterCrawl is fully open-source with self-hosting. + + +## Overview + +WaterCrawl is a modern web crawling framework that transforms web content into structured, LLM-ready data. It provides smart crawling controls, web search integration, and AI-powered content processing. + +**Self-Hosted First**: This integration prioritizes self-hosted deployment via Docker or Coolify over the cloud API. Self-hosting gives you unlimited crawling, full control, and no per-page costs. + +### Key Capabilities + +| Feature | Description | +|---------|-------------| +| **Smart Crawling** | Depth, domain, and path controls for targeted extraction | +| **Web Search** | Real-time web search with language/country/time filters | +| **Sitemap Generation** | Automatic URL discovery and structure mapping | +| **JavaScript Rendering** | Full browser rendering with configurable wait times | +| **AI Processing** | Built-in OpenAI integration for content transformation | +| **Plugin System** | Extensible architecture for custom processing | +| **Proxy Support** | Datacenter and residential proxy integration | +| **Team Dashboard** | Full web UI for managing crawls and API keys | + +### When to Use WaterCrawl + +**Best for**: +- Self-hosted web crawling with full control +- Web search integration for AI agents +- Teams needing a dashboard and API key management +- Sitemap discovery and analysis +- LLM-ready markdown output + +**Consider alternatives when**: +- CAPTCHA solving required (use Crawl4AI + CapSolver) +- Browser automation/interaction needed (use Playwright) +- Need to use your own browser session (use Playwriter) + +## Quick Start (Self-Hosted - RECOMMENDED) + +### Docker Deployment + +```bash +# Clone and configure WaterCrawl +bash .agent/scripts/watercrawl-helper.sh docker-setup + +# Start services +bash .agent/scripts/watercrawl-helper.sh docker-start + +# Create admin user +bash .agent/scripts/watercrawl-helper.sh docker-admin + +# Access dashboard at http://localhost +# Get API key from dashboard, then: +bash .agent/scripts/watercrawl-helper.sh api-key YOUR_API_KEY + +# Test crawling +bash .agent/scripts/watercrawl-helper.sh scrape https://example.com +``` + +### Coolify Deployment + +For VPS deployment via Coolify (self-hosted PaaS): + +```bash +bash .agent/scripts/watercrawl-helper.sh coolify-deploy +``` + +This shows instructions for deploying WaterCrawl as a Docker Compose application in Coolify. + +## Quick Start (Cloud API) + +If you prefer the managed cloud service: + +```bash +# Install SDK +bash .agent/scripts/watercrawl-helper.sh setup + +# Point to cloud API +bash .agent/scripts/watercrawl-helper.sh api-url https://app.watercrawl.dev + +# Configure API key (get from https://app.watercrawl.dev) +bash .agent/scripts/watercrawl-helper.sh api-key YOUR_API_KEY + +# Check status +bash .agent/scripts/watercrawl-helper.sh status +``` + +### Basic Usage + +```bash +# Scrape a single URL +bash .agent/scripts/watercrawl-helper.sh scrape https://example.com + +# Crawl a website (depth 3, max 100 pages) +bash .agent/scripts/watercrawl-helper.sh crawl https://docs.example.com 3 100 output.json + +# Search the web +bash .agent/scripts/watercrawl-helper.sh search "AI web crawling" 10 results.json + +# Generate sitemap +bash .agent/scripts/watercrawl-helper.sh sitemap https://example.com sitemap.json +``` + +## Node.js SDK Usage + +### Installation + +```bash +npm install @watercrawl/nodejs +``` + +### Basic Scraping + +```javascript +import { WaterCrawlAPIClient } from '@watercrawl/nodejs'; + +const client = new WaterCrawlAPIClient(process.env.WATERCRAWL_API_KEY); + +// Simple URL scraping +const result = await client.scrapeUrl('https://example.com', { + only_main_content: true, + include_links: true, + wait_time: 2000 +}); + +console.log(result); +``` + +### Crawling with Monitoring + +```javascript +import { WaterCrawlAPIClient } from '@watercrawl/nodejs'; + +const client = new WaterCrawlAPIClient(process.env.WATERCRAWL_API_KEY); + +// Create crawl request +const crawlRequest = await client.createCrawlRequest( + 'https://docs.example.com', + { + max_depth: 3, + page_limit: 100, + allowed_domains: ['docs.example.com'], + exclude_paths: ['/api/*', '/admin/*'] + }, + { + only_main_content: true, + include_links: true, + wait_time: 2000 + } +); + +console.log(`Crawl started: ${crawlRequest.uuid}`); + +// Monitor progress with real-time events +for await (const event of client.monitorCrawlRequest(crawlRequest.uuid)) { + if (event.type === 'state') { + console.log(`Status: ${event.data.status}, Pages: ${event.data.number_of_documents}`); + } else if (event.type === 'result') { + console.log(`Crawled: ${event.data.url}`); + // Process event.data.result (markdown content) + } +} +``` + +### Batch Crawling + +```javascript +// Crawl multiple URLs in a single request +const batchRequest = await client.createBatchCrawlRequest( + [ + 'https://example.com/page1', + 'https://example.com/page2', + 'https://example.com/page3' + ], + { proxy_server: null }, + { wait_time: 1000, include_html: true } +); + +// Monitor same as regular crawl +for await (const event of client.monitorCrawlRequest(batchRequest.uuid)) { + // Handle events +} +``` + +### Web Search + +```javascript +// Search the web +const results = await client.createSearchRequest( + 'AI web crawling frameworks', + { + language: 'en', + country: 'us', + time_range: 'month', // any, hour, day, week, month, year + depth: 'advanced' // basic, advanced, ultimate + }, + 10, // result limit + true, // sync (wait for results) + true // download results +); + +for (const result of results) { + console.log(`${result.title}: ${result.url}`); + console.log(result.description); +} +``` + +### Sitemap Generation + +```javascript +// Generate sitemap +const sitemap = await client.createSitemapRequest( + 'https://example.com', + { + include_subdomains: true, + ignore_sitemap_xml: false, + include_paths: [], + exclude_paths: ['/admin/*'] + }, + true, // sync + true // download +); + +// Get in different formats +const jsonSitemap = await client.getSitemapResults(sitemap.uuid, 'json'); +const markdownSitemap = await client.getSitemapResults(sitemap.uuid, 'markdown'); +const graphSitemap = await client.getSitemapResults(sitemap.uuid, 'graph'); +``` + +## Python SDK Usage + +### Installation + +```bash +pip install watercrawl-py +``` + +### Basic Usage + +```python +from watercrawl import WaterCrawlAPIClient + +client = WaterCrawlAPIClient(api_key="your-api-key") + +# Simple scrape +result = client.scrape_url( + "https://example.com", + page_options={ + "only_main_content": True, + "include_links": True, + "wait_time": 2000 + } +) + +print(result) +``` + +### Async Crawling + +```python +import asyncio +from watercrawl import AsyncWaterCrawlAPIClient + +async def crawl_site(): + client = AsyncWaterCrawlAPIClient(api_key="your-api-key") + + crawl_request = await client.create_crawl_request( + url="https://docs.example.com", + spider_options={ + "max_depth": 3, + "page_limit": 100 + } + ) + + async for event in client.monitor_crawl_request(crawl_request.uuid): + if event["type"] == "result": + print(f"Crawled: {event['data']['url']}") + +asyncio.run(crawl_site()) +``` + +## Page Options Reference + +| Option | Type | Description | +|--------|------|-------------| +| `exclude_tags` | string[] | HTML tags to exclude from extraction | +| `include_tags` | string[] | HTML tags to include (whitelist) | +| `wait_time` | number | Wait time in ms after page load | +| `only_main_content` | boolean | Extract only main content (remove headers/footers) | +| `include_html` | boolean | Include raw HTML in result | +| `include_links` | boolean | Include discovered links | +| `timeout` | number | Request timeout in ms | +| `accept_cookies_selector` | string | CSS selector for cookie accept button | +| `locale` | string | Browser locale (e.g., "en-US") | +| `extra_headers` | object | Custom HTTP headers | +| `actions` | Action[] | Actions to perform (screenshot, pdf) | + +## Spider Options Reference + +| Option | Type | Description | +|--------|------|-------------| +| `max_depth` | number | Maximum crawl depth from start URL | +| `page_limit` | number | Maximum pages to crawl | +| `allowed_domains` | string[] | Domains allowed to crawl | +| `exclude_paths` | string[] | URL paths to exclude (glob patterns) | +| `include_paths` | string[] | URL paths to include (glob patterns) | +| `proxy_server` | string | Proxy server URL | + +## Proxy Integration + +WaterCrawl supports both datacenter and residential proxies: + +```javascript +// Using team proxies (configured in dashboard) +const crawlRequest = await client.createCrawlRequest( + 'https://example.com', + { proxy_server: 'team' }, // Use team proxy list + {} +); + +// Using custom proxy +const crawlRequest = await client.createCrawlRequest( + 'https://example.com', + { proxy_server: 'http://user:pass@proxy.example.com:8080' }, + {} +); +``` + +**Proxy tiers by plan**: +- Free: Team proxies only +- Startup: Datacenter proxies (10+ locations) +- Growth+: Premium residential proxies (40+ locations) + +## Self-Hosted Deployment + +WaterCrawl can be self-hosted using Docker: + +```bash +# Clone repository +git clone https://github.com/watercrawl/WaterCrawl.git +cd WaterCrawl + +# Configure environment +cp .env.example .env +# Edit .env with your settings + +# Start with Docker Compose +docker-compose up -d +``` + +See [DEPLOYMENT.md](https://github.com/watercrawl/WaterCrawl/blob/main/DEPLOYMENT.md) for full self-hosting guide. + +## Plugin System + +WaterCrawl supports custom plugins for content processing: + +```python +# Install plugin base +pip install watercrawl-plugin + +# Example: OpenAI content extraction +pip install watercrawl-openai +``` + +**Available plugins**: +- `watercrawl-openai`: LLM-powered content extraction +- `watercrawl-plugin`: Base library for custom plugins + +## Comparison with Other Tools + +| Feature | WaterCrawl | Crawl4AI | Firecrawl | +|---------|-----------|----------|-----------| +| **Type** | Cloud API + Self-host | Self-hosted | Cloud API | +| **Web Search** | Yes | No | No | +| **CAPTCHA Solving** | No | Yes (CapSolver) | No | +| **Open Source** | Yes | Yes | Partial | +| **Free Tier** | 1,000 pages/month | Unlimited | 500 pages/month | +| **Proxy Support** | Yes (datacenter + residential) | Yes | Yes | +| **Plugin System** | Yes | Yes | No | +| **JavaScript Rendering** | Yes | Yes | Yes | + +**Choose WaterCrawl when**: +- You need web search integration +- You want a managed cloud service +- You need quick API access without infrastructure +- You want the option to self-host later + +**Choose Crawl4AI when**: +- You need high-volume local crawling +- You need CAPTCHA solving +- You want full control over the crawler +- You're building a RAG pipeline + +## Troubleshooting + +### API Key Issues + +```bash +# Check if key is configured +bash .agent/scripts/watercrawl-helper.sh status + +# Reconfigure key +bash .agent/scripts/watercrawl-helper.sh api-key YOUR_NEW_KEY +``` + +### Rate Limiting + +Free tier limits: +- 1,000 pages/month +- 100 pages/day +- Max depth: 2 +- Max pages per crawl: 50 +- 1 concurrent crawl + +Upgrade at https://app.watercrawl.dev for higher limits. + +### Connection Issues + +```bash +# Test API connectivity +curl -H "Authorization: Bearer $WATERCRAWL_API_KEY" \ + https://app.watercrawl.dev/api/v1/core/crawl-requests/ +``` + +## Resources + +- **Dashboard**: https://app.watercrawl.dev +- **Documentation**: https://docs.watercrawl.dev +- **API Reference**: https://docs.watercrawl.dev/api/documentation/ +- **GitHub**: https://github.com/watercrawl/WaterCrawl +- **Node.js SDK**: https://github.com/watercrawl/watercrawl-nodejs +- **Python SDK**: https://github.com/watercrawl/watercrawl-py +- **Discord**: https://discord.com/invite/8bwgBWeXYr