diff --git a/documentation/src/pages/recipes/data/recipes/data-analysis-pipeline.yaml b/documentation/src/pages/recipes/data/recipes/data-analysis-pipeline.yaml new file mode 100644 index 000000000000..562ca1b37ae3 --- /dev/null +++ b/documentation/src/pages/recipes/data/recipes/data-analysis-pipeline.yaml @@ -0,0 +1,150 @@ +version: 1.0.0 +title: Data Analysis Pipeline +description: An advanced data analysis workflow that orchestrates multiple sub-recipes to clean, analyze, visualize, and report on datasets with intelligent format detection and conditional processing +author: + contact: ARYPROGRAMMER + +activities: + - Detect and validate data file format (CSV, JSON, Excel, Parquet) + - Perform automated data cleaning and quality assessment + - Conduct statistical analysis and identify patterns + - Generate interactive visualizations and charts + - Create comprehensive markdown reports with insights + - Export results in multiple formats + +instructions: | + You are a Data Analysis Pipeline orchestrator that intelligently processes datasets through multiple specialized stages. + + Your workflow: + 1. Detect the data format and validate structure + 2. Clean data and handle missing values + 3. Perform statistical analysis based on data type + 4. Generate appropriate visualizations + 5. Compile comprehensive reports + + Use sub-recipes for specialized tasks and coordinate their execution based on data characteristics. + Maintain context between stages and pass relevant findings to subsequent analysis steps. + +parameters: + - key: data_file + input_type: string + requirement: required + description: Path to the data file to analyze (supports CSV, JSON, Excel, Parquet) + + - key: analysis_type + input_type: string + requirement: optional + default: "comprehensive" + description: Type of analysis - options are 'quick', 'comprehensive', 'statistical', 'exploratory' + + - key: output_dir + input_type: string + requirement: optional + default: "./analysis_output" + description: Directory where analysis results and visualizations will be saved + + - key: include_visualizations + input_type: string + requirement: optional + default: "true" + description: Whether to generate visualizations (true/false) + + - key: report_format + input_type: string + requirement: optional + default: "markdown" + description: Output report format - options are 'markdown', 'html', 'pdf' + +sub_recipes: + - name: "data_validator" + path: "./subrecipes/data-validator.yaml" + values: + validation_level: "comprehensive" + + - name: "data_cleaner" + path: "./subrecipes/data-cleaner.yaml" + values: + handle_missing: "smart" + remove_duplicates: "true" + + - name: "statistical_analyzer" + path: "./subrecipes/statistical-analyzer.yaml" + values: + confidence_level: "95" + include_correlations: "true" + + - name: "chart_generator" + path: "./subrecipes/chart-generator.yaml" + values: + chart_style: "modern" + color_scheme: "viridis" + +extensions: + - type: builtin + name: developer + display_name: Developer + timeout: 600 + bundled: true + description: For file operations, data processing, and script execution + + - type: builtin + name: memory + display_name: Memory + timeout: 300 + bundled: true + description: For storing analysis context and intermediate results across stages + + - type: stdio + name: filesystem + cmd: npx + args: + - -y + - "@modelcontextprotocol/server-filesystem" + - "{{ output_dir }}" + timeout: 300 + description: Enhanced filesystem operations for managing analysis outputs + +prompt: | + Analyze {{ data_file }} with {{ analysis_type }} mode. Output to {{ output_dir }}. + + CRITICAL: Handle file paths correctly for all operating systems. + - Detect the operating system (Windows/Linux/Mac) + - Use appropriate path separators (/ for Unix, \\ for Windows) + - Be careful to avoid escaping of slash or backslash characters + - Use os.path.join() or pathlib.Path for cross-platform paths + - Create output directories if they don't exist + + Workflow: + 1. Validate: Run data_validator subrecipe on {{ data_file }} + - Store validation results in memory + - Check for critical issues before proceeding + + 2. Clean: If issues found, run data_cleaner subrecipe + - Pass validation results to cleaner + - Handle cleaning errors gracefully + + {% if analysis_type == "statistical" or analysis_type == "comprehensive" %} + 3. Analyze: Run statistical_analyzer for stats and correlations + - Use cleaned data if available + - Store analysis results in memory + {% endif %} + + {% if include_visualizations == "true" %} + 4. Visualize: Run chart_generator for key charts + - Create output directory structure + - Handle visualization errors + {% endif %} + + 5. Report: Create brief {{ report_format }} summary + - Save to {{ output_dir }}/report.{{ report_format }} + - Use OS-compatible path construction + + Error Recovery: + - If a sub-recipe fails, continue with remaining stages if possible + - Log errors clearly with stage information + - Provide partial results if complete analysis fails + + For {{ analysis_type }}=="quick", skip heavy computations. Be efficient. + Use memory extension to pass results between stages. + Always verify paths work on the current OS before file operations. + \ No newline at end of file diff --git a/documentation/src/pages/recipes/data/recipes/subrecipes/chart-generator.yaml b/documentation/src/pages/recipes/data/recipes/subrecipes/chart-generator.yaml new file mode 100644 index 000000000000..86b47af02986 --- /dev/null +++ b/documentation/src/pages/recipes/data/recipes/subrecipes/chart-generator.yaml @@ -0,0 +1,163 @@ +version: 1.0.0 +title: Chart Generator +description: Creates professional, publication-ready visualizations and interactive charts from data analysis results + +instructions: | + You are a data visualization expert. Create clear, informative, and aesthetically pleasing + visualizations that effectively communicate data insights. + + Use best practices in data visualization and choose appropriate chart types for different data patterns. + +parameters: + - key: data_file + input_type: string + requirement: required + description: Path to the data file to visualize + + - key: chart_style + input_type: string + requirement: optional + default: "modern" + description: Visual style - options are 'modern', 'classic', 'minimal', 'publication' + + - key: color_scheme + input_type: string + requirement: optional + default: "viridis" + description: Color palette - options are 'viridis', 'plasma', 'blues', 'greens', 'categorical' + + - key: output_format + input_type: string + requirement: optional + default: "png" + description: Image format - options are 'png', 'svg', 'pdf', 'html' + + - key: interactive + input_type: string + requirement: optional + default: "false" + description: Generate interactive visualizations using Plotly (true/false) + +extensions: + - type: builtin + name: developer + timeout: 600 + bundled: true + description: For creating and saving visualizations + +prompt: | + Create simple, clear visualizations for {{ data_file }}. + Style: {{ chart_style }}, Colors: {{ color_scheme }} + + Important: Handle file paths correctly for all operating systems. + Use os.path.join() or pathlib for directory creation. + Detect OS for proper path separators. + + Quick viz workflow: + 1. Load data with pandas + 2. Create visualizations directory if not exists + 3. Create 2-3 essential charts: + - Histogram for numerical columns + - Bar chart for categorical columns + - Correlation heatmap if multiple numeric columns + 4. Save as PNG to ./visualizations/ with OS-compatible paths + + Error handling: + - Create output directory if missing + - Handle matplotlib backend issues + - Skip charts if data insufficient + - Report any visualization errors + + Use matplotlib or seaborn. Keep it simple and fast. + + 3. Visualization Standards + Apply these best practices: + - Clear, descriptive titles + - Labeled axes with units + - Legends when multiple series + - Grid lines for readability (subtle) + - Appropriate aspect ratios + - Consistent color usage + - Annotations for key insights + + 4. Style Application + {% if chart_style == "modern" %} + - Clean, minimalist design + - Sans-serif fonts + - Subtle grid lines + - High contrast colors + - White/light backgrounds + {% elif chart_style == "classic" %} + - Traditional academic style + - Serif fonts + - Visible grid lines + - Conservative colors + - Formal layout + {% elif chart_style == "minimal" %} + - Absolute minimum decoration + - No grid lines + - Simple colors + - Maximum data-ink ratio + {% elif chart_style == "publication" %} + - Journal-ready formatting + - High DPI + - Grayscale-safe colors + - Clear for print + {% endif %} + + 5. Interactive Features (if enabled) + {% if interactive == "true" %} + Create interactive HTML visualizations with: + - Hover tooltips showing exact values + - Zoom and pan capabilities + - Toggle legend items + - Download buttons + - Responsive layout + Use Plotly or Bokeh for interactivity. + {% endif %} + + 6. Chart Gallery Generation + Create the following visualization set: + + A. Distribution Analysis (for each numerical column) + - Histogram with KDE overlay + - Box plot + - Q-Q plot for normality + + B. Relationship Analysis + - Correlation heatmap (if multiple numerical columns) + - Scatter plots for top correlated pairs + - Pair plot matrix + + C. Category Analysis (if categorical data exists) + - Bar charts for categorical frequencies + - Grouped bar charts for relationships + - Proportional visualizations + + D. Time Series (if temporal data exists) + - Line plots with trend lines + - Seasonal decomposition + - Moving averages + + E. Overview Dashboard + - Combined multi-panel figure + - Key metrics summary + - Highlight important patterns + + 7. Save Visualizations + - Save each chart with descriptive filename + - Use {{ output_format }} format + - Organize in subdirectories by type + - Create index.html gallery if interactive + - Generate thumbnail versions + + 8. Visualization Report + Create a summary document with: + - List of all generated visualizations + - Description of each chart + - Key insights visible in each viz + - Recommendations for interpretation + - Technical notes (libraries, settings used) + + Focus on clarity and insight communication over decorative elements. + Ensure all visualizations are self-explanatory and publication-ready. \ No newline at end of file diff --git a/documentation/src/pages/recipes/data/recipes/subrecipes/data-cleaner.yaml b/documentation/src/pages/recipes/data/recipes/subrecipes/data-cleaner.yaml new file mode 100644 index 000000000000..23657c5edef4 --- /dev/null +++ b/documentation/src/pages/recipes/data/recipes/subrecipes/data-cleaner.yaml @@ -0,0 +1,65 @@ +version: 1.0.0 +title: Data Cleaner +description: Performs intelligent data cleaning including missing value handling, duplicate removal, and outlier treatment + +instructions: | + You are a data cleaning specialist. Clean and prepare datasets for analysis while + preserving data integrity and documenting all transformations applied. + + Use smart, context-aware cleaning strategies appropriate for the data type and domain. + +parameters: + - key: data_file + input_type: string + requirement: required + description: Path to the data file to clean + + - key: handle_missing + input_type: string + requirement: optional + default: "smart" + description: Strategy for missing values - options are 'remove', 'mean', 'median', 'mode', 'smart' + + - key: remove_duplicates + input_type: string + requirement: optional + default: "true" + description: Whether to remove duplicate rows (true/false) + + - key: outlier_strategy + input_type: string + requirement: optional + default: "flag" + description: How to handle outliers - options are 'keep', 'remove', 'flag', 'cap' + +extensions: + - type: builtin + name: developer + timeout: 600 + bundled: true + description: For data manipulation and cleaning operations + +prompt: | + Clean {{ data_file }} efficiently using pandas. + + Important: Handle file paths correctly for all operating systems. + Use os.path.join() or pathlib for cross-platform compatibility. + Detect OS and use appropriate separators. + + Strategy: {{ handle_missing }} for missing, {{ remove_duplicates }} duplicates, {{ outlier_strategy }} outliers + + Quick cleaning steps: + 1. Load data with pd.read_csv/json + 2. Drop duplicates if {{ remove_duplicates }}=="true" + 3. Handle missing: + - {{ handle_missing }}=="smart": fillna(median) for numeric, fillna(mode) for objects + - {{ handle_missing }}=="remove": dropna() + 4. Outliers: {% if outlier_strategy == "remove" %}IQR method{% else %}keep all{% endif %} + 5. Save as {filename}_cleaned.csv in same directory as input + + Error handling: + - Catch and report file I/O errors + - Handle empty dataframes gracefully + - Validate operations succeeded before proceeding + + Return brief report: rows before/after, columns cleaned, transformations applied. \ No newline at end of file diff --git a/documentation/src/pages/recipes/data/recipes/subrecipes/data-validator.yaml b/documentation/src/pages/recipes/data/recipes/subrecipes/data-validator.yaml new file mode 100644 index 000000000000..c1be6b97ffc7 --- /dev/null +++ b/documentation/src/pages/recipes/data/recipes/subrecipes/data-validator.yaml @@ -0,0 +1,57 @@ +version: 1.0.0 +title: Data Validator +description: Validates data file format, structure, and integrity with comprehensive error checking + +instructions: | + You are a data validation specialist. Your job is to thoroughly examine data files and + report on their structure, quality, and any issues that need attention before analysis. + + Be comprehensive but efficient in your validation checks. + +parameters: + - key: data_file + input_type: string + requirement: required + description: Path to the data file to validate + + - key: validation_level + input_type: string + requirement: optional + default: "standard" + description: Depth of validation - options are 'quick', 'standard', 'comprehensive' + +extensions: + - type: builtin + name: developer + timeout: 300 + bundled: true + description: For reading and examining data files + +prompt: | + Validate {{ data_file }} efficiently and concisely. + + Important: Handle file paths correctly for all operating systems. + Detect OS and use appropriate path separators. Avoid escaping issues. + + Use pandas to quickly analyze: + 1. Load data and detect format (CSV/JSON/Excel) + 2. Get basic info: shape, columns, dtypes, missing values + 3. Check for duplicates + {% if validation_level == "comprehensive" %} + 4. Compute basic statistics and detect outliers using describe() + {% endif %} + + Return a JSON report with: + - file_format, rows, columns + - data_types dict + - missing_values dict + - duplicates_count + - quality_score (0-100) + - issues list + + Error handling: + - If file not found, return error with clear message + - If format unsupported, suggest alternatives + - If loading fails, check encoding (try utf-8, latin1) + + Keep code simple, use pandas built-in methods, avoid loops. \ No newline at end of file diff --git a/documentation/src/pages/recipes/data/recipes/subrecipes/statistical-analyzer.yaml b/documentation/src/pages/recipes/data/recipes/subrecipes/statistical-analyzer.yaml new file mode 100644 index 000000000000..79464356705f --- /dev/null +++ b/documentation/src/pages/recipes/data/recipes/subrecipes/statistical-analyzer.yaml @@ -0,0 +1,58 @@ +version: 1.0.0 +title: Statistical Analyzer +description: Performs comprehensive statistical analysis including descriptive statistics, distributions, correlations, and hypothesis testing + +instructions: | + You are a statistical analysis expert. Perform rigorous statistical analysis on datasets + and extract meaningful insights using appropriate statistical methods. + + Explain findings in both technical and accessible terms. + +parameters: + - key: data_file + input_type: string + requirement: required + description: Path to the cleaned data file to analyze + + - key: confidence_level + input_type: string + requirement: optional + default: "95" + description: Confidence level for statistical tests (90, 95, 99) + + - key: include_correlations + input_type: string + requirement: optional + default: "true" + description: Whether to compute correlation matrix (true/false) + +extensions: + - type: builtin + name: developer + timeout: 300 + bundled: true + description: For statistical computations and analysis + +prompt: | + Quick statistical analysis of {{ data_file }} at {{ confidence_level }}% confidence. + + Important: Handle file paths correctly for all operating systems. + Detect OS and use appropriate path separators (forward slash / or backslash \). + Be careful to avoid escaping issues with path characters. + + Use pandas efficiently: + 1. Load data with pd.read_csv or appropriate method + 2. Run df.describe() for numerical stats (mean, std, quartiles) + 3. df.value_counts() for categorical columns + {% if include_correlations == "true" %} + 4. df.corr() for correlation matrix + {% endif %} + 5. Print summary report with key insights + + Be concise. Avoid complex computations unless needed. + + If you encounter errors: + - Check file path formatting + - Verify file exists and is readable + - Handle missing dependencies gracefully + - Provide clear error messages \ No newline at end of file