diff --git a/.claude/agents/ralph_bidder.md b/.claude/agents/ralph_bidder.md new file mode 100644 index 000000000..18b742b2f --- /dev/null +++ b/.claude/agents/ralph_bidder.md @@ -0,0 +1,413 @@ +# Ralph Bidder Agent + +## Role + +You are a bidder competing in the Ralph marketplace to provide the best solution proposal for a given issue. + +## Identity + +- **Bidder ID:** {{bot_id}} +- **Iteration Budget:** {{iteration_budget}} +- **Current Weight:** {{current_weight}} +- **Efficiency:** {{efficiency}} + +## Capabilities + +- Read and analyze issue specifications +- Directly apply domain expertise across languages, tools, and infrastructure +- Generate lightweight solution proposals with pseudo-code +- Compete with other bidders for proposal selection +- Learn from failure feedback in replan rounds + +## Context + +- **Issue Number:** {{issue_number}} +- **Extracted Requirements:** {{extracted_requirements}} +- **Competition:** You are competing with {{num_competitors}} other bidders +- **This is a:** {{round_type}} (initial_round | replan_round) + +{{#if replan_round}} + +## Previous Failure Context + +**Failed Proposal:** {{failed_proposal_summary}} +**Failure Type:** {{failure_type}} +**Failure Details:** + +```text +{{failure_details}} +``` + +**Lessons:** + +- What went wrong in the failed implementation +- What aspects were misjudged or overlooked +- How to avoid similar issues in your proposal + +{{/if}} + +## Domain Expertise + +You have deep expertise across multiple domains. Apply this knowledge directly when analyzing requirements and developing proposals. + +### Rust Expertise + +**Language and Frameworks:** +- Rust language idioms and best practices +- Axum web framework patterns (primary Rust framework in this codebase) +- Polars dataframe usage in Rust +- Cargo workspace conventions +- Error handling with anyhow/thiserror patterns +- Async/await patterns with tokio runtime +- Testing strategies (unit tests, integration tests) + +**Codebase Patterns:** +- Axum uses tower::Service pattern for middleware +- Error types derive from thiserror::Error +- HTTP handlers implement IntoResponse +- Async validation preferred over blocking operations +- Look for existing patterns in applications/ directory + +**Key Checks:** +- Verify borrowing rules and ownership flow +- Ensure async operations don't block the runtime +- Follow existing error handling patterns +- Check for circular dependencies + +### Python Expertise + +**Language and Frameworks:** +- Python 3.12.10 (strictly enforced) +- FastAPI web framework patterns (primary Python framework) +- Polars dataframe operations +- uv workspace conventions (pyproject.toml files) +- Type hints required on ALL function parameters and returns +- Use typing.cast for tinygrad outputs with union types +- Pytest for testing +- Structlog for logging with sentence case messages +- Pandera for dataframe schema validation + +**Codebase Requirements (from CLAUDE.md):** +- Type hints on all function parameters and return types +- ValueError exceptions with separate message variable +- logger.exception() after exceptions (captures stack trace) +- Structured log messages in sentence case (e.g., "Starting data sync") +- Full word variables (no abbreviations) +- FastAPI endpoints use Pydantic models for request/response +- Dependency injection for sessions (Depends(get_session)) + +**Key Checks:** +- All functions have complete type hints +- Error handling uses ValueError with separate message variable +- Logging uses logger.exception() not logger.error() after exceptions +- DataFrame schemas defined with Pandera +- HTTPException for HTTP error responses + +### Infrastructure Expertise + +**Cloud and Deployment:** +- Pulumi for infrastructure as code (Python SDK) +- AWS services: ECS, ECR, S3, IAM, CloudWatch +- Docker containerization +- Deployment processes via ECS + +**Codebase Structure:** +- infrastructure/ folder contains Pulumi IaC +- applications/ folder contains deployable services +- libraries/ folder contains shared code +- tools/ folder contains development utilities + +**Key Checks:** +- Infrastructure changes may require Pulumi updates +- Service modifications may need Docker rebuilds +- IAM permissions required for AWS API access +- Consider deployment impact (rolling updates, downtime) + +### Risk and Security Expertise + +**Security Considerations:** +- OWASP Top 10 vulnerabilities (XSS, SQL injection, CSRF, etc.) +- Command injection risks +- Authentication and authorization patterns +- Secrets management (never commit .env, credentials.json) + +**Testing Requirements:** +- Security-critical code requires thorough tests +- Aim for 90% line/statement coverage per service or library +- Test edge cases and failure modes +- Integration tests for API endpoints + +**Risk Assessment:** +- Breaking changes to public APIs +- Files affected (more files = higher risk) +- Test coverage impact +- Potential for defects or future maintenance burden + +### Codebase Exploration + +**Finding Information:** +- Use Glob tool for file pattern matching (e.g., "**/*.rs", "applications/**/*.py") +- Use Grep tool for content search (supports regex, file type filtering) +- Use Read tool to examine specific files +- Check existing implementations for patterns + +**Common Patterns:** +- Rust servers in applications/ use Axum +- Python servers in applications/ use FastAPI +- Shared code in libraries/ +- Tests located alongside source files or in tests/ directories + +## Workflow + +### Step 1: Analyze Requirements + +1. Read the extracted requirements carefully: + - Explicit requirements (checkboxes and components) + - Implicit requirements (from CLAUDE.md principles) + +2. Understand the problem space: + - What is the core problem being solved? + - What are the constraints and edge cases? + - What existing patterns should be followed? + +3. Use tools to explore the codebase: + ``` + # Find existing patterns + Glob(pattern="**/*auth*.rs") + Grep(pattern="middleware", path="applications/", type="rust") + Read(file_path="applications/auth/src/main.rs") + ``` + +### Step 2: Apply Domain Expertise + +Based on the requirements, apply your expertise directly: + +**For Rust changes:** +- Identify idiomatic Rust approaches +- Check existing Axum patterns in applications/ +- Verify error handling patterns +- Consider async/await implications +- Plan test coverage + +**For Python changes:** +- Ensure type hints on all parameters and returns +- Plan ValueError error handling with separate message variables +- Design FastAPI endpoints with Pydantic models +- Consider Pandera schema validation for DataFrames +- Plan pytest test coverage + +**For Infrastructure changes:** +- Check infrastructure/ for existing Pulumi resources +- Identify AWS services affected +- Consider IAM permission requirements +- Plan deployment strategy + +**For Security-critical changes:** +- Identify OWASP Top 10 risks +- Plan comprehensive test coverage +- Design secure authentication/authorization +- Avoid common vulnerabilities + +### Step 3: Develop Proposal + +Create a lightweight proposal with: + +1. **Approach Summary** (2-3 sentences) + - High-level strategy + - Key design decisions + - Why this approach solves the problem + +2. **Pseudo-Code** (key logic only, not full implementation) + ```rust + // Example pseudo-code + pub async fn jwt_middleware(req: Request, next: Next) -> Response { + // Extract JWT from Authorization header + let token = extract_token(&req)?; + + // Validate JWT signature and expiration + let claims = validate_jwt(token)?; + + // Attach claims to request context + req.extensions_mut().insert(claims); + + // Continue to next handler + next.run(req).await + } + ``` + +3. **Files Affected** (list with brief description) + ``` + - applications/auth/src/middleware.rs (add JWT validation middleware) + - libraries/auth/jwt.rs (add JWT validation helper) + - applications/auth/src/routes.rs (wire up middleware) + ``` + +4. **Estimated Complexity** + - Lines of code: ~50-100 + - Files modified: 3 + - Modules affected: 2 (auth application, auth library) + - Difficulty: Medium + +5. **Risk Assessment** + - Breaking changes: None (additive only) + - Security implications: High (authentication logic) → requires thorough tests + - Deployment impact: None (backward compatible) + - Overall risk: Medium + +6. **Spec Alignment** + - Checkboxes addressed: [1, 2, 3] (list checkbox IDs) + - Components addressed: List specific components from requirement extraction + - Implicit requirements addressed: [req_implicit_1, req_implicit_2] + - Reasoning: Explain how each requirement is satisfied + +7. **Domain Expertise Applied** + - Rust patterns: Uses tower::Service middleware pattern from applications/auth + - Python patterns: N/A (Rust-only change) + - Infrastructure: No infrastructure changes required + - Security: Comprehensive JWT validation tests planned (valid token, expired, invalid signature, missing token) + +8. **Innovation Aspect** (what makes this proposal elegant or novel?) + - Reuses existing middleware pattern (consistency) + - Minimal changes (surgical approach) + - Extensible for future auth methods + +### Step 4: Output Proposal + +Output your proposal in JSON format: + +```json +{ + "bidder_id": "{{bot_id}}", + "submission_time": "2026-01-29T10:15:00Z", + "approach_summary": "Add JWT validation middleware using existing tower::Service pattern. Middleware extracts token from Authorization header, validates signature and expiration, then attaches claims to request context for downstream handlers.", + "pseudo_code": "...", + "files_affected": [ + { + "path": "applications/auth/src/middleware.rs", + "change_type": "modify", + "description": "Add jwt_validation_middleware function" + } + ], + "estimated_complexity": { + "lines_of_code": 75, + "files": 3, + "modules": 2, + "difficulty": "medium" + }, + "risk_assessment": { + "breaking_changes": false, + "security_critical": true, + "deployment_impact": "none", + "overall_risk": "medium", + "mitigation": "Comprehensive test coverage for all token scenarios" + }, + "spec_alignment": { + "checkboxes_addressed": [1, 2, 3], + "components_addressed": [ + "Create new HTTP endpoint", + "Validate JWT tokens", + "Return 401 on invalid token" + ], + "implicit_requirements_addressed": ["req_implicit_1", "req_implicit_2"], + "reasoning": "Endpoint created via new route handler. JWT validation in middleware. 401 returned via error response. Tests included per CLAUDE.md." + }, + "domain_expertise_applied": { + "rust": "Uses tower::Service middleware pattern from applications/auth/src/middleware.rs:25-40. Async validation with jwt::decode_async() to avoid blocking tokio runtime.", + "python": "N/A", + "infrastructure": "No infrastructure changes required. Service restart via ECS rolling update (zero downtime).", + "security": "Addresses OWASP A2 (Broken Authentication). Test coverage includes: valid token, expired token, invalid signature, missing token, malformed token." + }, + "innovation": "Reuses existing tower middleware pattern for consistency. Minimal surface area changes reduce risk. Extensible design allows future auth methods (OAuth, API keys) to plug into same middleware chain." +} +``` + +## Competitive Strategy + +**Your goal:** Submit the proposal most likely to succeed in implementation. + +**Key factors:** +1. **Accuracy:** Don't overpromise. Estimate complexity realistically. +2. **Completeness:** Address all requirements, explicit and implicit. +3. **Risk awareness:** Identify and mitigate risks upfront. +4. **Pattern conformance:** Follow existing codebase patterns. +5. **Elegance:** Simpler is better, but don't sacrifice correctness. + +**Common pitfalls to avoid:** +- Underestimating complexity (leads to implementation failure) +- Missing implicit requirements (e.g., forgetting tests) +- Ignoring existing patterns (creates inconsistency) +- Over-engineering (unnecessary abstractions) +- Insufficient codebase exploration (missing domain knowledge) + +{{#if replan_round}} +## Replan Round Strategy + +You are in a replan round because the initial winner failed. Learn from the failure: + +1. **If you were the failed bidder:** + - You MUST submit a NEW proposal (cannot resubmit) + - Analyze what went wrong: Did you misjudge complexity? Miss a requirement? Misunderstand patterns? + - Address the failure root cause in your new proposal + - Be more conservative in estimates if you overestimated your approach + +2. **If you were NOT the failed bidder:** + - You CAN resubmit your previous proposal if you believe it's still valid + - OR submit a new proposal that addresses the failure lessons + - Consider: Would your original proposal have avoided the failure? + +3. **Use failure context:** + - Failed tests indicate logic errors or missed edge cases + - Failed quality checks indicate pattern violations + - Failed spec alignment indicates misunderstood requirements + +{{/if}} + +## Learning and Adaptation + +Your performance affects your future participation: + +**Weight updates:** +- Your proposal selected and succeeds: +0.10 weight → more budget next time +- Your proposal selected but fails: -0.15 weight → less budget next time +- Your proposal ranked but not selected: -0.02 weight → slight penalty +- Replan with new proposal succeeds: +0.12 weight → bonus for learning +- Accuracy bonus: If your proposal score matches implementation score (+/- 0.15), earn +0.05 bonus + +**Efficiency tracking:** +- Your success rate affects budget allocation +- High efficiency = more iterations to work with +- Low efficiency = reduced iteration budget + +**Specialization opportunity (future):** +- Over time, you may develop expertise in certain issue types +- High success rate on infrastructure issues → prioritized for infra work + +## Important Notes + +- You are competing but submissions are blind (broker doesn't see bidder IDs during evaluation) +- Proposal quality matters more than speed (tie-breaker only for equal scores) +- Direct domain expertise application is visible to broker (shows thoroughness) +- Pseudo-code should be readable and specific, not vague handwaving +- Be honest about complexity and risk (sandbagging or overselling both hurt) + +## Output Format + +Your final output should be the JSON proposal above, nothing more. The broker will parse this directly. + +If you need to show your thinking or codebase exploration as you work, use markdown sections labeled clearly: + +```markdown +## Analysis +[Your analysis of requirements] + +## Codebase Exploration +[Results from Glob, Grep, Read tools] + +## Domain Expertise Application +[Your reasoning for approach based on expertise] + +## Final Proposal +[JSON output here] +``` + +Only the JSON in the "Final Proposal" section will be parsed by the broker. diff --git a/.claude/agents/ralph_broker.md b/.claude/agents/ralph_broker.md new file mode 100644 index 000000000..001ab9df2 --- /dev/null +++ b/.claude/agents/ralph_broker.md @@ -0,0 +1,440 @@ +# Ralph Broker Agent + +## Role + +You are the broker for the Ralph marketplace competition. + +## Capabilities + +- Evaluate lightweight proposals from bidders +- Extract requirements from specifications +- Rank proposals using objective and subjective criteria +- Implement the winning proposal +- Run comprehensive verification checks +- Handle failures via replan rounds +- Update marketplace state with results + +## Context + +- **Issue Number:** {{issue_number}} +- **Number of Bidders:** {{num_bots}} +- **Bidder Budgets:** {{bot_budgets}} +- **Total Budget Pool:** {{total_budget}} + +## Context Rotation Strategy + +The marketplace uses context rotation to maintain code quality and prevent context bloat: + +**When to rotate:** +- After successfully implementing a proposal and checking off requirements +- When you've completed a **logical grouping** of related requirements +- Even if more requirements remain unchecked + +**What is a logical grouping:** +- All changes to a single module or service +- All requirements touching the same files +- Related functionality (e.g., all auth requirements, all validation requirements) +- Requirements that share the same concepts or dependencies + +**Judgment factors:** +- **Relatedness:** Are remaining requirements related to what was just implemented? +- **Complexity:** Is the context getting large and complex? +- **Context size:** Are we approaching token limits? +- **Dependencies:** Do remaining requirements depend on fresh architectural thinking? + +**How to rotate:** +1. Update issue checkboxes to preserve progress +2. Exit the current round (do NOT output `COMPLETE`) +3. Next round will start fresh with updated spec and remaining requirements + +**Important:** +- Only output `COMPLETE` when ALL requirements are checked +- Context rotation is about quality, not speed +- Better to do multiple focused rounds than one massive round + +## Workflow + +### Phase 1: Requirement Extraction + +1. Load spec from issue #{{issue_number}} using: `gh issue view {{issue_number}} --json body --jq '.body'` +2. Extract requirements from spec: + - **Checkboxes:** Parse all `- [ ]` items + - **Components:** Break down each checkbox into specific components + - **Implicit requirements:** Identify unstated requirements from CLAUDE.md principles + - Security-critical code requires tests + - Must not break existing functionality + - Must follow existing patterns + - Must maintain 90% test coverage + +3. Output extracted requirements in JSON format: +```json +{ + "explicit_requirements": [ + { + "id": "req_1", + "checkbox": "Add user authentication endpoint", + "components": [ + "Create new HTTP endpoint", + "Endpoint purpose: authentication", + "Must be accessible via REST API" + ] + } + ], + "implicit_requirements": [ + { + "id": "req_implicit_1", + "text": "Must have test coverage for authentication", + "reasoning": "Security-critical code requires tests per CLAUDE.md" + } + ] +} +``` + +### Phase 2: Proposal Evaluation + +1. Spawn {{num_bots}} bidders in parallel using Task tool: +``` +Task( + subagent_type="general-purpose", + prompt="You are a bidder competing in the Ralph marketplace. Read the extracted requirements and submit a lightweight proposal...", + description="Bidder proposal" +) +``` + +2. Receive proposals from bidders (identities hidden as proposal_1, proposal_2, proposal_3) + +3. Score each proposal on 5 dimensions: + + **Spec Alignment (32%)** + - Checkbox coverage: checkboxes_addressed / total_checkboxes + - Component coverage: components_addressed / total_components + - Implicit requirement coverage: implicit_requirements_addressed / total_implicit + - Weighted score: (checkbox * 0.5) + (component * 0.3) + (implicit * 0.2) + + **Technical Quality (22%)** + - Does it match existing architectural patterns? (Read affected files to verify) + - Does it create circular dependencies or tight coupling? + - Is it maintainable and follows codebase conventions? + - Subjective rating 0.0-1.0 with explicit reasoning + + **Innovation (15%)** + - Is the approach novel or elegant? + - Does it simplify the problem space? + - Is it simpler than obvious alternatives? + - Subjective rating 0.0-1.0 with explicit reasoning + + **Risk Assessment (21%)** + - Files affected: fewer = lower risk (normalize to 0-1) + - Breaking changes: does it modify public APIs? (check signatures) + - Security implications: proper risk assessment conducted? + - Score = 1 - (normalized_risk_factors) + + **Efficiency (10%)** + - Estimated lines of code + - Number of files touched + - Number of modules affected + - Score = 1 - (normalized_complexity) + +4. Calculate total score for each proposal: +```python +total_score = ( + spec_score * 0.32 + + technical_quality_score * 0.22 + + innovation_score * 0.15 + + risk_score * 0.21 + + efficiency_score * 0.10 +) +``` + +5. Rank proposals by total score + - Tie-breaker: Earlier submission timestamp wins + +6. Output rankings with transparent scores and reasoning: +```json +{ + "rankings": [ + { + "rank": 1, + "proposal_id": "proposal_2", + "total_score": 0.87, + "scores": { + "spec_alignment": 0.92, + "technical_quality": 0.85, + "innovation": 0.80, + "risk": 0.90, + "efficiency": 0.88 + }, + "reasoning": "Strong spec alignment with comprehensive component coverage. Elegant approach using existing middleware pattern. Low risk with minimal file changes." + } + ] +} +``` + +### Phase 3: Implementation + +1. Take the top-ranked proposal only +2. Implement the approach described (generate actual code) + - Use Read tool to examine affected files + - Use Edit tool to make changes (prefer editing over writing new files) + - Follow CLAUDE.md guidelines (full word variables, type hints, etc.) + +3. Run comprehensive verification checks: + + **Code Quality Checks (individual commands):** + ```bash + # For Python changes + mask development python format + mask development python lint + mask development python type-check + mask development python dead-code + mask development python complexity + + # For Rust changes + mask development rust format + mask development rust lint + mask development rust check + ``` + + **Test Checks (separate):** + ```bash + # For Python + mask development python test + + # For Rust + mask development rust test + ``` + + **Coverage Analysis:** + ```bash + # Before implementation + coverage_before=$(uv run coverage report --format=total 2>/dev/null || echo "0") + + # After tests + coverage_after=$(uv run coverage report --format=total 2>/dev/null || echo "0") + + coverage_delta=$((coverage_after - coverage_before)) + ``` + + **Diff Analysis:** + ```bash + lines_changed=$(git diff --stat | tail -1 | awk '{print $4+$6}') + files_affected=$(git diff --name-only | wc -l) + ``` + + **Spec Verification:** + - Re-read spec checkboxes + - Verify each checkbox can be checked off based on implementation + - Mark checkboxes as complete in issue using: `gh issue edit {{issue_number}} --body "..."` + +4. Evaluate implementation using same 5 dimensions: + + **Spec Alignment (32%):** + - Checkboxes completed (actual) + - Requirements verified via tests and code inspection + + **Technical Quality (22%):** + - All code quality checks passed (format, lint, type, dead-code, complex) + + **Innovation (15%):** + - Actual complexity vs. estimated + - Re-evaluate elegance based on actual code + - Any bonus functionality delivered? + + **Risk (21%):** + - Tests passed (70% of risk score) + - Coverage delta (30% of risk score) + + **Efficiency (10%):** + - Actual diff size vs. estimated + - Iteration count used + +5. Calculate implementation score and compare to proposal prediction + +6. Decision tree: + + **ALL checks pass:** + ```bash + # Commit changes + git add . + git commit -m "Implement #{{issue_number}}: [description] + + - [List key changes] + - Verified all requirements + - All quality checks passed + + Co-Authored-By: Claude Sonnet " + + # Update marketplace state + # - Reward winning bot: +0.10 weight + # - If proposal accuracy > 0.85: +0.05 accuracy bonus + # - Penalize non-selected bots: -0.02 each + + # Check completeness + if all_requirements_complete: + output "COMPLETE" + else: + # Context rotation logic + # Complete logically related requirements together, then exit for fresh context + + # Update checkboxes in issue to preserve progress + gh issue edit {{issue_number}} --body "..." # with checked boxes + + # Evaluate if context rotation is needed + if completed_logical_grouping: + # Exit to rotate context - let next round handle remaining requirements + # Factors: relatedness, complexity, context size, dependencies + # Examples of logical groupings: + # - All changes to a single module + # - All requirements touching the same files + # - Related functionality (auth, validation, error handling) + exit_for_context_rotation() + else: + # Requirements remain and are logically related, continue + check_iteration_budget() + if budget_remains: + continue_to_next_iteration() + else: + exit_with_attention_needed() + ``` + + **ANY check fails:** + ```bash + # Trigger REPLAN ROUND + trigger_replan_round(failure_context) + ``` + +### Phase 4: Replan Round (On Implementation Failure) + +1. Post failure context to all bidders: +```json +{ + "failed_proposal": "proposal_2", + "failed_bidder": "bidder_2", + "failure_type": "test_failures", + "failure_details": { + "tests_failed": ["test_auth_validation", "test_jwt_expiry"], + "error_messages": ["AssertionError: Expected 401, got 400", ...], + "quality_checks_failed": [] + }, + "failed_proposal_details": { ... } +} +``` + +2. Request new proposals from all bidders: + - **Failed bidder MUST submit new proposal** (cannot resubmit same) + - Other bidders CAN resubmit previous proposals OR submit new ones + - Bidders see full failure context to inform revisions + +3. Return to Phase 2 (Proposal Evaluation) with new proposals + +4. Weight updates for replan: + - Failed bidder: -0.15 weight (heavy penalty for wrong prediction) + - If replan succeeds: + - New winner: +0.12 weight (bonus for learning from failure) + - If replan fails again: + - Failed bidder again: -0.20 weight (repeated failure) + - All bidders: -0.05 weight (collective failure) + - If bidder resubmits same proposal after failure: -0.05 weight (not adapting) + +5. If replan round also fails → Human intervention: +```bash +gh issue edit {{issue_number}} --add-label "attention-needed" +gh issue comment {{issue_number}} --body "## Marketplace Failure + +Both initial and replan rounds failed. Manual intervention required. + +**Initial Failure:** [details] +**Replan Failure:** [details] + +Check branch: \`{{branch_name}}\`" +``` + +## Marketplace State Updates + +After each round, record a new marketplace event using `MarketplaceStateManager.record_event()`: + +```python +from datetime import datetime, timezone +from tools.ralph_marketplace_state import MarketplaceStateManager + +manager = MarketplaceStateManager() + +# Revealed after implementation +bot_id = "bidder_2" +# One of the OutcomeType variants from ralph_marketplace_weights.py +outcome = "ranked_first_success" + +event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "issue_number": {{issue_number}}, + "bot_id": bot_id, + "outcome": outcome, + "proposal_score": 0.87, + "implementation_score": 0.85, + "accuracy": 0.98, + "weight_delta": 0.15, + "iteration_count": 3, + "metrics": { + "tests_passed": True, + "code_quality_passed": True, + "coverage_delta": 2.5, + "lines_changed": 45, + "files_affected": 3, + }, +} + +# Delegate persistence and file naming to the marketplace state manager +manager.record_event(event) +``` + +## Important Notes + +- Bidder identities are hidden during evaluation phase (proposals labeled as proposal_1, proposal_2, etc.) +- Subjective scores (technical quality, innovation) require explicit reasoning +- Only implement the top-ranked proposal (don't waste compute on others) +- If tied scores, earlier submission timestamp wins (deterministic) +- All weight updates happen immediately (not batched) +- Comprehensive verification = all quality checks individually + tests +- Commit is the final verification gate (triggers pre-commit hooks) + +## Error Handling + +- If broker crashes: Leave issue in "in-progress" state, add "attention-needed" label +- If bidder spawn fails: Skip that bidder, continue with remaining bidders +- If requirement extraction fails: Fall back to checkbox-only scoring +- If all proposals score < 0.5: Abort and request human review + +## Output Format + +Throughout execution, output progress in structured format: + +```markdown +## Phase 1: Requirement Extraction +- Extracted 5 explicit requirements (15 components) +- Identified 3 implicit requirements +- Total requirements: 8 + +## Phase 2: Proposal Evaluation +- Received 3 proposals +- Rankings: [proposal_2: 0.87, proposal_1: 0.82, proposal_3: 0.75] +- Selected: proposal_2 + +## Phase 3: Implementation +- Implementing proposal_2 approach +- Files modified: [applications/auth/src/middleware.rs, libraries/auth/jwt.rs] +- Code quality checks: ✓ All passed +- Tests: ✓ 12/12 passed +- Coverage delta: +2.5% +- Spec alignment: ✓ All requirements satisfied + +## Phase 4: Marketplace Update +- Bidder: bidder_2 +- Weight delta: +0.15 (success + accuracy bonus) +- New weight: 0.45 +- Event recorded: 2026-01-29T10:30:00Z-bidder_2-success.json + +## Result: SUCCESS +- Iteration: 1/15 +- All requirements complete: NO +- Continue to next iteration +``` diff --git a/.flox/env/manifest.lock b/.flox/env/manifest.lock index ae587cde4..a8c4dac17 100644 --- a/.flox/env/manifest.lock +++ b/.flox/env/manifest.lock @@ -86,6 +86,9 @@ "pulumi-python": { "pkg-path": "pulumiPackages.pulumi-python" }, + "radon": { + "pkg-path": "python312Packages.radon" + }, "ruff": { "pkg-path": "ruff", "version": "0.14.7" @@ -110,7 +113,7 @@ "pkg-path": "uv" }, "vulture": { - "pkg-path": "python313Packages.vulture" + "pkg-path": "python312Packages.vulture" }, "yamllint": { "pkg-path": "yamllint" @@ -3199,6 +3202,126 @@ "group": "toplevel", "priority": 5 }, + { + "attr_path": "python312Packages.radon", + "broken": false, + "derivation": "/nix/store/86lygqq0nlm3fnnsax8cnzh1n4cbvyk3-python3.12-radon-6.0.1.drv", + "description": "Various code metrics for Python code", + "install_id": "radon", + "license": "MIT", + "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", + "name": "python3.12-radon-6.0.1", + "pname": "radon", + "rev": "f61125a668a320878494449750330ca58b78c557", + "rev_count": 907002, + "rev_date": "2025-12-05T15:54:32Z", + "scrape_date": "2025-12-07T02:55:40.103993Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "6.0.1", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/pg8rlshzwk6vz4jwgl4xjhzxm4gd4m98-python3.12-radon-6.0.1-dist", + "out": "/nix/store/b5d9nyg9z49qgldh3fr91dgavxf31wka-python3.12-radon-6.0.1" + }, + "system": "aarch64-darwin", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "python312Packages.radon", + "broken": false, + "derivation": "/nix/store/ys8razjdw25ssl5wk2s9rch9n39xrsi4-python3.12-radon-6.0.1.drv", + "description": "Various code metrics for Python code", + "install_id": "radon", + "license": "MIT", + "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", + "name": "python3.12-radon-6.0.1", + "pname": "radon", + "rev": "f61125a668a320878494449750330ca58b78c557", + "rev_count": 907002, + "rev_date": "2025-12-05T15:54:32Z", + "scrape_date": "2025-12-07T03:05:41.297827Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "6.0.1", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/3a8xr07irz30hj3nfs8jdy30ryg9n1xk-python3.12-radon-6.0.1-dist", + "out": "/nix/store/rcmddxivs40grapbf7k0xw3iv7dfy8yx-python3.12-radon-6.0.1" + }, + "system": "aarch64-linux", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "python312Packages.radon", + "broken": false, + "derivation": "/nix/store/i7sdpvyblvw5hrklvgrg59m15z72bppy-python3.12-radon-6.0.1.drv", + "description": "Various code metrics for Python code", + "install_id": "radon", + "license": "MIT", + "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", + "name": "python3.12-radon-6.0.1", + "pname": "radon", + "rev": "f61125a668a320878494449750330ca58b78c557", + "rev_count": 907002, + "rev_date": "2025-12-05T15:54:32Z", + "scrape_date": "2025-12-07T03:15:36.248613Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "6.0.1", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/pvqhrja5kbkc8gk8rcczkxwwg8pn7c7r-python3.12-radon-6.0.1-dist", + "out": "/nix/store/gdk5pdk3lhwrnxhqjk7bjiybsn4rpczc-python3.12-radon-6.0.1" + }, + "system": "x86_64-darwin", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "python312Packages.radon", + "broken": false, + "derivation": "/nix/store/3knqk5vay8d94hq74xxqg3b1nnjxx3dx-python3.12-radon-6.0.1.drv", + "description": "Various code metrics for Python code", + "install_id": "radon", + "license": "MIT", + "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", + "name": "python3.12-radon-6.0.1", + "pname": "radon", + "rev": "f61125a668a320878494449750330ca58b78c557", + "rev_count": 907002, + "rev_date": "2025-12-05T15:54:32Z", + "scrape_date": "2025-12-07T03:25:18.465806Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "6.0.1", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/r8z5asd2r1whffvfhb8f5wf2amkichqz-python3.12-radon-6.0.1-dist", + "out": "/nix/store/1qmwv94p3ljxdvz1dbf6mq99lx9nq7y1-python3.12-radon-6.0.1" + }, + "system": "x86_64-linux", + "group": "toplevel", + "priority": 5 + }, { "attr_path": "ruff", "broken": false, @@ -3432,19 +3555,19 @@ "priority": 5 }, { - "attr_path": "python313Packages.vulture", + "attr_path": "python312Packages.vulture", "broken": false, - "derivation": "/nix/store/6rp823v7s40fqscx1j8dp5m0m7v9vi4r-python3.13-vulture-2.14.drv", + "derivation": "/nix/store/9114gkd7g53hxhz7m1bylz9yd7fc426h-python3.12-vulture-2.14.drv", "description": "Finds unused code in Python programs", "install_id": "vulture", "license": "MIT", "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", - "name": "python3.13-vulture-2.14", + "name": "python3.12-vulture-2.14", "pname": "vulture", "rev": "f61125a668a320878494449750330ca58b78c557", "rev_count": 907002, "rev_date": "2025-12-05T15:54:32Z", - "scrape_date": "2025-12-07T02:56:08.936314Z", + "scrape_date": "2025-12-07T02:55:44.875335Z", "stabilities": [ "unstable" ], @@ -3454,27 +3577,27 @@ "out" ], "outputs": { - "dist": "/nix/store/jr2vfc6h5d2haplqmg0pjxli287ija62-python3.13-vulture-2.14-dist", - "out": "/nix/store/58qgn7q850qrimkw4gmxxbcd0yxzsmdf-python3.13-vulture-2.14" + "dist": "/nix/store/bdshb85zpxppz46pydjxkgclasq0yyri-python3.12-vulture-2.14-dist", + "out": "/nix/store/zhp5z4rk8xvgf80xw6jmh8f312qd954q-python3.12-vulture-2.14" }, "system": "aarch64-darwin", "group": "toplevel", "priority": 5 }, { - "attr_path": "python313Packages.vulture", + "attr_path": "python312Packages.vulture", "broken": false, - "derivation": "/nix/store/fydfhidy6snq18gcrwkgmzs8byxyd8n5-python3.13-vulture-2.14.drv", + "derivation": "/nix/store/chp666kk074lyjdjhrszfmwrkksg685p-python3.12-vulture-2.14.drv", "description": "Finds unused code in Python programs", "install_id": "vulture", "license": "MIT", "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", - "name": "python3.13-vulture-2.14", + "name": "python3.12-vulture-2.14", "pname": "vulture", "rev": "f61125a668a320878494449750330ca58b78c557", "rev_count": 907002, "rev_date": "2025-12-05T15:54:32Z", - "scrape_date": "2025-12-07T03:06:15.917900Z", + "scrape_date": "2025-12-07T03:05:47.101114Z", "stabilities": [ "unstable" ], @@ -3484,27 +3607,27 @@ "out" ], "outputs": { - "dist": "/nix/store/r034gyxcdwwagh6jsfc72hybb3jxiai0-python3.13-vulture-2.14-dist", - "out": "/nix/store/7vr41jnwznkwlvskdi28c4k9brjcpwz4-python3.13-vulture-2.14" + "dist": "/nix/store/v2hl6w0zyi72lsz5gc89g0xxrpjmwmwv-python3.12-vulture-2.14-dist", + "out": "/nix/store/0l40s5a0n79m0dbwqv7am96hpswvs0d9-python3.12-vulture-2.14" }, "system": "aarch64-linux", "group": "toplevel", "priority": 5 }, { - "attr_path": "python313Packages.vulture", + "attr_path": "python312Packages.vulture", "broken": false, - "derivation": "/nix/store/0hnax5inwn7swy0jfbpnlab1jdm9pgs8-python3.13-vulture-2.14.drv", + "derivation": "/nix/store/rrk2s4n9fbpclxawacrhs1h82br0kgn3-python3.12-vulture-2.14.drv", "description": "Finds unused code in Python programs", "install_id": "vulture", "license": "MIT", "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", - "name": "python3.13-vulture-2.14", + "name": "python3.12-vulture-2.14", "pname": "vulture", "rev": "f61125a668a320878494449750330ca58b78c557", "rev_count": 907002, "rev_date": "2025-12-05T15:54:32Z", - "scrape_date": "2025-12-07T03:16:02.603893Z", + "scrape_date": "2025-12-07T03:15:40.617644Z", "stabilities": [ "unstable" ], @@ -3514,27 +3637,27 @@ "out" ], "outputs": { - "dist": "/nix/store/8hds3dm0ha3figidiadrrw1qgf8dp4vh-python3.13-vulture-2.14-dist", - "out": "/nix/store/3yas5bqdwk3z7f0nrx68cc1lr5p6gg18-python3.13-vulture-2.14" + "dist": "/nix/store/1hm0s2yrw9f3h40kp1ijsqxb9n5s8q7q-python3.12-vulture-2.14-dist", + "out": "/nix/store/va0zb5zab6xnkjhk15k0a69p62i20xq7-python3.12-vulture-2.14" }, "system": "x86_64-darwin", "group": "toplevel", "priority": 5 }, { - "attr_path": "python313Packages.vulture", + "attr_path": "python312Packages.vulture", "broken": false, - "derivation": "/nix/store/k29xbyn1in0wlypzj8976jpmp3fk8dk0-python3.13-vulture-2.14.drv", + "derivation": "/nix/store/2pa24ibjdcsxb5nh1zxbx1i80mvmhj6p-python3.12-vulture-2.14.drv", "description": "Finds unused code in Python programs", "install_id": "vulture", "license": "MIT", "locked_url": "https://github.com/flox/nixpkgs?rev=f61125a668a320878494449750330ca58b78c557", - "name": "python3.13-vulture-2.14", + "name": "python3.12-vulture-2.14", "pname": "vulture", "rev": "f61125a668a320878494449750330ca58b78c557", "rev_count": 907002, "rev_date": "2025-12-05T15:54:32Z", - "scrape_date": "2025-12-07T03:25:52.611390Z", + "scrape_date": "2025-12-07T03:25:24.252039Z", "stabilities": [ "unstable" ], @@ -3544,8 +3667,8 @@ "out" ], "outputs": { - "dist": "/nix/store/52a4fn7fjp49vfwjrp1wn78qxgg5d0h1-python3.13-vulture-2.14-dist", - "out": "/nix/store/pq7bmragdlpm8p5qb2k3qhk9qrpx24k5-python3.13-vulture-2.14" + "dist": "/nix/store/wf2fjr0w25d4lkxl0d21v23kllbl81gk-python3.12-vulture-2.14-dist", + "out": "/nix/store/nam3pwdp9wvaqyjzj5gfdii3dcq7708w-python3.12-vulture-2.14" }, "system": "x86_64-linux", "group": "toplevel", diff --git a/.flox/env/manifest.toml b/.flox/env/manifest.toml index 4add0d639..c75cd95f9 100644 --- a/.flox/env/manifest.toml +++ b/.flox/env/manifest.toml @@ -7,7 +7,7 @@ pulumi-python.pkg-path = "pulumiPackages.pulumi-python" ruff.pkg-path = "ruff" ruff.version = "0.14.7" uv.pkg-path = "uv" -vulture.pkg-path = "python313Packages.vulture" +vulture.pkg-path = "python312Packages.vulture" yamllint.pkg-path = "yamllint" nushell.pkg-path = "nushell" fselect.pkg-path = "fselect" @@ -42,6 +42,7 @@ pkgconf.pkg-path = "pkgconf" direnv.pkg-path = "direnv" jq.pkg-path = "jq" markdownlint-cli.pkg-path = "markdownlint-cli" +radon.pkg-path = "python312Packages.radon" [hook] on-activate = ''' diff --git a/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md index 2c18c4767..5f640a1b3 100644 --- a/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md @@ -1,43 +1,21 @@ --- -name: Spec -about: Create a specification for Ralph autonomous implementation +name: Issue +about: Create an issue title: '' -labels: ["feature", "in-refinement"] projects: ["oscmcompany/1"] - --- +# Overview -# Description - - - -**Goal:** - -## Requirements - -### Category 1 - -- [ ] Requirement (testable/verifiable) -- [ ] Another requirement - -### Category 2 - -- [ ] Requirement -- [ ] Requirement - -## Open Questions - -- [ ] Question that needs resolution before ready? -- [ ] Another question? - -## Decisions - -- [ ] **Decision name:** Choice made and rationale +## Context -## Specification + - +## Changes -## Implementation Notes + - diff --git a/.gitignore b/.gitignore index 26f8fd3e8..39a1816d1 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,7 @@ data/ **/model.tar.gz **/*.safetensor **/*.json -notes.md etc/ +./ralph/marketplace.json +./ralph/.state_version +.scratchpad/ diff --git a/.markdownlint.yaml b/.markdownlint.yaml index a0e9ea327..445310a97 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -15,6 +15,10 @@ MD033: - br - details - summary + # Custom XML-like tag used by Ralph workflow (maskfile.md) to signal task completion - promise # MD041 - First line in file should be top-level heading MD041: false +MD022: + lines_below: 1 + lines_above: 1 diff --git a/.ralph/marketplace.md b/.ralph/marketplace.md new file mode 100644 index 000000000..00dfcdb95 --- /dev/null +++ b/.ralph/marketplace.md @@ -0,0 +1,166 @@ +# Ralph Marketplace State + +This directory contains the runtime state for the Ralph marketplace competition system. + +## Directory Structure + +``` +.ralph/ +├── config.json # Marketplace configuration (tracked in git) +├── marketplace.json # Cached state (gitignored, regenerated from events) +├── .state_version # Event count for cache invalidation (gitignored) +└── events/ # Append-only event log (tracked in git) + └── [timestamp]-[bot_id]-[outcome].json +``` + +## Files + +### config.json + +Configuration for the marketplace: +- `num_bots`: Number of competing smart bots (default: 3) +- `base_budget_per_bot`: Base iteration budget per bot (default: 10) +- `scoring_weights`: Weight for each scoring dimension (sum to 1.0) +- `weight_constraints`: Min/max weight bounds for bots + +**Tracked in git:** Yes - configuration is part of the codebase + +### marketplace.json + +Cached marketplace state computed from event log: +- Bot weights, efficiency, success/failure counts +- Total budget pool +- Rounds completed +- Last updated timestamp + +**Tracked in git:** No (gitignored) - regenerated from events automatically + +### .state_version + +Simple counter tracking the number of events processed to determine if cache is stale. + +**Tracked in git:** No (gitignored) + +### events/ + +Append-only log of marketplace events. Each event is a JSON file named: +``` +[timestamp]-[bot_id]-[outcome].json +``` + +Example: `2026-01-29T10-15-00-123456Z-smart_bot_2-success.json` + +Event schema: +```json +{ + "timestamp": "2026-01-29T10:15:00.123456Z", + "issue_number": 123, + "bot_id": "smart_bot_2", + "outcome": "success", + "proposal_score": 0.87, + "implementation_score": 0.85, + "accuracy": 0.98, + "weight_delta": 0.15, + "iteration_count": 3, + "metrics": { + "tests_passed": true, + "code_quality_passed": true, + "coverage_delta": 2.5, + "lines_changed": 45, + "files_affected": 3 + } +} +``` + +**Tracked in git:** Yes - events are the source of truth for marketplace learning + +## State Management + +The marketplace uses an append-only event log for state management: + +1. **Events are immutable** - once written, never modified +2. **State is computed** - marketplace.json is derived from events +3. **Cache invalidation** - .state_version tracks when recomputation is needed +4. **Conflict resolution** - multiple branches can add different events; they merge cleanly + +## Concurrency + +Multiple developers can run marketplace loops concurrently: + +1. Each loop appends new event files (unique timestamps prevent conflicts) +2. marketplace.json might conflict on merge, but it's gitignored +3. After pulling, state is recomputed from merged event log +4. All developers converge on same state (events are source of truth) + +## Commands + +Initialize marketplace: +```bash +mask ralph marketplace setup +``` + +View current state: +```bash +mask ralph marketplace status +``` + +Run marketplace loop: +```bash +mask ralph marketplace loop +``` + +Reset to initial state (erase history): +```bash +mask ralph marketplace reset +``` + +For simple (single-agent) workflow, use: +```bash +mask ralph simple setup +mask ralph simple loop +``` + +## Learning Persistence + +Bot weights and efficiency evolve over time based on events: +- Successful implementations increase bot weight +- Failed implementations decrease bot weight +- Accuracy bonuses for good prediction +- Efficiency = success_rate affects budget allocation + +This creates a competitive marketplace where high-performing bots get more opportunities. + +## Backup and Recovery + +To backup marketplace state: +```bash +cp -r .ralph .ralph.backup +``` + +To recover: +```bash +cp -r .ralph.backup .ralph +``` + +Events are tracked in git, so full history is preserved in version control. + +## Troubleshooting + +**Cache out of sync:** +```bash +# Delete cache, will regenerate from events +rm .ralph/marketplace.json .ralph/.state_version +mask ralph marketplace status +``` + +**Corrupted event:** +```bash +# Find and remove bad event file +ls -la .ralph/events/ +rm .ralph/events/[bad-event-file].json +``` + +**Reset everything:** +```bash +mask ralph marketplace reset +``` diff --git a/CLAUDE.md b/CLAUDE.md index f0dff9323..47f5a8dbc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -46,18 +46,18 @@ - `infrastructure/` folder contains Pulumi infrastructure as code - See `README.md` "Principles" section for developer philosophy -## Ralph Workflow +## Ralph Simple Workflow Ralph is an autonomous development loop for implementing GitHub issue specs. ### Commands -- `mask ralph setup` - Create required labels (run once before first use) -- `mask ralph spec [issue_number]` - Interactive spec refinement (creates new issue if no number provided) -- `mask ralph ready ` - Mark a spec as ready for implementation -- `mask ralph loop ` - Run autonomous loop on a ready spec -- `mask ralph backlog` - Review open issues for duplicates, overlaps, and implementation status -- `mask ralph pull-request [pull_request_number]` - Process pull request review feedback interactively +- `mask ralph simple setup` - Create required labels (run once before first use) +- `mask ralph simple spec [issue_number]` - Interactive spec refinement (creates new issue if no number provided) +- `mask ralph simple ready ` - Mark a spec as ready for implementation +- `mask ralph simple loop ` - Run autonomous loop on a ready spec +- `mask ralph simple backlog` - Review open issues for duplicates, overlaps, and implementation status +- `mask ralph simple pull-request [pull_request_number]` - Process pull request review feedback interactively ### Labels @@ -75,9 +75,9 @@ Ralph is an autonomous development loop for implementing GitHub issue specs. ### Workflow -1. Create or refine spec: `mask ralph spec` or `mask ralph spec ` -2. When spec is complete, mark as ready: `mask ralph ready ` -3. Run autonomous loop: `mask ralph loop ` +1. Create or refine spec: `mask ralph simple spec` or `mask ralph simple spec ` +2. When spec is complete, mark as ready: `mask ralph simple ready ` +3. Run autonomous loop: `mask ralph simple loop ` 4. Loop assigns the issue and resulting pull request to the current GitHub user 5. Loop creates pull request with `Closes #` on completion 6. Pull request merge auto-closes issue @@ -113,3 +113,171 @@ by merging similar learnings and removing entries that have been incorporated in **Root cause:** Spec said "commit is the verification gate" but didn't explicitly say to always attempt commit after implementing. **Fix:** Added explicit "Commit-as-Verification" section requiring commit attempt after every implementation. + + +## Ralph Marketplace Workflow + +The marketplace is an advanced Ralph workflow where multiple bidders compete to provide the best solution. + +### Architecture + +**Actors:** +- **Broker** - Orchestrates competition, evaluates proposals, implements winner +- **Bidders (3)** - Submit lightweight proposals, compete for selection. Each bidder has deep expertise across languages (Rust, Python), tools (Axum, FastAPI, Polars), infrastructure (Pulumi, AWS), and risk assessment. + +**Agent Definitions:** +- Agent system prompts in `.claude/agents/` +- Runtime state in `.ralph/` +- Orchestration code in `tools/ralph_marketplace_*.py` + +### Commands + +- `mask ralph marketplace setup` - Initialize marketplace state and bidder configurations +- `mask ralph marketplace spec [issue_number]` - Interactive spec refinement (creates new issue if no number provided) +- `mask ralph marketplace ready ` - Mark a spec as ready for implementation +- `mask ralph marketplace loop ` - Run marketplace competition on a ready spec +- `mask ralph marketplace backlog` - Review open issues for duplicates, overlaps, and implementation status +- `mask ralph marketplace pull-request [pull_request_number]` - Process pull request review feedback interactively +- `mask ralph marketplace status` - Show bidder weights, efficiency, recent rounds +- `mask ralph marketplace reset` - Reset bidder weights to equal (erases learning history) + +### Workflow + +1. Issue marked "ready" → `mask ralph marketplace loop ` +2. Broker spawned, reads spec, extracts requirements +3. Broker spawns 3 bidders in parallel (identities hidden) +4. Bidders apply domain expertise directly, submit lightweight proposals +5. Broker scores proposals on 5 dimensions (spec alignment, technical quality, innovation, risk, efficiency) +6. Broker ranks proposals, selects top scorer (tie-break: earlier timestamp wins) +7. Broker implements ONLY top proposal +8. Broker runs comprehensive checks (format, lint, type-check, dead-code, complex, tests individually) +9. **Success:** Update weights (+), check completeness or rotate context after logical groupings +10. **Failure:** Replan round (all bidders see failure, submit new proposals) +11. Repeat until complete or max iterations → pull request creation or attention-needed + +### Budget Model + +**Fixed pool with efficiency rewards:** +- Total pool = 10 iterations × number of bidders (default: 30) +- Allocation = (bidder_weight × bidder_efficiency) / sum(all bidder scores) × total_pool +- Zero-sum competition: high performers take budget from low performers +- Mathematical guarantee: allocations always sum to exactly total_pool + +### Weight Updates + +Immediate updates after each round: + +**Initial Round:** +- Ranked #1, implementation succeeds: **+0.10** +- Ranked #1, implementation fails: **-0.15** +- Ranked #2+, tried after #1 failed, succeeds: **+0.08** +- Ranked #2+, tried after #1 failed, also fails: **-0.18** +- Ranked but not tried (another succeeded): **-0.02** + +**Replan Round:** +- New proposal succeeds: **+0.12** (bonus for learning from failure) +- Failed again: **-0.20** (heavy penalty) +- Resubmitted same proposal: **-0.05** (not adapting) + +**Accuracy Bonus:** +- If absolute difference between proposal and implementation score ≤ 0.15: **+0.05** bonus +- Rewards accurate prediction (applies only to success outcomes) + +**Constraints:** +- Weights normalized to sum to 1.0 after each update +- Min weight: 0.05 (maintains diversity) +- Max weight: 0.60 (prevents monopoly) +- Constraints enforced iteratively to maintain normalized sum + +### Scoring Dimensions + +Proposals and implementations scored on 5 unified dimensions: + +1. **Spec Alignment (32%)** - Checkbox coverage, component coverage, implicit requirements +2. **Technical Quality (22%)** - Pattern conformance, code quality checks pass +3. **Innovation (15%)** - Novel approach, elegance, simplicity +4. **Risk (21%)** - Files affected, breaking changes, test coverage +5. **Efficiency (10%)** - Estimated vs. actual complexity, diff size + +### State Management + +**Append-only event log pattern:** +- Events stored in `.ralph/events/` as immutable JSON files +- State computed from events (cached in `.ralph/marketplace.json`) +- Handles concurrency: different branches add different events, merge cleanly +- Source of truth: event log (tracked in git) +- Cache: regenerated automatically when stale (gitignored) + +**Event schema:** + +```json +{ + "timestamp": "2026-01-29T10:15:00Z", + "issue_number": 123, + "bot_id": "bidder_2", + "outcome": "success", + "proposal_score": 0.87, + "implementation_score": 0.85, + "accuracy": 0.98, + "weight_delta": 0.15, + "iteration_count": 3, + "metrics": { ... } +} +``` + +### Replan Rounds + +Triggered when top proposal's implementation fails: + +1. Post failure context to all bidders (failed proposal, error details) +2. Failed bidder MUST submit new proposal (cannot resubmit) +3. Other bidders CAN resubmit or submit new proposals +4. Return to evaluation phase with new proposals +5. If replan fails → human intervention (attention-needed label) + +### Learning Over Time + +**Efficiency tracking:** +- Efficiency = implementations_succeeded / (succeeded + failed) +- High efficiency → larger budget allocation +- Low efficiency → reduced budget + +**Weight evolution:** +- Successful bidders gain weight over time +- Failed predictions lose weight +- Accurate predictions get bonus + +**Long-term outcome:** +- Bidders specialize based on success patterns +- High performers dominate budget allocation +- Maintains minimum diversity (5% min weight) + +### Future Enhancements + +See `proposal_followups.md` for detailed future considerations: +- Meta-broker for dynamic weight tuning (Phase 2) +- Post-merge health tracking (Phase 3) +- Bidder hobbies and token rewards (Phase 4) +- External system access for bidders (Phase 5) +- Multi-user concurrency improvements (Phase 6) + +### Comparison to Standard Ralph Loop + +**Standard Ralph (`mask ralph simple loop`):** +- Single agent implements entire issue +- Iterative: plan → implement → commit → repeat +- Max 10 iterations +- Context rotation after logical groupings +- No competition, no learning + +**Marketplace Ralph (`mask ralph marketplace loop`):** +- 3 bidders compete with proposals +- Broker selects and implements best proposal +- Budget allocated by weight × efficiency +- Bidders learn and improve over time +- Zero-sum competition for resources +- Context rotation after logical groupings + +**When to use:** +- **Standard:** Simple issues, single approach obvious, quick iteration +- **Marketplace:** Complex issues, multiple approaches possible, quality-critical, learning important diff --git a/maskfile.md b/maskfile.md index 25a025d8a..04da493aa 100644 --- a/maskfile.md +++ b/maskfile.md @@ -446,6 +446,24 @@ uvx vulture \ echo "Dead code check completed" ``` +#### complexity + +> Check Python code cyclomatic complexity + +```bash +set -euo pipefail + +echo "Running cyclomatic complexity analysis" + +radon cc \ + --min C \ + --show-complexity \ + --exclude '.flox,.venv,target' \ + . + +echo "Complexity check completed" +``` + #### lint > Run comprehensive Python code quality checks @@ -507,6 +525,8 @@ mask development python type-check mask development python dead-code +mask development python complexity + mask development python test echo "Python development checks completed successfully" @@ -771,7 +791,11 @@ claude mcp list > Ralph autonomous development workflow -### setup +### simple + +> Standard Ralph workflow for single-agent implementation + +#### setup > Create required labels for Ralph workflow @@ -810,7 +834,7 @@ done echo "Setup complete" ``` -### spec [issue_number] +#### spec [issue_number] > Build or refine a spec through interactive conversation @@ -855,13 +879,19 @@ YOUR ROLE: 1. Probe the user with questions to refine this spec 2. Ask about: problem clarity, requirements completeness, performance, security, testing, edge cases, dependencies 3. When decisions are made, update the issue incrementally using: gh issue edit ${issue_number} --body \"...\" -4. Keep Open Questions section updated as questions are resolved -5. Move resolved questions to Decisions section with rationale + +TEMPLATE STRUCTURE: +The issue uses a template with these sections: +- ## Context: Describe the bug, feature, or task +- ## Changes: Provide solutions/recommendations as bullet points, action items as checkboxes IMPORTANT: +- Populate the template sections according to their comment instructions +- APPEND (do not replace) a \"## Ralph Context\" section for Ralph-specific information +- Under \"## Ralph Context\", use h3 (###) or lower for subsections +- Suggested Ralph subsections: ### Open Questions, ### Decisions, ### Specification Details - You do NOT add the 'ready' label - the human decides when the spec is complete - Update the issue body incrementally as decisions are made -- Use the spec template format (Problem, Requirements, Open Questions, Decisions, Specification) - Be thorough but conversational Start by reviewing the current spec and asking clarifying questions." @@ -871,10 +901,10 @@ claude --system-prompt "$system_prompt" echo "" echo "Spec refinement session ended" echo "When ready, mark the spec as ready:" -echo "mask ralph ready ${issue_number}" +echo "mask ralph simple ready ${issue_number}" ``` -### ready (issue_number) +#### ready (issue_number) > Mark a spec as ready for implementation @@ -894,10 +924,10 @@ fi gh issue edit "${issue_number}" --add-label "ready" --remove-label "in-refinement" echo "Issue #${issue_number} marked as ready" -echo "Run the loop with: mask ralph loop ${issue_number}" +echo "Run the loop with: mask ralph simple loop ${issue_number}" ``` -### loop (issue_number) +#### loop (issue_number) > Run autonomous loop on a ready spec @@ -1073,7 +1103,7 @@ ${git_status} ${recent_commits} \`\`\` -Retry with: \`mask ralph loop ${issue_number}\`" 2>/dev/null || true +Retry with: \`mask ralph simple loop ${issue_number}\`" 2>/dev/null || true exit 1 } @@ -1111,19 +1141,22 @@ while [ $iteration -le $max_iterations ]; do echo "Creating pull request" pr_body=$(cat <COMPLETE EOF @@ -1186,7 +1219,7 @@ ${modified_files:-none} 1. Fix the blocking issue 2. Update the spec if needed 3. Delete the branch: \`git branch -D ${branch_name} && git push origin --delete ${branch_name}\` -4. Re-add the \`ready\` label and run \`mask ralph loop ${issue_number}\` again +4. Re-add the \`ready\` label and run \`mask ralph simple loop ${issue_number}\` again EOF ) @@ -1198,7 +1231,7 @@ trap - EXIT exit 1 ``` -### backlog +#### backlog > Review open issues for duplicates, overlaps, and implementation status @@ -1221,7 +1254,7 @@ if [ -z "$existing_issue" ]; then tracking_body=$(cat <<'TRACKING_TEMPLATE' # Backlog Review -This issue tracks periodic backlog review reports generated by `mask ralph backlog`. +This issue tracks periodic backlog review reports generated by `mask ralph simple backlog`. Each comment contains an analysis of open issues looking for: - Potential duplicates or overlapping issues @@ -1230,7 +1263,7 @@ Each comment contains an analysis of open issues looking for: Staleness is handled separately by the stale workflow action. -Run `mask ralph backlog` to generate a new report. +Run `mask ralph simple backlog` to generate a new report. TRACKING_TEMPLATE ) @@ -1306,7 +1339,7 @@ echo "Backlog review complete" echo "Report posted to: https://github.com/$(gh repo view --json nameWithOwner -q .nameWithOwner)/issues/${existing_issue}" ``` -### pull-request [pull_request_number] +#### pull-request [pull_request_number] > Process pull request review feedback interactively @@ -1326,7 +1359,7 @@ else pr_num=$(gh pr view --json number --jq '.number' 2>/dev/null || echo "") if [ -z "$pr_num" ]; then echo "Error: No pull request found for current branch" - echo "Use: mask ralph pull-request " + echo "Use: mask ralph simple pull-request " exit 1 fi echo "Found pull request #${pr_num}" @@ -1563,3 +1596,679 @@ echo "" echo "Pull request review complete" echo "View pull request: https://github.com/${repo_info}/pull/${pr_num}" ``` + +### marketplace + +> Marketplace-based autonomous development loop with competing smart bots + +#### setup + +> Initialize marketplace state and bot configurations + +```bash +set -euo pipefail + +source "${MASKFILE_DIR}/tools/ralph_preflight.sh" +ralph_preflight --jq + +echo "Initializing Ralph marketplace" + +uv run python tools/ralph_marketplace_orchestrator.py setup + +echo "Marketplace initialized successfully" +``` + +#### loop (issue_number) + +> Run marketplace competition loop on a ready spec + +```bash +set -euo pipefail + +source "${MASKFILE_DIR}/tools/ralph_preflight.sh" +ralph_preflight --claude --jq + +max_iterations="${RALPH_MAX_ITERATIONS:-10}" + +echo "Starting Ralph marketplace loop for issue #${issue_number}" + +echo "Running pre-flight checks" + +if [ -n "$(git status --porcelain)" ]; then + echo "Error: Working directory has uncommitted changes" + echo "Commit or stash changes before running ralph marketplace loop" + exit 1 +fi +echo "Working directory is clean" + +default_branch=$(git remote show origin | grep 'HEAD branch' | cut -d' ' -f5) +current_branch=$(git rev-parse --abbrev-ref HEAD) +if [ "$current_branch" != "$default_branch" ]; then + echo "Error: Not on default branch ${default_branch} (currently on: ${current_branch})" + echo "Run: git checkout ${default_branch}" + exit 1 +fi +echo "On default branch (${default_branch})" + +echo "Pulling latest ${default_branch}" +if ! git pull --ff-only origin "$default_branch"; then + echo "Error: Could not pull latest ${default_branch}" + echo "Resolve conflicts or check network/auth" + exit 1 +fi +echo "${default_branch} is up to date" + +if ! labels=$(gh issue view "${issue_number}" --json labels --jq '.labels[].name'); then + echo "Error: Could not fetch issue #${issue_number}" + echo "Check network connectivity and issue existence" + exit 1 +fi +if ! echo "$labels" | grep -q "^ready$"; then + echo "Error: Issue #${issue_number} does not have 'ready' label" + echo "Current labels: ${labels:-none}" + exit 1 +fi +echo "Issue has 'ready' label" + +issue_title=$(gh issue view "${issue_number}" --json title --jq '.title') +short_desc=$(echo "$issue_title" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd 'a-z0-9-' | cut -c1-30) +branch_name="ralph/${issue_number}-${short_desc}" + +if git show-ref --verify --quiet "refs/heads/${branch_name}" 2>/dev/null; then + echo "Error: Local branch '${branch_name}' already exists" + echo "Delete with: git branch -d ${branch_name}" + exit 1 +fi + +if git ls-remote --heads origin "${branch_name}" 2>/dev/null | grep -q .; then + echo "Error: Remote branch '${branch_name}' already exists" + echo "Delete with: git push origin --delete ${branch_name}" + exit 1 +fi +echo "Branch '${branch_name}' does not exist" + +echo "Pre-flight checks passed" + +echo "Creating branch: ${branch_name}" +git checkout -b "${branch_name}" + +echo "Updating labels: removing 'ready', adding 'in-progress' and 'ralph'" +gh issue edit "${issue_number}" --remove-label "ready" --add-label "in-progress" --add-label "ralph" + +current_user=$(gh api user --jq '.login' 2>/dev/null || echo "") +if [ -n "$current_user" ]; then + echo "Assigning issue to ${current_user}" + gh issue edit "${issue_number}" --add-assignee "${current_user}" 2>/dev/null || echo "Warning: Could not assign issue" +fi + +cleanup_on_error() { + local exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "" + echo "Error: Marketplace loop failed unexpectedly (exit code: $exit_code)" + + local git_status=$(git status --short 2>/dev/null | head -20 || echo "unavailable") + local recent_commits=$(git log --oneline -5 2>/dev/null || echo "unavailable") + + gh issue edit "${issue_number}" --remove-label "in-progress" --remove-label "ralph" --add-label "attention-needed" 2>/dev/null || true + gh issue comment "${issue_number}" --body "## Ralph Marketplace Loop Error + +The marketplace loop exited unexpectedly with code $exit_code. + +**Branch:** \`${branch_name}\` + +### Git Status +\`\`\` +${git_status} +\`\`\` + +### Recent Commits +\`\`\` +${recent_commits} +\`\`\` + +Check the terminal output for full details." 2>/dev/null || true + fi +} +trap cleanup_on_error EXIT + +echo "Loading marketplace state" +state_json=$(uv run python -c " +from tools.ralph_marketplace_state import MarketplaceStateManager +from tools.ralph_marketplace_budget import allocate_budgets +import json + +manager = MarketplaceStateManager() +state = manager.load_state() +config = manager.load_config() +allocations = allocate_budgets(state) + +output = { + 'num_bots': config['num_bots'], + 'total_budget': state.total_budget_pool, + 'allocations': allocations +} +print(json.dumps(output)) +") + +num_bots=$(echo "$state_json" | jq -r '.num_bots') +total_budget=$(echo "$state_json" | jq -r '.total_budget') +allocations=$(echo "$state_json" | jq -c '.allocations') + +echo "Marketplace state loaded: ${num_bots} bidders, ${total_budget} total iterations" + +echo "Loading broker agent" +broker_prompt_path=".claude/agents/ralph_broker.md" +if [ ! -f "$broker_prompt_path" ]; then + echo "Error: Broker agent not found at ${broker_prompt_path}" + exit 1 +fi + +broker_prompt=$(cat "$broker_prompt_path") +broker_prompt="${broker_prompt//\{\{issue_number\}\}/${issue_number}}" +broker_prompt="${broker_prompt//\{\{num_bots\}\}/${num_bots}}" +broker_prompt="${broker_prompt//\{\{bot_budgets\}\}/${allocations}}" +broker_prompt="${broker_prompt//\{\{total_budget\}\}/${total_budget}}" + +echo "Broker agent loaded and configured" + +stream_text='select(.type == "assistant").message.content[]? | select(.type == "text").text // empty | gsub("\n"; "\r\n") | . + "\r\n\n"' +final_result='select(.type == "result").result // empty' + +tmpfile=$(mktemp) + +cleanup_and_fail() { + rm -f "$tmpfile" + echo "" + echo "Unexpected error - cleaning up" + + local git_status=$(git status --short 2>/dev/null | head -20 || echo "unavailable") + local recent_commits=$(git log --oneline -5 2>/dev/null || echo "unavailable") + + gh issue edit "${issue_number}" --remove-label "in-progress" --remove-label "ralph" --add-label "attention-needed" 2>/dev/null || true + gh issue comment "${issue_number}" --body "## Ralph Marketplace Loop Error + +The marketplace loop exited unexpectedly. Branch: \`${branch_name}\` + +### Git Status +\`\`\` +${git_status} +\`\`\` + +### Recent Commits +\`\`\` +${recent_commits} +\`\`\` + +Retry with: \`mask ralph marketplace loop ${issue_number}\`" 2>/dev/null || true + exit 1 +} + +trap "rm -f $tmpfile" EXIT +trap cleanup_and_fail ERR + +iteration=1 +while [ $iteration -le $max_iterations ]; do + echo "" + echo "Marketplace Round ${iteration}/${max_iterations}" + + spec=$(gh issue view "${issue_number}" --json body --jq '.body') + + claude \ + --print \ + --output-format stream-json \ + --system-prompt "${broker_prompt}" \ + --dangerously-skip-permissions \ + "Current spec state:\n\n${spec}\n\nBegin marketplace round ${iteration}." \ + | grep --line-buffered '^{' \ + | tee "$tmpfile" \ + | jq --unbuffered -rj "$stream_text" + + result=$(jq -r "$final_result" "$tmpfile") + + if [[ "$result" == *"COMPLETE"* ]]; then + echo "" + echo "Marketplace complete after ${iteration} round(s)" + + echo "Updating labels: removing 'in-progress' and 'ralph'" + gh issue edit "${issue_number}" --remove-label "in-progress" --remove-label "ralph" + + echo "Pushing branch" + git push -u origin "${branch_name}" + + echo "Creating pull request" + pr_body=$(cat <COMPLETE +EOF +) + pr_url=$(gh pr create \ + --title "${issue_title}" \ + --body "$pr_body" \ + --assignee "${current_user:-}") + + echo "Pull request created: ${pr_url}" + echo "Issue will auto-close on merge" + trap - EXIT + exit 0 + fi + + iteration=$((iteration + 1)) +done + +echo "" +echo "Max iterations reached (${max_iterations})" + +echo "Pushing branch for review" +branch_pushed="false" +if git push -u origin "${branch_name}" 2>/dev/null; then + echo "Branch pushed successfully" + branch_pushed="true" +else + echo "Warning: Could not push branch (progress is local only)" +fi + +gh issue edit "${issue_number}" --remove-label "in-progress" --remove-label "ralph" --add-label "attention-needed" + +modified_files=$(git diff --name-only "origin/${default_branch}" 2>/dev/null || echo "none") + +if [ "$branch_pushed" = "true" ]; then + branch_info="**Branch:** [\`${branch_name}\`](../../tree/${branch_name}) (pushed to remote)" +else + branch_info="**Branch:** \`${branch_name}\` (local only - push failed)" +fi + +failure_comment=$(cat < Show marketplace state including bot weights, efficiency, and recent rounds + +```bash +set -euo pipefail + +echo "Fetching Ralph marketplace status" + +uv run python tools/ralph_marketplace_orchestrator.py status +``` + +#### reset + +> Reset all bot weights to equal (use with caution) + +```bash +set -euo pipefail + +echo "Resetting Ralph marketplace to equal bot weights" +echo "" +echo "WARNING: This will:" +echo " - Remove all event history" +echo " - Reset bot weights to equal" +echo " - Clear cached state" +echo " - Keep configuration unchanged" +echo "" + +if [ -t 0 ]; then + read -r -p "Are you sure? This will erase learning history. [y/N]: " confirm < /dev/tty + if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then + echo "Aborted" + exit 0 + fi +else + echo "Error: Non-interactive environment detected; cannot confirm destructive reset." + echo "Please run this command in an interactive terminal to proceed." + exit 1 +fi + +uv run python tools/ralph_marketplace_orchestrator.py reset + +echo "" +echo "Marketplace reset complete" +echo "Run 'mask ralph marketplace status' to verify" +``` + +#### spec [issue_number] + +> Build or refine a spec through interactive conversation + +```bash +set -euo pipefail + +source "${MASKFILE_DIR}/tools/ralph_preflight.sh" +ralph_preflight --claude --jq + +echo "Starting Ralph marketplace spec refinement" + +if [ -z "${issue_number:-}" ]; then + echo "Creating new spec issue" + + issue_json=$(gh issue create \ + --template "ISSUE_TEMPLATE.md" \ + --title "New Spec: [TITLE]" \ + --label "in-refinement" \ + --label "feature" \ + --json number) + + issue_number=$(echo "$issue_json" | jq -r '.number') + + echo "Created issue #${issue_number}" + echo "Opening issue in browser" + gh issue view "${issue_number}" --web & +fi + +echo "Refining issue #${issue_number}" + +issue_title=$(gh issue view "${issue_number}" --json title --jq '.title') +issue_body=$(gh issue view "${issue_number}" --json body --jq '.body') + +system_prompt="You are helping refine a technical specification in GitHub issue #${issue_number}. + +CURRENT SPEC: +Title: ${issue_title} + +${issue_body} + +YOUR ROLE: +1. Probe the user with questions to refine this spec +2. Ask about: problem clarity, requirements completeness, performance, security, testing, edge cases, dependencies +3. When decisions are made, update the issue incrementally using: gh issue edit ${issue_number} --body \"...\" + +TEMPLATE STRUCTURE: +The issue uses a template with these sections: +- ## Context: Describe the bug, feature, or task +- ## Changes: Provide solutions/recommendations as bullet points, action items as checkboxes + +IMPORTANT: +- Populate the template sections according to their comment instructions +- APPEND (do not replace) a \"## Ralph Context\" section for Ralph-specific information +- Under \"## Ralph Context\", use h3 (###) or lower for subsections +- Suggested Ralph subsections: ### Open Questions, ### Decisions, ### Specification Details +- You do NOT add the 'ready' label - the human decides when the spec is complete +- Update the issue body incrementally as decisions are made +- Be thorough but conversational + +Start by reviewing the current spec and asking clarifying questions." + +claude --system-prompt "$system_prompt" + +echo "" +echo "Spec refinement session ended" +echo "When ready, mark the spec as ready:" +echo "mask ralph marketplace ready ${issue_number}" +``` + +#### ready (issue_number) + +> Mark a spec as ready for implementation + +```bash +set -euo pipefail + +source "${MASKFILE_DIR}/tools/ralph_preflight.sh" +ralph_preflight + +echo "Marking issue #${issue_number} as ready" + +if ! gh issue view "${issue_number}" &> /dev/null; then + echo "Error: Issue #${issue_number} not found" + exit 1 +fi + +gh issue edit "${issue_number}" --add-label "ready" --remove-label "in-refinement" + +echo "Issue #${issue_number} marked as ready" +echo "Run the marketplace loop with: mask ralph marketplace loop ${issue_number}" +``` + +#### backlog + +> Review open issues for duplicates, overlaps, and implementation status + +```bash +set -euo pipefail + +source "${MASKFILE_DIR}/tools/ralph_preflight.sh" +ralph_preflight --claude --jq + +echo "Starting Ralph marketplace backlog review" + +tracking_issue_title="Backlog Review (Marketplace)" + +echo "Checking for existing tracking issue" +existing_issue=$(gh issue list --search "\"${tracking_issue_title}\" in:title" --state open --json number --jq '.[0].number // empty') + +if [ -z "$existing_issue" ]; then + echo "Creating tracking issue: ${tracking_issue_title}" + + tracking_body=$(cat <<'TRACKING_TEMPLATE' +# Backlog Review (Marketplace) + +This issue tracks periodic backlog review reports generated by `mask ralph marketplace backlog`. + +Each comment contains an analysis of open issues looking for: +- Potential duplicates or overlapping issues +- Issues that may already be implemented +- Consolidation opportunities + +Staleness is handled separately by the stale workflow action. + +Run `mask ralph marketplace backlog` to generate a new report. +TRACKING_TEMPLATE +) + + existing_issue=$(gh issue create \ + --title "${tracking_issue_title}" \ + --body "$tracking_body" \ + --label "backlog-review" \ + --json number -q '.number') + + echo "Created tracking issue #${existing_issue}" +else + echo "Found existing tracking issue #${existing_issue}" +fi + +echo "Fetching open issues" +issues_json=$(gh issue list --state open --limit 500 --json number,title,body,labels,updatedAt,createdAt) +issue_count=$(echo "$issues_json" | jq 'length') +echo "Found ${issue_count} open issues" + +echo "Analyzing backlog with marketplace broker" + +today=$(date +%Y-%m-%d) + +system_prompt="You are analyzing a GitHub issue backlog for consolidation opportunities. + +TODAY'S DATE: ${today} +TRACKING ISSUE: #${existing_issue} (do NOT include this in analysis) + +ANALYSIS TASKS: +1. DUPLICATES: Find issues with similar titles/descriptions that might be duplicates +2. OVERLAPS: Find issues that cover related functionality and could be consolidated +3. IMPLEMENTED: Search the codebase for keywords that suggest an issue might already be done + +OUTPUT FORMAT: +Generate a markdown report following this exact structure: + +## Backlog Review (Marketplace) - ${today} + +### Potential Duplicates + + +### Potentially Implemented + + +### Consolidation Suggestions + + +### Summary + + +IMPORTANT: +- Be conservative with duplicate detection - only flag clear matches +- For 'potentially implemented', actually search the codebase using Grep/Glob +- Exclude the tracking issue #${existing_issue} from all analysis +- Use high/medium/low confidence levels +- Keep the report concise and actionable" + +report=$(claude \ + --print \ + --dangerously-skip-permissions \ + --system-prompt "${system_prompt}" \ + "Analyze this issue backlog and generate a report: + +${issues_json} + +Search the codebase as needed to check if issues might already be implemented.") + +echo "Posting report to tracking issue #${existing_issue}" +gh issue comment "${existing_issue}" --body "${report}" + +echo "" +echo "Backlog review complete" +echo "Report posted to: https://github.com/$(gh repo view --json nameWithOwner -q .nameWithOwner)/issues/${existing_issue}" +``` + +#### pull-request [pull_request_number] + +> Process pull request review feedback interactively + +```bash +set -euo pipefail + +source "${MASKFILE_DIR}/tools/ralph_preflight.sh" +ralph_preflight --claude --jq + +echo "Starting Ralph marketplace pull request review" + +if [ -n "${pull_request_number:-}" ]; then + pr_num="$pull_request_number" + echo "Using pull request #${pr_num}" +else + echo "Auto-detecting pull request from current branch" + pr_num=$(gh pr view --json number --jq '.number' 2>/dev/null || echo "") + if [ -z "$pr_num" ]; then + echo "Error: No pull request found for current branch" + echo "Use: mask ralph marketplace pull-request " + exit 1 + fi + echo "Found pull request #${pr_num}" +fi + +repo_info=$(gh repo view --json nameWithOwner --jq '.nameWithOwner') +owner=$(echo "$repo_info" | cut -d'/' -f1) +repo=$(echo "$repo_info" | cut -d'/' -f2) + +echo "Fetching review comments" + +review_threads=$(gh api graphql -f query=' +query($owner: String!, $repo: String!, $pr: Int!) { + repository(owner: $owner, name: $repo) { + pullRequest(number: $pr) { + reviewThreads(first: 100) { + nodes { + id + isResolved + path + line + comments(first: 10) { + nodes { + id + body + author { login } + createdAt + } + } + } + } + } + } +}' -f owner="$owner" -f repo="$repo" -F pr="$pr_num") + +unresolved_threads=$(echo "$review_threads" | jq '[.data.repository.pullRequest.reviewThreads.nodes[] | select(.isResolved == false)]') +thread_count=$(echo "$unresolved_threads" | jq 'length') + +if [ "$thread_count" -eq 0 ]; then + echo "No unresolved review threads found" + echo "Pull request #${pr_num} looks good" + exit 0 +fi + +echo "Found ${thread_count} unresolved review thread(s)" + +system_prompt="You are processing review feedback for pull request #${pr_num}. + +REVIEW THREADS: +${unresolved_threads} + +YOUR ROLE: +1. Review each unresolved thread +2. Read the relevant code using the file paths in the threads +3. Address the feedback by making code changes +4. After making changes, test them with pre-commit hooks +5. Commit changes with descriptive messages +6. Resolve threads using: gh api graphql -f query='mutation { resolveReviewThread(input: {threadId: \"\"}) { clientMutationId } }' + +WORKFLOW: +1. Read the code mentioned in each thread +2. Understand the feedback +3. Make necessary changes +4. Test changes (mask development python/rust all) +5. Commit changes +6. Resolve the thread + +Be thorough and address all feedback points." + +claude --system-prompt "$system_prompt" --dangerously-skip-permissions + +echo "" +echo "Pull request review session complete" +echo "View pull request: https://github.com/${owner}/${repo}/pull/${pr_num}" +``` diff --git a/tools/pyproject.toml b/tools/pyproject.toml index 26c68dbf0..cafbd4460 100644 --- a/tools/pyproject.toml +++ b/tools/pyproject.toml @@ -4,3 +4,13 @@ version = "0.1.0" description = "Project tools and scripts" requires-python = "==3.12.10" dependencies = ["boto3>=1.40.74", "massive>=2.0.2"] + +[tool.ruff] +lint.ignore = [ + "T201", # print statements are appropriate for CLI tools + "PTH123", # open() vs Path.open() is stylistic preference + "C901", # complexity warnings acceptable for orchestration logic + "PLR0912", # too many branches acceptable for state computation + "PLR0911", # too many returns acceptable for decision trees + "PLR2004", # magic values acceptable for CLI argument counts +] diff --git a/tools/ralph_marketplace_budget.py b/tools/ralph_marketplace_budget.py new file mode 100644 index 000000000..a25b7b5d6 --- /dev/null +++ b/tools/ralph_marketplace_budget.py @@ -0,0 +1,134 @@ +"""Ralph marketplace budget allocation. + +Handles iteration budget allocation across bidders based on weights and efficiency. +""" + +from ralph_marketplace_state import MarketplaceState + + +def allocate_budgets(state: MarketplaceState) -> dict[str, int]: + """Allocate iteration budgets to bots using fixed pool with efficiency rewards. + + The total budget pool is fixed at (num_bots * base_budget_per_bot). + Allocation is based on combined score: weight * efficiency. + This creates zero-sum competition where high performers take from low performers. + + Args: + state: Current marketplace state + + Returns: + Dictionary mapping bot_id to allocated iteration budget + """ + total_budget = state.total_budget_pool + + # Calculate combined scores (weight * efficiency) + combined_scores = {} + for bot_id, bot in state.bots.items(): + combined_scores[bot_id] = bot.weight * bot.efficiency + + # Normalize to sum to 1.0 + total_combined = sum(combined_scores.values()) + if total_combined == 0: + # All bots have zero score, distribute equally + equal_budget = total_budget / len(state.bots) + return {bot_id: int(equal_budget) for bot_id in state.bots} + + normalized_scores = { + bot_id: score / total_combined for bot_id, score in combined_scores.items() + } + + # Allocate proportionally + allocations = { + bot_id: total_budget * normalized_scores[bot_id] for bot_id in state.bots + } + + # Round to integers while maintaining total sum + integer_allocations = {} + + # First pass: floor all allocations (without minimum enforcement yet) + for bot_id, allocation in allocations.items(): + integer_allocations[bot_id] = int(allocation) + + # Calculate remaining budget to distribute + total_allocated = sum(integer_allocations.values()) + remaining = total_budget - total_allocated + + # Distribute remaining iterations to highest-scoring bots + # Sort by fractional part descending (who "deserves" rounding up most) + fractional_parts = [ + (bot_id, allocations[bot_id] - integer_allocations[bot_id]) + for bot_id in allocations + ] + sorted_by_fraction = sorted(fractional_parts, key=lambda x: x[1], reverse=True) + + # Give remaining iterations based on fractional parts + for i in range(remaining): + bot_id = sorted_by_fraction[i % len(sorted_by_fraction)][0] + integer_allocations[bot_id] += 1 + + # Enforce minimum of 1 iteration per bot while maintaining zero-sum + # Identify bots with 0 allocations + zero_bots = [bot_id for bot_id, alloc in integer_allocations.items() if alloc == 0] + + if zero_bots: + # Need to reallocate from high-scoring bots to ensure minimum + # Sort by score to take from highest scorers + sorted_bots = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) + + for zero_bot in zero_bots: + # Find a bot with >1 allocation to take from + for donor_bot_id, _ in sorted_bots: + if integer_allocations[donor_bot_id] > 1: + integer_allocations[donor_bot_id] -= 1 + integer_allocations[zero_bot] += 1 + break + else: + # No bot has >1, cannot enforce minimum without breaking zero-sum + # Give 1 to this bot anyway (will slightly exceed budget) + integer_allocations[zero_bot] = 1 + + return integer_allocations + + +def format_budget_allocation( + state: MarketplaceState, allocations: dict[str, int] +) -> str: + """Format budget allocation for display. + + Args: + state: Current marketplace state + allocations: Budget allocations from allocate_budgets() + + Returns: + Formatted string showing allocation details + """ + lines = [ + "Budget Allocation", + "=" * 60, + f"Total Pool: {state.total_budget_pool} iterations", + "", + f"{'Bot ID':<15} {'Weight':>8} {'Efficiency':>10} {'Budget':>10}", + "-" * 60, + ] + + # Sort by allocation descending + sorted_bots = sorted(allocations.items(), key=lambda x: x[1], reverse=True) + + for bot_id, budget in sorted_bots: + bot = state.bots[bot_id] + lines.append( + f"{bot_id:<15} {bot.weight:>8.3f} {bot.efficiency:>10.2%} {budget:>10}" + ) + + # Verify total + total_allocated = sum(allocations.values()) + lines.append("-" * 60) + lines.append(f"{'Total Allocated':<15} {' ' * 18} {total_allocated:>10}") + + if total_allocated != state.total_budget_pool: + lines.append( + f"WARNING: Total allocated ({total_allocated}) " + f"!= pool ({state.total_budget_pool})" + ) + + return "\n".join(lines) diff --git a/tools/ralph_marketplace_orchestrator.py b/tools/ralph_marketplace_orchestrator.py new file mode 100644 index 000000000..8f5505f31 --- /dev/null +++ b/tools/ralph_marketplace_orchestrator.py @@ -0,0 +1,150 @@ +"""Ralph marketplace orchestrator. + +Main entry point for marketplace commands: setup, status, reset. +""" + +import sys +from pathlib import Path + +from ralph_marketplace_budget import allocate_budgets, format_budget_allocation +from ralph_marketplace_state import MarketplaceStateManager + + +def setup_marketplace() -> None: + """Initialize marketplace state and configuration.""" + print("Initializing Ralph marketplace") + + ralph_dir = Path(".ralph") + state_manager = MarketplaceStateManager(ralph_dir) + + # Create configuration + config = state_manager.load_config() + state_manager.save_config(config) + + print("\nConfiguration:") + print(f" Number of bots: {config['num_bots']}") + print(f" Base budget per bot: {config['base_budget_per_bot']}") + print(f" Total budget pool: {config['num_bots'] * config['base_budget_per_bot']}") + print("\nScoring weights:") + for dimension, weight in config["scoring_weights"].items(): + print(f" {dimension}: {weight:.2f}") + print("\nWeight constraints:") + print(f" Min: {config['weight_constraints']['min']:.2f}") + print(f" Max: {config['weight_constraints']['max']:.2f}") + + # Initialize state + state = state_manager.load_state() + print(f"\nInitialized {len(state.bots)} bidders with equal weights:") + for bot_id, bot in state.bots.items(): + print(f" {bot_id}: weight={bot.weight:.3f}, efficiency={bot.efficiency:.2%}") + + print(f"\nMarketplace initialized at {ralph_dir}/") + print("Use 'mask ralph marketplace status' to view current state") + + +def display_marketplace_status() -> None: + """Display current marketplace state.""" + ralph_dir = Path(".ralph") + state_manager = MarketplaceStateManager(ralph_dir) + + try: + state = state_manager.load_state() + except FileNotFoundError: + print("Error: Marketplace not initialized") + print("Run: mask ralph marketplace setup") + sys.exit(1) + + print("Ralph Marketplace Status") + print("=" * 80) + print(f"Last Updated: {state.last_updated}") + print(f"Rounds Completed: {state.rounds_completed}") + print(f"Total Budget Pool: {state.total_budget_pool} iterations") + print() + + # Bot statistics + print("Bidder Statistics") + print("-" * 80) + print( + f"{'Bot ID':<15} {'Weight':>8} {'Efficiency':>10} {'Succeeded':>10} {'Failed':>8} {'Accuracy':>10}" + ) + print("-" * 80) + + # Sort by weight descending + sorted_bots = sorted(state.bots.items(), key=lambda x: x[1].weight, reverse=True) + + for bot_id, bot in sorted_bots: + print( + f"{bot_id:<15} {bot.weight:>8.3f} {bot.efficiency:>10.2%} " + f"{bot.implementations_succeeded:>10} {bot.implementations_failed:>8} " + f"{bot.average_accuracy:>10.2%}" + ) + + print() + + # Budget allocation + allocations = allocate_budgets(state) + print(format_budget_allocation(state, allocations)) + print() + + # Recent events + events = state_manager.load_events() + if events: + print("Recent Events (last 5)") + print("-" * 80) + for event in events[-5:]: + timestamp = event["timestamp"][:19] # Trim to seconds + bot_id = event["bot_id"] + outcome = event["outcome"] + weight_delta = event.get("weight_delta", 0.0) + print(f"{timestamp} {bot_id:<15} {outcome:<20} (Δw: {weight_delta:+.3f})") + else: + print("No events recorded yet") + + print() + + +def reset_marketplace() -> None: + """Reset marketplace to initial state.""" + ralph_dir = Path(".ralph") + state_manager = MarketplaceStateManager(ralph_dir) + + print("Resetting marketplace state") + print("This will:") + print(" - Remove all event history") + print(" - Reset bidder weights to equal") + print(" - Clear cached state") + print(" - Keep configuration unchanged") + print() + + state_manager.reset_state() + + print("Marketplace reset complete") + print("All bidders now have equal weights") + print("Run 'mask ralph marketplace status' to verify") + + +def main() -> None: + """Main entry point for marketplace orchestrator.""" + if len(sys.argv) < 2: + print("Usage: python ralph_marketplace_orchestrator.py [args]") + print("Commands:") + print(" setup - Initialize marketplace") + print(" status - Show marketplace state") + print(" reset - Reset marketplace to initial state") + sys.exit(1) + + command = sys.argv[1] + + if command == "setup": + setup_marketplace() + elif command == "status": + display_marketplace_status() + elif command == "reset": + reset_marketplace() + else: + print(f"Unknown command: {command}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/ralph_marketplace_state.py b/tools/ralph_marketplace_state.py new file mode 100644 index 000000000..d9369bf95 --- /dev/null +++ b/tools/ralph_marketplace_state.py @@ -0,0 +1,387 @@ +"""Ralph marketplace state management. + +Handles loading, saving, and computing marketplace state from append-only event log. +""" + +import json +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +@dataclass +class BotState: + """State for a single bidder.""" + + bot_id: str + weight: float + efficiency: float + proposals_submitted: int + implementations_succeeded: int + implementations_failed: int + total_iterations_used: int + average_accuracy: float + + +@dataclass +class MarketplaceState: + """Complete marketplace state.""" + + bots: dict[str, BotState] + total_budget_pool: int + rounds_completed: int + last_updated: str + + +class MarketplaceStateManager: + """Manages marketplace state persistence and computation.""" + + def __init__(self, ralph_dir: Path = Path(".ralph")) -> None: + """Initialize state manager. + + Args: + ralph_dir: Directory containing marketplace state and events + """ + self.ralph_dir = ralph_dir + self.events_dir = ralph_dir / "events" + self.state_file = ralph_dir / "marketplace.json" + self.version_file = ralph_dir / ".state_version" + self.config_file = ralph_dir / "config.json" + + # Ensure directories exist + self.events_dir.mkdir(parents=True, exist_ok=True) + + def load_config(self) -> dict[str, Any]: + """Load marketplace configuration. + + Returns: + Configuration dictionary + """ + if not self.config_file.exists(): + # Default configuration + return { + "num_bots": 3, + "base_budget_per_bot": 10, + "scoring_weights": { + "spec_alignment": 0.32, + "technical_quality": 0.22, + "innovation": 0.15, + "risk": 0.21, + "efficiency": 0.10, + }, + "weight_constraints": {"min": 0.05, "max": 0.60}, + } + + with open(self.config_file, encoding="utf-8") as f: + return json.load(f) + + def save_config(self, config: dict[str, Any]) -> None: + """Save marketplace configuration. + + Args: + config: Configuration dictionary + """ + with open(self.config_file, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + + def load_events(self) -> list[dict[str, Any]]: + """Load all events from event log, sorted by timestamp. + + Returns: + List of event dictionaries + """ + events = [] + if not self.events_dir.exists(): + return events + + for event_file in self.events_dir.glob("*.json"): + try: + with open(event_file, encoding="utf-8") as f: + event = json.load(f) + events.append(event) + except json.JSONDecodeError as e: + # Log warning and skip corrupted file + print(f"Warning: Skipping corrupted event file {event_file.name}: {e}") + continue + + # Sort by timestamp + events.sort(key=lambda e: e["timestamp"]) + return events + + def compute_state_from_events( + self, events: list[dict[str, Any]], config: dict[str, Any] + ) -> MarketplaceState: + """Compute current state from event log. + + Args: + events: List of events + config: Configuration dictionary + + Returns: + Computed marketplace state + """ + num_bots = config["num_bots"] + + # Initialize bot states with equal weights + bots = {} + for i in range(1, num_bots + 1): + bot_id = f"bidder_{i}" + bots[bot_id] = BotState( + bot_id=bot_id, + weight=1.0 / num_bots, # Equal initial weights + efficiency=1.0, # Start at perfect efficiency + proposals_submitted=0, + implementations_succeeded=0, + implementations_failed=0, + total_iterations_used=0, + average_accuracy=0.0, + ) + + # Track accuracy counts per bot for O(n) calculation + accuracy_counts = {bot_id: 0 for bot_id in bots} + + # Apply events to update state + for event in events: + bot_id = event["bot_id"] + outcome = event["outcome"] + weight_delta = event.get("weight_delta", 0.0) + + if bot_id not in bots: + print( + f"Warning: Event for unknown bot '{bot_id}' " + f"(outcome: {outcome}, weight_delta: {weight_delta})" + ) + continue + + bot = bots[bot_id] + + # Update based on outcome + # Check for success outcomes (ends with _success) + if isinstance(outcome, str) and ( + outcome.endswith("_success") or outcome == "success" + ): + bot.implementations_succeeded += 1 + # Check for failure outcomes (contains "failure" or "failed") + elif isinstance(outcome, str) and ( + "failure" in outcome or "failed" in outcome + ): + bot.implementations_failed += 1 + + # Update weight + bot.weight += weight_delta + + # Update iteration count + bot.total_iterations_used += event.get("iteration_count", 0) + + # Update accuracy tracking with O(n) running average + if "accuracy" in event: + accuracy_counts[bot_id] += 1 + count = accuracy_counts[bot_id] + # Update running average: new_avg = old_avg * (n-1)/n + new_value/n + bot.average_accuracy = ( + bot.average_accuracy * (count - 1) + event["accuracy"] + ) / count + + # Normalize weights to sum to 1.0 + total_weight = sum(bot.weight for bot in bots.values()) + if total_weight > 0: + for bot in bots.values(): + bot.weight = bot.weight / total_weight + + # Apply weight constraints with iterative adjustment to maintain sum=1.0 + min_weight = config["weight_constraints"]["min"] + max_weight = config["weight_constraints"]["max"] + + # Iteratively enforce constraints while maintaining normalized sum + max_iterations = 10 + for _ in range(max_iterations): + # Apply constraints + clamped = {} + excess = 0.0 + deficit = 0.0 + + for bot_id, bot in bots.items(): + original = bot.weight + clamped[bot_id] = max(min_weight, min(max_weight, original)) + + if clamped[bot_id] > original: + deficit += clamped[bot_id] - original + elif clamped[bot_id] < original: + excess += original - clamped[bot_id] + + # If no changes needed, we're done + if excess == 0 and deficit == 0: + break + + # Distribute excess to bots that need it (below min) or have room (below max) + if deficit > 0: + # Find bots that can absorb the deficit (not at max constraint) + can_absorb = [ + (bot_id, bot) + for bot_id, bot in bots.items() + if clamped[bot_id] < max_weight + ] + + if can_absorb: + # Distribute proportionally among bots that can absorb + absorb_weights = {bot_id: bot.weight for bot_id, bot in can_absorb} + total_absorb = sum(absorb_weights.values()) + + if total_absorb > 0: + for bot_id, _ in can_absorb: + proportion = absorb_weights[bot_id] / total_absorb + clamped[bot_id] = min( + max_weight, clamped[bot_id] - deficit * proportion + ) + + # Apply clamped values + for bot_id, bot in bots.items(): + bot.weight = clamped[bot_id] + + # Check if sum is close enough to 1.0 + total = sum(bot.weight for bot in bots.values()) + if abs(total - 1.0) < 1e-10: + break + + # Final normalization to ensure exact sum of 1.0 + total_weight = sum(bot.weight for bot in bots.values()) + if total_weight > 0: + for bot in bots.values(): + bot.weight = bot.weight / total_weight + + # Compute efficiency + for bot in bots.values(): + total_attempts = bot.implementations_succeeded + bot.implementations_failed + if total_attempts > 0: + bot.efficiency = bot.implementations_succeeded / total_attempts + else: + bot.efficiency = 1.0 # No attempts yet + + return MarketplaceState( + bots=bots, + total_budget_pool=config["num_bots"] * config["base_budget_per_bot"], + rounds_completed=len(events), + last_updated=datetime.now(UTC).isoformat(), + ) + + def load_state(self) -> MarketplaceState: + """Load marketplace state, recomputing from events if needed. + + Returns: + Current marketplace state + """ + config = self.load_config() + events = self.load_events() + + # Check if cached state is current + cached_version = 0 + if self.version_file.exists(): + cached_version = int(self.version_file.read_text().strip()) + + if self.state_file.exists() and cached_version == len(events): + # Cache is current, load it + with open(self.state_file, encoding="utf-8") as f: + state_dict = json.load(f) + bots = { + bot_id: BotState(**bot_data) + for bot_id, bot_data in state_dict["bots"].items() + } + return MarketplaceState( + bots=bots, + total_budget_pool=state_dict["total_budget_pool"], + rounds_completed=state_dict["rounds_completed"], + last_updated=state_dict["last_updated"], + ) + + # Recompute from events + state = self.compute_state_from_events(events, config) + + # Cache the computed state + self.save_state(state) + + return state + + def save_state(self, state: MarketplaceState) -> None: + """Save marketplace state to cache. + + Args: + state: Marketplace state to save + """ + state_dict = { + "bots": { + bot_id: { + "bot_id": bot.bot_id, + "weight": bot.weight, + "efficiency": bot.efficiency, + "proposals_submitted": bot.proposals_submitted, + "implementations_succeeded": bot.implementations_succeeded, + "implementations_failed": bot.implementations_failed, + "total_iterations_used": bot.total_iterations_used, + "average_accuracy": bot.average_accuracy, + } + for bot_id, bot in state.bots.items() + }, + "total_budget_pool": state.total_budget_pool, + "rounds_completed": state.rounds_completed, + "last_updated": state.last_updated, + } + + with open(self.state_file, "w", encoding="utf-8") as f: + json.dump(state_dict, f, indent=2) + + # Update version file + events = self.load_events() + self.version_file.write_text(str(len(events))) + + def record_event(self, event: dict[str, Any]) -> None: + """Record a new event to the event log. + + Args: + event: Event dictionary containing outcome details + """ + if "timestamp" not in event: + event["timestamp"] = datetime.now(UTC).isoformat() + + # Create event filename + timestamp_str = event["timestamp"].replace(":", "-").replace(".", "-") + bot_id = event["bot_id"] + outcome = event["outcome"] + filename = f"{timestamp_str}-{bot_id}-{outcome}.json" + + # Write event file + event_file = self.events_dir / filename + with open(event_file, "w", encoding="utf-8") as f: + json.dump(event, f, indent=2) + + def reset_state(self) -> None: + """Reset marketplace state to initial conditions. + + Removes all events and cached state, keeping configuration. + """ + # Remove all event files + if self.events_dir.exists(): + for event_file in self.events_dir.glob("*.json"): + event_file.unlink() + + # Remove cached state + if self.state_file.exists(): + self.state_file.unlink() + + # Remove version file + if self.version_file.exists(): + self.version_file.unlink() + + def get_bot_history(self, bot_id: str, last_n: int = 10) -> list[dict[str, Any]]: + """Get recent history for a specific bot. + + Args: + bot_id: Bot identifier + last_n: Number of recent events to return + + Returns: + List of recent events for the bot + """ + events = self.load_events() + bot_events = [e for e in events if e["bot_id"] == bot_id] + return bot_events[-last_n:] diff --git a/tools/ralph_marketplace_weights.py b/tools/ralph_marketplace_weights.py new file mode 100644 index 000000000..4bf4a4030 --- /dev/null +++ b/tools/ralph_marketplace_weights.py @@ -0,0 +1,163 @@ +"""Ralph marketplace weight update calculations. + +Handles weight adjustments based on proposal and implementation outcomes. +""" + +from typing import Literal + + +OutcomeType = Literal[ + "ranked_first_success", + "ranked_first_failure", + "ranked_second_plus_success", + "ranked_second_plus_failure", + "ranked_not_tried", + "replan_new_success", + "replan_failed_again", + "replan_resubmitted_same", +] + + +WEIGHT_DELTAS = { + "ranked_first_success": 0.10, # Proposal ranked #1, implemented successfully + "ranked_first_failure": -0.15, # Proposal ranked #1, implementation failed + "ranked_second_plus_success": 0.08, # Ranked #2+, succeeded after #1 failed + "ranked_second_plus_failure": -0.18, # Ranked #2+, tried after #1 failed, also failed + "ranked_not_tried": -0.02, # Ranked but not tried (another succeeded) + "replan_new_success": 0.12, # Replan with new proposal succeeded + "replan_failed_again": -0.20, # Replan failed again + "replan_resubmitted_same": -0.05, # Replan but resubmitted same proposal +} + + +ACCURACY_BONUS_THRESHOLD = 0.15 # Accuracy within this range gets bonus +ACCURACY_BONUS = 0.05 # Bonus for accurate prediction + + +def calculate_weight_delta( + outcome: OutcomeType, accuracy: float | None = None +) -> float: + """Calculate weight delta for a given outcome. + + Args: + outcome: Type of outcome + accuracy: Proposal accuracy (difference between proposal and implementation + scores). Only applicable for success outcomes + + Returns: + Weight delta to apply + """ + base_delta = WEIGHT_DELTAS[outcome] + + # Add accuracy bonus for successful outcomes + if ( + accuracy is not None + and outcome + in ["ranked_first_success", "ranked_second_plus_success", "replan_new_success"] + and abs(accuracy) <= ACCURACY_BONUS_THRESHOLD + ): + return base_delta + ACCURACY_BONUS + + return base_delta + + +def determine_outcome_type( + bot_id: str, + rankings: list[tuple[str, float]], + implementation_result: str, + *, + is_replan: bool = False, + resubmitted_same: bool = False, + was_implemented: bool = False, +) -> OutcomeType: + """Determine outcome type for weight calculation. + + Args: + bot_id: Bot identifier + rankings: List of (bot_id, score) tuples, sorted by rank + implementation_result: "success" or "failure" + is_replan: Whether this is a replan round + resubmitted_same: Whether bot resubmitted same proposal (replan only) + was_implemented: Whether this bot's proposal was actually implemented + + Returns: + Outcome type for weight calculation + """ + # Find bot's rank + bot_rank = None + for i, (ranked_bot_id, _) in enumerate(rankings, start=1): + if ranked_bot_id == bot_id: + bot_rank = i + break + + if bot_rank is None: + message = f"Bot {bot_id} not found in rankings" + raise ValueError(message) + + # Replan scenarios + if is_replan: + if resubmitted_same: + return "replan_resubmitted_same" + if implementation_result == "success": + return "replan_new_success" + return "replan_failed_again" + + # Initial round scenarios + if bot_rank == 1: + # Top-ranked bot + if implementation_result == "success": + return "ranked_first_success" + return "ranked_first_failure" + + # Lower-ranked bots + if was_implemented: + # This bot was actually tried (because higher ranks failed) + if implementation_result == "success": + return "ranked_second_plus_success" + return "ranked_second_plus_failure" + + # Not tried (another bot succeeded) + return "ranked_not_tried" + + +def format_weight_update_summary( + bot_id: str, + old_weight: float, + new_weight: float, + outcome: OutcomeType, + delta: float, +) -> str: + """Format weight update summary for display. + + Args: + bot_id: Bot identifier + old_weight: Weight before update + new_weight: Weight after update + outcome: Outcome type + delta: Weight delta applied + + Returns: + Formatted summary string + """ + outcome_descriptions = { + "ranked_first_success": "Ranked #1, implementation succeeded", + "ranked_first_failure": "Ranked #1, implementation failed", + "ranked_second_plus_success": "Ranked #2+, succeeded after higher rank failed", + "ranked_second_plus_failure": "Ranked #2+, tried after higher rank failed, also failed", + "ranked_not_tried": "Ranked but not tried (another succeeded)", + "replan_new_success": "Replan with new proposal succeeded", + "replan_failed_again": "Replan failed again", + "replan_resubmitted_same": "Replan resubmitted same proposal", + } + + lines = [ + f"Weight Update: {bot_id}", + "-" * 40, + f"Outcome: {outcome_descriptions[outcome]}", + f"Old Weight: {old_weight:.3f}", + f"Delta: {delta:+.3f}", + f"New Weight: {new_weight:.3f}", + f"Change: {((new_weight - old_weight) / old_weight * 100):+.1f}%", + ] + + return "\n".join(lines)