diff --git a/.agents/scripts/comfy-cli-helper.sh b/.agents/scripts/comfy-cli-helper.sh index f2311fef5..3b5eb7ce1 100755 --- a/.agents/scripts/comfy-cli-helper.sh +++ b/.agents/scripts/comfy-cli-helper.sh @@ -111,6 +111,10 @@ cmd_setup() { while [[ $# -gt 0 ]]; do case "$1" in --path) + if [[ $# -lt 2 ]]; then + log_error "--path requires a value" + return 1 + fi path="$2" shift 2 ;; @@ -146,10 +150,18 @@ cmd_launch() { while [[ $# -gt 0 ]]; do case "$1" in --port) + if [[ $# -lt 2 ]]; then + log_error "--port requires a value" + return 1 + fi port="$2" shift 2 ;; --listen) + if [[ $# -lt 2 ]]; then + log_error "--listen requires a value" + return 1 + fi listen="$2" shift 2 ;; @@ -239,6 +251,10 @@ cmd_snapshot_save() { while [[ $# -gt 0 ]]; do case "$1" in --output) + if [[ $# -lt 2 ]]; then + log_error "--output requires a value" + return 1 + fi output="$2" shift 2 ;; diff --git a/.agents/scripts/eeat-score-helper.sh b/.agents/scripts/eeat-score-helper.sh index 92abe030f..5427f8339 100755 --- a/.agents/scripts/eeat-score-helper.sh +++ b/.agents/scripts/eeat-score-helper.sh @@ -810,7 +810,7 @@ do_analyze() { local urls=() if [[ "$input_file" == *.json ]]; then # JSON format - extract URLs with status 200 - mapfile -t urls < <(jq -r '.[] | select(.status_code == 200) | .url' "$input_file" 2>/dev/null || echo "") + mapfile -t urls < <(jq -r '.[] | select(.status_code == 200) | .url' "$input_file" 2>/dev/null) elif [[ "$input_file" == *.csv ]]; then # CSV format - extract URLs from first column where status is 200 mapfile -t urls < <(tail -n +2 "$input_file" | awk -F',' '$2 == "200" || $2 == 200 {gsub(/"/, "", $1); print $1}') diff --git a/.agents/scripts/email-batch-convert-helper.sh b/.agents/scripts/email-batch-convert-helper.sh index 4ddcfc3fe..f65f91733 100755 --- a/.agents/scripts/email-batch-convert-helper.sh +++ b/.agents/scripts/email-batch-convert-helper.sh @@ -98,6 +98,11 @@ cmd_convert() { fi done < <(find "$input_dir" -type f \( -name "*.eml" -o -name "*.msg" \) -print0 2>/dev/null) + if [[ "$count" -gt 0 && "$success" -eq 0 ]]; then + print_error "Conversion complete: ${success}/${count} succeeded, ${failed} failed" + print_error "All conversions failed" + return 1 + fi print_success "Conversion complete: ${success}/${count} succeeded, ${failed} failed" return 0 } diff --git a/.agents/scripts/email-thread-reconstruction.py b/.agents/scripts/email-thread-reconstruction.py index b607c0145..e4f8bbe72 100755 --- a/.agents/scripts/email-thread-reconstruction.py +++ b/.agents/scripts/email-thread-reconstruction.py @@ -24,39 +24,39 @@ def parse_frontmatter(md_file): """Extract YAML frontmatter from a markdown file. - + Returns dict of metadata, or None if no frontmatter found. """ - with open(md_file, 'r', encoding='utf-8') as f: + with open(md_file, "r", encoding="utf-8") as f: content = f.read() - + # Check for YAML frontmatter delimiters - if not content.startswith('---\n'): + if not content.startswith("---\n"): return None - + # Find the closing delimiter - end_match = re.search(r'\n---\n', content[4:]) + end_match = re.search(r"\n---\n", content[4:]) if not end_match: return None - - frontmatter_text = content[4:4 + end_match.start()] - + + frontmatter_text = content[4 : 4 + end_match.start()] + # Parse YAML-like frontmatter (simple key: value pairs) metadata = {} - for line in frontmatter_text.split('\n'): - if ':' not in line: + for line in frontmatter_text.split("\n"): + if ":" not in line: continue # Handle simple key: value (not nested structures) - if line.startswith(' '): + if line.startswith(" "): continue # Skip nested items for now - key, _, value = line.partition(':') + key, _, value = line.partition(":") key = key.strip() value = value.strip() # Remove quotes if present if value.startswith('"') and value.endswith('"'): value = value[1:-1] metadata[key] = value - + return metadata @@ -64,13 +64,13 @@ def _format_field(key, value): """Format a YAML frontmatter field as 'key: value' string.""" if isinstance(value, str): return f'{key}: "{value}"' - return f'{key}: {value}' + return f"{key}: {value}" def _find_insert_point(lines): """Find insertion point for new fields (after tokens_estimate or at end).""" for i, line in enumerate(lines): - if line.startswith('tokens_estimate:'): + if line.startswith("tokens_estimate:"): return i + 1 return len(lines) @@ -78,7 +78,7 @@ def _find_insert_point(lines): def _update_existing_field(lines, key, value): """Update an existing field in frontmatter lines. Returns True if found.""" for i, line in enumerate(lines): - if line.startswith(f'{key}:'): + if line.startswith(f"{key}:"): lines[i] = _format_field(key, value) return True return False @@ -86,181 +86,183 @@ def _update_existing_field(lines, key, value): def update_frontmatter(md_file, new_fields): """Update frontmatter in a markdown file with new fields. - + Adds or updates fields in the YAML frontmatter section. """ - with open(md_file, 'r', encoding='utf-8') as f: + with open(md_file, "r", encoding="utf-8") as f: content = f.read() - - if not content.startswith('---\n'): + + if not content.startswith("---\n"): return False - + # Find the closing delimiter - end_match = re.search(r'\n---\n', content[4:]) + end_match = re.search(r"\n---\n", content[4:]) if not end_match: return False - + frontmatter_end = 4 + end_match.start() + 5 # +5 for '\n---\n' - frontmatter_text = content[4:4 + end_match.start()] + frontmatter_text = content[4 : 4 + end_match.start()] body = content[frontmatter_end:] - - lines = frontmatter_text.split('\n') + + lines = frontmatter_text.split("\n") insert_idx = _find_insert_point(lines) - + # Update existing fields or collect new ones new_lines = [] for key, value in new_fields.items(): if not _update_existing_field(lines, key, value): new_lines.append(_format_field(key, value)) - + # Insert new lines at the insertion point if new_lines: lines = lines[:insert_idx] + new_lines + lines[insert_idx:] - + # Rebuild frontmatter - new_frontmatter = '---\n' + '\n'.join(lines) + '\n---\n' + new_frontmatter = "---\n" + "\n".join(lines) + "\n---\n" new_content = new_frontmatter + body - - with open(md_file, 'w', encoding='utf-8') as f: + + with open(md_file, "w", encoding="utf-8") as f: f.write(new_content) - + return True def build_thread_graph(emails): """Build a thread graph from email metadata. - + Args: emails: list of dicts with 'file', 'message_id', 'in_reply_to', 'date_sent' - + Returns: dict mapping thread_id (root message_id) to list of emails in thread order """ # Build lookup maps by_message_id = {} for email in emails: - msg_id = email.get('message_id', '').strip() + msg_id = email.get("message_id", "").strip() if msg_id: by_message_id[msg_id] = email - + # Build parent-child relationships children = defaultdict(list) roots = [] - + for email in emails: - msg_id = email.get('message_id', '').strip() - in_reply_to = email.get('in_reply_to', '').strip() - + msg_id = email.get("message_id", "").strip() + in_reply_to = email.get("in_reply_to", "").strip() + if not msg_id: # No message_id, treat as standalone roots.append(email) continue - + if not in_reply_to or in_reply_to not in by_message_id: # Root message (no parent or parent not in dataset) roots.append(email) else: # Reply to another message children[in_reply_to].append(email) - + # Build threads by traversing from roots threads = {} - + def traverse(email, thread_list, position=0): """Recursively traverse thread tree.""" - email['thread_position'] = position + email["thread_position"] = position thread_list.append(email) - - msg_id = email.get('message_id', '') + + msg_id = email.get("message_id", "") if msg_id in children: # Sort children by date sorted_children = sorted( - children[msg_id], - key=lambda e: e.get('date_sent', '') + children[msg_id], key=lambda e: e.get("date_sent", "") ) for child in sorted_children: traverse(child, thread_list, position + 1) - + for root in roots: thread_list = [] traverse(root, thread_list) - + # Thread ID is the root message_id (or file path if no message_id) - thread_id = root.get('message_id', '') or root['file'] - + thread_id = root.get("message_id", "") or root["file"] + # Set thread_length for all emails in thread for email in thread_list: - email['thread_length'] = len(thread_list) - email['thread_id'] = thread_id - + email["thread_length"] = len(thread_list) + email["thread_id"] = thread_id + threads[thread_id] = thread_list - + return threads def generate_thread_index(threads, output_file): """Generate a thread index file listing all emails by thread. - + Format: # Email Threads Index - + ## Thread: ( messages) Thread ID: - + 1. []() - - 2. []() - - ... """ - lines = ['# Email Threads Index', ''] - lines.append(f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') - lines.append(f'Total threads: {len(threads)}') - lines.append('') - + lines = ["# Email Threads Index", ""] + lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + lines.append(f"Total threads: {len(threads)}") + lines.append("") + # Sort threads by date of first message sorted_threads = sorted( threads.items(), - key=lambda t: t[1][0].get('date_sent', '') if t[1] else '', - reverse=True # Most recent first + key=lambda t: t[1][0].get("date_sent", "") if t[1] else "", + reverse=True, # Most recent first ) - + for thread_id, emails in sorted_threads: if not emails: continue - + root = emails[0] - subject = root.get('subject', 'No Subject') - thread_length = root.get('thread_length', len(emails)) - - lines.append(f'## Thread: {subject} ({thread_length} messages)') - lines.append(f'Thread ID: `{thread_id}`') - lines.append('') - + subject = root.get("subject", "No Subject") + thread_length = root.get("thread_length", len(emails)) + + msg_word = "message" if thread_length == 1 else "messages" + lines.append(f"## Thread: {subject} ({thread_length} {msg_word})") + lines.append(f"Thread ID: `{thread_id}`") + lines.append("") + for i, email in enumerate(emails, 1): - file_path = Path(email['file']).name - email_subject = email.get('subject', 'No Subject') - from_addr = email.get('from', 'Unknown') - date_sent = email.get('date_sent', 'Unknown') - position = email.get('thread_position', i - 1) - + file_path = Path(email["file"]).name + email_subject = email.get("subject", "No Subject") + from_addr = email.get("from", "Unknown") + date_sent = email.get("date_sent", "Unknown") + position = email.get("thread_position", i - 1) + # Indent replies - indent = ' ' * position - lines.append(f'{indent}{i}. [{email_subject}]({file_path}) - {from_addr} - {date_sent}') - - lines.append('') - - with open(output_file, 'w', encoding='utf-8') as f: - f.write('\n'.join(lines)) - + indent = " " * position + lines.append( + f"{indent}{i}. [{email_subject}]({file_path}) - {from_addr} - {date_sent}" + ) + + lines.append("") + + with open(output_file, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) + return output_file def reconstruct_threads(directory, output_index=None): """Reconstruct email threads from a directory of converted emails. - + Args: directory: path to directory containing .md email files output_index: path to thread index file (default: directory/thread-index.md) - + Returns: dict with 'threads', 'updated_count', 'index_file' """ @@ -268,69 +270,75 @@ def reconstruct_threads(directory, output_index=None): if not dir_path.is_dir(): print(f"ERROR: Directory not found: {directory}", file=sys.stderr) sys.exit(1) - + # Find all .md files - md_files = list(dir_path.glob('*.md')) + md_files = list(dir_path.glob("*.md")) if not md_files: print(f"WARNING: No .md files found in {directory}", file=sys.stderr) - return {'threads': {}, 'updated_count': 0, 'index_file': None} - + return {"threads": {}, "updated_count": 0, "index_file": None} + # Parse frontmatter from all files emails = [] for md_file in md_files: metadata = parse_frontmatter(md_file) if metadata: - metadata['file'] = str(md_file) + metadata["file"] = str(md_file) emails.append(metadata) - + if not emails: - print(f"WARNING: No emails with frontmatter found in {directory}", file=sys.stderr) - return {'threads': {}, 'updated_count': 0, 'index_file': None} - + print( + f"WARNING: No emails with frontmatter found in {directory}", file=sys.stderr + ) + return {"threads": {}, "updated_count": 0, "index_file": None} + # Build thread graph threads = build_thread_graph(emails) - + # Update frontmatter in all files updated_count = 0 for _tid, thread_emails in threads.items(): for email in thread_emails: new_fields = { - 'thread_id': email['thread_id'], - 'thread_position': email['thread_position'], - 'thread_length': email['thread_length'], + "thread_id": email["thread_id"], + "thread_position": email["thread_position"], + "thread_length": email["thread_length"], } - if update_frontmatter(email['file'], new_fields): + if update_frontmatter(email["file"], new_fields): updated_count += 1 - + # Generate thread index if output_index is None: - output_index = dir_path / 'thread-index.md' - + output_index = dir_path / "thread-index.md" + index_file = generate_thread_index(threads, output_index) - + return { - 'threads': threads, - 'updated_count': updated_count, - 'index_file': str(index_file) + "threads": threads, + "updated_count": updated_count, + "index_file": str(index_file), } def main(): parser = argparse.ArgumentParser( - description='Reconstruct email conversation threads from message-id chains' + description="Reconstruct email conversation threads from message-id chains" + ) + parser.add_argument("directory", help="Directory containing .md email files") + parser.add_argument( + "--output", + "-o", + help="Output thread index file (default: directory/thread-index.md)", ) - parser.add_argument('directory', help='Directory containing .md email files') - parser.add_argument('--output', '-o', help='Output thread index file (default: directory/thread-index.md)') - + args = parser.parse_args() - + result = reconstruct_threads(args.directory, args.output) - + print(f"Processed {result['updated_count']} emails") print(f"Found {len(result['threads'])} threads") - if result['index_file']: + if result["index_file"]: print(f"Thread index: {result['index_file']}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/.agents/scripts/model-registry-helper.sh b/.agents/scripts/model-registry-helper.sh index 1a02cc838..be313f085 100755 --- a/.agents/scripts/model-registry-helper.sh +++ b/.agents/scripts/model-registry-helper.sh @@ -491,9 +491,6 @@ sync_providers() { " 2>/dev/null || echo "") # If recent successful sync, skip - if [[ -n "$opencode_synced" && "$opencode_synced" != "0" ]]; then - return 0 - fi if [[ -n "$opencode_synced" && "$opencode_synced" -gt 0 ]]; then print_info "Skipping direct API probing (OpenCode sync already discovered $opencode_synced models)" return 0 diff --git a/.agents/scripts/ocr-receipt-helper.sh b/.agents/scripts/ocr-receipt-helper.sh index c2a339c87..cb1663f8f 100755 --- a/.agents/scripts/ocr-receipt-helper.sh +++ b/.agents/scripts/ocr-receipt-helper.sh @@ -42,347 +42,347 @@ readonly DEFAULT_VAT_RATE="20" # Ensure workspace exists ensure_workspace() { - mkdir -p "$WORKSPACE_DIR" 2>/dev/null || true - return 0 + mkdir -p "$WORKSPACE_DIR" 2>/dev/null || true + return 0 } # Check if Ollama is running and GLM-OCR model is available check_ollama() { - if ! command -v ollama &>/dev/null; then - print_error "Ollama is not installed. Run: brew install ollama" - return 1 - fi + if ! command -v ollama &>/dev/null; then + print_error "Ollama is not installed. Run: brew install ollama" + return 1 + fi - if ! ollama list 2>/dev/null | grep -q "${OCR_MODEL}"; then - print_error "GLM-OCR model not found. Run: ocr-receipt-helper.sh install" - return 1 - fi + if ! ollama list 2>/dev/null | grep -q "${OCR_MODEL}"; then + print_error "GLM-OCR model not found. Run: ocr-receipt-helper.sh install" + return 1 + fi - return 0 + return 0 } # Check if document-extraction venv is available check_extraction_venv() { - if [[ -d "${VENV_DIR}/bin" ]]; then - return 0 - fi - print_error "Document extraction venv not found at ${VENV_DIR}" - print_info "Run: document-extraction-helper.sh install --core" - return 1 + if [[ -d "${VENV_DIR}/bin" ]]; then + return 0 + fi + print_error "Document extraction venv not found at ${VENV_DIR}" + print_info "Run: document-extraction-helper.sh install --core" + return 1 } # Activate the document-extraction venv activate_venv() { - if [[ -d "${VENV_DIR}/bin" ]]; then - source "${VENV_DIR}/bin/activate" - return 0 - fi - return 1 + if [[ -d "${VENV_DIR}/bin" ]]; then + source "${VENV_DIR}/bin/activate" + return 0 + fi + return 1 } # Detect file type (image vs PDF vs document) detect_file_type() { - local file="$1" - local ext="${file##*.}" - ext="$(echo "$ext" | tr '[:upper:]' '[:lower:]')" - - case "$ext" in - png|jpg|jpeg|tiff|bmp|webp|heic) - echo "image" - ;; - pdf) - echo "pdf" - ;; - docx|xlsx|pptx|html|htm) - echo "document" - ;; - *) - echo "unknown" - ;; - esac - return 0 + local file="$1" + local ext="${file##*.}" + ext="$(echo "$ext" | tr '[:upper:]' '[:lower:]')" + + case "$ext" in + png | jpg | jpeg | tiff | bmp | webp | heic) + echo "image" + ;; + pdf) + echo "pdf" + ;; + docx | xlsx | pptx | html | htm) + echo "document" + ;; + *) + echo "unknown" + ;; + esac + return 0 } # Auto-detect document type (invoice vs receipt) from OCR text detect_document_type() { - local text="$1" - local lower_text - lower_text="$(echo "$text" | tr '[:upper:]' '[:lower:]')" - - # Invoice indicators: invoice number, due date, payment terms, PO number - local invoice_score=0 - if echo "$lower_text" | grep -qE "invoice\s*(no|number|#|:)"; then - invoice_score=$((invoice_score + 3)) - fi - if echo "$lower_text" | grep -qE "due\s*date|payment\s*terms|net\s*[0-9]+"; then - invoice_score=$((invoice_score + 2)) - fi - if echo "$lower_text" | grep -qE "purchase\s*order|p\.?o\.?\s*(no|number|#)"; then - invoice_score=$((invoice_score + 2)) - fi - if echo "$lower_text" | grep -qE "bill\s*to|ship\s*to|remit\s*to"; then - invoice_score=$((invoice_score + 1)) - fi - - # Receipt indicators: receipt, cash, card, change, thank you - local receipt_score=0 - if echo "$lower_text" | grep -qE "receipt|till|register"; then - receipt_score=$((receipt_score + 3)) - fi - if echo "$lower_text" | grep -qE "cash|card|visa|mastercard|amex|contactless|chip"; then - receipt_score=$((receipt_score + 2)) - fi - if echo "$lower_text" | grep -qE "change\s*due|thank\s*you|have\s*a\s*nice"; then - receipt_score=$((receipt_score + 2)) - fi - if echo "$lower_text" | grep -qE "subtotal|sub\s*total"; then - receipt_score=$((receipt_score + 1)) - fi - - if [[ "$invoice_score" -gt "$receipt_score" ]]; then - echo "invoice" - elif [[ "$receipt_score" -gt "$invoice_score" ]]; then - echo "receipt" - else - # Default to invoice (more structured, safer assumption) - echo "invoice" - fi - return 0 + local text="$1" + local lower_text + lower_text="$(echo "$text" | tr '[:upper:]' '[:lower:]')" + + # Invoice indicators: invoice number, due date, payment terms, PO number + local invoice_score=0 + if echo "$lower_text" | grep -qE "invoice\s*(no|number|#|:)"; then + invoice_score=$((invoice_score + 3)) + fi + if echo "$lower_text" | grep -qE "due\s*date|payment\s*terms|net\s*[0-9]+"; then + invoice_score=$((invoice_score + 2)) + fi + if echo "$lower_text" | grep -qE "purchase\s*order|p\.?o\.?\s*(no|number|#)"; then + invoice_score=$((invoice_score + 2)) + fi + if echo "$lower_text" | grep -qE "bill\s*to|ship\s*to|remit\s*to"; then + invoice_score=$((invoice_score + 1)) + fi + + # Receipt indicators: receipt, cash, card, change, thank you + local receipt_score=0 + if echo "$lower_text" | grep -qE "receipt|till|register"; then + receipt_score=$((receipt_score + 3)) + fi + if echo "$lower_text" | grep -qE "cash|card|visa|mastercard|amex|contactless|chip"; then + receipt_score=$((receipt_score + 2)) + fi + if echo "$lower_text" | grep -qE "change\s*due|thank\s*you|have\s*a\s*nice"; then + receipt_score=$((receipt_score + 2)) + fi + if echo "$lower_text" | grep -qE "subtotal|sub\s*total"; then + receipt_score=$((receipt_score + 1)) + fi + + if [[ "$invoice_score" -gt "$receipt_score" ]]; then + echo "invoice" + elif [[ "$receipt_score" -gt "$invoice_score" ]]; then + echo "receipt" + else + # Default to invoice (more structured, safer assumption) + echo "invoice" + fi + return 0 } # OCR scan using GLM-OCR via Ollama cmd_scan() { - local input_file="$1" - local output_format="${2:-text}" - - validate_file_exists "$input_file" "Input file" || return 1 - check_ollama || return 1 - ensure_workspace - - local file_type - file_type="$(detect_file_type "$input_file")" - - local basename - basename="$(basename "$input_file" | sed 's/\.[^.]*$//')" - - case "$file_type" in - image) - print_info "OCR scanning image: ${input_file}" - local ocr_text - ocr_text="$(ollama run "$OCR_MODEL" "Extract all text from this receipt or invoice exactly as written. Include all amounts, dates, item descriptions, and totals." --images "$input_file" 2>/dev/null)" || { - print_error "OCR scan failed" - return 1 - } - ;; - pdf) - print_info "OCR scanning PDF: ${input_file} (converting pages to images first)" - if ! command -v magick &>/dev/null && ! command -v convert &>/dev/null; then - print_error "ImageMagick required for PDF OCR. Install: brew install imagemagick" - return 1 - fi - - local tmp_dir - tmp_dir="$(mktemp -d)" - # Use magick (ImageMagick 7) or convert (ImageMagick 6) - if command -v magick &>/dev/null; then - magick -density 300 "$input_file" -quality 90 "${tmp_dir}/page-%03d.png" 2>/dev/null || { - print_error "PDF to image conversion failed" - rm -rf "$tmp_dir" - return 1 - } - else - convert -density 300 "$input_file" -quality 90 "${tmp_dir}/page-%03d.png" 2>/dev/null || { - print_error "PDF to image conversion failed" - rm -rf "$tmp_dir" - return 1 - } - fi - - local ocr_text="" - local page_num=0 - for page in "${tmp_dir}"/page-*.png; do - [[ -f "$page" ]] || continue - page_num=$((page_num + 1)) - print_info " OCR page ${page_num}..." - local page_text - page_text="$(ollama run "$OCR_MODEL" "Extract all text from this receipt or invoice exactly as written." --images "$page" 2>/dev/null)" || continue - if [[ -n "$ocr_text" ]]; then - ocr_text="${ocr_text}\n\n--- Page ${page_num} ---\n\n${page_text}" - else - ocr_text="$page_text" - fi - done - rm -rf "$tmp_dir" - - if [[ -z "$ocr_text" ]]; then - print_error "No text extracted from PDF" - return 1 - fi - ;; - document) - print_info "For document files, use: document-extraction-helper.sh extract ${input_file}" - print_info "Or use: ocr-receipt-helper.sh extract ${input_file} (structured extraction)" - return 1 - ;; - *) - print_error "Unsupported file type: ${input_file}" - return 1 - ;; - esac - - # Output - case "$output_format" in - text) - echo "$ocr_text" - ;; - json) - local doc_type - doc_type="$(detect_document_type "$ocr_text")" - local output_file="${WORKSPACE_DIR}/${basename}-ocr.json" - printf '{\n "source_file": "%s",\n "detected_type": "%s",\n "ocr_text": %s\n}\n' \ - "$input_file" "$doc_type" "$(echo "$ocr_text" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')" \ - > "$output_file" - print_success "OCR output: ${output_file}" - echo "$ocr_text" - ;; - markdown) - local doc_type - doc_type="$(detect_document_type "$ocr_text")" - echo "# OCR Scan: ${basename}" - echo "" - echo "**Source**: ${input_file}" - echo "**Detected type**: ${doc_type}" - echo "" - echo "## Extracted Text" - echo "" - echo "$ocr_text" - ;; - esac - - return 0 + local input_file="$1" + local output_format="${2:-text}" + + validate_file_exists "$input_file" "Input file" || return 1 + check_ollama || return 1 + ensure_workspace + + local file_type + file_type="$(detect_file_type "$input_file")" + + local basename + basename="$(basename "$input_file" | sed 's/\.[^.]*$//')" + + case "$file_type" in + image) + print_info "OCR scanning image: ${input_file}" + local ocr_text + ocr_text="$(ollama run "$OCR_MODEL" "Extract all text from this receipt or invoice exactly as written. Include all amounts, dates, item descriptions, and totals." --images "$input_file" 2>/dev/null)" || { + print_error "OCR scan failed" + return 1 + } + ;; + pdf) + print_info "OCR scanning PDF: ${input_file} (converting pages to images first)" + if ! command -v magick &>/dev/null && ! command -v convert &>/dev/null; then + print_error "ImageMagick required for PDF OCR. Install: brew install imagemagick" + return 1 + fi + + local tmp_dir + tmp_dir="$(mktemp -d)" + # Use magick (ImageMagick 7) or convert (ImageMagick 6) + if command -v magick &>/dev/null; then + magick -density 300 "$input_file" -quality 90 "${tmp_dir}/page-%03d.png" 2>/dev/null || { + print_error "PDF to image conversion failed" + rm -rf "$tmp_dir" + return 1 + } + else + convert -density 300 "$input_file" -quality 90 "${tmp_dir}/page-%03d.png" 2>/dev/null || { + print_error "PDF to image conversion failed" + rm -rf "$tmp_dir" + return 1 + } + fi + + local ocr_text="" + local page_num=0 + for page in "${tmp_dir}"/page-*.png; do + [[ -f "$page" ]] || continue + page_num=$((page_num + 1)) + print_info " OCR page ${page_num}..." + local page_text + page_text="$(ollama run "$OCR_MODEL" "Extract all text from this receipt or invoice exactly as written." --images "$page" 2>/dev/null)" || continue + if [[ -n "$ocr_text" ]]; then + ocr_text="${ocr_text}\n\n--- Page ${page_num} ---\n\n${page_text}" + else + ocr_text="$page_text" + fi + done + rm -rf "$tmp_dir" + + if [[ -z "$ocr_text" ]]; then + print_error "No text extracted from PDF" + return 1 + fi + ;; + document) + print_info "For document files, use: document-extraction-helper.sh extract ${input_file}" + print_info "Or use: ocr-receipt-helper.sh extract ${input_file} (structured extraction)" + return 1 + ;; + *) + print_error "Unsupported file type: ${input_file}" + return 1 + ;; + esac + + # Output + case "$output_format" in + text) + echo "$ocr_text" + ;; + json) + local doc_type + doc_type="$(detect_document_type "$ocr_text")" + local output_file="${WORKSPACE_DIR}/${basename}-ocr.json" + printf '{\n "source_file": "%s",\n "detected_type": "%s",\n "ocr_text": %s\n}\n' \ + "$input_file" "$doc_type" "$(echo "$ocr_text" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')" \ + >"$output_file" + print_success "OCR output: ${output_file}" + echo "$ocr_text" + ;; + markdown) + local doc_type + doc_type="$(detect_document_type "$ocr_text")" + echo "# OCR Scan: ${basename}" + echo "" + echo "**Source**: ${input_file}" + echo "**Detected type**: ${doc_type}" + echo "" + echo "## Extracted Text" + echo "" + echo "$ocr_text" + ;; + esac + + return 0 } # Structured extraction using Docling + ExtractThinker cmd_extract() { - local input_file="$1" - local doc_type="${2:-auto}" - local privacy="${3:-local}" - local output_format="${4:-json}" - - validate_file_exists "$input_file" "Input file" || return 1 - ensure_workspace - - local file_type - file_type="$(detect_file_type "$input_file")" - - # For images, use GLM-OCR first then parse the text with LLM - if [[ "$file_type" == "image" ]]; then - check_ollama || return 1 - print_info "Step 1/2: OCR scanning image..." - local ocr_text - ocr_text="$(ollama run "$OCR_MODEL" "Extract all text from this receipt or invoice exactly as written. Include all amounts, dates, item descriptions, and totals." --images "$input_file" 2>/dev/null)" || { - print_error "OCR scan failed" - return 1 - } - - # Auto-detect type if needed - if [[ "$doc_type" == "auto" ]]; then - doc_type="$(detect_document_type "$ocr_text")" - print_info "Auto-detected document type: ${doc_type}" - fi - - print_info "Step 2/2: Extracting structured data from OCR text..." - extract_from_text "$ocr_text" "$doc_type" "$privacy" "$input_file" - return $? - fi - - # For PDFs and documents, use document-extraction-helper.sh if available - if [[ "$file_type" == "pdf" ]] || [[ "$file_type" == "document" ]]; then - # Auto-detect type: do a quick OCR scan first - if [[ "$doc_type" == "auto" ]]; then - if [[ "$file_type" == "pdf" ]] && check_ollama 2>/dev/null; then - # Quick OCR of first page for type detection - local tmp_dir - tmp_dir="$(mktemp -d)" - if command -v magick &>/dev/null; then - magick -density 150 "${input_file}[0]" -quality 80 "${tmp_dir}/page-000.png" 2>/dev/null - elif command -v convert &>/dev/null; then - convert -density 150 "${input_file}[0]" -quality 80 "${tmp_dir}/page-000.png" 2>/dev/null - fi - if [[ -f "${tmp_dir}/page-000.png" ]]; then - local quick_text - quick_text="$(ollama run "$OCR_MODEL" "Extract all text" --images "${tmp_dir}/page-000.png" 2>/dev/null)" || true - if [[ -n "${quick_text:-}" ]]; then - doc_type="$(detect_document_type "$quick_text")" - print_info "Auto-detected document type: ${doc_type}" - fi - fi - rm -rf "$tmp_dir" - fi - # Fallback to invoice if detection failed - if [[ "$doc_type" == "auto" ]]; then - doc_type="invoice" - print_info "Defaulting to document type: invoice" - fi - fi - - # Use document-extraction-helper.sh for structured extraction - local schema_name="$doc_type" - if [[ -x "${SCRIPT_DIR}/document-extraction-helper.sh" ]]; then - print_info "Extracting structured data via document-extraction-helper.sh..." - "${SCRIPT_DIR}/document-extraction-helper.sh" extract "$input_file" \ - --schema "$schema_name" --privacy "$privacy" --output "$output_format" - return $? - else - print_error "document-extraction-helper.sh not found" - print_info "Falling back to OCR scan..." - cmd_scan "$input_file" "$output_format" - return $? - fi - fi - - print_error "Unsupported file type for extraction" - return 1 + local input_file="$1" + local doc_type="${2:-auto}" + local privacy="${3:-local}" + local output_format="${4:-json}" + + validate_file_exists "$input_file" "Input file" || return 1 + ensure_workspace + + local file_type + file_type="$(detect_file_type "$input_file")" + + # For images, use GLM-OCR first then parse the text with LLM + if [[ "$file_type" == "image" ]]; then + check_ollama || return 1 + print_info "Step 1/2: OCR scanning image..." + local ocr_text + ocr_text="$(ollama run "$OCR_MODEL" "Extract all text from this receipt or invoice exactly as written. Include all amounts, dates, item descriptions, and totals." --images "$input_file" 2>/dev/null)" || { + print_error "OCR scan failed" + return 1 + } + + # Auto-detect type if needed + if [[ "$doc_type" == "auto" ]]; then + doc_type="$(detect_document_type "$ocr_text")" + print_info "Auto-detected document type: ${doc_type}" + fi + + print_info "Step 2/2: Extracting structured data from OCR text..." + extract_from_text "$ocr_text" "$doc_type" "$privacy" "$input_file" + return $? + fi + + # For PDFs and documents, use document-extraction-helper.sh if available + if [[ "$file_type" == "pdf" ]] || [[ "$file_type" == "document" ]]; then + # Auto-detect type: do a quick OCR scan first + if [[ "$doc_type" == "auto" ]]; then + if [[ "$file_type" == "pdf" ]] && check_ollama 2>/dev/null; then + # Quick OCR of first page for type detection + local tmp_dir + tmp_dir="$(mktemp -d)" + if command -v magick &>/dev/null; then + magick -density 150 "${input_file}[0]" -quality 80 "${tmp_dir}/page-000.png" 2>/dev/null + elif command -v convert &>/dev/null; then + convert -density 150 "${input_file}[0]" -quality 80 "${tmp_dir}/page-000.png" 2>/dev/null + fi + if [[ -f "${tmp_dir}/page-000.png" ]]; then + local quick_text + quick_text="$(ollama run "$OCR_MODEL" "Extract all text" --images "${tmp_dir}/page-000.png" 2>/dev/null)" || true + if [[ -n "${quick_text:-}" ]]; then + doc_type="$(detect_document_type "$quick_text")" + print_info "Auto-detected document type: ${doc_type}" + fi + fi + rm -rf "$tmp_dir" + fi + # Fallback to invoice if detection failed + if [[ "$doc_type" == "auto" ]]; then + doc_type="invoice" + print_info "Defaulting to document type: invoice" + fi + fi + + # Use document-extraction-helper.sh for structured extraction + local schema_name="$doc_type" + if [[ -x "${SCRIPT_DIR}/document-extraction-helper.sh" ]]; then + print_info "Extracting structured data via document-extraction-helper.sh..." + "${SCRIPT_DIR}/document-extraction-helper.sh" extract "$input_file" \ + --schema "$schema_name" --privacy "$privacy" --output "$output_format" + return $? + else + print_error "document-extraction-helper.sh not found" + print_info "Falling back to OCR scan..." + cmd_scan "$input_file" "$output_format" + return $? + fi + fi + + print_error "Unsupported file type for extraction" + return 1 } # Extract structured data from OCR text using an LLM extract_from_text() { - local ocr_text="$1" - local doc_type="$2" - local privacy="$3" - local source_file="${4:-unknown}" - - local basename - basename="$(basename "$source_file" | sed 's/\.[^.]*$//')" - local output_file="${WORKSPACE_DIR}/${basename}-extracted.json" - - # Determine LLM backend - local llm_model - case "$privacy" in - local|none) - if command -v ollama &>/dev/null; then - llm_model="llama3.2" - else - print_error "Ollama required for local privacy mode" - return 1 - fi - ;; - edge) - llm_model="cloudflare" - ;; - cloud) - llm_model="cloud" - ;; - *) - print_error "Unknown privacy mode: ${privacy}" - return 1 - ;; - esac - - # Build extraction prompt based on document type (aligned with Pydantic schemas) - local extraction_prompt - if [[ "$doc_type" == "invoice" ]]; then - extraction_prompt="Extract the following fields from this invoice text as JSON. Use null for missing fields. + local ocr_text="$1" + local doc_type="$2" + local privacy="$3" + local source_file="${4:-unknown}" + + local basename + basename="$(basename "$source_file" | sed 's/\.[^.]*$//')" + local output_file="${WORKSPACE_DIR}/${basename}-extracted.json" + + # Determine LLM backend + local llm_model + case "$privacy" in + local | none) + if command -v ollama &>/dev/null; then + llm_model="llama3.2" + else + print_error "Ollama required for local privacy mode" + return 1 + fi + ;; + edge) + llm_model="cloudflare" + ;; + cloud) + llm_model="cloud" + ;; + *) + print_error "Unknown privacy mode: ${privacy}" + return 1 + ;; + esac + + # Build extraction prompt based on document type (aligned with Pydantic schemas) + local extraction_prompt + if [[ "$doc_type" == "invoice" ]]; then + extraction_prompt="Extract the following fields from this invoice text as JSON. Use null for missing fields. All dates must be in YYYY-MM-DD format. All amounts must be numbers (not strings). Fields: @@ -405,8 +405,8 @@ Return ONLY valid JSON, no explanation. Invoice text: ${ocr_text}" - else - extraction_prompt="Extract the following fields from this receipt text as JSON. Use null for missing fields. + else + extraction_prompt="Extract the following fields from this receipt text as JSON. Use null for missing fields. All dates must be in YYYY-MM-DD format. All amounts must be numbers (not strings). Fields: @@ -428,170 +428,170 @@ Return ONLY valid JSON, no explanation. Receipt text: ${ocr_text}" - fi - - print_info "Parsing ${doc_type} with LLM (privacy: ${privacy})..." - - local extracted_json - if [[ "$llm_model" == "llama3.2" ]]; then - extracted_json="$(echo "$extraction_prompt" | ollama run llama3.2 2>/dev/null)" || { - print_error "LLM extraction failed" - return 1 - } - elif [[ "$llm_model" == "cloudflare" ]] || [[ "$llm_model" == "cloud" ]]; then - # For edge/cloud modes, fall back to document-extraction-helper.sh - # which handles API key management - print_warning "Edge/cloud extraction requires document-extraction-helper.sh" - print_info "Falling back to local Ollama extraction..." - if command -v ollama &>/dev/null; then - extracted_json="$(echo "$extraction_prompt" | ollama run llama3.2 2>/dev/null)" || { - print_error "LLM extraction failed" - return 1 - } - else - print_error "No LLM backend available" - return 1 - fi - fi - - # Clean up the JSON (strip markdown code fences if present) - extracted_json="$(echo "$extracted_json" | sed -n '/^[{[]/,/^[}\]]/p')" - - # Validate JSON - if ! echo "$extracted_json" | python3 -m json.tool > /dev/null 2>&1; then - print_warning "LLM returned invalid JSON. Saving raw output." - local raw_file="${WORKSPACE_DIR}/${basename}-raw.txt" - echo "$extracted_json" > "$raw_file" - print_info "Raw output saved to: ${raw_file}" - # Attempt to wrap in a basic structure - printf '{\n "source_file": "%s",\n "document_type": "%s",\n "extraction_status": "partial",\n "raw_text": %s\n}\n' \ - "$source_file" "$doc_type" "$(echo "$ocr_text" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')" \ - > "$output_file" - else - # Write raw extraction first - local raw_output_file="${WORKSPACE_DIR}/${basename}-raw-extracted.json" - printf '{\n "source_file": "%s",\n "document_type": "%s",\n "extraction_status": "complete",\n "data": %s\n}\n' \ - "$source_file" "$doc_type" "$extracted_json" \ - > "$raw_output_file" - - # Run validation pipeline if available - if [[ -f "$PIPELINE_PY" ]]; then - local pipeline_type - if [[ "$doc_type" == "invoice" ]]; then - pipeline_type="purchase_invoice" - else - pipeline_type="expense_receipt" - fi - print_info "Running validation pipeline..." - if python3 "$PIPELINE_PY" validate "$raw_output_file" --type "$pipeline_type" > "$output_file" 2>/dev/null; then - print_info "Validation complete" - else - # Validation ran but flagged issues (exit code 2) - output is still valid - if [[ $? -eq 2 ]]; then - print_warning "Extraction requires manual review (see validation.warnings)" - else - # Validation failed entirely, use raw output - print_warning "Validation pipeline failed, using raw extraction" - cp "$raw_output_file" "$output_file" - fi - fi - else - cp "$raw_output_file" "$output_file" - fi - fi - - # Display the extracted data - python3 -m json.tool "$output_file" 2>/dev/null || cat "$output_file" - print_success "Extracted data saved to: ${output_file}" - return 0 + fi + + print_info "Parsing ${doc_type} with LLM (privacy: ${privacy})..." + + local extracted_json + if [[ "$llm_model" == "llama3.2" ]]; then + extracted_json="$(echo "$extraction_prompt" | ollama run llama3.2 2>/dev/null)" || { + print_error "LLM extraction failed" + return 1 + } + elif [[ "$llm_model" == "cloudflare" ]] || [[ "$llm_model" == "cloud" ]]; then + # For edge/cloud modes, fall back to document-extraction-helper.sh + # which handles API key management + print_warning "Edge/cloud extraction requires document-extraction-helper.sh" + print_info "Falling back to local Ollama extraction..." + if command -v ollama &>/dev/null; then + extracted_json="$(echo "$extraction_prompt" | ollama run llama3.2 2>/dev/null)" || { + print_error "LLM extraction failed" + return 1 + } + else + print_error "No LLM backend available" + return 1 + fi + fi + + # Clean up the JSON (strip markdown code fences if present) + extracted_json="$(echo "$extracted_json" | sed -n '/^[{[]/,/^[}\]]/p')" + + # Validate JSON + if ! echo "$extracted_json" | python3 -m json.tool >/dev/null 2>&1; then + print_warning "LLM returned invalid JSON. Saving raw output." + local raw_file="${WORKSPACE_DIR}/${basename}-raw.txt" + echo "$extracted_json" >"$raw_file" + print_info "Raw output saved to: ${raw_file}" + # Attempt to wrap in a basic structure + printf '{\n "source_file": "%s",\n "document_type": "%s",\n "extraction_status": "partial",\n "raw_text": %s\n}\n' \ + "$source_file" "$doc_type" "$(echo "$ocr_text" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')" \ + >"$output_file" + else + # Write raw extraction first + local raw_output_file="${WORKSPACE_DIR}/${basename}-raw-extracted.json" + printf '{\n "source_file": "%s",\n "document_type": "%s",\n "extraction_status": "complete",\n "data": %s\n}\n' \ + "$source_file" "$doc_type" "$extracted_json" \ + >"$raw_output_file" + + # Run validation pipeline if available + if [[ -f "$PIPELINE_PY" ]]; then + local pipeline_type + if [[ "$doc_type" == "invoice" ]]; then + pipeline_type="purchase_invoice" + else + pipeline_type="expense_receipt" + fi + print_info "Running validation pipeline..." + local validate_rc=0 + python3 "$PIPELINE_PY" validate "$raw_output_file" --type "$pipeline_type" >"$output_file" || validate_rc=$? + if [[ "$validate_rc" -eq 0 ]]; then + print_info "Validation complete" + elif [[ "$validate_rc" -eq 2 ]]; then + # Validation ran but flagged issues (exit code 2) - output is still valid + print_warning "Extraction requires manual review (see validation.warnings)" + else + # Validation failed entirely, use raw output + print_warning "Validation pipeline failed, using raw extraction" + cp "$raw_output_file" "$output_file" + fi + else + cp "$raw_output_file" "$output_file" + fi + fi + + # Display the extracted data + python3 -m json.tool "$output_file" 2>/dev/null || cat "$output_file" + print_success "Extracted data saved to: ${output_file}" + return 0 } # Batch process a directory cmd_batch() { - local input_dir="$1" - local doc_type="${2:-auto}" - local privacy="${3:-local}" - - if [[ ! -d "$input_dir" ]]; then - print_error "Directory not found: ${input_dir}" - return 1 - fi - - ensure_workspace - - local count=0 - local failed=0 - local supported_extensions="png jpg jpeg tiff bmp webp heic pdf" - - print_info "Batch processing receipts/invoices from: ${input_dir}" - echo "" - - for file in "${input_dir}"/*; do - [[ -f "$file" ]] || continue - - local ext="${file##*.}" - ext="$(echo "$ext" | tr '[:upper:]' '[:lower:]')" - - # Check if extension is supported - local supported=0 - for supported_ext in $supported_extensions; do - if [[ "$ext" == "$supported_ext" ]]; then - supported=1 - break - fi - done - - if [[ "$supported" -eq 0 ]]; then - continue - fi - - echo "---" - print_info "Processing: $(basename "$file")" - if cmd_extract "$file" "$doc_type" "$privacy" "json"; then - count=$((count + 1)) - else - failed=$((failed + 1)) - fi - echo "" - done - - echo "===" - print_success "Batch complete: ${count} succeeded, ${failed} failed" - print_info "Output directory: ${WORKSPACE_DIR}" - return 0 + local input_dir="$1" + local doc_type="${2:-auto}" + local privacy="${3:-local}" + + if [[ ! -d "$input_dir" ]]; then + print_error "Directory not found: ${input_dir}" + return 1 + fi + + ensure_workspace + + local count=0 + local failed=0 + local supported_extensions="png jpg jpeg tiff bmp webp heic pdf" + + print_info "Batch processing receipts/invoices from: ${input_dir}" + echo "" + + for file in "${input_dir}"/*; do + [[ -f "$file" ]] || continue + + local ext="${file##*.}" + ext="$(echo "$ext" | tr '[:upper:]' '[:lower:]')" + + # Check if extension is supported + local supported=0 + for supported_ext in $supported_extensions; do + if [[ "$ext" == "$supported_ext" ]]; then + supported=1 + break + fi + done + + if [[ "$supported" -eq 0 ]]; then + continue + fi + + echo "---" + print_info "Processing: $(basename "$file")" + if cmd_extract "$file" "$doc_type" "$privacy" "json"; then + count=$((count + 1)) + else + failed=$((failed + 1)) + fi + echo "" + done + + echo "===" + print_success "Batch complete: ${count} succeeded, ${failed} failed" + print_info "Output directory: ${WORKSPACE_DIR}" + return 0 } # Preview what would be sent to QuickFile (dry run) cmd_preview() { - local input_file="$1" - local doc_type="${2:-auto}" - local privacy="${3:-local}" - local supplier_override="${4:-}" - local nominal_code="${5:-${DEFAULT_NOMINAL_CODE}}" - local currency="${6:-${DEFAULT_CURRENCY}}" - local vat_rate="${7:-${DEFAULT_VAT_RATE}}" - - validate_file_exists "$input_file" "Input file" || return 1 - ensure_workspace - - # Extract data first - print_info "Extracting data for QuickFile preview..." - cmd_extract "$input_file" "$doc_type" "$privacy" "json" || return 1 - - local basename - basename="$(basename "$input_file" | sed 's/\.[^.]*$//')" - local extracted_file="${WORKSPACE_DIR}/${basename}-extracted.json" - - if [[ ! -f "$extracted_file" ]]; then - print_error "Extraction output not found" - return 1 - fi - - # Build QuickFile purchase invoice preview - print_info "QuickFile Purchase Invoice Preview:" - echo "" - - python3 -c " + local input_file="$1" + local doc_type="${2:-auto}" + local privacy="${3:-local}" + local supplier_override="${4:-}" + local nominal_code="${5:-${DEFAULT_NOMINAL_CODE}}" + local currency="${6:-${DEFAULT_CURRENCY}}" + local vat_rate="${7:-${DEFAULT_VAT_RATE}}" + + validate_file_exists "$input_file" "Input file" || return 1 + ensure_workspace + + # Extract data first + print_info "Extracting data for QuickFile preview..." + cmd_extract "$input_file" "$doc_type" "$privacy" "json" || return 1 + + local basename + basename="$(basename "$input_file" | sed 's/\.[^.]*$//')" + local extracted_file="${WORKSPACE_DIR}/${basename}-extracted.json" + + if [[ ! -f "$extracted_file" ]]; then + print_error "Extraction output not found" + return 1 + fi + + # Build QuickFile purchase invoice preview + print_info "QuickFile Purchase Invoice Preview:" + echo "" + + python3 -c " import json import sys @@ -663,46 +663,46 @@ print() print(' QuickFile API call: quickfile_purchase_create') print(' Supplier lookup: quickfile_supplier_search -> quickfile_supplier_create (if new)') " 2>/dev/null || { - print_error "Preview generation failed" - return 1 - } + print_error "Preview generation failed" + return 1 + } - print_info "To create this purchase invoice, run:" - echo " ocr-receipt-helper.sh quickfile ${input_file}" - return 0 + print_info "To create this purchase invoice, run:" + echo " ocr-receipt-helper.sh quickfile ${input_file}" + return 0 } # Extract and create QuickFile purchase invoice cmd_quickfile() { - local input_file="$1" - local doc_type="${2:-auto}" - local privacy="${3:-local}" - local supplier_override="${4:-}" - local nominal_code="${5:-${DEFAULT_NOMINAL_CODE}}" - local currency="${6:-${DEFAULT_CURRENCY}}" - local vat_rate="${7:-${DEFAULT_VAT_RATE}}" - - validate_file_exists "$input_file" "Input file" || return 1 - ensure_workspace - - # Extract data first - print_info "Step 1/3: Extracting receipt/invoice data..." - cmd_extract "$input_file" "$doc_type" "$privacy" "json" || return 1 - - local basename - basename="$(basename "$input_file" | sed 's/\.[^.]*$//')" - local extracted_file="${WORKSPACE_DIR}/${basename}-extracted.json" - - if [[ ! -f "$extracted_file" ]]; then - print_error "Extraction output not found" - return 1 - fi - - # Generate QuickFile-ready JSON - print_info "Step 2/3: Preparing QuickFile purchase invoice..." - local qf_file="${WORKSPACE_DIR}/${basename}-quickfile.json" - - python3 -c " + local input_file="$1" + local doc_type="${2:-auto}" + local privacy="${3:-local}" + local supplier_override="${4:-}" + local nominal_code="${5:-${DEFAULT_NOMINAL_CODE}}" + local currency="${6:-${DEFAULT_CURRENCY}}" + local vat_rate="${7:-${DEFAULT_VAT_RATE}}" + + validate_file_exists "$input_file" "Input file" || return 1 + ensure_workspace + + # Extract data first + print_info "Step 1/3: Extracting receipt/invoice data..." + cmd_extract "$input_file" "$doc_type" "$privacy" "json" || return 1 + + local basename + basename="$(basename "$input_file" | sed 's/\.[^.]*$//')" + local extracted_file="${WORKSPACE_DIR}/${basename}-extracted.json" + + if [[ ! -f "$extracted_file" ]]; then + print_error "Extraction output not found" + return 1 + fi + + # Generate QuickFile-ready JSON + print_info "Step 2/3: Preparing QuickFile purchase invoice..." + local qf_file="${WORKSPACE_DIR}/${basename}-quickfile.json" + + python3 -c " import json import sys from datetime import datetime @@ -795,384 +795,405 @@ with open('${qf_file}', 'w') as f: print(json.dumps(qf_data, indent=2)) " 2>/dev/null || { - print_error "QuickFile data preparation failed" - return 1 - } - - print_success "QuickFile-ready data saved to: ${qf_file}" - echo "" - - # Step 3: Generate MCP recording instructions via quickfile-helper.sh - local qf_helper="${SCRIPT_DIR}/quickfile-helper.sh" - if [[ -x "$qf_helper" ]]; then - local record_cmd="record-purchase" - if [[ "$doc_type" == "receipt" ]]; then - record_cmd="record-expense" - fi - print_info "Step 3/3: Generating QuickFile MCP recording instructions..." - "$qf_helper" "$record_cmd" "$qf_file" --nominal "$nominal_code" --auto-supplier || { - print_warning "quickfile-helper.sh failed, showing manual instructions" - echo "" - echo " Prompt the AI with:" - echo " \"Read ${qf_file} and use quickfile_supplier_search to find or" - echo " quickfile_supplier_create to create the supplier, then use" - echo " quickfile_purchase_create to record this purchase invoice.\"" - } - else - print_info "Step 3/3: To create the purchase invoice in QuickFile, use the AI assistant:" - echo "" - echo " Prompt the AI with:" - echo " \"Read ${qf_file} and use quickfile_supplier_search to find or" - echo " quickfile_supplier_create to create the supplier, then use" - echo " quickfile_purchase_create to record this purchase invoice.\"" - echo "" - print_info "Or install quickfile-helper.sh for automated MCP instructions." - fi - return 0 + print_error "QuickFile data preparation failed" + return 1 + } + + print_success "QuickFile-ready data saved to: ${qf_file}" + echo "" + + # Step 3: Generate MCP recording instructions via quickfile-helper.sh + local qf_helper="${SCRIPT_DIR}/quickfile-helper.sh" + if [[ -x "$qf_helper" ]]; then + local record_cmd="record-purchase" + if [[ "$doc_type" == "receipt" ]]; then + record_cmd="record-expense" + fi + print_info "Step 3/3: Generating QuickFile MCP recording instructions..." + "$qf_helper" "$record_cmd" "$qf_file" --nominal "$nominal_code" --auto-supplier || { + print_warning "quickfile-helper.sh failed, showing manual instructions" + echo "" + echo " Prompt the AI with:" + echo " \"Read ${qf_file} and use quickfile_supplier_search to find or" + echo " quickfile_supplier_create to create the supplier, then use" + echo " quickfile_purchase_create to record this purchase invoice.\"" + } + else + print_info "Step 3/3: To create the purchase invoice in QuickFile, use the AI assistant:" + echo "" + echo " Prompt the AI with:" + echo " \"Read ${qf_file} and use quickfile_supplier_search to find or" + echo " quickfile_supplier_create to create the supplier, then use" + echo " quickfile_purchase_create to record this purchase invoice.\"" + echo "" + print_info "Or install quickfile-helper.sh for automated MCP instructions." + fi + return 0 } # Check component status cmd_status() { - echo "OCR Receipt Pipeline - Component Status" - echo "========================================" - echo "" - - # Ollama - echo "OCR Engine:" - if command -v ollama &>/dev/null; then - echo " ollama: installed" - if ollama list 2>/dev/null | grep -q "${OCR_MODEL}"; then - echo " glm-ocr model: available" - else - echo " glm-ocr model: not pulled (run: ollama pull glm-ocr)" - fi - if ollama list 2>/dev/null | grep -q "llama3"; then - echo " llama3.2: available (for structured extraction)" - else - echo " llama3.2: not pulled (run: ollama pull llama3.2)" - fi - else - echo " ollama: not installed (run: brew install ollama)" - fi - - # ImageMagick (for PDF) - echo "" - echo "PDF Support:" - if command -v magick &>/dev/null; then - echo " imagemagick: installed (v7)" - elif command -v convert &>/dev/null; then - echo " imagemagick: installed (v6)" - else - echo " imagemagick: not installed (run: brew install imagemagick)" - fi - - # Validation pipeline - echo "" - echo "Validation Pipeline:" - if [[ -f "$PIPELINE_PY" ]]; then - echo " extraction_pipeline.py: available" - else - echo " extraction_pipeline.py: not found" - fi - if python3 -c "import pydantic" 2>/dev/null; then - echo " pydantic: installed" - else - echo " pydantic: not installed (run: pip install pydantic>=2.0)" - fi - - # Document extraction venv - echo "" - echo "Structured Extraction:" - if [[ -d "${VENV_DIR}/bin" ]]; then - echo " python venv: ${VENV_DIR}" - if "${VENV_DIR}/bin/python3" -c "import docling" 2>/dev/null; then - echo " docling: installed" - else - echo " docling: not installed" - fi - if "${VENV_DIR}/bin/python3" -c "import extract_thinker" 2>/dev/null; then - echo " extract-thinker: installed" - else - echo " extract-thinker: not installed" - fi - else - echo " python venv: not created" - echo " (run: document-extraction-helper.sh install --core)" - fi - - # QuickFile MCP - echo "" - echo "QuickFile Integration:" - if [[ -f "${HOME}/Git/quickfile-mcp/dist/index.js" ]]; then - echo " quickfile-mcp: installed" - else - echo " quickfile-mcp: not found (optional - for purchase invoice creation)" - fi - if [[ -f "${HOME}/.config/.quickfile-mcp/credentials.json" ]]; then - echo " credentials: configured" - else - echo " credentials: not configured" - fi - - # Workspace - echo "" - echo "Workspace:" - echo " output dir: ${WORKSPACE_DIR}" - if [[ -d "$WORKSPACE_DIR" ]]; then - local file_count - file_count="$(find "$WORKSPACE_DIR" -type f 2>/dev/null | wc -l | tr -d ' ')" - echo " files: ${file_count}" - else - echo " files: (not created yet)" - fi - - return 0 + echo "OCR Receipt Pipeline - Component Status" + echo "========================================" + echo "" + + # Ollama + echo "OCR Engine:" + if command -v ollama &>/dev/null; then + echo " ollama: installed" + if ollama list 2>/dev/null | grep -q "${OCR_MODEL}"; then + echo " glm-ocr model: available" + else + echo " glm-ocr model: not pulled (run: ollama pull glm-ocr)" + fi + if ollama list 2>/dev/null | grep -q "llama3"; then + echo " llama3.2: available (for structured extraction)" + else + echo " llama3.2: not pulled (run: ollama pull llama3.2)" + fi + else + echo " ollama: not installed (run: brew install ollama)" + fi + + # ImageMagick (for PDF) + echo "" + echo "PDF Support:" + if command -v magick &>/dev/null; then + echo " imagemagick: installed (v7)" + elif command -v convert &>/dev/null; then + echo " imagemagick: installed (v6)" + else + echo " imagemagick: not installed (run: brew install imagemagick)" + fi + + # Validation pipeline + echo "" + echo "Validation Pipeline:" + if [[ -f "$PIPELINE_PY" ]]; then + echo " extraction_pipeline.py: available" + else + echo " extraction_pipeline.py: not found" + fi + if python3 -c "import pydantic" 2>/dev/null; then + echo " pydantic: installed" + else + echo " pydantic: not installed (run: pip install pydantic>=2.0)" + fi + + # Document extraction venv + echo "" + echo "Structured Extraction:" + if [[ -d "${VENV_DIR}/bin" ]]; then + echo " python venv: ${VENV_DIR}" + if "${VENV_DIR}/bin/python3" -c "import docling" 2>/dev/null; then + echo " docling: installed" + else + echo " docling: not installed" + fi + if "${VENV_DIR}/bin/python3" -c "import extract_thinker" 2>/dev/null; then + echo " extract-thinker: installed" + else + echo " extract-thinker: not installed" + fi + else + echo " python venv: not created" + echo " (run: document-extraction-helper.sh install --core)" + fi + + # QuickFile MCP + echo "" + echo "QuickFile Integration:" + if [[ -f "${HOME}/Git/quickfile-mcp/dist/index.js" ]]; then + echo " quickfile-mcp: installed" + else + echo " quickfile-mcp: not found (optional - for purchase invoice creation)" + fi + if [[ -f "${HOME}/.config/.quickfile-mcp/credentials.json" ]]; then + echo " credentials: configured" + else + echo " credentials: not configured" + fi + + # Workspace + echo "" + echo "Workspace:" + echo " output dir: ${WORKSPACE_DIR}" + if [[ -d "$WORKSPACE_DIR" ]]; then + local file_count + file_count="$(find "$WORKSPACE_DIR" -type f 2>/dev/null | wc -l | tr -d ' ')" + echo " files: ${file_count}" + else + echo " files: (not created yet)" + fi + + return 0 } # Install OCR dependencies cmd_install() { - print_info "Installing OCR receipt pipeline dependencies..." - echo "" - - # Ollama - if command -v ollama &>/dev/null; then - print_success "Ollama already installed" - else - print_info "Installing Ollama..." - if command -v brew &>/dev/null; then - brew install ollama || { - print_error "Ollama installation failed" - return 1 - } - else - print_error "Homebrew not found. Install Ollama manually: https://ollama.com/" - return 1 - fi - fi - - # GLM-OCR model - if ollama list 2>/dev/null | grep -q "${OCR_MODEL}"; then - print_success "GLM-OCR model already available" - else - print_info "Pulling GLM-OCR model (~2GB)..." - ollama pull "$OCR_MODEL" || { - print_error "Failed to pull GLM-OCR model" - return 1 - } - print_success "GLM-OCR model installed" - fi - - # llama3.2 for structured extraction - if ollama list 2>/dev/null | grep -q "llama3"; then - print_success "llama3.2 model already available" - else - print_info "Pulling llama3.2 model (for structured extraction)..." - ollama pull llama3.2 || { - print_warning "Failed to pull llama3.2. Structured extraction may be limited." - } - fi - - # ImageMagick for PDF support - if command -v magick &>/dev/null || command -v convert &>/dev/null; then - print_success "ImageMagick already installed" - else - print_info "Installing ImageMagick (for PDF support)..." - if command -v brew &>/dev/null; then - brew install imagemagick || print_warning "ImageMagick installation failed. PDF OCR will not work." - else - print_warning "Install ImageMagick manually for PDF support" - fi - fi - - # Document extraction (optional, handled by document-extraction-helper.sh) - echo "" - print_info "For structured extraction with Pydantic schemas, also run:" - echo " document-extraction-helper.sh install --core" - echo "" - print_success "OCR receipt pipeline installation complete" - return 0 + print_info "Installing OCR receipt pipeline dependencies..." + echo "" + + # Ollama + if command -v ollama &>/dev/null; then + print_success "Ollama already installed" + else + print_info "Installing Ollama..." + if command -v brew &>/dev/null; then + brew install ollama || { + print_error "Ollama installation failed" + return 1 + } + else + print_error "Homebrew not found. Install Ollama manually: https://ollama.com/" + return 1 + fi + fi + + # GLM-OCR model + if ollama list 2>/dev/null | grep -q "${OCR_MODEL}"; then + print_success "GLM-OCR model already available" + else + print_info "Pulling GLM-OCR model (~2GB)..." + ollama pull "$OCR_MODEL" || { + print_error "Failed to pull GLM-OCR model" + return 1 + } + print_success "GLM-OCR model installed" + fi + + # llama3.2 for structured extraction + if ollama list 2>/dev/null | grep -q "llama3"; then + print_success "llama3.2 model already available" + else + print_info "Pulling llama3.2 model (for structured extraction)..." + ollama pull llama3.2 || { + print_warning "Failed to pull llama3.2. Structured extraction may be limited." + } + fi + + # ImageMagick for PDF support + if command -v magick &>/dev/null || command -v convert &>/dev/null; then + print_success "ImageMagick already installed" + else + print_info "Installing ImageMagick (for PDF support)..." + if command -v brew &>/dev/null; then + brew install imagemagick || print_warning "ImageMagick installation failed. PDF OCR will not work." + else + print_warning "Install ImageMagick manually for PDF support" + fi + fi + + # Document extraction (optional, handled by document-extraction-helper.sh) + echo "" + print_info "For structured extraction with Pydantic schemas, also run:" + echo " document-extraction-helper.sh install --core" + echo "" + print_success "OCR receipt pipeline installation complete" + return 0 } # Show help cmd_help() { - echo "OCR Receipt/Invoice Extraction Helper - AI DevOps Framework" - echo "" - echo "${HELP_LABEL_USAGE}" - echo " ocr-receipt-helper.sh [options]" - echo "" - echo "${HELP_LABEL_COMMANDS}" - echo " scan Quick OCR text extraction (GLM-OCR)" - echo " extract Structured extraction with validation pipeline" - echo " validate Validate extracted JSON (VAT, dates, confidence)" - echo " batch Batch process directory of receipts/invoices" - echo " quickfile Extract and prepare QuickFile purchase invoice" - echo " preview Dry run - show what would be sent to QuickFile" - echo " status Check installed components" - echo " install Install OCR dependencies" - echo " help Show this help" - echo "" - echo "${HELP_LABEL_OPTIONS}" - echo " --type Document type (default: auto-detect)" - echo " --privacy local, edge, cloud, none (default: local)" - echo " --output json, text, markdown (default: json)" - echo " --supplier Override supplier name for QuickFile" - echo " --nominal QuickFile nominal code (default: 7901)" - echo " --currency Currency code (default: GBP)" - echo " --vat-rate VAT rate percentage (default: 20)" - echo "" - echo "Pipeline:" - echo " 1. scan - Raw OCR text extraction (GLM-OCR via Ollama, local)" - echo " 2. extract - Structured extraction + validation (auto-detect type)" - echo " 3. validate - VAT arithmetic, date checks, confidence scoring" - echo " 4. preview - Show QuickFile purchase invoice preview (dry run)" - echo " 5. quickfile - Generate QuickFile-ready JSON + MCP recording instructions" - echo "" - echo " For recording in QuickFile, also see: quickfile-helper.sh" - echo "" - echo "${HELP_LABEL_EXAMPLES}" - echo " ocr-receipt-helper.sh scan receipt.jpg" - echo " ocr-receipt-helper.sh extract invoice.pdf --type invoice --privacy local" - echo " ocr-receipt-helper.sh batch ~/Documents/receipts/" - echo " ocr-receipt-helper.sh preview receipt.png --supplier 'Amazon UK'" - echo " ocr-receipt-helper.sh quickfile invoice.pdf --nominal 7502 --currency GBP" - echo " ocr-receipt-helper.sh status" - echo " ocr-receipt-helper.sh install" - echo "" - echo "Related:" - echo " document-extraction-helper.sh - General document extraction" - echo " tools/accounts/receipt-ocr.md - Subagent documentation" - echo " tools/ocr/glm-ocr.md - GLM-OCR model reference" - echo " services/accounting/quickfile.md - QuickFile MCP integration" - return 0 + echo "OCR Receipt/Invoice Extraction Helper - AI DevOps Framework" + echo "" + echo "${HELP_LABEL_USAGE}" + echo " ocr-receipt-helper.sh [options]" + echo "" + echo "${HELP_LABEL_COMMANDS}" + echo " scan Quick OCR text extraction (GLM-OCR)" + echo " extract Structured extraction with validation pipeline" + echo " validate Validate extracted JSON (VAT, dates, confidence)" + echo " batch Batch process directory of receipts/invoices" + echo " quickfile Extract and prepare QuickFile purchase invoice" + echo " preview Dry run - show what would be sent to QuickFile" + echo " status Check installed components" + echo " install Install OCR dependencies" + echo " help Show this help" + echo "" + echo "${HELP_LABEL_OPTIONS}" + echo " --type Document type (default: auto-detect)" + echo " --privacy local, edge, cloud, none (default: local)" + echo " --output json, text, markdown (default: json)" + echo " --supplier Override supplier name for QuickFile" + echo " --nominal QuickFile nominal code (default: 7901)" + echo " --currency Currency code (default: GBP)" + echo " --vat-rate VAT rate percentage (default: 20)" + echo "" + echo "Pipeline:" + echo " 1. scan - Raw OCR text extraction (GLM-OCR via Ollama, local)" + echo " 2. extract - Structured extraction + validation (auto-detect type)" + echo " 3. validate - VAT arithmetic, date checks, confidence scoring" + echo " 4. preview - Show QuickFile purchase invoice preview (dry run)" + echo " 5. quickfile - Generate QuickFile-ready JSON + MCP recording instructions" + echo "" + echo " For recording in QuickFile, also see: quickfile-helper.sh" + echo "" + echo "${HELP_LABEL_EXAMPLES}" + echo " ocr-receipt-helper.sh scan receipt.jpg" + echo " ocr-receipt-helper.sh extract invoice.pdf --type invoice --privacy local" + echo " ocr-receipt-helper.sh batch ~/Documents/receipts/" + echo " ocr-receipt-helper.sh preview receipt.png --supplier 'Amazon UK'" + echo " ocr-receipt-helper.sh quickfile invoice.pdf --nominal 7502 --currency GBP" + echo " ocr-receipt-helper.sh status" + echo " ocr-receipt-helper.sh install" + echo "" + echo "Related:" + echo " document-extraction-helper.sh - General document extraction" + echo " tools/accounts/receipt-ocr.md - Subagent documentation" + echo " tools/ocr/glm-ocr.md - GLM-OCR model reference" + echo " services/accounting/quickfile.md - QuickFile MCP integration" + return 0 } # Parse command-line arguments parse_args() { - local command="${1:-help}" - shift || true - - # Parse named options - local file="" - local doc_type="auto" - local privacy="local" - local output_format="json" - local supplier="" - local nominal_code="${DEFAULT_NOMINAL_CODE}" - local currency="${DEFAULT_CURRENCY}" - local vat_rate="${DEFAULT_VAT_RATE}" - - # First positional arg after command is the file/dir - if [[ $# -gt 0 ]] && [[ ! "$1" =~ ^-- ]]; then - file="$1" - shift || true - fi - - while [[ $# -gt 0 ]]; do - case "$1" in - --type) - doc_type="${2:-auto}" - shift 2 || { print_error "Missing value for --type"; return 1; } - ;; - --privacy) - privacy="${2:-local}" - shift 2 || { print_error "Missing value for --privacy"; return 1; } - ;; - --output) - output_format="${2:-json}" - shift 2 || { print_error "Missing value for --output"; return 1; } - ;; - --supplier) - supplier="${2:-}" - shift 2 || { print_error "Missing value for --supplier"; return 1; } - ;; - --nominal) - nominal_code="${2:-${DEFAULT_NOMINAL_CODE}}" - shift 2 || { print_error "Missing value for --nominal"; return 1; } - ;; - --currency) - currency="${2:-${DEFAULT_CURRENCY}}" - shift 2 || { print_error "Missing value for --currency"; return 1; } - ;; - --vat-rate) - vat_rate="${2:-${DEFAULT_VAT_RATE}}" - shift 2 || { print_error "Missing value for --vat-rate"; return 1; } - ;; - *) - print_warning "Unknown option: $1" - shift - ;; - esac - done - - case "$command" in - scan) - if [[ -z "$file" ]]; then - print_error "${ERROR_INPUT_FILE_REQUIRED}" - return 1 - fi - cmd_scan "$file" "$output_format" - ;; - extract) - if [[ -z "$file" ]]; then - print_error "${ERROR_INPUT_FILE_REQUIRED}" - return 1 - fi - cmd_extract "$file" "$doc_type" "$privacy" "$output_format" - ;; - validate) - if [[ -z "$file" ]]; then - print_error "${ERROR_INPUT_FILE_REQUIRED}" - return 1 - fi - if [[ -f "$PIPELINE_PY" ]]; then - local pipeline_type="auto" - if [[ "$doc_type" == "invoice" ]]; then - pipeline_type="purchase_invoice" - elif [[ "$doc_type" == "receipt" ]]; then - pipeline_type="expense_receipt" - fi - python3 "$PIPELINE_PY" validate "$file" --type "$pipeline_type" - else - print_error "Validation pipeline not found: ${PIPELINE_PY}" - return 1 - fi - ;; - batch) - if [[ -z "$file" ]]; then - print_error "Input directory is required" - return 1 - fi - cmd_batch "$file" "$doc_type" "$privacy" - ;; - quickfile|qf) - if [[ -z "$file" ]]; then - print_error "${ERROR_INPUT_FILE_REQUIRED}" - return 1 - fi - cmd_quickfile "$file" "$doc_type" "$privacy" "$supplier" "$nominal_code" "$currency" "$vat_rate" - ;; - preview) - if [[ -z "$file" ]]; then - print_error "${ERROR_INPUT_FILE_REQUIRED}" - return 1 - fi - cmd_preview "$file" "$doc_type" "$privacy" "$supplier" "$nominal_code" "$currency" "$vat_rate" - ;; - status) - cmd_status - ;; - install) - cmd_install - ;; - help|--help|-h) - cmd_help - ;; - *) - print_error "${ERROR_UNKNOWN_COMMAND}: ${command}" - cmd_help - return 1 - ;; - esac + local command="${1:-help}" + shift || true + + # Parse named options + local file="" + local doc_type="auto" + local privacy="local" + local output_format="json" + local supplier="" + local nominal_code="${DEFAULT_NOMINAL_CODE}" + local currency="${DEFAULT_CURRENCY}" + local vat_rate="${DEFAULT_VAT_RATE}" + + # First positional arg after command is the file/dir + if [[ $# -gt 0 ]] && [[ ! "$1" =~ ^-- ]]; then + file="$1" + shift || true + fi + + while [[ $# -gt 0 ]]; do + case "$1" in + --type) + doc_type="${2:-auto}" + shift 2 || { + print_error "Missing value for --type" + return 1 + } + ;; + --privacy) + privacy="${2:-local}" + shift 2 || { + print_error "Missing value for --privacy" + return 1 + } + ;; + --output) + output_format="${2:-json}" + shift 2 || { + print_error "Missing value for --output" + return 1 + } + ;; + --supplier) + supplier="${2:-}" + shift 2 || { + print_error "Missing value for --supplier" + return 1 + } + ;; + --nominal) + nominal_code="${2:-${DEFAULT_NOMINAL_CODE}}" + shift 2 || { + print_error "Missing value for --nominal" + return 1 + } + ;; + --currency) + currency="${2:-${DEFAULT_CURRENCY}}" + shift 2 || { + print_error "Missing value for --currency" + return 1 + } + ;; + --vat-rate) + vat_rate="${2:-${DEFAULT_VAT_RATE}}" + shift 2 || { + print_error "Missing value for --vat-rate" + return 1 + } + ;; + *) + print_warning "Unknown option: $1" + shift + ;; + esac + done + + case "$command" in + scan) + if [[ -z "$file" ]]; then + print_error "${ERROR_INPUT_FILE_REQUIRED}" + return 1 + fi + cmd_scan "$file" "$output_format" + ;; + extract) + if [[ -z "$file" ]]; then + print_error "${ERROR_INPUT_FILE_REQUIRED}" + return 1 + fi + cmd_extract "$file" "$doc_type" "$privacy" "$output_format" + ;; + validate) + if [[ -z "$file" ]]; then + print_error "${ERROR_INPUT_FILE_REQUIRED}" + return 1 + fi + if [[ -f "$PIPELINE_PY" ]]; then + local pipeline_type="auto" + if [[ "$doc_type" == "invoice" ]]; then + pipeline_type="purchase_invoice" + elif [[ "$doc_type" == "receipt" ]]; then + pipeline_type="expense_receipt" + fi + python3 "$PIPELINE_PY" validate "$file" --type "$pipeline_type" + else + print_error "Validation pipeline not found: ${PIPELINE_PY}" + return 1 + fi + ;; + batch) + if [[ -z "$file" ]]; then + print_error "Input directory is required" + return 1 + fi + cmd_batch "$file" "$doc_type" "$privacy" + ;; + quickfile | qf) + if [[ -z "$file" ]]; then + print_error "${ERROR_INPUT_FILE_REQUIRED}" + return 1 + fi + cmd_quickfile "$file" "$doc_type" "$privacy" "$supplier" "$nominal_code" "$currency" "$vat_rate" + ;; + preview) + if [[ -z "$file" ]]; then + print_error "${ERROR_INPUT_FILE_REQUIRED}" + return 1 + fi + cmd_preview "$file" "$doc_type" "$privacy" "$supplier" "$nominal_code" "$currency" "$vat_rate" + ;; + status) + cmd_status + ;; + install) + cmd_install + ;; + help | --help | -h) + cmd_help + ;; + *) + print_error "${ERROR_UNKNOWN_COMMAND}: ${command}" + cmd_help + return 1 + ;; + esac } # Main entry point diff --git a/.agents/scripts/package.json b/.agents/scripts/package.json index 40d91c2f3..3673d9d1c 100644 --- a/.agents/scripts/package.json +++ b/.agents/scripts/package.json @@ -2,7 +2,7 @@ "name": "scripts", "version": "1.0.0", "description": "", - "main": "ahrefs-mcp-wrapper.js", + "main": "wappalyzer-detect.mjs", "directories": { "test": "tests" }, diff --git a/.agents/scripts/real-video-enhancer-helper.sh b/.agents/scripts/real-video-enhancer-helper.sh index 95936b7d4..28a042b9b 100755 --- a/.agents/scripts/real-video-enhancer-helper.sh +++ b/.agents/scripts/real-video-enhancer-helper.sh @@ -656,6 +656,7 @@ cmd_enhance() { # Create temporary files for pipeline local temp_dir temp_dir=$(mktemp -d) + trap 'rm -rf -- "$temp_dir"' RETURN local temp_upscaled="${temp_dir}/upscaled.mp4" local temp_interpolated="${temp_dir}/interpolated.mp4" diff --git a/.agents/scripts/session-miner-pulse.sh b/.agents/scripts/session-miner-pulse.sh index 955cc6b5b..7ca229844 100755 --- a/.agents/scripts/session-miner-pulse.sh +++ b/.agents/scripts/session-miner-pulse.sh @@ -19,7 +19,9 @@ set -euo pipefail # --- Configuration --- -SCRIPT_DIR="$(cd "${BASH_SOURCE[0]%/*}" && pwd)" +_smp_dir="${BASH_SOURCE[0]%/*}" +[[ "$_smp_dir" == "${BASH_SOURCE[0]}" ]] && _smp_dir="." +SCRIPT_DIR="$(cd "$_smp_dir" && pwd)" MINER_DIR="${HOME}/.aidevops/.agent-workspace/work/session-miner" # Shipped with aidevops; copied to workspace on first run EXTRACTOR_SRC="${SCRIPT_DIR}/session-miner/extract.py" diff --git a/.agents/scripts/system-cleanup.sh b/.agents/scripts/system-cleanup.sh index 5b40c8587..3b1094ed4 100755 --- a/.agents/scripts/system-cleanup.sh +++ b/.agents/scripts/system-cleanup.sh @@ -2,7 +2,7 @@ # shellcheck disable=SC2034,SC2317 # System Cleanup & Maintenance Script -# +# # Performs garbage collection, removes cruft, and maintains system hygiene # for the AI DevOps Framework. Includes lock file protection and 90-day logging. # @@ -25,7 +25,7 @@ readonly LOG_DIR="$HOME/.agents/logs" readonly LOG_FILE="${LOG_DIR}/operations.log" readonly LOCK_FILE="/tmp/aidevops-${SCRIPT_NAME}.lock" readonly TMP_DIR="$HOME/.agents/tmp" -readonly AGENT_DIR="$HOME/.agent" +readonly AGENT_DIR="$HOME/.agents" readonly PROJECT_DIR="$HOME/git/aidevops" readonly RETENTION_DAYS_LOGS=90 readonly RETENTION_DAYS_TMP=7 @@ -38,80 +38,80 @@ DRY_RUN=true # ----------------------------------------------------------------------------- setup_logging() { - # Create log directory if it doesn't exist - if [[ ! -d "$LOG_DIR" ]]; then - mkdir -p "$LOG_DIR" - fi - return 0 + # Create log directory if it doesn't exist + if [[ ! -d "$LOG_DIR" ]]; then + mkdir -p "$LOG_DIR" + fi + return 0 } log() { - local level="$1" - local message="$2" - local timestamp - timestamp=$(date "+%Y-%m-%dT%H:%M:%S%z") - - # Console output - local color="$NC" - case "$level" in - "INFO") color="$GREEN" ;; - "WARN") color="$YELLOW" ;; - "ERROR") color="$RED" ;; - "DEBUG") color="$BLUE" ;; - *) color="$NC" ;; - esac - - echo -e "${color}[${level}] ${message}${NC}" - - # File output (append) - if [[ -d "$LOG_DIR" ]]; then - echo "${timestamp} [${level}] ${message}" >> "$LOG_FILE" - fi - - return 0 + local level="$1" + local message="$2" + local timestamp + timestamp=$(date "+%Y-%m-%dT%H:%M:%S%z") + + # Console output + local color="$NC" + case "$level" in + "INFO") color="$GREEN" ;; + "WARN") color="$YELLOW" ;; + "ERROR") color="$RED" ;; + "DEBUG") color="$BLUE" ;; + *) color="$NC" ;; + esac + + echo -e "${color}[${level}] ${message}${NC}" + + # File output (append) + if [[ -d "$LOG_DIR" ]]; then + echo "${timestamp} [${level}] ${message}" >>"$LOG_FILE" + fi + + return 0 } rotate_logs() { - log "INFO" "Checking log retention policy (${RETENTION_DAYS_LOGS} days)..." - - if [[ ! -f "$LOG_FILE" ]]; then - return 0 - fi - - # Use a temporary file to filter logs - local temp_log="${LOG_FILE}.tmp" - local cutoff_date - - # Calculate cutoff date timestamp for comparison (cross-platform compatible approximation) - # Note: Precise date math in bash across OS versions is tricky. - # Here we'll rely on finding lines that don't match old dates if possible, - # or simply use find to remove archived log files if we were rotating files. - # Since we are appending to a single file, we'll inspect the file content. - - # For this implementation, we will archive the log file if it gets too large - # or just rely on the user to not have massive logs. - # A simpler robust approach for a single file is difficult without external tools. - # Let's stick to the requirement: "keeps 90 days of records". - - # We will use a simple grep strategy assuming ISO dates: YYYY-MM-DD - # Current date minus 90 days - if date -v -90d > /dev/null 2>&1; then - # BSD/macOS date - cutoff_date=$(date -v -${RETENTION_DAYS_LOGS}d +%Y-%m-%d) - else - # GNU date - cutoff_date=$(date -d "${RETENTION_DAYS_LOGS} days ago" +%Y-%m-%d) - fi - - log "DEBUG" "Pruning logs older than $cutoff_date" - - # Filter the log file: Keep lines where date >= cutoff_date - # This is a string comparison which works for ISO 8601 dates - awk -v cutoff="$cutoff_date" '$1 >= cutoff' "$LOG_FILE" > "$temp_log" - - mv "$temp_log" "$LOG_FILE" - - return 0 + log "INFO" "Checking log retention policy (${RETENTION_DAYS_LOGS} days)..." + + if [[ ! -f "$LOG_FILE" ]]; then + return 0 + fi + + # Use a temporary file to filter logs + local temp_log="${LOG_FILE}.tmp" + local cutoff_date + + # Calculate cutoff date timestamp for comparison (cross-platform compatible approximation) + # Note: Precise date math in bash across OS versions is tricky. + # Here we'll rely on finding lines that don't match old dates if possible, + # or simply use find to remove archived log files if we were rotating files. + # Since we are appending to a single file, we'll inspect the file content. + + # For this implementation, we will archive the log file if it gets too large + # or just rely on the user to not have massive logs. + # A simpler robust approach for a single file is difficult without external tools. + # Let's stick to the requirement: "keeps 90 days of records". + + # We will use a simple grep strategy assuming ISO dates: YYYY-MM-DD + # Current date minus 90 days + if date -v -90d >/dev/null 2>&1; then + # BSD/macOS date + cutoff_date=$(date -v -${RETENTION_DAYS_LOGS}d +%Y-%m-%d) + else + # GNU date + cutoff_date=$(date -d "${RETENTION_DAYS_LOGS} days ago" +%Y-%m-%d) + fi + + log "DEBUG" "Pruning logs older than $cutoff_date" + + # Filter the log file: Keep lines where date >= cutoff_date + # This is a string comparison which works for ISO 8601 dates + awk -v cutoff="$cutoff_date" '$1 >= cutoff' "$LOG_FILE" >"$temp_log" + + mv "$temp_log" "$LOG_FILE" + + return 0 } # ----------------------------------------------------------------------------- @@ -119,40 +119,40 @@ rotate_logs() { # ----------------------------------------------------------------------------- acquire_lock() { - if [[ -f "$LOCK_FILE" ]]; then - # Check if process is still running - local pid - pid=$(cat "$LOCK_FILE") - if ps -p "$pid" > /dev/null 2>&1; then - log "ERROR" "Script is already running (PID: $pid). Lock file exists at $LOCK_FILE" - return 1 - else - log "WARN" "Found stale lock file from PID $pid. Removing..." - rm -f "$LOCK_FILE" - fi - fi - - echo $$ > "$LOCK_FILE" - return 0 + if [[ -f "$LOCK_FILE" ]]; then + # Check if process is still running + local pid + pid=$(cat "$LOCK_FILE") + if ps -p "$pid" >/dev/null 2>&1; then + log "ERROR" "Script is already running (PID: $pid). Lock file exists at $LOCK_FILE" + return 1 + else + log "WARN" "Found stale lock file from PID $pid. Removing..." + rm -f "$LOCK_FILE" + fi + fi + + echo $$ >"$LOCK_FILE" + return 0 } release_lock() { - if [[ -f "$LOCK_FILE" ]]; then - rm -f "$LOCK_FILE" - fi - return 0 + if [[ -f "$LOCK_FILE" ]]; then + rm -f "$LOCK_FILE" + fi + return 0 } cleanup_exit() { - local exit_code=$? - release_lock - if [[ $exit_code -eq 0 ]]; then - log "INFO" "Cleanup completed successfully" - else - log "ERROR" "Cleanup finished with error (Code: $exit_code)" - fi - exit "$exit_code" - return 0 + local exit_code=$? + release_lock + if [[ $exit_code -eq 0 ]]; then + log "INFO" "Cleanup completed successfully" + else + log "ERROR" "Cleanup finished with error (Code: $exit_code)" + fi + exit "$exit_code" + return 0 } # ----------------------------------------------------------------------------- @@ -160,105 +160,105 @@ cleanup_exit() { # ----------------------------------------------------------------------------- cleanup_directory() { - local dir="$1" - local pattern="$2" - local days="${3:-0}" # 0 means ignore age - local desc="$4" - - if [[ ! -d "$dir" ]]; then - log "DEBUG" "Directory not found, skipping: $dir" - return 0 - fi - - log "INFO" "Scanning $desc ($dir)..." - - # Build find arguments as an array (safe, no eval needed) - local find_args=("$dir" "-name" "$pattern") - - # Add depth limit to avoid scanning entire system if path is wrong - find_args+=("-maxdepth" "4") - - # Add age filter if specified - if [[ "$days" -gt 0 ]]; then - find_args+=("-mtime" "+${days}") - fi - - # Exclude common directories - find_args+=("-not" "-path" "*/.git/*" "-not" "-path" "*/node_modules/*") - - # Execute find with safe array expansion - local files_found - files_found=$(find "${find_args[@]}" 2>/dev/null || echo "") - - if [[ -z "$files_found" ]]; then - log "DEBUG" "No matching files found for $desc" - return 0 - fi - - # Process matches - local count=0 - while IFS= read -r file; do - if [[ -z "$file" ]]; then continue; fi - - if [[ "$DRY_RUN" == "true" ]]; then - log "INFO" "[DRY-RUN] Would delete: $file" - else - if rm -f "$file"; then - log "INFO" "Deleted: $file" - else - log "ERROR" "Failed to delete: $file" - fi - fi - count=$((count + 1)) - done <<< "$files_found" - - if [[ "$count" -gt 0 ]]; then - log "INFO" "Processed $count files in $desc" - fi - - return 0 + local dir="$1" + local pattern="$2" + local days="${3:-0}" # 0 means ignore age + local desc="$4" + + if [[ ! -d "$dir" ]]; then + log "DEBUG" "Directory not found, skipping: $dir" + return 0 + fi + + log "INFO" "Scanning $desc ($dir)..." + + # Build find arguments as an array (safe, no eval needed) + local find_args=("$dir" "-name" "$pattern") + + # Add depth limit to avoid scanning entire system if path is wrong + find_args+=("-maxdepth" "4") + + # Add age filter if specified + if [[ "$days" -gt 0 ]]; then + find_args+=("-mtime" "+${days}") + fi + + # Exclude common directories + find_args+=("-not" "-path" "*/.git/*" "-not" "-path" "*/node_modules/*") + + # Execute find with safe array expansion + local files_found + files_found=$(find "${find_args[@]}" 2>/dev/null || echo "") + + if [[ -z "$files_found" ]]; then + log "DEBUG" "No matching files found for $desc" + return 0 + fi + + # Process matches + local count=0 + while IFS= read -r file; do + if [[ -z "$file" ]]; then continue; fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "INFO" "[DRY-RUN] Would delete: $file" + else + if rm -f "$file"; then + log "INFO" "Deleted: $file" + else + log "ERROR" "Failed to delete: $file" + fi + fi + count=$((count + 1)) + done <<<"$files_found" + + if [[ "$count" -gt 0 ]]; then + log "INFO" "Processed $count files in $desc" + fi + + return 0 } cleanup_tmp_dir() { - # Special handling for tmp directory to clean everything older than X days - # Not just specific patterns - - if [[ ! -d "$TMP_DIR" ]]; then return 0; fi - - log "INFO" "Cleaning temporary directory ($TMP_DIR) - items older than $RETENTION_DAYS_TMP days..." - - # Build find arguments as an array (safe, no eval needed) - # Using -mindepth 1 to not delete the dir itself - local find_args=("$TMP_DIR" "-mindepth" "1" "-mtime" "+${RETENTION_DAYS_TMP}") - - # Exclude README.md - find_args+=("-not" "-name" "README.md") - - # Execute find with safe array expansion - local items - items=$(find "${find_args[@]}" 2>/dev/null || echo "") - - if [[ -z "$items" ]]; then - log "DEBUG" "No old temporary items found" - return 0 - fi - - while IFS= read -r item; do - if [[ -z "$item" ]]; then continue; fi - - if [[ "$DRY_RUN" == "true" ]]; then - log "INFO" "[DRY-RUN] Would delete: $item" - else - # Use rm -rf for directories - if rm -rf "$item"; then - log "INFO" "Deleted: $item" - else - log "ERROR" "Failed to delete: $item" - fi - fi - done <<< "$items" - - return 0 + # Special handling for tmp directory to clean everything older than X days + # Not just specific patterns + + if [[ ! -d "$TMP_DIR" ]]; then return 0; fi + + log "INFO" "Cleaning temporary directory ($TMP_DIR) - items older than $RETENTION_DAYS_TMP days..." + + # Build find arguments as an array (safe, no eval needed) + # Using -mindepth 1 to not delete the dir itself + local find_args=("$TMP_DIR" "-mindepth" "1" "-mtime" "+${RETENTION_DAYS_TMP}") + + # Exclude README.md + find_args+=("-not" "-name" "README.md") + + # Execute find with safe array expansion + local items + items=$(find "${find_args[@]}" 2>/dev/null || echo "") + + if [[ -z "$items" ]]; then + log "DEBUG" "No old temporary items found" + return 0 + fi + + while IFS= read -r item; do + if [[ -z "$item" ]]; then continue; fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "INFO" "[DRY-RUN] Would delete: $item" + else + # Use rm -rf for directories + if rm -rf "$item"; then + log "INFO" "Deleted: $item" + else + log "ERROR" "Failed to delete: $item" + fi + fi + done <<<"$items" + + return 0 } # ----------------------------------------------------------------------------- @@ -266,73 +266,73 @@ cleanup_tmp_dir() { # ----------------------------------------------------------------------------- show_help() { - echo "Usage: $0 [options]" - echo - echo "Options:" - echo " --force Execute deletions (disable dry-run)" - echo " --dry-run Simulate deletions (default)" - echo " --help Show this help message" - echo - return 0 + echo "Usage: $0 [options]" + echo + echo "Options:" + echo " --force Execute deletions (disable dry-run)" + echo " --dry-run Simulate deletions (default)" + echo " --help Show this help message" + echo + return 0 } main() { - # Parse arguments - while [[ $# -gt 0 ]]; do - case "$1" in - --force) - DRY_RUN=false - shift - ;; - --dry-run) - DRY_RUN=true - shift - ;; - --help|-h) - show_help - exit 0 - ;; - *) - log "ERROR" "Unknown option: $1" - show_help - exit 1 - ;; - esac - done - - setup_logging - - # Trap signals for cleanup - trap cleanup_exit INT TERM EXIT - - log "INFO" "Starting System Cleanup (Dry Run: $DRY_RUN)" - - if ! acquire_lock; then - exit 1 - fi - - # 1. Log Rotation - rotate_logs - - # 2. Clean Agent Directory Cruft - cleanup_directory "$AGENT_DIR" ".DS_Store" 0 "Agent Directory System Files" - cleanup_directory "$AGENT_DIR" "*.backup.*" 0 "Agent Directory Backups" - cleanup_directory "$AGENT_DIR" "*.bak" 0 "Agent Directory Bak Files" - - # 3. Clean Project Directory Cruft - cleanup_directory "$PROJECT_DIR" ".DS_Store" 0 "Project Directory System Files" - cleanup_directory "$PROJECT_DIR" "*.backup" 0 "Project Directory Backups" - cleanup_directory "$PROJECT_DIR" "*.bak" 0 "Project Directory Bak Files" - cleanup_directory "$PROJECT_DIR" "*~" 0 "Project Directory Swap Files" - - # 4. Clean Temporary Directory (Age-based) - cleanup_tmp_dir - - # 5. Clean Stale Lock Files (globally in /tmp related to this project) - # Be careful here, only target our specific locks - cleanup_directory "/tmp" "aidevops-*.lock" 1 "Stale Lock Files (>24h)" - - return 0 + # Parse arguments + while [[ $# -gt 0 ]]; do + case "$1" in + --force) + DRY_RUN=false + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --help | -h) + show_help + exit 0 + ;; + *) + log "ERROR" "Unknown option: $1" + show_help + exit 1 + ;; + esac + done + + setup_logging + + # Trap signals for cleanup + trap cleanup_exit INT TERM EXIT + + log "INFO" "Starting System Cleanup (Dry Run: $DRY_RUN)" + + if ! acquire_lock; then + exit 1 + fi + + # 1. Log Rotation + rotate_logs + + # 2. Clean Agent Directory Cruft + cleanup_directory "$AGENT_DIR" ".DS_Store" 0 "Agent Directory System Files" + cleanup_directory "$AGENT_DIR" "*.backup.*" 0 "Agent Directory Backups" + cleanup_directory "$AGENT_DIR" "*.bak" 0 "Agent Directory Bak Files" + + # 3. Clean Project Directory Cruft + cleanup_directory "$PROJECT_DIR" ".DS_Store" 0 "Project Directory System Files" + cleanup_directory "$PROJECT_DIR" "*.backup" 0 "Project Directory Backups" + cleanup_directory "$PROJECT_DIR" "*.bak" 0 "Project Directory Bak Files" + cleanup_directory "$PROJECT_DIR" "*~" 0 "Project Directory Swap Files" + + # 4. Clean Temporary Directory (Age-based) + cleanup_tmp_dir + + # 5. Clean Stale Lock Files (globally in /tmp related to this project) + # Be careful here, only target our specific locks + cleanup_directory "/tmp" "aidevops-*.lock" 1 "Stale Lock Files (>24h)" + + return 0 } main "$@" diff --git a/.agents/scripts/ttsr-rule-loader.sh b/.agents/scripts/ttsr-rule-loader.sh index 27df16310..d1bb14943 100755 --- a/.agents/scripts/ttsr-rule-loader.sh +++ b/.agents/scripts/ttsr-rule-loader.sh @@ -45,7 +45,7 @@ SCRIPT_NAME="ttsr-rule-loader" # Default rules directory: relative to repo root (one level up from scripts/) DEFAULT_RULES_DIR="${SCRIPT_DIR}/../rules" -DEFAULT_STATE_FILE="/tmp/ttsr-state-$$" +DEFAULT_STATE_FILE="/tmp/ttsr-state-${PPID:-$$}" # ============================================================================= # Utility Functions @@ -366,9 +366,16 @@ cmd_list() { if [[ "$format" == "json" ]]; then [[ "$first" -eq 1 ]] && first=0 || printf ',\n' - printf ' {"id":"%s","trigger":"%s","severity":"%s","repeat_policy":"%s","gap_turns":%s,"enabled":%s,"tags":"%s","file":"%s"}' \ - "$rule_id" "$rule_trigger" "$rule_severity" "$rule_repeat_policy" \ - "$rule_gap_turns" "$rule_enabled" "$rule_tags" "$rule_file" + printf ' %s' "$(jq -c -n \ + --arg id "$rule_id" \ + --arg trigger "$rule_trigger" \ + --arg severity "$rule_severity" \ + --arg repeat_policy "$rule_repeat_policy" \ + --argjson gap_turns "$rule_gap_turns" \ + --argjson enabled "$rule_enabled" \ + --arg tags "$rule_tags" \ + --arg file "$rule_file" \ + '{id: $id, trigger: $trigger, severity: $severity, repeat_policy: $repeat_policy, gap_turns: $gap_turns, enabled: $enabled, tags: $tags, file: $file}')" else # Truncate trigger for display local display_trigger="$rule_trigger" diff --git a/.agents/scripts/verify-brief.sh b/.agents/scripts/verify-brief.sh index d2dd59106..2d453ae12 100755 --- a/.agents/scripts/verify-brief.sh +++ b/.agents/scripts/verify-brief.sh @@ -78,8 +78,11 @@ parse_args() { arg="$1" case "$arg" in --repo-path) - val="$2" - REPO_PATH="$val" + if [[ $# -lt 2 || "$2" == -* ]]; then + log_fail "Missing value for --repo-path" + return 2 + fi + REPO_PATH="$2" shift 2 ;; --json) diff --git a/.agents/scripts/wappalyzer-helper.sh b/.agents/scripts/wappalyzer-helper.sh index 5762282df..574710bdb 100755 --- a/.agents/scripts/wappalyzer-helper.sh +++ b/.agents/scripts/wappalyzer-helper.sh @@ -100,7 +100,7 @@ install_deps() { wappalyzer_detect() { local url="$1" local output_file="${2:-}" - local wrapper_script="$SCRIPT_DIR/wappalyzer-detect.js" + local wrapper_script="$SCRIPT_DIR/wappalyzer-detect.mjs" if ! check_wappalyzer; then print_error "Wappalyzer not installed. Run: $0 install" @@ -112,6 +112,7 @@ wappalyzer_detect() { # Run Wappalyzer wrapper script local temp_output temp_output=$(mktemp) + trap 'rm -f -- "$temp_output"' RETURN local global_modules global_modules="$(npm root -g)" diff --git a/.agents/scripts/watercrawl-helper.sh b/.agents/scripts/watercrawl-helper.sh index ab7fddeab..2d0d4d3b4 100755 --- a/.agents/scripts/watercrawl-helper.sh +++ b/.agents/scripts/watercrawl-helper.sh @@ -39,7 +39,7 @@ set -euo pipefail # Source shared constants (provides sed_inplace and other utilities) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit -source "$SCRIPT_DIR/shared-constants.sh" 2>/dev/null || true +source "$SCRIPT_DIR/shared-constants.sh" || true # Fallback if shared-constants.sh not loaded if ! declare -f ensure_credentials_file &>/dev/null; then diff --git a/tests/test-supervisor-globals.sh b/tests/test-supervisor-globals.sh index b336ff3ed..d5a89b0cb 100755 --- a/tests/test-supervisor-globals.sh +++ b/tests/test-supervisor-globals.sh @@ -24,10 +24,12 @@ FAIL=0 pass() { PASS=$((PASS + 1)) echo " PASS: $1" + return 0 } fail() { FAIL=$((FAIL + 1)) echo " FAIL: $1" + return 0 } echo "=== Supervisor Globals Test ===" @@ -69,12 +71,14 @@ required_globals=( "VALID_TRANSITIONS" ) -all_files="$SUPERVISOR $(echo "$REPO_DIR/.agents/scripts/supervisor/"*.sh)" +all_files=("$SUPERVISOR") +for f in "$REPO_DIR/.agents/scripts/supervisor/"*.sh; do + [[ -e "$f" ]] && all_files+=("$f") +done for var in "${required_globals[@]}"; do # Check if the variable is assigned (not just referenced) in any file # Handles: VAR=, readonly VAR=, readonly -a VAR=( - # shellcheck disable=SC2086 - if grep -qE "^[[:space:]]*(readonly( -a)? )?${var}=" $all_files 2>/dev/null; then + if grep -qE "^[[:space:]]*(readonly( -a)? )?${var}=" "${all_files[@]}" 2>/dev/null; then pass "$var is defined" else fail "$var is NOT defined in any supervisor file" @@ -91,8 +95,7 @@ for var in SUPERVISOR_DIR SUPERVISOR_DB SUPERVISOR_LOG PULSE_LOCK_DIR PULSE_LOCK # Check if used in modules if grep -rq "\$${var}\b\|\${${var}}" "$module_dir/" 2>/dev/null; then # Check if defined in monolith or _common.sh - # shellcheck disable=SC2086 - if ! grep -q "^[[:space:]]*\(readonly \)\{0,1\}${var}=" $all_files 2>/dev/null; then + if ! grep -qE "^[[:space:]]*(readonly( -a)? )?${var}=" "${all_files[@]}" 2>/dev/null; then fail "$var used in modules but not defined anywhere" missing_count=$((missing_count + 1)) fi