-
Notifications
You must be signed in to change notification settings - Fork 3
Open
Description
Summary
Document comprehensive remote log analysis patterns for debugging cluster job failures.
Debug Pattern 1: Systematic Remote Log Analysis
From check_remote_logs.py
- Complete job directory inspection:
def check_remote_logs():
# Find recent job directories
stdin, stdout, stderr = ssh_client.exec_command('ls -t /tmp/clustrix_host/ | head -5')
recent_jobs = stdout.read().decode().strip().split('\n')
for job in recent_jobs:
job_path = f"/tmp/clustrix_host/{job}"
# Check job directory contents
stdin, stdout, stderr = ssh_client.exec_command(f'ls -la {job_path}/')
job_contents = stdout.read().decode().strip()
# Look for result/error files
stdin, stdout, stderr = ssh_client.exec_command(f'ls -la {job_path}/*result*.pkl 2>/dev/null')
result_files = stdout.read().decode().strip()
# Find and examine log files
stdin, stdout, stderr = ssh_client.exec_command(f'find {job_path} -name "*.log" -o -name "*.out" -o -name "*.err"')
log_files = stdout.read().decode().strip()
for log_file in log_files.split('\n'):
if log_file.strip():
stdin, stdout, stderr = ssh_client.exec_command(f'cat {log_file}')
log_content = stdout.read().decode().strip()
print(f"Content of {log_file}:")
print(log_content)
Debug Pattern 2: Virtual Environment Verification
# Check if venv directories exist and work
stdin, stdout, stderr = ssh_client.exec_command(f'find {job_path} -name "clustrix_venv2*" -type d')
venv2_dirs = stdout.read().decode().strip()
if venv2_dirs:
venv2_dir = venv2_dirs.split('\n')[0].strip()
python_path = f"{venv2_dir}/bin/python"
# Test Python in venv
stdin, stdout, stderr = ssh_client.exec_command(f'{python_path} --version 2>&1')
python_version = stdout.read().decode().strip()
# Test package availability
stdin, stdout, stderr = ssh_client.exec_command(f'{python_path} -c "import torch; print(torch.__version__)" 2>&1')
torch_check = stdout.read().decode().strip()
Debug Pattern 3: SLURM Job Result Analysis
From check_slurm_results.py
- Structured result file examination:
# Look for recent result files
stdin, stdout, stderr = ssh_client.exec_command(f"find {result_dir} -name 'result_*_job*.json' | sort")
recent_files = stdout.read().decode().strip().split('\n')
for result_file in recent_files:
stdin, stdout, stderr = ssh_client.exec_command(f"cat {result_file}")
content = stdout.read().decode()
try:
result_data = json.loads(content)
print(json.dumps(result_data, indent=2))
except:
print(content) # Raw output if not JSON
Debug Pattern 4: Live Environment Testing
From check_tensor01_status.py
- Test actual cluster environment:
@cluster(cleanup_on_success=False)
def check_remote_environment():
result = subprocess.run([
"python", "-c", """
import sys, os
print(f'PYTHON_VERSION:{sys.version}')
print(f'PYTHON_PATH:{sys.executable}')
print(f'CUDA_VISIBLE_DEVICES:{os.environ.get('CUDA_VISIBLE_DEVICES', 'NOT_SET')}')
try:
import torch
print(f'TORCH_VERSION:{torch.__version__}')
print(f'CUDA_AVAILABLE:{torch.cuda.is_available()}')
print(f'DEVICE_COUNT:{torch.cuda.device_count()}')
except Exception as e:
print(f'TORCH_ERROR:{e}')
"""
], capture_output=True, text=True, timeout=60)
return {"output": result.stdout, "error": result.stderr, "code": result.returncode}
Debugging Value
These patterns enable:
- Systematic analysis of failed cluster jobs
- Verification of virtual environment setup
- Inspection of execution logs and error files
- Live testing of cluster environment capabilities
- Structured examination of job results
Source: Repository cleanup Issue #72
Metadata
Metadata
Assignees
Labels
No labels