Skip to content

ATLAS Phase 6B: Memory Persistence and Lifecycle Management #15184

@Nero7991

Description

@Nero7991

Summary

Implement persistent ATLAS memory management with automatic save/load functionality, enabling ATLAS memory state to persist across server restarts and sessions. This system provides seamless memory continuity for long-running applications.

Background

ATLAS memory contains valuable learned context patterns that should persist across sessions. The memory persistence system needs to:

  • Automatically save ATLAS memory state on graceful shutdown
  • Load existing memory on startup if available
  • Support manual save/load operations
  • Handle memory file versioning and compatibility
  • Provide robust error handling and corruption recovery

Implementation Requirements

1. Memory File Format Specification

ATLAS Memory File Structure

// ATLAS memory file header
struct atlas_memory_file_header {
    char magic[8];              // "ATLASMEM"
    uint32_t version;           // File format version
    uint32_t checksum;          // CRC32 checksum of content
    uint64_t timestamp;         // Creation timestamp
    uint32_t model_hash;        // Hash of source model
    uint32_t config_hash;       // Hash of ATLAS configuration
    
    // Memory dimensions
    uint32_t num_layers;
    uint32_t memory_dim;
    uint32_t window_size;
    uint32_t feature_dim;
    
    // Memory state info
    uint64_t total_tokens_processed;
    uint32_t memory_updates_count;
    float last_omega_loss;
    
    uint32_t reserved[16];      // Future extensions
};

// Memory data layout
struct atlas_memory_file_content {
    struct atlas_memory_file_header header;
    
    // Per-layer memory modules (num_layers entries)
    struct atlas_layer_memory {
        float* w1_weights;      // [memory_dim, input_dim]
        float* b1_bias;         // [memory_dim]
        float* w2_weights;      // [input_dim, memory_dim] 
        float* b2_bias;         // [input_dim]
        float* residual_weights; // [input_dim, input_dim]
        
        // Sliding window state
        float* window_keys;     // [window_size, key_dim]
        float* window_values;   // [window_size, value_dim]
        uint32_t* window_positions; // [window_size]
        uint32_t window_head;
        uint32_t window_fill;
        
        // Optimizer state
        float* muon_momentum;   // [total_params]
        float* muon_hessian_approx; // [total_params, total_params]
        
        // Feature mapping state
        float* polynomial_coeffs; // [polynomial_degree + 1]
        float* feature_cache_keys; // [cache_size, input_dim]
        float* feature_cache_values; // [cache_size, feature_dim]
        uint32_t feature_cache_size;
    } layers[];
};

2. Core Memory Persistence API

Save Operations

// Save ATLAS memory to file
int llama_atlas_save_memory(
    const struct atlas_context* ctx,
    const char* filename
) {
    FILE* file = fopen(filename, "wb");
    if (!file) {
        LOG_ERROR("Failed to open %s for writing\n", filename);
        return ATLAS_ERROR_FILE_WRITE;
    }
    
    // Prepare header
    struct atlas_memory_file_header header = {0};
    memcpy(header.magic, "ATLASMEM", 8);
    header.version = ATLAS_MEMORY_FILE_VERSION;
    header.timestamp = get_current_timestamp();
    header.model_hash = compute_model_hash(ctx);
    header.config_hash = compute_config_hash(ctx);
    
    // Memory dimensions
    header.num_layers = ctx->num_layers;
    header.memory_dim = ctx->memory_dim;
    header.window_size = ctx->window_size;
    header.feature_dim = ctx->feature_dim;
    
    // Memory statistics
    header.total_tokens_processed = ctx->stats.total_tokens_processed;
    header.memory_updates_count = ctx->stats.memory_updates_count;
    header.last_omega_loss = ctx->stats.last_omega_loss;
    
    // Write header
    if (fwrite(&header, sizeof(header), 1, file) != 1) {
        fclose(file);
        return ATLAS_ERROR_FILE_WRITE;
    }
    
    // Write layer data
    for (int layer = 0; layer < ctx->num_layers; layer++) {
        if (save_layer_memory(file, &ctx->layers[layer]) != 0) {
            fclose(file);
            return ATLAS_ERROR_FILE_WRITE;
        }
    }
    
    // Compute and update checksum
    uint32_t checksum = compute_file_checksum(filename);
    fseek(file, offsetof(struct atlas_memory_file_header, checksum), SEEK_SET);
    fwrite(&checksum, sizeof(checksum), 1, file);
    
    fclose(file);
    LOG_INFO("ATLAS memory saved to %s\n", filename);
    return ATLAS_SUCCESS;
}

// Save layer-specific memory data
static int save_layer_memory(FILE* file, const struct atlas_layer_context* layer) {
    // Save memory module weights
    if (fwrite(layer->memory.w1, sizeof(float), 
               layer->memory_dim * layer->input_dim, file) != 
               layer->memory_dim * layer->input_dim) {
        return ATLAS_ERROR_FILE_WRITE;
    }
    
    // Save sliding window state
    if (fwrite(layer->omega.window.keys, sizeof(float),
               layer->window_size * layer->key_dim, file) !=
               layer->window_size * layer->key_dim) {
        return ATLAS_ERROR_FILE_WRITE;
    }
    
    // Save optimizer state
    if (layer->muon.enabled) {
        if (fwrite(layer->muon.momentum, sizeof(float),
                   layer->total_params, file) != layer->total_params) {
            return ATLAS_ERROR_FILE_WRITE;
        }
    }
    
    return ATLAS_SUCCESS;
}

Load Operations

// Load ATLAS memory from file
int llama_atlas_load_memory(
    struct atlas_context* ctx,
    const char* filename
) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        LOG_WARNING("ATLAS memory file %s not found\n", filename);
        return ATLAS_ERROR_FILE_NOT_FOUND;
    }
    
    // Read header
    struct atlas_memory_file_header header;
    if (fread(&header, sizeof(header), 1, file) != 1) {
        fclose(file);
        return ATLAS_ERROR_FILE_READ;
    }
    
    // Validate header
    if (memcmp(header.magic, "ATLASMEM", 8) != 0) {
        LOG_ERROR("Invalid ATLAS memory file format\n");
        fclose(file);
        return ATLAS_ERROR_INVALID_FORMAT;
    }
    
    if (header.version != ATLAS_MEMORY_FILE_VERSION) {
        LOG_WARNING("ATLAS memory file version mismatch: %d vs %d\n",
                   header.version, ATLAS_MEMORY_FILE_VERSION);
        // Attempt version migration if possible
        if (!migrate_memory_version(&header, file)) {
            fclose(file);
            return ATLAS_ERROR_VERSION_MISMATCH;
        }
    }
    
    // Verify compatibility
    if (header.num_layers != ctx->num_layers ||
        header.memory_dim != ctx->memory_dim ||
        header.window_size != ctx->window_size) {
        LOG_ERROR("ATLAS memory file incompatible with current configuration\n");
        fclose(file);
        return ATLAS_ERROR_INCOMPATIBLE;
    }
    
    // Verify checksum
    uint32_t computed_checksum = compute_file_checksum(filename);
    if (computed_checksum != header.checksum) {
        LOG_ERROR("ATLAS memory file corrupted (checksum mismatch)\n");
        fclose(file);
        return ATLAS_ERROR_CORRUPTED;
    }
    
    // Load layer data
    for (int layer = 0; layer < ctx->num_layers; layer++) {
        if (load_layer_memory(file, &ctx->layers[layer]) != 0) {
            fclose(file);
            return ATLAS_ERROR_FILE_READ;
        }
    }
    
    // Restore statistics
    ctx->stats.total_tokens_processed = header.total_tokens_processed;
    ctx->stats.memory_updates_count = header.memory_updates_count;
    ctx->stats.last_omega_loss = header.last_omega_loss;
    
    fclose(file);
    LOG_INFO("ATLAS memory loaded from %s (%lu tokens processed)\n", 
             filename, header.total_tokens_processed);
    return ATLAS_SUCCESS;
}

3. Automatic Memory Management

Graceful Shutdown Handler

// Signal handler for graceful shutdown
static volatile bool atlas_shutdown_requested = false;

void atlas_signal_handler(int signal) {
    LOG_INFO("Received signal %d, initiating graceful ATLAS shutdown\n", signal);
    atlas_shutdown_requested = true;
}

// Register shutdown handlers
void llama_atlas_register_shutdown_handlers(struct atlas_context* ctx) {
    // Register signal handlers
    signal(SIGINT, atlas_signal_handler);
    signal(SIGTERM, atlas_signal_handler);
    
    // Register atexit handler
    atexit(atlas_atexit_handler);
    
    // Store context for handlers
    global_atlas_context = ctx;
}

// Graceful shutdown implementation
void llama_atlas_graceful_shutdown(struct atlas_context* ctx) {
    if (!ctx || !ctx->auto_save_enabled) return;
    
    LOG_INFO("Performing graceful ATLAS memory save...\n");
    
    // Generate default filename
    char memory_filename[512];
    if (ctx->model_filename) {
        snprintf(memory_filename, sizeof(memory_filename),
                "%s_memory.atlas", 
                get_filename_without_extension(ctx->model_filename));
    } else {
        snprintf(memory_filename, sizeof(memory_filename),
                "atlas_memory_%lu.atlas", get_current_timestamp());
    }
    
    // Save memory state
    int result = llama_atlas_save_memory(ctx, memory_filename);
    if (result == ATLAS_SUCCESS) {
        LOG_INFO("ATLAS memory successfully saved to %s\n", memory_filename);
    } else {
        LOG_ERROR("Failed to save ATLAS memory: %d\n", result);
    }
}

// Atexit handler
void atlas_atexit_handler(void) {
    if (global_atlas_context) {
        llama_atlas_graceful_shutdown(global_atlas_context);
    }
}

Automatic Load on Startup

// Auto-load memory during initialization
int llama_atlas_auto_load_memory(
    struct atlas_context* ctx,
    const char* model_filename,
    const char* custom_memory_file
) {
    char memory_filename[512];
    
    // Determine memory file to load
    if (custom_memory_file && strlen(custom_memory_file) > 0) {
        strncpy(memory_filename, custom_memory_file, sizeof(memory_filename) - 1);
    } else if (model_filename) {
        // Generate default memory filename
        snprintf(memory_filename, sizeof(memory_filename),
                "%s_memory.atlas", 
                get_filename_without_extension(model_filename));
    } else {
        LOG_WARNING("No model filename available for auto-load\n");
        return ATLAS_ERROR_NO_FILENAME;
    }
    
    // Check if memory file exists
    if (!file_exists(memory_filename)) {
        LOG_INFO("No existing ATLAS memory file found, starting with fresh memory\n");
        return ATLAS_SUCCESS; // Not an error
    }
    
    // Attempt to load
    int result = llama_atlas_load_memory(ctx, memory_filename);
    if (result == ATLAS_SUCCESS) {
        LOG_INFO("Successfully auto-loaded ATLAS memory from %s\n", memory_filename);
        ctx->memory_loaded_from_file = true;
        strncpy(ctx->current_memory_file, memory_filename, 
                sizeof(ctx->current_memory_file) - 1);
    } else {
        LOG_WARNING("Failed to auto-load ATLAS memory from %s: %d\n", 
                   memory_filename, result);
        // Continue with fresh memory
    }
    
    return result;
}

4. Periodic Auto-Save

Background Save Thread

// Periodic save configuration
struct atlas_autosave_config {
    bool enabled;
    int interval_seconds;       // Save interval (e.g., 300 = 5 minutes)
    int token_threshold;        // Save after N tokens processed
    float loss_change_threshold; // Save if loss changes significantly
    bool save_on_improvement;   // Save when loss improves
};

// Background auto-save thread
void* atlas_autosave_thread(void* arg) {
    struct atlas_context* ctx = (struct atlas_context*)arg;
    time_t last_save_time = time(NULL);
    uint64_t last_save_tokens = ctx->stats.total_tokens_processed;
    float last_save_loss = ctx->stats.last_omega_loss;
    
    while (!atlas_shutdown_requested && ctx->autosave.enabled) {
        time_t current_time = time(NULL);
        uint64_t current_tokens = ctx->stats.total_tokens_processed;
        float current_loss = ctx->stats.last_omega_loss;
        
        bool should_save = false;
        
        // Time-based saving
        if (current_time - last_save_time >= ctx->autosave.interval_seconds) {
            should_save = true;
            LOG_DEBUG("Auto-save triggered by time interval\n");
        }
        
        // Token-based saving
        if (current_tokens - last_save_tokens >= ctx->autosave.token_threshold) {
            should_save = true;
            LOG_DEBUG("Auto-save triggered by token threshold\n");
        }
        
        // Loss improvement saving
        if (ctx->autosave.save_on_improvement && 
            current_loss < last_save_loss - ctx->autosave.loss_change_threshold) {
            should_save = true;
            LOG_DEBUG("Auto-save triggered by loss improvement\n");
        }
        
        if (should_save && ctx->current_memory_file[0] != '\0') {
            // Create backup of current file
            char backup_filename[512];
            snprintf(backup_filename, sizeof(backup_filename),
                    "%s.backup", ctx->current_memory_file);
            
            if (file_exists(ctx->current_memory_file)) {
                rename(ctx->current_memory_file, backup_filename);
            }
            
            // Save current state
            int result = llama_atlas_save_memory(ctx, ctx->current_memory_file);
            if (result == ATLAS_SUCCESS) {
                // Remove backup on successful save
                if (file_exists(backup_filename)) {
                    unlink(backup_filename);
                }
                
                last_save_time = current_time;
                last_save_tokens = current_tokens;
                last_save_loss = current_loss;
                
                LOG_DEBUG("Auto-save completed successfully\n");
            } else {
                // Restore backup on failure
                if (file_exists(backup_filename)) {
                    rename(backup_filename, ctx->current_memory_file);
                }
                
                LOG_ERROR("Auto-save failed: %d\n", result);
            }
        }
        
        // Sleep for check interval (e.g., 10 seconds)
        sleep(10);
    }
    
    return NULL;
}

5. Memory File Management Utilities

File Operations and Validation

// Validate memory file integrity
int llama_atlas_validate_memory_file(const char* filename) {
    struct atlas_memory_file_header header;
    
    FILE* file = fopen(filename, "rb");
    if (!file) return ATLAS_ERROR_FILE_NOT_FOUND;
    
    // Read header
    if (fread(&header, sizeof(header), 1, file) != 1) {
        fclose(file);
        return ATLAS_ERROR_FILE_READ;
    }
    fclose(file);
    
    // Validate magic
    if (memcmp(header.magic, "ATLASMEM", 8) != 0) {
        return ATLAS_ERROR_INVALID_FORMAT;
    }
    
    // Validate checksum
    uint32_t computed_checksum = compute_file_checksum(filename);
    if (computed_checksum != header.checksum) {
        return ATLAS_ERROR_CORRUPTED;
    }
    
    return ATLAS_SUCCESS;
}

// Get memory file information
int llama_atlas_get_memory_file_info(
    const char* filename,
    struct atlas_memory_file_info* info
) {
    struct atlas_memory_file_header header;
    
    FILE* file = fopen(filename, "rb");
    if (!file) return ATLAS_ERROR_FILE_NOT_FOUND;
    
    if (fread(&header, sizeof(header), 1, file) != 1) {
        fclose(file);
        return ATLAS_ERROR_FILE_READ;
    }
    fclose(file);
    
    // Fill info structure
    info->version = header.version;
    info->timestamp = header.timestamp;
    info->num_layers = header.num_layers;
    info->memory_dim = header.memory_dim;
    info->window_size = header.window_size;
    info->total_tokens_processed = header.total_tokens_processed;
    info->memory_updates_count = header.memory_updates_count;
    info->last_omega_loss = header.last_omega_loss;
    
    return ATLAS_SUCCESS;
}

// Repair corrupted memory file (if possible)
int llama_atlas_repair_memory_file(const char* filename) {
    // Implementation would attempt to recover from backup,
    // fix checksum issues, or migrate from older versions
    // This is a placeholder for advanced recovery logic
    
    LOG_WARNING("Memory file repair not yet implemented\n");
    return ATLAS_ERROR_NOT_IMPLEMENTED;
}

Testing Requirements

Unit Tests

  • File format: Save/load cycle preserves all memory state
  • Checksum validation: Corrupted files properly detected
  • Version compatibility: Handle version mismatches gracefully
  • Error handling: Robust failure recovery
  • File operations: Handle filesystem errors properly

Integration Tests

  • Server integration: Memory persists across server restarts
  • Concurrent access: Multiple processes don't corrupt memory files
  • Auto-save: Periodic saving works correctly
  • Signal handling: Graceful shutdown saves memory
  • API integration: Manual save/load via HTTP endpoints

Stress Tests

  • Large memory: Handle memory files >1GB
  • Frequent saves: Auto-save every minute for 24 hours
  • Corruption recovery: Handle various corruption scenarios
  • Disk space: Graceful handling of disk full conditions
  • File system: Work across different file systems (ext4, NTFS, APFS)

Implementation Files

Core Implementation

  • src/atlas/atlas-memory-persistence.h - Memory persistence API
  • src/atlas/atlas-memory-persistence.cpp - Core save/load implementation
  • src/atlas/atlas-file-format.h - File format definitions
  • src/atlas/atlas-autosave.cpp - Automatic save functionality

Utilities

  • src/atlas/atlas-file-utils.cpp - File operation utilities
  • src/atlas/atlas-checksum.cpp - Checksum and validation
  • src/atlas/atlas-migration.cpp - Version migration support

Server Integration

  • examples/server/server-atlas-persistence.cpp - Server memory management
  • examples/server/atlas-memory-endpoints.cpp - HTTP endpoints for memory ops

Tools

  • tools/atlas-memory-tool.cpp - Command-line memory file utility
  • tools/atlas-memory-viewer.cpp - Memory file inspection tool

Test Files

  • tests/atlas/test-memory-persistence.cpp - Core persistence testing
  • tests/atlas/test-memory-corruption.cpp - Corruption handling tests
  • tests/atlas/test-autosave.cpp - Auto-save functionality tests

Success Criteria

Functional Requirements

  • Memory state completely preserved across sessions
  • Automatic save/load works reliably
  • File corruption detected and handled gracefully
  • Manual save/load operations work via API
  • Memory files are portable across systems

Performance Requirements

  • Save operation completes in <5 seconds for typical memory sizes
  • Load operation completes in <2 seconds
  • Auto-save doesn't impact inference performance
  • Memory file size <100MB for typical configurations

Reliability Requirements

  • Zero data loss during graceful shutdown
  • Corruption detection accuracy >99.9%
  • Auto-save failure rate <0.1%
  • Recovery from backup successful >95% of cases

Command Line Usage

# Basic usage with auto-save/load
./llama-server -m model.gguf --atlas --atlas-auto-memory

# Specify custom memory file
./llama-server -m model.gguf --atlas --atlas-memory-file custom_memory.atlas

# Disable auto-save (manual only)
./llama-server -m model.gguf --atlas --atlas-no-auto-save

# Configure auto-save interval
./llama-server -m model.gguf --atlas --atlas-autosave-interval 600

# Memory file utilities
./atlas-memory-tool --validate model_memory.atlas
./atlas-memory-tool --info model_memory.atlas
./atlas-memory-tool --repair corrupted_memory.atlas

Dependencies

  • Issues Add missing headers for memcpy and assert #3-10: All ATLAS components and server integration
  • File I/O libraries and filesystem support
  • Signal handling (POSIX signals)
  • Threading support for auto-save
  • Checksum/hashing library (CRC32, SHA256)

Estimated Effort

2 weeks for experienced systems programmer with file format design experience

References

  • Binary file format design best practices
  • Database transaction and recovery principles
  • POSIX signal handling documentation
  • File system reliability and atomic operations

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions