ggml-org · doctorjei · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 17, 2026
@@ -574,6 +574,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
+    // --hugepages compatibility checks. PR #1 covers weights on the --mmap path only;
+    // --no-mmap and --direct-io both route around the hugetlb branch, so surface the
+    // conflict at the user rather than silently ignoring the request. Linux-only.
+    if (params.use_hugepages) {
+#ifndef __linux__
+        throw std::invalid_argument("error: --hugepages is Linux only.\n");
+#endif
+        if (!params.use_mmap) {
+            throw std::invalid_argument("error: --hugepages requires --mmap. Coverage for --no-mmap is added in a follow-up PR. Drop --no-mmap to use --hugepages.\n");
+        }
+        if (params.use_direct_io) {
+            throw std::invalid_argument("error: --hugepages and --direct-io are incompatible. Drop one.\n");
+        }
+    }
+
     // handle model and download
     if (!skip_model_download) {
         auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
@@ -2225,6 +2240,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = value;
         }
     ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"--hugepages"},
+        "back model weights with anonymous hugetlb 2 MiB pages (Linux only).\n"
+        "reserve the pool first with e.g. `sysctl -w vm.nr_hugepages=N` — no reboot required.\n"
+        "pinned in RAM, bypasses swap and page cache; primary win is vmemmap reclamation via HVO",
+        [](common_params & params) {
+            params.use_hugepages = true;
+        }
+    ).set_env("LLAMA_ARG_HUGEPAGES"));
     add_opt(common_arg(
         {"-dio", "--direct-io"},
         {"-ndio", "--no-direct-io"},

@@ -1421,6 +1421,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_direct_io   = params.use_direct_io;
+    mparams.use_hugepages   = params.use_hugepages;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;

@@ -534,6 +534,7 @@ struct common_params {
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // enable mmap to use filesystem cache
     bool use_direct_io     = false; // read from disk without buffering
+    bool use_hugepages     = false; // back weight mappings with anonymous hugetlb pages (Linux only)
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation

@@ -606,6 +606,8 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
 struct ggml_backend_cuda_buffer_context {
     int device;
     void * dev_ptr = nullptr;
+    void * host_ptr = nullptr;
+    bool   owned    = true;
     std::string name;
 
     ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
@@ -614,7 +616,12 @@ struct ggml_backend_cuda_buffer_context {
     }
 
     ~ggml_backend_cuda_buffer_context() {
-        CUDA_CHECK(cudaFree(dev_ptr));
+        if (owned) {
+            CUDA_CHECK(cudaFree(dev_ptr));
+        } else {
+            // host_ptr was registered via cudaHostRegister; dev_ptr aliases it and is not ours to free.
+            CUDA_CHECK(cudaHostUnregister(host_ptr));
+        }
     }
 };
 
@@ -645,10 +652,16 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
         const size_t original_size = ggml_nbytes(tensor);
         const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
-        if (padded_size > original_size) {
+        if (padded_size > original_size && ctx->owned) {
             ggml_cuda_set_device(ctx->device);
             CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
         }
+        // For externally-owned buffers (buffer_from_host_ptr), the memset is
+        // skipped: hipMemset through a hipHostGetDevicePointer-derived address
+        // is unsupported on ROCm integrated GPUs, and the padding region may
+        // extend past the registered host range for the final tensor. GGUF
+        // files zero-pad between tensors by convention, so the padding bytes
+        // in the mmap'd region are already zero.
     }
     return GGML_STATUS_SUCCESS;
 }
@@ -4653,10 +4666,24 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     bool events = true;
 #endif
 
+    // buffer_from_host_ptr is currently enabled only on HIP integrated GPUs
+    // (validated on Strix Halo / ROCm 7.2.0). NVIDIA Jetson reports
+    // prop.integrated == 1 too and may benefit from the same path, but it
+    // has not been tested on that platform; #15034's cuda_host-buffer
+    // corruption is in a different code path (see ggml-cuda.cu:243) and
+    // is not expected to apply here, but validation is required before
+    // extending beyond HIP.
+    bool buffer_from_host_ptr = false;
+#if defined(GGML_USE_HIP)
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
+    buffer_from_host_ptr = prop.integrated > 0;
+#endif
+
     props->caps = {
         /* .async                 = */ true,
         /* .host_buffer           = */ host_buffer,
-        /* .buffer_from_host_ptr  = */ false,
+        /* .buffer_from_host_ptr  = */ buffer_from_host_ptr,
         /* .events                = */ events,
     };
 }
@@ -4677,6 +4704,53 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
     return ggml_backend_cuda_host_buffer_type();
 }
 
+#if defined(GGML_USE_HIP)
+// HIP-only for now; see comment at the capability flag in get_props().
+// TODO: extend to CUDA / Jetson after validating that #15034's corruption
+// mode does not apply to this code path.
+static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(
+        ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(max_tensor_size);
+
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    ggml_cuda_set_device(ctx->device);
+
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
+    if (prop.integrated <= 0) {
+        return nullptr;
+    }
+
+    // ReadOnly is intentionally not set: ggml_backend_cuda_buffer_init_tensor
+    // uses cudaMemset to zero quantized-tensor padding, which would be
+    // rejected for a read-only-registered region.
+    cudaError_t err = cudaHostRegister(ptr, size,
+        cudaHostRegisterPortable | cudaHostRegisterMapped);
+    if (err != cudaSuccess) {
+        (void)cudaGetLastError();
+        GGML_LOG_ERROR("%s: cudaHostRegister failed: %s\n", __func__, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    void * dev_ptr = nullptr;
+    err = cudaHostGetDevicePointer(&dev_ptr, ptr, 0);
+    if (err != cudaSuccess) {
+        (void)cudaGetLastError();
+        cudaHostUnregister(ptr);
+        GGML_LOG_ERROR("%s: cudaHostGetDevicePointer failed: %s\n", __func__, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    ggml_backend_cuda_buffer_context * buf_ctx = new ggml_backend_cuda_buffer_context(ctx->device, dev_ptr);
+    buf_ctx->host_ptr = ptr;
+    buf_ctx->owned    = false;
+
+    return ggml_backend_buffer_init(
+        ggml_backend_cuda_device_get_buffer_type(dev),
+        ggml_backend_cuda_buffer_interface, buf_ctx, size);
+}
+#endif // GGML_USE_HIP
+
 // TODO: move these functions here
 static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
@@ -5126,7 +5200,11 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .init_backend            = */ ggml_backend_cuda_device_init_backend,
     /* .get_buffer_type         = */ ggml_backend_cuda_device_get_buffer_type,
     /* .get_host_buffer_type    = */ ggml_backend_cuda_device_get_host_buffer_type,
+#if defined(GGML_USE_HIP)
+    /* .buffer_from_host_ptr    = */ ggml_backend_cuda_device_buffer_from_host_ptr,
+#else
     /* .buffer_from_host_ptr    = */ NULL,
+#endif
     /* .supports_op             = */ ggml_backend_cuda_device_supports_op,
     /* .supports_buft           = */ ggml_backend_cuda_device_supports_buft,
     /* .offload_op              = */ ggml_backend_cuda_device_offload_op,

@@ -74,7 +74,9 @@
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
+#define cudaHostGetDevicePointer hipHostGetDevicePointer
 #define cudaHostRegister hipHostRegister
+#define cudaHostRegisterMapped hipHostRegisterMapped
 #define cudaHostRegisterPortable hipHostRegisterPortable
 #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
 #define cudaHostUnregister hipHostUnregister

diff --git a/include/llama.h b/include/llama.h
@@ -314,6 +314,7 @@ extern "C" {
         bool vocab_only;      // only load the vocabulary, no weights
         bool use_mmap;        // use mmap if possible
         bool use_direct_io;   // use direct io, takes precedence over use_mmap when supported
+        bool use_hugepages;   // back model memory with anonymous hugetlb pages (Linux only)
         bool use_mlock;       // force system to keep model in RAM
         bool check_tensors;   // validate model tensor data
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)

@@ -40,6 +40,20 @@
 #include <TargetConditionals.h>
 #endif
 
+#ifdef __linux__
+// Older glibc headers may miss these; upstream kernel has had them for years.
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#endif
+#endif
+
+// 2 MiB hugepage size, used for both the hugetlb mmap length and the
+// unmap_fragment alignment granularity when the mapping is hugetlb-backed.
+static constexpr size_t LLAMA_HUGE_PAGE_SIZE = 2ull * 1024 * 1024;
+
 // TODO: consider moving to llama-impl.h if needed in more places
 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
@@ -434,9 +448,38 @@ struct llama_mmap::impl {
 #ifdef _POSIX_MAPPED_FILES
     std::vector<std::pair<size_t, size_t>> mapped_fragments;
 
-    impl(struct llama_file * file, size_t prefetch, bool numa) {
+    impl(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) {
         size = file->size();
         int fd = file->file_id();
+#ifdef __linux__
+        if (hugetlb) {
+            // Anonymous hugetlb mapping rounded up to 2 MiB. PROT_WRITE lets
+            // load_all_data pread the file in (downgraded to PROT_READ after);
+            // MAP_POPULATE surfaces pool-exhaustion here, not mid-load.
+            mmap_size = (size + LLAMA_HUGE_PAGE_SIZE - 1) & ~(LLAMA_HUGE_PAGE_SIZE - 1);
+            addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, -1, 0);
+            if (addr == MAP_FAILED) {
+                int saved = errno;
+                if (saved == ENOMEM) {
+                    size_t need = mmap_size / LLAMA_HUGE_PAGE_SIZE;
+                    long   have = -1;
+                    if (FILE * f = std::fopen("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages", "r")) {
+                        if (std::fscanf(f, "%ld", &have) != 1) { have = -1; }
+                        std::fclose(f);
+                    }
+                    throw std::runtime_error(format("hugetlb mmap failed: need %zu free 2 MiB pages, pool has %ld. "
+                                                    "Try: sudo sysctl -w vm.nr_hugepages=%zu", need, have, need));
+                }
+                throw std::runtime_error(format("hugetlb mmap failed: %s", strerror(saved)));
+            }
+            is_hugetlb_ = true;
+            mapped_fragments.emplace_back(0, mmap_size);
+            return;
+        }
+#else
+        (void) hugetlb;
+#endif
         int flags = MAP_SHARED;
         if (numa) { prefetch = 0; }
 #ifdef __linux__
@@ -480,7 +523,9 @@ struct llama_mmap::impl {
     }
 
     void unmap_fragment(size_t first, size_t last) {
-        int page_size = sysconf(_SC_PAGESIZE);
+        // Hugetlb munmaps must be 2 MiB-aligned; the file-backed path uses
+        // the kernel base page size as before.
+        size_t page_size = is_hugetlb_ ? LLAMA_HUGE_PAGE_SIZE : (size_t) sysconf(_SC_PAGESIZE);
         align_range(&first, &last, page_size);
         size_t len = last - first;
 
@@ -525,8 +570,9 @@ struct llama_mmap::impl {
 #elif defined(_WIN32)
     HANDLE hMapping = nullptr;
 
-    impl(struct llama_file * file, size_t prefetch, bool numa) {
+    impl(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) {
         GGML_UNUSED(numa);
+        GGML_UNUSED(hugetlb);
 
         size = file->size();
 
@@ -589,10 +635,11 @@ struct llama_mmap::impl {
         }
     }
 #else
-    impl(struct llama_file * file, size_t prefetch, bool numa) {
+    impl(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) {
         GGML_UNUSED(file);
         GGML_UNUSED(prefetch);
         GGML_UNUSED(numa);
+        GGML_UNUSED(hugetlb);
 
         throw std::runtime_error("mmap not supported");
     }
@@ -605,15 +652,21 @@ struct llama_mmap::impl {
     }
 #endif
 
-    void * addr;
-    size_t size;
+    void * addr = nullptr;
+    size_t size = 0;
+    // Hugetlb: physical mapping length (file size rounded up to 2 MiB) used
+    // by munmap; `size` continues to report the underlying file length.
+    size_t mmap_size = 0;
+    bool   is_hugetlb_ = false;
 };
 
-llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique<impl>(file, prefetch, numa)) {}
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) : pimpl(std::make_unique<impl>(file, prefetch, numa, hugetlb)) {}
 llama_mmap::~llama_mmap() = default;
 
 size_t llama_mmap::size() const { return pimpl->size; }
+size_t llama_mmap::mmap_size() const { return pimpl->mmap_size ? pimpl->mmap_size : pimpl->size; }
 void * llama_mmap::addr() const { return pimpl->addr; }
+bool   llama_mmap::is_hugetlb() const { return pimpl->is_hugetlb_; }
 
 void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
 

@@ -42,11 +42,13 @@ struct llama_file {
 
 struct llama_mmap {
     llama_mmap(const llama_mmap &) = delete;
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool hugetlb = false);
     ~llama_mmap();
 
     size_t size() const;
+    size_t mmap_size() const; // physical mapping length (>= size() when hugetlb-rounded)
     void * addr() const;
+    bool   is_hugetlb() const;
 
     void unmap_fragment(size_t first, size_t last);