Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}

// --hugepages compatibility checks. PR #1 covers weights on the --mmap path only;
// --no-mmap and --direct-io both route around the hugetlb branch, so surface the
// conflict at the user rather than silently ignoring the request. Linux-only.
if (params.use_hugepages) {
#ifndef __linux__
throw std::invalid_argument("error: --hugepages is Linux only.\n");
#endif
if (!params.use_mmap) {
throw std::invalid_argument("error: --hugepages requires --mmap. Coverage for --no-mmap is added in a follow-up PR. Drop --no-mmap to use --hugepages.\n");
}
if (params.use_direct_io) {
throw std::invalid_argument("error: --hugepages and --direct-io are incompatible. Drop one.\n");
}
}

// handle model and download
if (!skip_model_download) {
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
Expand Down Expand Up @@ -2225,6 +2240,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_mmap = value;
}
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"--hugepages"},
"back model weights with anonymous hugetlb 2 MiB pages (Linux only).\n"
"reserve the pool first with e.g. `sysctl -w vm.nr_hugepages=N` — no reboot required.\n"
"pinned in RAM, bypasses swap and page cache; primary win is vmemmap reclamation via HVO",
[](common_params & params) {
params.use_hugepages = true;
}
).set_env("LLAMA_ARG_HUGEPAGES"));
add_opt(common_arg(
{"-dio", "--direct-io"},
{"-ndio", "--no-direct-io"},
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1421,6 +1421,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_direct_io = params.use_direct_io;
mparams.use_hugepages = params.use_hugepages;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ struct common_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // enable mmap to use filesystem cache
bool use_direct_io = false; // read from disk without buffering
bool use_hugepages = false; // back weight mappings with anonymous hugetlb pages (Linux only)
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
Expand Down
84 changes: 81 additions & 3 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,8 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
struct ggml_backend_cuda_buffer_context {
int device;
void * dev_ptr = nullptr;
void * host_ptr = nullptr;
bool owned = true;
std::string name;

ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
Expand All @@ -614,7 +616,12 @@ struct ggml_backend_cuda_buffer_context {
}

~ggml_backend_cuda_buffer_context() {
CUDA_CHECK(cudaFree(dev_ptr));
if (owned) {
CUDA_CHECK(cudaFree(dev_ptr));
} else {
// host_ptr was registered via cudaHostRegister; dev_ptr aliases it and is not ours to free.
CUDA_CHECK(cudaHostUnregister(host_ptr));
}
}
};

Expand Down Expand Up @@ -645,10 +652,16 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
const size_t original_size = ggml_nbytes(tensor);
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

if (padded_size > original_size) {
if (padded_size > original_size && ctx->owned) {
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
}
// For externally-owned buffers (buffer_from_host_ptr), the memset is
// skipped: hipMemset through a hipHostGetDevicePointer-derived address
// is unsupported on ROCm integrated GPUs, and the padding region may
// extend past the registered host range for the final tensor. GGUF
// files zero-pad between tensors by convention, so the padding bytes
// in the mmap'd region are already zero.
}
return GGML_STATUS_SUCCESS;
}
Expand Down Expand Up @@ -4653,10 +4666,24 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
bool events = true;
#endif

// buffer_from_host_ptr is currently enabled only on HIP integrated GPUs
// (validated on Strix Halo / ROCm 7.2.0). NVIDIA Jetson reports
// prop.integrated == 1 too and may benefit from the same path, but it
// has not been tested on that platform; #15034's cuda_host-buffer
// corruption is in a different code path (see ggml-cuda.cu:243) and
// is not expected to apply here, but validation is required before
// extending beyond HIP.
bool buffer_from_host_ptr = false;
#if defined(GGML_USE_HIP)
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
buffer_from_host_ptr = prop.integrated > 0;
#endif

props->caps = {
/* .async = */ true,
/* .host_buffer = */ host_buffer,
/* .buffer_from_host_ptr = */ false,
/* .buffer_from_host_ptr = */ buffer_from_host_ptr,
/* .events = */ events,
};
}
Expand All @@ -4677,6 +4704,53 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
return ggml_backend_cuda_host_buffer_type();
}

#if defined(GGML_USE_HIP)
// HIP-only for now; see comment at the capability flag in get_props().
// TODO: extend to CUDA / Jetson after validating that #15034's corruption
// mode does not apply to this code path.
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(
ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
GGML_UNUSED(max_tensor_size);

ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);

cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
if (prop.integrated <= 0) {
return nullptr;
}

// ReadOnly is intentionally not set: ggml_backend_cuda_buffer_init_tensor
// uses cudaMemset to zero quantized-tensor padding, which would be
// rejected for a read-only-registered region.
cudaError_t err = cudaHostRegister(ptr, size,
cudaHostRegisterPortable | cudaHostRegisterMapped);
if (err != cudaSuccess) {
(void)cudaGetLastError();
GGML_LOG_ERROR("%s: cudaHostRegister failed: %s\n", __func__, cudaGetErrorString(err));
return nullptr;
}

void * dev_ptr = nullptr;
err = cudaHostGetDevicePointer(&dev_ptr, ptr, 0);
if (err != cudaSuccess) {
(void)cudaGetLastError();
cudaHostUnregister(ptr);
GGML_LOG_ERROR("%s: cudaHostGetDevicePointer failed: %s\n", __func__, cudaGetErrorString(err));
return nullptr;
}

ggml_backend_cuda_buffer_context * buf_ctx = new ggml_backend_cuda_buffer_context(ctx->device, dev_ptr);
buf_ctx->host_ptr = ptr;
buf_ctx->owned = false;

return ggml_backend_buffer_init(
ggml_backend_cuda_device_get_buffer_type(dev),
ggml_backend_cuda_buffer_interface, buf_ctx, size);
}
#endif // GGML_USE_HIP

// TODO: move these functions here
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
Expand Down Expand Up @@ -5126,7 +5200,11 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .init_backend = */ ggml_backend_cuda_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
#if defined(GGML_USE_HIP)
/* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
#else
/* .buffer_from_host_ptr = */ NULL,
#endif
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-cuda/vendors/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostGetDevicePointer hipHostGetDevicePointer
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterMapped hipHostRegisterMapped
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ extern "C" {
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
bool use_hugepages; // back model memory with anonymous hugetlb pages (Linux only)
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
Expand Down
67 changes: 60 additions & 7 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@
#include <TargetConditionals.h>
#endif

#ifdef __linux__
// Older glibc headers may miss these; upstream kernel has had them for years.
#ifndef MAP_HUGETLB
#define MAP_HUGETLB 0x40000
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#endif
#endif

// 2 MiB hugepage size, used for both the hugetlb mmap length and the
// unmap_fragment alignment granularity when the mapping is hugetlb-backed.
static constexpr size_t LLAMA_HUGE_PAGE_SIZE = 2ull * 1024 * 1024;

// TODO: consider moving to llama-impl.h if needed in more places
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
Expand Down Expand Up @@ -434,9 +448,38 @@ struct llama_mmap::impl {
#ifdef _POSIX_MAPPED_FILES
std::vector<std::pair<size_t, size_t>> mapped_fragments;

impl(struct llama_file * file, size_t prefetch, bool numa) {
impl(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) {
size = file->size();
int fd = file->file_id();
#ifdef __linux__
if (hugetlb) {
// Anonymous hugetlb mapping rounded up to 2 MiB. PROT_WRITE lets
// load_all_data pread the file in (downgraded to PROT_READ after);
// MAP_POPULATE surfaces pool-exhaustion here, not mid-load.
mmap_size = (size + LLAMA_HUGE_PAGE_SIZE - 1) & ~(LLAMA_HUGE_PAGE_SIZE - 1);
addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, -1, 0);
if (addr == MAP_FAILED) {
int saved = errno;
if (saved == ENOMEM) {
size_t need = mmap_size / LLAMA_HUGE_PAGE_SIZE;
long have = -1;
if (FILE * f = std::fopen("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages", "r")) {
if (std::fscanf(f, "%ld", &have) != 1) { have = -1; }
std::fclose(f);
}
throw std::runtime_error(format("hugetlb mmap failed: need %zu free 2 MiB pages, pool has %ld. "
"Try: sudo sysctl -w vm.nr_hugepages=%zu", need, have, need));
}
throw std::runtime_error(format("hugetlb mmap failed: %s", strerror(saved)));
}
is_hugetlb_ = true;
mapped_fragments.emplace_back(0, mmap_size);
return;
}
#else
(void) hugetlb;
#endif
int flags = MAP_SHARED;
if (numa) { prefetch = 0; }
#ifdef __linux__
Expand Down Expand Up @@ -480,7 +523,9 @@ struct llama_mmap::impl {
}

void unmap_fragment(size_t first, size_t last) {
int page_size = sysconf(_SC_PAGESIZE);
// Hugetlb munmaps must be 2 MiB-aligned; the file-backed path uses
// the kernel base page size as before.
size_t page_size = is_hugetlb_ ? LLAMA_HUGE_PAGE_SIZE : (size_t) sysconf(_SC_PAGESIZE);
align_range(&first, &last, page_size);
size_t len = last - first;

Expand Down Expand Up @@ -525,8 +570,9 @@ struct llama_mmap::impl {
#elif defined(_WIN32)
HANDLE hMapping = nullptr;

impl(struct llama_file * file, size_t prefetch, bool numa) {
impl(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) {
GGML_UNUSED(numa);
GGML_UNUSED(hugetlb);

size = file->size();

Expand Down Expand Up @@ -589,10 +635,11 @@ struct llama_mmap::impl {
}
}
#else
impl(struct llama_file * file, size_t prefetch, bool numa) {
impl(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) {
GGML_UNUSED(file);
GGML_UNUSED(prefetch);
GGML_UNUSED(numa);
GGML_UNUSED(hugetlb);

throw std::runtime_error("mmap not supported");
}
Expand All @@ -605,15 +652,21 @@ struct llama_mmap::impl {
}
#endif

void * addr;
size_t size;
void * addr = nullptr;
size_t size = 0;
// Hugetlb: physical mapping length (file size rounded up to 2 MiB) used
// by munmap; `size` continues to report the underlying file length.
size_t mmap_size = 0;
bool is_hugetlb_ = false;
};

llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique<impl>(file, prefetch, numa)) {}
llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa, bool hugetlb) : pimpl(std::make_unique<impl>(file, prefetch, numa, hugetlb)) {}
llama_mmap::~llama_mmap() = default;

size_t llama_mmap::size() const { return pimpl->size; }
size_t llama_mmap::mmap_size() const { return pimpl->mmap_size ? pimpl->mmap_size : pimpl->size; }
void * llama_mmap::addr() const { return pimpl->addr; }
bool llama_mmap::is_hugetlb() const { return pimpl->is_hugetlb_; }

void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }

Expand Down
4 changes: 3 additions & 1 deletion src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@ struct llama_file {

struct llama_mmap {
llama_mmap(const llama_mmap &) = delete;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool hugetlb = false);
~llama_mmap();

size_t size() const;
size_t mmap_size() const; // physical mapping length (>= size() when hugetlb-rounded)
void * addr() const;
bool is_hugetlb() const;

void unmap_fragment(size_t first, size_t last);

Expand Down
Loading