Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for unified memory allocations #2116

Merged
merged 4 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* The prelude definition of `filter` is now more memory efficient,
particularly when the output is much smaller than the input. (#2109)

* New configuration for GPU backends:
`futhark_context_config_set_unified_memory`, also available on
executables as ``--unified-memory``.

### Removed

### Changed
Expand Down
13 changes: 13 additions & 0 deletions docs/c-api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,19 @@ The following API functions are available when using the ``opencl``,
with :c:func:`futhark_context_config_set_platform`, only the
devices from matching platforms are considered.

.. c:function:: void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag);

Use "unified" memory for GPU arrays. This means arrays are located
in memory that is also accessible from the CPU. The details depends
on the backend and hardware in use. The following values are
supported:

* 0: never use managed memory.

* 1: always use managed memory.

* 2: use managed memory if the device claims to support it (the
default).

Exotic
~~~~~~
Expand Down
29 changes: 28 additions & 1 deletion rts/c/backends/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ struct futhark_context_config {
char* preferred_device;
int preferred_device_num;

int unified_memory;

char* dump_ptx_to;
char* load_ptx_from;

Expand Down Expand Up @@ -121,6 +123,8 @@ static void backend_context_config_setup(struct futhark_context_config *cfg) {
cfg->dump_ptx_to = NULL;
cfg->load_ptx_from = NULL;

cfg->unified_memory = 2;

cfg->default_block_size = 256;
cfg->default_grid_size = 0; // Set properly later.
cfg->default_tile_size = 32;
Expand Down Expand Up @@ -186,6 +190,10 @@ void futhark_context_config_load_ptx_from(struct futhark_context_config *cfg, co
cfg->load_ptx_from = strdup(path);
}

void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) {
cfg->unified_memory = flag;
}

void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) {
cfg->default_block_size = size;
cfg->default_block_size_changed = 1;
Expand Down Expand Up @@ -830,6 +838,18 @@ int backend_context_setup(struct futhark_context* ctx) {

free_list_init(&ctx->gpu_free_list);

if (ctx->cfg->unified_memory == 2) {
ctx->cfg->unified_memory = device_query(ctx->dev, MANAGED_MEMORY);
}

if (ctx->cfg->logging) {
if (ctx->cfg->unified_memory) {
fprintf(ctx->log, "Using managed memory\n");
} else {
fprintf(ctx->log, "Using unmanaged memory\n");
}
}

// MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably
// for backwards compatibility. Add _OPTIN and you seem to get the
// right number.
Expand Down Expand Up @@ -1082,10 +1102,17 @@ static int gpu_launch_kernel(struct futhark_context* ctx,
}

static int gpu_alloc_actual(struct futhark_context *ctx, size_t size, gpu_mem *mem_out) {
CUresult res = cuMemAlloc(mem_out, size);
CUresult res;
if (ctx->cfg->unified_memory) {
res = cuMemAllocManaged(mem_out, size, CU_MEM_ATTACH_GLOBAL);
} else {
res = cuMemAlloc(mem_out, size);
}

if (res == CUDA_ERROR_OUT_OF_MEMORY) {
return FUTHARK_OUT_OF_MEMORY;
}

CUDA_SUCCEED_OR_RETURN(res);
return FUTHARK_SUCCESS;
}
Expand Down
29 changes: 28 additions & 1 deletion rts/c/backends/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ struct futhark_context_config {
int num_build_opts;
char* *build_opts;

int unified_memory;

char* preferred_device;
int preferred_device_num;

Expand All @@ -111,6 +113,8 @@ static void backend_context_config_setup(struct futhark_context_config *cfg) {
cfg->preferred_device = strdup("");
cfg->program = strconcat(gpu_program);

cfg->unified_memory = 2;

cfg->default_block_size = 256;
cfg->default_grid_size = 0; // Set properly later.
cfg->default_tile_size = 32;
Expand Down Expand Up @@ -166,6 +170,10 @@ void futhark_context_config_set_program(struct futhark_context_config *cfg, cons
cfg->program = strdup(s);
}

void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) {
cfg->unified_memory = flag;
}

void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) {
cfg->default_block_size = size;
cfg->default_block_size_changed = 1;
Expand Down Expand Up @@ -686,6 +694,18 @@ int backend_context_setup(struct futhark_context* ctx) {

free_list_init(&ctx->gpu_free_list);

if (ctx->cfg->unified_memory == 2) {
ctx->cfg->unified_memory = device_query(ctx->dev, hipDeviceAttributeManagedMemory);
}

if (ctx->cfg->logging) {
if (ctx->cfg->unified_memory) {
fprintf(ctx->log, "Using managed memory\n");
} else {
fprintf(ctx->log, "Using unmanaged memory\n");
}
}

ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock);
ctx->max_thread_block_size = device_query(ctx->dev, hipDeviceAttributeMaxThreadsPerBlock);
ctx->max_grid_size = device_query(ctx->dev, hipDeviceAttributeMaxGridDimX);
Expand Down Expand Up @@ -938,7 +958,14 @@ static int gpu_launch_kernel(struct futhark_context* ctx,
}

static int gpu_alloc_actual(struct futhark_context *ctx, size_t size, gpu_mem *mem_out) {
hipError_t res = hipMalloc(mem_out, size);
hipError_t res;

if (ctx->cfg->unified_memory) {
res = hipMallocManaged(mem_out, size, hipMemAttachGlobal);
} else {
res = hipMalloc(mem_out, size);
}

if (res == hipErrorOutOfMemory) {
return FUTHARK_OUT_OF_MEMORY;
}
Expand Down
8 changes: 8 additions & 0 deletions rts/c/backends/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ struct futhark_context_config {
char* preferred_device;
int ignore_blacklist;

int unified_memory;

char* dump_binary_to;
char* load_binary_from;

Expand Down Expand Up @@ -166,6 +168,8 @@ static void backend_context_config_setup(struct futhark_context_config* cfg) {
cfg->load_binary_from = NULL;
cfg->program = strconcat(gpu_program);

cfg->unified_memory = 2;

// The following are dummy sizes that mean the concrete defaults
// will be set during initialisation via hardware-inspection-based
// heuristics.
Expand Down Expand Up @@ -432,6 +436,10 @@ void futhark_context_config_load_binary_from(struct futhark_context_config *cfg,
cfg->load_binary_from = strdup(path);
}

void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) {
cfg->unified_memory = flag;
}

void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) {
cfg->default_group_size = size;
cfg->default_group_size_changed = 1;
Expand Down
8 changes: 8 additions & 0 deletions src/Futhark/CodeGen/Backends/GPU.hs
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,13 @@ gpuOptions =
optionArgument = RequiredArgument "INT",
optionDescription = "The default parallelism threshold.",
optionAction = [C.cstm|futhark_context_config_set_default_threshold(cfg, atoi(optarg));|]
},
Option
{ optionLongName = "unified-memory",
optionShortName = Nothing,
optionArgument = RequiredArgument "INT",
optionDescription = "Whether to use unified memory",
optionAction = [C.cstm|futhark_context_config_set_unified_memory(cfg, atoi(optarg));|]
}
]

Expand Down Expand Up @@ -462,3 +469,4 @@ generateGPUBoilerplate gpu_program macros backendH kernels types failures = do
GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);|]
GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);|]
GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);|]
GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag);|]
Loading