diku-dk · athas · Feb 28, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 27, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * The prelude definition of `filter` is now more memory efficient,
   particularly when the output is much smaller than the input. (#2109)
 
+* New configuration for GPU backends:
+  `futhark_context_config_set_unified_memory`, also available on
+  executables as ``--unified-memory``.
+
 ### Removed
 
 ### Changed

diff --git a/docs/c-api.rst b/docs/c-api.rst
@@ -514,6 +514,19 @@ The following API functions are available when using the ``opencl``,
    with :c:func:`futhark_context_config_set_platform`, only the
    devices from matching platforms are considered.
 
+.. c:function:: void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag);
+
+   Use "unified" memory for GPU arrays. This means arrays are located
+   in memory that is also accessible from the CPU. The details depends
+   on the backend and hardware in use. The following values are
+   supported:
+
+   * 0: never use managed memory.
+
+   * 1: always use managed memory.
+
+   * 2: use managed memory if the device claims to support it (the
+     default).
 
 Exotic
 ~~~~~~

diff --git a/rts/c/backends/cuda.h b/rts/c/backends/cuda.h
@@ -94,6 +94,8 @@ struct futhark_context_config {
   char* preferred_device;
   int preferred_device_num;
 
+  int unified_memory;
+
   char* dump_ptx_to;
   char* load_ptx_from;
 
@@ -121,6 +123,8 @@ static void backend_context_config_setup(struct futhark_context_config *cfg) {
   cfg->dump_ptx_to = NULL;
   cfg->load_ptx_from = NULL;
 
+  cfg->unified_memory = 2;
+
   cfg->default_block_size = 256;
   cfg->default_grid_size = 0; // Set properly later.
   cfg->default_tile_size = 32;
@@ -186,6 +190,10 @@ void futhark_context_config_load_ptx_from(struct futhark_context_config *cfg, co
   cfg->load_ptx_from = strdup(path);
 }
 
+void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) {
+  cfg->unified_memory = flag;
+}
+
 void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) {
   cfg->default_block_size = size;
   cfg->default_block_size_changed = 1;
@@ -830,6 +838,18 @@ int backend_context_setup(struct futhark_context* ctx) {
 
   free_list_init(&ctx->gpu_free_list);
 
+  if (ctx->cfg->unified_memory == 2) {
+    ctx->cfg->unified_memory = device_query(ctx->dev, MANAGED_MEMORY);
+  }
+
+  if (ctx->cfg->logging) {
+    if (ctx->cfg->unified_memory) {
+      fprintf(ctx->log, "Using managed memory\n");
+    } else {
+      fprintf(ctx->log, "Using unmanaged memory\n");
+    }
+  }
+
   // MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably
   // for backwards compatibility.  Add _OPTIN and you seem to get the
   // right number.
@@ -1082,10 +1102,17 @@ static int gpu_launch_kernel(struct futhark_context* ctx,
 }
 
 static int gpu_alloc_actual(struct futhark_context *ctx, size_t size, gpu_mem *mem_out) {
-  CUresult res = cuMemAlloc(mem_out, size);
+  CUresult res;
+  if (ctx->cfg->unified_memory) {
+    res = cuMemAllocManaged(mem_out, size, CU_MEM_ATTACH_GLOBAL);
+  } else {
+    res = cuMemAlloc(mem_out, size);
+  }
+
   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
     return FUTHARK_OUT_OF_MEMORY;
   }
+
   CUDA_SUCCEED_OR_RETURN(res);
   return FUTHARK_SUCCESS;
 }

diff --git a/rts/c/backends/hip.h b/rts/c/backends/hip.h
@@ -89,6 +89,8 @@ struct futhark_context_config {
   int num_build_opts;
   char* *build_opts;
 
+  int unified_memory;
+
   char* preferred_device;
   int preferred_device_num;
 
@@ -111,6 +113,8 @@ static void backend_context_config_setup(struct futhark_context_config *cfg) {
   cfg->preferred_device = strdup("");
   cfg->program = strconcat(gpu_program);
 
+  cfg->unified_memory = 2;
+
   cfg->default_block_size = 256;
   cfg->default_grid_size = 0; // Set properly later.
   cfg->default_tile_size = 32;
@@ -166,6 +170,10 @@ void futhark_context_config_set_program(struct futhark_context_config *cfg, cons
   cfg->program = strdup(s);
 }
 
+void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) {
+  cfg->unified_memory = flag;
+}
+
 void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) {
   cfg->default_block_size = size;
   cfg->default_block_size_changed = 1;
@@ -686,6 +694,18 @@ int backend_context_setup(struct futhark_context* ctx) {
 
   free_list_init(&ctx->gpu_free_list);
 
+  if (ctx->cfg->unified_memory == 2) {
+    ctx->cfg->unified_memory = device_query(ctx->dev, hipDeviceAttributeManagedMemory);
+  }
+
+  if (ctx->cfg->logging) {
+    if (ctx->cfg->unified_memory) {
+      fprintf(ctx->log, "Using managed memory\n");
+    } else {
+      fprintf(ctx->log, "Using unmanaged memory\n");
+    }
+  }
+
   ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock);
   ctx->max_thread_block_size = device_query(ctx->dev, hipDeviceAttributeMaxThreadsPerBlock);
   ctx->max_grid_size = device_query(ctx->dev, hipDeviceAttributeMaxGridDimX);
@@ -938,7 +958,14 @@ static int gpu_launch_kernel(struct futhark_context* ctx,
 }
 
 static int gpu_alloc_actual(struct futhark_context *ctx, size_t size, gpu_mem *mem_out) {
-  hipError_t res = hipMalloc(mem_out, size);
+  hipError_t res;
+
+  if (ctx->cfg->unified_memory) {
+    res = hipMallocManaged(mem_out, size, hipMemAttachGlobal);
+  } else {
+    res = hipMalloc(mem_out, size);
+  }
+
   if (res == hipErrorOutOfMemory) {
     return FUTHARK_OUT_OF_MEMORY;
   }

diff --git a/rts/c/backends/opencl.h b/rts/c/backends/opencl.h
@@ -136,6 +136,8 @@ struct futhark_context_config {
   char* preferred_device;
   int ignore_blacklist;
 
+  int unified_memory;
+
   char* dump_binary_to;
   char* load_binary_from;
 
@@ -166,6 +168,8 @@ static void backend_context_config_setup(struct futhark_context_config* cfg) {
   cfg->load_binary_from = NULL;
   cfg->program = strconcat(gpu_program);
 
+  cfg->unified_memory = 2;
+
   // The following are dummy sizes that mean the concrete defaults
   // will be set during initialisation via hardware-inspection-based
   // heuristics.
@@ -432,6 +436,10 @@ void futhark_context_config_load_binary_from(struct futhark_context_config *cfg,
   cfg->load_binary_from = strdup(path);
 }
 
+void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) {
+  cfg->unified_memory = flag;
+}
+
 void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) {
   cfg->default_group_size = size;
   cfg->default_group_size_changed = 1;

diff --git a/src/Futhark/CodeGen/Backends/GPU.hs b/src/Futhark/CodeGen/Backends/GPU.hs
@@ -370,6 +370,13 @@ gpuOptions =
         optionArgument = RequiredArgument "INT",
         optionDescription = "The default parallelism threshold.",
         optionAction = [C.cstm|futhark_context_config_set_default_threshold(cfg, atoi(optarg));|]
+      },
+    Option
+      { optionLongName = "unified-memory",
+        optionShortName = Nothing,
+        optionArgument = RequiredArgument "INT",
+        optionDescription = "Whether to use unified memory",
+        optionAction = [C.cstm|futhark_context_config_set_unified_memory(cfg, atoi(optarg));|]
       }
   ]
 
@@ -462,3 +469,4 @@ generateGPUBoilerplate gpu_program macros backendH kernels types failures = do
   GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);|]
   GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);|]
   GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);|]
+  GC.headerDecl GC.InitDecl [C.cedecl|void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag);|]