pytorch · liligwu · Jan 25, 2022 · Jan 26, 2022 · Jan 26, 2022 · Jan 31, 2022
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -13,7 +13,7 @@ endif()
 option(USE_CUDA "Use CUDA" ON)
 option(USE_ROCM "Use ROCm" OFF)
 
-if((EXISTS "/bin/hipcc") AND NOT (EXISTS "/bin/nvcc"))
+if((EXISTS "/opt/rocm/") AND NOT (EXISTS "/bin/nvcc"))
   message("AMD GPU detected.")
   set(USE_CUDA OFF)
   set(USE_ROCM ON)

diff --git a/fbgemm_gpu/bench/quantize_ops_benchmark.py b/fbgemm_gpu/bench/quantize_ops_benchmark.py
@@ -33,17 +33,7 @@ def cli() -> None:
     pass
 
 
-@cli.command()
-@click.option("--flush-gpu-cache-size-mb", default=0)
-@click.option("--iters", default=100)
-@click.option("--warmup-runs", default=2)
-@settings(max_examples=10, deadline=None)
-# pyre-ignore
-@given(
-    num_columns=st.sampled_from([2**n for n in range(4, 10)]),
-    num_rows=st.sampled_from([2**n for n in range(4, 10)]),
-)
-def bench(
+def bench_impl(
     flush_gpu_cache_size_mb: int,
     iters: int,
     num_columns: int,
@@ -138,6 +128,40 @@ def bench(
         logging.info(f"{k} time per iter: {t_time * 1.0e6:.0f}us")
 
 
+@settings(max_examples=10, deadline=None)
+# pyre-ignore
+@given(
+    num_columns=st.sampled_from([2 ** n for n in range(4, 10)]),
+    num_rows=st.sampled_from([2 ** n for n in range(4, 10)]),
+)
+def bench_spectrum(
+    flush_gpu_cache_size_mb: int,
+    iters: int,
+    num_columns: int,
+    num_rows: int,
+    warmup_runs: int,
+) -> None:
+        bench_impl(flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, iters=iters, num_columns=num_columns, num_rows=num_rows, warmup_runs=warmup_runs)
+
+@cli.command()
+@click.option("--flush-gpu-cache-size-mb", default=0)
+@click.option("--iters", default=100)
+@click.option("--num-columns", default=-1)
+@click.option("--num-rows", default=-1)
+@click.option("--warmup-runs", default=2)
+def bench(
+    flush_gpu_cache_size_mb: int,
+    iters: int,
+    num_columns: int,
+    num_rows: int,
+    warmup_runs: int,
+) -> None:
+    if num_columns == -1 or num_rows == -1:
+        bench_spectrum(flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, iters=iters, warmup_runs=warmup_runs)
+    else:
+        bench_impl(flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, iters=iters, num_columns=num_columns, num_rows=num_rows, warmup_runs=warmup_runs)
+
+
 @cli.command()
 @click.option("--flush-gpu-cache-size-mb", default=0)
 @click.option("--iters", default=100)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h b/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h
@@ -15,30 +15,37 @@ namespace fbgemm_gpu {
 
 #define FBGEMM_GPU_ENUM_CREATE_TAG(module_name)                                \
   struct fbgemm_gpu_enum_tag_##module_name {};                                 \
-  extern template enum_registration<struct fbgemm_gpu_enum_tag_##module_name>* \
+  template <> enum_registration<struct fbgemm_gpu_enum_tag_##module_name>*     \
       enum_registration<                                                       \
-          struct fbgemm_gpu_enum_tag_##module_name>::registration_list;
+          struct fbgemm_gpu_enum_tag_##module_name>::registration_list;        \
+  extern template class enum_registration<                                     \
+      struct fbgemm_gpu_enum_tag_##module_name>;
 
 #define FBGEMM_GPU_ENUM_TAG(module_name) \
   struct fbgemm_gpu_enum_tag_##module_name
 
 #define FBGEMM_GPU_ENUM_GLOGAL(module_name)                                    \
+  template class enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>;          \
   template <>                                                                  \
   enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>*                         \
       enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>::registration_list = \
           nullptr;
 
-#define FBGEMM_GPU_ENUM_REGISTER_START(module_name, enum_name)                           \
-  enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)> fbgemm_fpu_enum_reg_ ## enum_name( \
-      #enum_name,
+// To work around (escape from) hipify_torch, the names of the idendifiers
+// are decoposed to `prefix` and `enum_name`.
+#define FBGEMM_GPU_ENUM_REGISTER_START(module_name, prefix, enum_name)                           \
+  enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)> fbgemm_fpu_enum_reg_ \
+      ## prefix ## enum_name( #prefix #enum_name,
 
 #define FBGEMM_GPU_ENUM_REGISTER_END );
 
 #define FBGEMM_GPU_ENUM_OP(module_name, op_name) \
 #op_name "() -> ((str, (str, int)[])[])",      \
       TORCH_FN(enum_query <FBGEMM_GPU_ENUM_TAG(module_name)>)
-#define FBGEMM_GPU_ENUM_ITEM(x) \
-  { #x, x }
+// To work around (escape from) hipify_torch, the names of the idendifiers
+// are decoposed to `x` and `y`. `z` is supposed to be hipified.
+#define FBGEMM_GPU_ENUM_ITEM(x, y, z) \
+  { #x #y, z }
 
 using enum_item = std::tuple<std::string, int64_t>;
 

diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
@@ -357,13 +357,13 @@ Tensor uvm_to_cpu_clone(Tensor t) {
 
 FBGEMM_GPU_ENUM_GLOGAL(uvm)
 
-FBGEMM_GPU_ENUM_REGISTER_START(uvm, cudaMemoryAdvise){
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetReadMostly),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetReadMostly),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetPreferredLocation),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetPreferredLocation),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetAccessedBy),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetAccessedBy),
+FBGEMM_GPU_ENUM_REGISTER_START(uvm, cudaMemory, Advise){
+    FBGEMM_GPU_ENUM_ITEM(cudaMem, AdviseSetReadMostly, cudaMemAdviseSetReadMostly),
+    FBGEMM_GPU_ENUM_ITEM(cudaMem, AdviseUnsetReadMostly, cudaMemAdviseUnsetReadMostly),
+    FBGEMM_GPU_ENUM_ITEM(cudaMem, AdviseSetPreferredLocation, cudaMemAdviseSetPreferredLocation),
+    FBGEMM_GPU_ENUM_ITEM(cudaMem, AdviseUnsetPreferredLocation, cudaMemAdviseUnsetPreferredLocation),
+    FBGEMM_GPU_ENUM_ITEM(cudaMem, AdviseSetAccessedBy, cudaMemAdviseSetAccessedBy),
+    FBGEMM_GPU_ENUM_ITEM(cudaMem, AdviseUnsetAccessedBy, cudaMemAdviseUnsetAccessedBy),
 } FBGEMM_GPU_ENUM_REGISTER_END
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/cumem_utils_host.cpp b/fbgemm_gpu/src/cumem_utils_host.cpp
@@ -39,11 +39,7 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
       TORCH_FN(uvm_mem_advice_dont_fork));
 
   m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
-
-#ifndef __HIP_PLATFORM_HCC__
-  // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
   m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
-#endif
 }
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -69,11 +65,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       TORCH_FN(uvm_mem_advice_dont_fork));
 
   m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
-
-#ifndef __HIP_PLATFORM_HCC__
-  // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
   m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
-#endif
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/test/uvm_test.py b/fbgemm_gpu/test/uvm_test.py
@@ -19,7 +19,7 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable
+    from test_utils import gpu_available, gpu_unavailable, skipIfRocm
 else:
     from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable
 
@@ -80,6 +80,7 @@ def test_enum(self) -> None:
         # pyre-ignore[16]
         assert cudaMemoryAdvise.cudaMemAdviseSetAccessedBy.value == 5
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(
@@ -123,6 +124,7 @@ def test_cudaMemPrefetchAsync(self, sizes: List[int], vanilla: bool) -> None:
 
         torch.cuda.synchronize(torch.device("cuda:0"))
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable or torch.cuda.device_count() < 2)
     @given(
         sizes=st.lists(
@@ -154,6 +156,7 @@ def test_uvm_to_device(self, sizes: List[int], vanilla: bool) -> None:
         assert torch.ops.fbgemm.uvm_storage(second_t)
         assert second_t.device == device_prototype.device
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(
@@ -183,6 +186,7 @@ def test_uvm_slice(self, sizes: List[int], vanilla: bool) -> None:
             assert torch.ops.fbgemm.is_uvm_tensor(uvm_slice)
             assert torch.ops.fbgemm.uvm_storage(cpu_slice)
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(