ROCm · ScXfjiang · Jan 2, 2025 · Oct 25, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -163,7 +163,7 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
     ComputationPlacerCreationFunction creation_function) {
   absl::MutexLock lock(&ComputationPlacer::platform_computation_placer_mutex_);
   auto* computation_placers = GetPlatformComputationPlacers();
-  CHECK(computation_placers->find(platform_id) == computation_placers->end());
+  // CHECK(computation_placers->find(platform_id) == computation_placers->end());
   (*computation_placers)[platform_id].creation_function = creation_function;
 }
 

diff --git a/tensorflow/compiler/xla/stream_executor/BUILD b/tensorflow/compiler/xla/stream_executor/BUILD
@@ -450,6 +450,7 @@ tsl_gpu_library(
         ":temporary_memory_manager",
         ":timer",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_blas_lt_gemm_runner",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",

diff --git a/tensorflow/compiler/xla/stream_executor/gpu/BUILD b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
@@ -78,6 +78,7 @@ cc_library(
         #"//tensorflow/core/platform:env",
         "//tensorflow/tsl/util:env_var",
         "@com_google_absl//absl/types:any",
+        "//tensorflow/compiler/xla:debug_options_flags",
     ]),
 )
 
@@ -87,13 +88,14 @@ cc_library(
     srcs = if_gpu_is_configured(["gpu_blas_lt_gemm_runner.cc"]),
     hdrs = if_gpu_is_configured(["gpu_blas_lt_gemm_runner.h"]),
     deps = if_gpu_is_configured([
-        "//tensorflow/core:autotuning_proto_cc",
-	"//tensorflow/core:autotune_results_proto_cc",	        
-	"//tensorflow/compiler/xla:xla_proto",		 
-	"//tensorflow/compiler/xla/stream_executor:scratch_allocator",			        
-	"//tensorflow/compiler/xla/service/gpu:autotuner_util",
-    "//tensorflow/compiler/xla:debug_options_flags",
-	":gpu_blas_lt",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",		 
+        "//tensorflow/compiler/xla/stream_executor:scratch_allocator",			        
+        "//tensorflow/compiler/xla/service/gpu:autotuner_util",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        ":gpu_blas_lt",
     ]),
 )
 

diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/util/env_var.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 namespace stream_executor {
 
@@ -31,6 +32,13 @@ using blas::ComputationType;
 using blas::DataType;
 using xla::PrimitiveType;
 
+bool GpuBlasLtEnabled() {
+  static std::atomic_bool result{[] {
+    return xla::GetDebugOptionsFromFlags().xla_gpu_enable_cublaslt();
+  }()};
+  return result;
+}
+
 namespace {
 
 bool TF32_Enabled() {

diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -34,6 +34,8 @@ namespace stream_executor {
 
 namespace gpu {
 
+bool GpuBlasLtEnabled();
+
 xla::StatusOr<blas::DataType> AsBlasDataType(xla::PrimitiveType dtype);
 
 xla::StatusOr<blas::ComputationType> GetBlasComputationType(