ROCm · jayfurmanek · Jul 11, 2024 · Aug 5, 2024 · Aug 29, 2024 · Sep 5, 2024
diff --git a/build_whl.sh b/build_whl.sh
@@ -13,18 +13,33 @@ TF_PKG_LOC=/tmp/tensorflow_pkg
 rm -f $TF_PKG_LOC/tensorflow*.whl
 
 # First positional argument (if any) specifies the ROCM_INSTALL_DIR
-ROCM_INSTALL_DIR=/opt/rocm-6.1.0
-if [[ -n $1 ]]; then
+ROCM_INSTALL_DIR=/opt/rocm-6.1.1
+if [ "$#" -ge 1 ]; then
     ROCM_INSTALL_DIR=$1
 fi
 export ROCM_TOOLKIT_PATH=$ROCM_INSTALL_DIR
+export USE_BAZEL_VERSION=0.26.1
 
-yes "" | TF_NEED_ROCM=1 ROCM_TOOLKIT_PATH=${ROCM_INSTALL_DIR} PYTHON_BIN_PATH=/usr/bin/python3 PYTHON_BIN_PATH=/usr/bin/python3 ./configure
-pip3 uninstall -y tensorflow || true
-bazel build -c opt --copt -g --strip=never --copt=-mavx --copt=-mavx2 --config=rocm --copt=-Wno-invalid-constexpr //tensorflow:libtensorflow_cc.so
-bazel build -c opt --copt -g --strip=never --copt=-mavx --copt=-mavx2 --config=rocm  //tensorflow:libtensorflow_framework.so
-bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures  &&
-bazel-bin/tensorflow/tools/pip_package/build_pip_package $TF_PKG_LOC
+#--explain=explanation.log --verbose_explanations \
 
-echo y| pip uninstall tensorflow
-pip install $TF_PKG_LOC/tensorflow-1.15.5-cp37-cp37m-linux_x86_64.whl
+
+#yes "" | TF_NEED_ROCM=1 ROCM_TOOLKIT_PATH=${ROCM_INSTALL_DIR} PYTHON_BIN_PATH=/usr/bin/python3 PYTHON_BIN_PATH=/usr/bin/python3 ./configure
+#pip3 uninstall -y tensorflow || true
+bazel build -c opt --copt -g --strip=never --copt=-mavx --copt=-mavx2 --config=rocm \
+            --copt -Wno-invalid-constexpr \
+            --copt -Wno-deprecated-builtins \
+            --copt -Wno-unused-command-line-argument \
+            --copt -Wno-unused-but-set-variable \
+            --copt -Wno-final-dtor-non-final-class \
+            --copt -Wno-deprecated-declarations \
+            --copt -Wno-sign-compare \
+            --copt -Wno-bitwise-instead-of-logical \
+            --action_env=TF_ROCM_AMDGPU_TARGETS=gfx942 \
+            //tensorflow:libtensorflow_cc.so \
+            //tensorflow:libtensorflow_framework.so
+
+#bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures  &&
+#bazel-bin/tensorflow/tools/pip_package/build_pip_package $TF_PKG_LOC
+
+#echo y| pip uninstall tensorflow
+#pip install  ~/tensorflow_pkg/tensorflow-1.15.5-cp37-cp37m-linux_x86_64.whl
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -367,7 +367,7 @@ Status XlaCompilationCache::CompileImpl(
   int64 current_request_count = ++entry->request_count;
   if (!entry->compiled) {
     VLOG(0) << "Compilation cache entry hit: " << entry->compiled << " " 
-            << " signature: " << signature.HumanString() << " with request count "
+            << " signature: ?" << " with request count "
             << current_request_count << " and compile threshold "
             << compile_threshold.value_or(0) << " shape info="
             << inputs_shape_info.get();

diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -59,6 +59,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_allow_excess_precision(true);
   opts.set_xla_force_host_platform_device_count(1);
+  opts.set_xla_gpu_autotune_level(4);
+  opts.set_xla_gpu_autotune_gemm_rtol(0.1f);
+  opts.set_xla_gpu_redzone_padding_bytes(8 * 1024 * 1024);
+
   return opts;
 }
 
@@ -122,6 +126,14 @@ static void AllocateFlags() {
     };
   };
 
+  auto float_setter_for =
+      [](void (DebugOptions::*member_setter)(float)) {
+        return [member_setter](float value) {
+          (flag_values->*member_setter)(value);
+          return true;
+        };
+      };
+
   auto string_setter_for =
       [](void (DebugOptions::*member_setter)(const string& value)) {
         return [member_setter](const string& value) {
@@ -534,6 +546,47 @@ static void AllocateFlags() {
                        flag_values->xla_gpu_algorithm_blacklist_path(),
                        "An AlgorithmBlacklist text proto file as a blacklist "
                        "of convolutions to avoid to use."),
+      tensorflow::Flag(
+          "xla_gpu_autotune_level",
+          int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
+          flag_values->xla_gpu_autotune_level(),
+          "Set GEMM and Convolution auto-tuning level. 0 = off; 1 = on; 2 = "
+          "on+init; 3 = on+init+reinit; 4 = on+init+reinit+check; "
+          "5 = on+init+reinit+check and skip WRONG_RESULT solutions. See also "
+          "the related flag xla_gpu_autotune_gemm_rtol. Remark that, setting the "
+          "level to 5 only makes sense if you are sure that the reference (first "
+          "in the list) solution is numerically CORRECT. Otherwise, the autotuner "
+          "might discard many other correct solutions based on the failed "
+          "BufferComparator test."),
+      tensorflow::Flag(
+      "xla_gpu_dump_autotune_results_to",
+      string_setter_for(&DebugOptions::set_xla_gpu_dump_autotune_results_to),
+        flag_values->xla_gpu_dump_autotune_results_to(),
+        "File to write autotune results to. It will be a binary file unless the "
+        "name ends with .txt or .textproto. Warning: The results are written at "
+        "every compilation, possibly multiple times per process. This only works "
+        "on CUDA. In tests, the TEST_UNDECLARED_OUTPUTS_DIR prefix can be used "
+        "to write to their output directory."),
+      tensorflow::Flag(
+        "xla_gpu_load_autotune_results_from",
+        string_setter_for(&DebugOptions::set_xla_gpu_load_autotune_results_from),
+        flag_values->xla_gpu_load_autotune_results_from(),
+        "File to load autotune results from. It will be considered a binary file "
+        "unless the name ends with .txt or .textproto. It will be loaded at most "
+        "once per process. This only works on CUDA. In tests, the TEST_WORKSPACE "
+        "prefix can be used to load files from their data dependencies."),
+      tensorflow::Flag(
+        "xla_gpu_autotune_gemm_rtol",
+        float_setter_for(&DebugOptions::set_xla_gpu_autotune_gemm_rtol),
+        flag_values->xla_gpu_autotune_gemm_rtol(),
+        "Relative precision for comparing GEMM solutions vs the reference one"),
+      tensorflow::Flag(
+        "xla_gpu_redzone_padding_bytes",
+        int32_setter_for(&DebugOptions::set_xla_gpu_redzone_padding_bytes),
+        flag_values->xla_gpu_redzone_padding_bytes(),
+        "Amount of padding the redzone allocator will put on one side of each "
+        "buffer it allocates. (So the buffer's total size will be increased by "
+        "2x this value.)"),
   });
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -11,6 +11,11 @@ load(
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
+    "tf_kernel_library",
+)
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
@@ -444,6 +449,7 @@ cc_library(
         "fft_thunk.cc",
         "for_thunk.cc",
         "gemm_thunk.cc",
+        "gpublas_lt_matmul_thunk.cc",
         "gpu_executable.cc",
         "infeed_thunk.cc",
         "kernel_thunk.cc",
@@ -468,6 +474,7 @@ cc_library(
         "fft_thunk.h",
         "for_thunk.h",
         "gemm_thunk.h",
+        "gpublas_lt_matmul_thunk.h",
         "gpu_executable.h",
         "infeed_thunk.h",
         "kernel_thunk.h",
@@ -491,12 +498,14 @@ cc_library(
         ":hlo_execution_profiler",
         ":infeed_manager",
         ":ir_emission_utils",
+        ":matmul_utils",
         ":nccl_all_reduce_thunk",  # fixdeps: keep
         ":outfeed_manager",
         ":partition_assignment",
         ":stream_assignment",
         ":stream_executor_util",
         ":thunk",
+        ":autotuner_util",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:refcounting_hash_map",
@@ -524,6 +533,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor/gpu:gpu_blas_lt",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:kernel",
         "//tensorflow/stream_executor/gpu:gpu_stream",
@@ -570,22 +580,43 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "matmul_utils",
+    srcs = ["matmul_utils.cc"],
+    hdrs = ["matmul_utils.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/gpu:backend_configs",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor/gpu:gpu_blas_lt",
+        "@com_google_absl//absl/types:any",
+    ]
+)
+
 cc_library(
     name = "gemm_rewriter",
     srcs = ["gemm_rewriter.cc"],
     hdrs = ["gemm_rewriter.h"],
     deps = [
         ":backend_configs",
         ":ir_emission_utils",
+        ":matmul_utils",
+        ":gpu_types",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/stream_executor/gpu:gpu_blas_lt",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:any",
     ],
 )
 
@@ -600,6 +631,9 @@ cc_library(
         ":gpu_executable",
         ":ir_emission_utils",
         ":stream_executor_util",
+        ":matmul_utils",
+        ":autotuner_compile_util",
+        ":autotuner_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
@@ -612,11 +646,47 @@ cc_library(
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:device_memory_allocator",
-        "//tensorflow/stream_executor:redzone_allocator",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
         "@com_google_absl//absl/types:optional",
     ],
 )
 
+
+cc_library(
+    name = "autotuner_util",
+    srcs = if_gpu_is_configured(["autotuner_util.cc"]),
+    hdrs = if_gpu_is_configured(["autotuner_util.h"]),
+    deps = if_gpu_is_configured([
+        ":stream_executor_util",
+        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core:autotune_results_proto_cc",
+         "//tensorflow/compiler/xla:shape_util",
+         "//tensorflow/compiler/xla:status_macros",
+         "//tensorflow/compiler/xla:types",
+         "//tensorflow/compiler/xla:util",
+         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
+    ]),
+)
+
+# We need a separate target, as runtime executable cannot depend on compilation
+# pipeline.
+cc_library(
+    name = "autotuner_compile_util",
+    srcs = if_gpu_is_configured(["autotuner_compile_util.cc"]),
+    hdrs = if_gpu_is_configured(["autotuner_compile_util.h"]),
+    deps = if_gpu_is_configured([
+        ":autotuner_util",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+    ])
+)
+
 cc_library(
     name = "gpu_conv_algorithm_picker",
     srcs = ["gpu_conv_algorithm_picker.cc"],
@@ -643,7 +713,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor:device_memory_allocator",
-        "//tensorflow/stream_executor:redzone_allocator",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1029,6 +1099,7 @@ cc_library(
         ":stream_executor_util",
         ":target_constants",
         ":variadic_op_splitter",
+        ":autotuner_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1186,6 +1257,7 @@ cc_library(
         ":gemm_rewriter",
         ":gpu_compiler",
         ":gpu_conv_algorithm_picker",
+        ":gemm_algorithm_picker",
         "//tensorflow/core/platform:rocm_rocdl_path",
     ],
 )
@@ -1350,6 +1422,8 @@ cc_library(
     hdrs = ["stream_executor_util.h"],
     copts = tf_copts(),
     deps = [
+        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1385,15 +1459,39 @@ tf_cc_test(
     ],
 )
 
-cc_library(
+
+tf_kernel_library(
+    name = "image_ops_kernels",
+    srcs = [
+        "kernels/bipartite_match_op.cc",
+        "kernels/image_ops.cc",
+        "kernels/image_ops.h",
+        "kernels/segmentation_ops.cc",
+        "kernels/segmentation_ops.h",
+    ],
+    gpu_srcs = [
+        "kernels/image_ops_gpu.cu.cc",
+        "kernels/image_ops.h",
+    ],
+    deps = [
+        ":image_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_kernel_library(
     name = "buffer_comparator",
-    srcs = [] + if_cuda_is_configured([
-        "buffer_comparator_cuda.cc",
-    ]) + if_rocm_is_configured([
-        "buffer_comparator_rocm.cc",
+    srcs = if_gpu_is_configured(["buffer_comparator.cc",
+                                 "buffer_comparator.h"]),
+    gpu_srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]),
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "-DTENSORFLOW_USE_ROCM=1",
     ]),
-    hdrs = ["buffer_comparator.h"],
     deps = [
+        ":partition_assignment",
         ":stream_executor_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",

diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
@@ -83,7 +84,8 @@ Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
 }
 
 Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    HloModule* hlo_module, const AutotuneConfig& cfg, 
+    se::StreamExecutor* stream_exec,
     se::DeviceMemoryAllocator* device_allocator) {
   HloPassPipeline pipeline("post-layout_assignment");
   pipeline.AddInvariantChecker<HloVerifier>(
@@ -98,9 +100,12 @@ Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
   // Rewrite GEMMs into custom calls.
-  pipeline.AddPass<GemmRewriter>();
+  pipeline.AddPass<GemmRewriter>(GetGpuVersion(stream_exec));
 
   pipeline.AddPass<GpuConvAlgorithmPicker>(stream_exec, device_allocator);
+
+  pipeline.AddPass<GemmAlgorithmPicker>(cfg);
+
   // Clean up new_tuple described above.
   pipeline.AddPass<TupleSimplifier>();