Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

R1.15 rocm61 albm add asan hipblaslt #2672

Open
wants to merge 6 commits into
base: r1.15-rocm61-albm-add-asan
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions build_whl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,33 @@ TF_PKG_LOC=/tmp/tensorflow_pkg
rm -f $TF_PKG_LOC/tensorflow*.whl

# First positional argument (if any) specifies the ROCM_INSTALL_DIR
ROCM_INSTALL_DIR=/opt/rocm-6.1.0
if [[ -n $1 ]]; then
ROCM_INSTALL_DIR=/opt/rocm-6.1.1
if [ "$#" -ge 1 ]; then
ROCM_INSTALL_DIR=$1
fi
export ROCM_TOOLKIT_PATH=$ROCM_INSTALL_DIR
export USE_BAZEL_VERSION=0.26.1

yes "" | TF_NEED_ROCM=1 ROCM_TOOLKIT_PATH=${ROCM_INSTALL_DIR} PYTHON_BIN_PATH=/usr/bin/python3 PYTHON_BIN_PATH=/usr/bin/python3 ./configure
pip3 uninstall -y tensorflow || true
bazel build -c opt --copt -g --strip=never --copt=-mavx --copt=-mavx2 --config=rocm --copt=-Wno-invalid-constexpr //tensorflow:libtensorflow_cc.so
bazel build -c opt --copt -g --strip=never --copt=-mavx --copt=-mavx2 --config=rocm //tensorflow:libtensorflow_framework.so
bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
bazel-bin/tensorflow/tools/pip_package/build_pip_package $TF_PKG_LOC
#--explain=explanation.log --verbose_explanations \

echo y| pip uninstall tensorflow
pip install $TF_PKG_LOC/tensorflow-1.15.5-cp37-cp37m-linux_x86_64.whl

#yes "" | TF_NEED_ROCM=1 ROCM_TOOLKIT_PATH=${ROCM_INSTALL_DIR} PYTHON_BIN_PATH=/usr/bin/python3 PYTHON_BIN_PATH=/usr/bin/python3 ./configure
#pip3 uninstall -y tensorflow || true
bazel build -c opt --copt -g --strip=never --copt=-mavx --copt=-mavx2 --config=rocm \
--copt -Wno-invalid-constexpr \
--copt -Wno-deprecated-builtins \
--copt -Wno-unused-command-line-argument \
--copt -Wno-unused-but-set-variable \
--copt -Wno-final-dtor-non-final-class \
--copt -Wno-deprecated-declarations \
--copt -Wno-sign-compare \
--copt -Wno-bitwise-instead-of-logical \
--action_env=TF_ROCM_AMDGPU_TARGETS=gfx942 \
//tensorflow:libtensorflow_cc.so \
//tensorflow:libtensorflow_framework.so

#bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
#bazel-bin/tensorflow/tools/pip_package/build_pip_package $TF_PKG_LOC

#echo y| pip uninstall tensorflow
#pip install ~/tensorflow_pkg/tensorflow-1.15.5-cp37-cp37m-linux_x86_64.whl
2 changes: 1 addition & 1 deletion tensorflow/compiler/jit/xla_compilation_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ Status XlaCompilationCache::CompileImpl(
int64 current_request_count = ++entry->request_count;
if (!entry->compiled) {
VLOG(0) << "Compilation cache entry hit: " << entry->compiled << " "
<< " signature: " << signature.HumanString() << " with request count "
<< " signature: ?" << " with request count "
<< current_request_count << " and compile threshold "
<< compile_threshold.value_or(0) << " shape info="
<< inputs_shape_info.get();
Expand Down
53 changes: 53 additions & 0 deletions tensorflow/compiler/xla/debug_options_flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {

opts.set_xla_allow_excess_precision(true);
opts.set_xla_force_host_platform_device_count(1);
opts.set_xla_gpu_autotune_level(4);
opts.set_xla_gpu_autotune_gemm_rtol(0.1f);
opts.set_xla_gpu_redzone_padding_bytes(8 * 1024 * 1024);

return opts;
}

Expand Down Expand Up @@ -122,6 +126,14 @@ static void AllocateFlags() {
};
};

auto float_setter_for =
[](void (DebugOptions::*member_setter)(float)) {
return [member_setter](float value) {
(flag_values->*member_setter)(value);
return true;
};
};

auto string_setter_for =
[](void (DebugOptions::*member_setter)(const string& value)) {
return [member_setter](const string& value) {
Expand Down Expand Up @@ -534,6 +546,47 @@ static void AllocateFlags() {
flag_values->xla_gpu_algorithm_blacklist_path(),
"An AlgorithmBlacklist text proto file as a blacklist "
"of convolutions to avoid to use."),
tensorflow::Flag(
"xla_gpu_autotune_level",
int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
flag_values->xla_gpu_autotune_level(),
"Set GEMM and Convolution auto-tuning level. 0 = off; 1 = on; 2 = "
"on+init; 3 = on+init+reinit; 4 = on+init+reinit+check; "
"5 = on+init+reinit+check and skip WRONG_RESULT solutions. See also "
"the related flag xla_gpu_autotune_gemm_rtol. Remark that, setting the "
"level to 5 only makes sense if you are sure that the reference (first "
"in the list) solution is numerically CORRECT. Otherwise, the autotuner "
"might discard many other correct solutions based on the failed "
"BufferComparator test."),
tensorflow::Flag(
"xla_gpu_dump_autotune_results_to",
string_setter_for(&DebugOptions::set_xla_gpu_dump_autotune_results_to),
flag_values->xla_gpu_dump_autotune_results_to(),
"File to write autotune results to. It will be a binary file unless the "
"name ends with .txt or .textproto. Warning: The results are written at "
"every compilation, possibly multiple times per process. This only works "
"on CUDA. In tests, the TEST_UNDECLARED_OUTPUTS_DIR prefix can be used "
"to write to their output directory."),
tensorflow::Flag(
"xla_gpu_load_autotune_results_from",
string_setter_for(&DebugOptions::set_xla_gpu_load_autotune_results_from),
flag_values->xla_gpu_load_autotune_results_from(),
"File to load autotune results from. It will be considered a binary file "
"unless the name ends with .txt or .textproto. It will be loaded at most "
"once per process. This only works on CUDA. In tests, the TEST_WORKSPACE "
"prefix can be used to load files from their data dependencies."),
tensorflow::Flag(
"xla_gpu_autotune_gemm_rtol",
float_setter_for(&DebugOptions::set_xla_gpu_autotune_gemm_rtol),
flag_values->xla_gpu_autotune_gemm_rtol(),
"Relative precision for comparing GEMM solutions vs the reference one"),
tensorflow::Flag(
"xla_gpu_redzone_padding_bytes",
int32_setter_for(&DebugOptions::set_xla_gpu_redzone_padding_bytes),
flag_values->xla_gpu_redzone_padding_bytes(),
"Amount of padding the redzone allocator will put on one side of each "
"buffer it allocates. (So the buffer's total size will be increased by "
"2x this value.)"),
});
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
}
Expand Down
114 changes: 106 additions & 8 deletions tensorflow/compiler/xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ load(
"tf_cc_test",
"tf_copts",
"tf_cuda_library",
"tf_kernel_library",
)
load(
"//tensorflow/stream_executor:build_defs.bzl",
"if_gpu_is_configured",
)
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
Expand Down Expand Up @@ -444,6 +449,7 @@ cc_library(
"fft_thunk.cc",
"for_thunk.cc",
"gemm_thunk.cc",
"gpublas_lt_matmul_thunk.cc",
"gpu_executable.cc",
"infeed_thunk.cc",
"kernel_thunk.cc",
Expand All @@ -468,6 +474,7 @@ cc_library(
"fft_thunk.h",
"for_thunk.h",
"gemm_thunk.h",
"gpublas_lt_matmul_thunk.h",
"gpu_executable.h",
"infeed_thunk.h",
"kernel_thunk.h",
Expand All @@ -491,12 +498,14 @@ cc_library(
":hlo_execution_profiler",
":infeed_manager",
":ir_emission_utils",
":matmul_utils",
":nccl_all_reduce_thunk", # fixdeps: keep
":outfeed_manager",
":partition_assignment",
":stream_assignment",
":stream_executor_util",
":thunk",
":autotuner_util",
"//tensorflow/compiler/xla:array2d",
"//tensorflow/compiler/xla:literal",
"//tensorflow/compiler/xla:refcounting_hash_map",
Expand Down Expand Up @@ -524,6 +533,7 @@ cc_library(
"//tensorflow/core/profiler/lib:traceme",
"//tensorflow/stream_executor",
"//tensorflow/stream_executor:blas",
"//tensorflow/stream_executor/gpu:gpu_blas_lt",
"//tensorflow/stream_executor:device_memory",
"//tensorflow/stream_executor:kernel",
"//tensorflow/stream_executor/gpu:gpu_stream",
Expand Down Expand Up @@ -570,22 +580,43 @@ cc_library(
],
)

cc_library(
name = "matmul_utils",
srcs = ["matmul_utils.cc"],
hdrs = ["matmul_utils.h"],
deps = [
"//tensorflow/compiler/xla:shape_util",
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla:window_util",
"//tensorflow/compiler/xla:xla_data_proto",
"//tensorflow/compiler/xla/service/gpu:backend_configs",
"//tensorflow/compiler/xla/service:hlo",
"//tensorflow/stream_executor:blas",
"//tensorflow/stream_executor/gpu:gpu_blas_lt",
"@com_google_absl//absl/types:any",
]
)

cc_library(
name = "gemm_rewriter",
srcs = ["gemm_rewriter.cc"],
hdrs = ["gemm_rewriter.h"],
deps = [
":backend_configs",
":ir_emission_utils",
":matmul_utils",
":gpu_types",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:statusor",
"//tensorflow/compiler/xla/service:hlo",
"//tensorflow/compiler/xla/service:hlo_casting_utils",
"//tensorflow/compiler/xla/service:hlo_pass",
"//tensorflow/compiler/xla/service:pattern_matcher",
"//tensorflow/stream_executor/gpu:gpu_blas_lt",
"//tensorflow/core:lib",
"//tensorflow/stream_executor/lib",
"@com_google_absl//absl/types:optional",
"@com_google_absl//absl/types:any",
],
)

Expand All @@ -600,6 +631,9 @@ cc_library(
":gpu_executable",
":ir_emission_utils",
":stream_executor_util",
":matmul_utils",
":autotuner_compile_util",
":autotuner_util",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla/service:hlo",
Expand All @@ -612,11 +646,47 @@ cc_library(
"//tensorflow/stream_executor:blas",
"//tensorflow/stream_executor:device_memory",
"//tensorflow/stream_executor:device_memory_allocator",
"//tensorflow/stream_executor:redzone_allocator",
"//tensorflow/stream_executor/gpu:redzone_allocator",
"@com_google_absl//absl/types:optional",
],
)


cc_library(
name = "autotuner_util",
srcs = if_gpu_is_configured(["autotuner_util.cc"]),
hdrs = if_gpu_is_configured(["autotuner_util.h"]),
deps = if_gpu_is_configured([
":stream_executor_util",
"//tensorflow/core:autotuning_proto_cc",
"//tensorflow/core:autotune_results_proto_cc",
"//tensorflow/compiler/xla:shape_util",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:types",
"//tensorflow/compiler/xla:util",
"//tensorflow/stream_executor",
"//tensorflow/stream_executor/gpu:redzone_allocator",
]),
)

# We need a separate target, as runtime executable cannot depend on compilation
# pipeline.
cc_library(
name = "autotuner_compile_util",
srcs = if_gpu_is_configured(["autotuner_compile_util.cc"]),
hdrs = if_gpu_is_configured(["autotuner_compile_util.h"]),
deps = if_gpu_is_configured([
":autotuner_util",
":ir_emission_utils",
"//tensorflow/compiler/xla/service:shaped_buffer",
"//tensorflow/stream_executor",
"//tensorflow/stream_executor/gpu:gpu_stream_header",
"//tensorflow/stream_executor/gpu:redzone_allocator",
"//tensorflow/compiler/xla:shape_util",
"//tensorflow/compiler/xla:util",
])
)

cc_library(
name = "gpu_conv_algorithm_picker",
srcs = ["gpu_conv_algorithm_picker.cc"],
Expand All @@ -643,7 +713,7 @@ cc_library(
"//tensorflow/core:stream_executor_no_cuda",
"//tensorflow/core/util/proto:proto_utils",
"//tensorflow/stream_executor:device_memory_allocator",
"//tensorflow/stream_executor:redzone_allocator",
"//tensorflow/stream_executor/gpu:redzone_allocator",
"@com_google_absl//absl/algorithm:container",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
Expand Down Expand Up @@ -1029,6 +1099,7 @@ cc_library(
":stream_executor_util",
":target_constants",
":variadic_op_splitter",
":autotuner_util",
"//tensorflow/compiler/xla:protobuf_util",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:statusor",
Expand Down Expand Up @@ -1186,6 +1257,7 @@ cc_library(
":gemm_rewriter",
":gpu_compiler",
":gpu_conv_algorithm_picker",
":gemm_algorithm_picker",
"//tensorflow/core/platform:rocm_rocdl_path",
],
)
Expand Down Expand Up @@ -1350,6 +1422,8 @@ cc_library(
hdrs = ["stream_executor_util.h"],
copts = tf_copts(),
deps = [
"//tensorflow/core:autotuning_proto_cc",
"//tensorflow/core/util/proto:proto_utils",
"//tensorflow/compiler/xla:shape_util",
"//tensorflow/compiler/xla:statusor",
"//tensorflow/compiler/xla:types",
Expand Down Expand Up @@ -1385,15 +1459,39 @@ tf_cc_test(
],
)

cc_library(

tf_kernel_library(
name = "image_ops_kernels",
srcs = [
"kernels/bipartite_match_op.cc",
"kernels/image_ops.cc",
"kernels/image_ops.h",
"kernels/segmentation_ops.cc",
"kernels/segmentation_ops.h",
],
gpu_srcs = [
"kernels/image_ops_gpu.cu.cc",
"kernels/image_ops.h",
],
deps = [
":image_ops_op_lib",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//third_party/eigen3",
],
alwayslink = 1,
)

tf_kernel_library(
name = "buffer_comparator",
srcs = [] + if_cuda_is_configured([
"buffer_comparator_cuda.cc",
]) + if_rocm_is_configured([
"buffer_comparator_rocm.cc",
srcs = if_gpu_is_configured(["buffer_comparator.cc",
"buffer_comparator.h"]),
gpu_srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]),
copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured([
"-DTENSORFLOW_USE_ROCM=1",
]),
hdrs = ["buffer_comparator.h"],
deps = [
":partition_assignment",
":stream_executor_util",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:util",
Expand Down
9 changes: 7 additions & 2 deletions tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
#include "tensorflow/compiler/xla/service/hlo_cse.h"
#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
Expand Down Expand Up @@ -83,7 +84,8 @@ Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
}

Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
HloModule* hlo_module, se::StreamExecutor* stream_exec,
HloModule* hlo_module, const AutotuneConfig& cfg,
se::StreamExecutor* stream_exec,
se::DeviceMemoryAllocator* device_allocator) {
HloPassPipeline pipeline("post-layout_assignment");
pipeline.AddInvariantChecker<HloVerifier>(
Expand All @@ -98,9 +100,12 @@ Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);

// Rewrite GEMMs into custom calls.
pipeline.AddPass<GemmRewriter>();
pipeline.AddPass<GemmRewriter>(GetGpuVersion(stream_exec));

pipeline.AddPass<GpuConvAlgorithmPicker>(stream_exec, device_allocator);

pipeline.AddPass<GemmAlgorithmPicker>(cfg);

// Clean up new_tuple described above.
pipeline.AddPass<TupleSimplifier>();

Expand Down
Loading