Skip to content

Commit 2b11f19

Browse files
metaflowGoogle-ML-Automation
authored andcommitted
[XLA] fail if the change bit reported by a pass does not agree with HLO hash comparison
Usually we trust that pass truthfully reports if it did update the HLO. Shockingly there are cases, e.g. algsimp, that wrongly report "unchanged" pass result. Having a flag makes it easier to detect existing and new cases of such bugs. PiperOrigin-RevId: 738733528
1 parent 4ee2886 commit 2b11f19

File tree

4 files changed

+58
-3
lines changed

4 files changed

+58
-3
lines changed

xla/debug_options_flags.cc

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
337337
opts.set_xla_hlo_pass_fix_detect_cycles(false);
338338
opts.set_xla_gpu_experimental_enable_sync_collective_combining(false);
339339
opts.set_xla_allow_get_default_platform(true);
340+
opts.set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change(false);
341+
opts.set_xla_unsupported_crash_on_hlo_pass_noop_change(false);
340342
return opts;
341343
}
342344

@@ -2311,7 +2313,22 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
23112313
"If non empty will interpret this variable as a path for performance "
23122314
"tables for collectives. Expects `xla.gpu.DeviceHloInstructionProfiles` "
23132315
"proto."));
2314-
} // NOLINT(readability/fn_size)1
2316+
flag_list->push_back(tsl::Flag(
2317+
"xla_unsupported_crash_on_hlo_pass_silent_hlo_change",
2318+
bool_setter_for(
2319+
&DebugOptions::
2320+
set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change),
2321+
debug_options->xla_unsupported_crash_on_hlo_pass_silent_hlo_change(),
2322+
"Crash if a pass reports that it did not change the HLO but in fact it "
2323+
"did."));
2324+
flag_list->push_back(tsl::Flag(
2325+
"xla_unsupported_crash_on_hlo_pass_noop_change",
2326+
bool_setter_for(
2327+
&DebugOptions::set_xla_unsupported_crash_on_hlo_pass_noop_change),
2328+
debug_options->xla_unsupported_crash_on_hlo_pass_noop_change(),
2329+
"Crash if a pass reports that it did change the HLO but in fact it "
2330+
"did not."));
2331+
} // NOLINT(readability/fn_size)
23152332

23162333
// Allocates flag_values and flag_objects; this function must not be called more
23172334
// than once - its call done via call_once.

xla/hlo/pass/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ cc_library(
8989
"//xla:util",
9090
"//xla:xla_proto_cc",
9191
"//xla/hlo/ir:hlo",
92+
"//xla/hlo/ir:hlo_module_group",
9293
"//xla/service:compilation_stats",
9394
"//xla/service:dump",
9495
"//xla/service:hlo_graph_dumper",
@@ -102,6 +103,7 @@ cc_library(
102103
"@com_google_absl//absl/status:statusor",
103104
"@com_google_absl//absl/strings",
104105
"@com_google_absl//absl/strings:str_format",
106+
"@com_google_absl//absl/types:optional",
105107
"@tsl//tsl/profiler/lib:scoped_annotation",
106108
"@tsl//tsl/profiler/lib:traceme",
107109
],

xla/hlo/pass/hlo_pass_pipeline.cc

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ limitations under the License.
1515

1616
#include "xla/hlo/pass/hlo_pass_pipeline.h"
1717

18+
#include <cstddef>
19+
#include <optional>
1820
#include <string>
1921
#include <vector>
2022

@@ -24,6 +26,8 @@ limitations under the License.
2426
#include "absl/strings/str_format.h"
2527
#include "absl/strings/str_join.h"
2628
#include "absl/strings/string_view.h"
29+
#include "xla/hlo/ir/hlo_module_group.h"
30+
#include "xla/hlo/pass/hlo_pass_interface.h"
2731
#include "xla/service/dump.h"
2832
#include "xla/service/hlo_graph_dumper.h"
2933
#include "xla/service/hlo_proto_util.h"
@@ -169,6 +173,9 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
169173
/*module_changed=*/false);
170174

171175
bool changed = false;
176+
bool verify_pass_changed_report =
177+
debug_options.xla_unsupported_crash_on_hlo_pass_silent_hlo_change() ||
178+
debug_options.xla_unsupported_crash_on_hlo_pass_noop_change();
172179
for (int i = 0; i < passes.size(); i++) {
173180
HloPassInterface* pass = passes[i];
174181
std::string pass_name = std::string(pass->name());
@@ -178,7 +185,11 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
178185
pass_name, hlo->name(), UniqueId(*hlo));
179186
}};
180187
VLOG(1) << " HLO pass " << pass_name;
181-
VLOG(2) << " Module hash " << absl::HashOf(*hlo);
188+
std::optional<size_t> hash_before = std::nullopt;
189+
if (verify_pass_changed_report || VLOG_IS_ON(2)) {
190+
hash_before = absl::HashOf(*hlo);
191+
VLOG(2) << " Module hash " << hash_before.value();
192+
}
182193
tsl::profiler::TraceMe traceme(pass->name());
183194
if (!pass->IsPassPipeline()) {
184195
compilation_stats_->StartPass(pass_name);
@@ -190,6 +201,27 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
190201
pass_name, absl::StatusCodeToString(status.code()));
191202
}
192203
TF_ASSIGN_OR_RETURN(bool pass_changed, status_or_changed);
204+
if (verify_pass_changed_report) {
205+
size_t hash_after = absl::HashOf(*hlo);
206+
// Fail if pass changed HLO but has reported that it didn't.
207+
if (!pass_changed && hash_after != hash_before &&
208+
debug_options.xla_unsupported_crash_on_hlo_pass_silent_hlo_change()) {
209+
LOG(FATAL) << absl::StrFormat(
210+
"Pass '%s' in pipeline '%s' reported that it did not change the "
211+
"HLO but the hash of HLO was changed from %d to %d. HLO text "
212+
"after:\n%s",
213+
pass_name, pipeline_name, hash_before.value(), hash_after,
214+
hlo->ToString());
215+
}
216+
// Fail if pass did not change HLO but has reported that it did.
217+
if (pass_changed && hash_after == hash_before &&
218+
debug_options.xla_unsupported_crash_on_hlo_pass_noop_change()) {
219+
LOG(FATAL) << absl::StrFormat(
220+
"Pass '%s' in pipeline '%s' reported that it changed the HLO but "
221+
"the hash of HLO was not updated. HLO text after:\n%s",
222+
pass_name, pipeline_name, hlo->ToString());
223+
}
224+
}
193225
if (!dump_regex.empty() && (pass_changed || dump_regex != ".*")) {
194226
MaybeDumpHloAndSaveFilenames(*hlo,
195227
/*after_pass_name=*/pass_name,

xla/xla.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ message DebugOptions {
125125
bool xla_hlo_pass_fix_detect_cycles = 370;
126126
// Crash if HloPassFix can not converge after a fixed number of iterations.
127127
bool xla_unsupported_crash_on_hlo_pass_fix_max_iterations = 363;
128+
// Crash if a pass reports that it changes the HLO but in fact it did not.
129+
bool xla_unsupported_crash_on_hlo_pass_noop_change = 379;
130+
// Crash if a pass reports that it did not change the HLO but in fact it did.
131+
bool xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380;
128132
// go/keep-sorted end
129133

130134
reserved 346; // xla_experimental_exec_time_optimization_effort
@@ -1189,7 +1193,7 @@ message DebugOptions {
11891193
// Note: when adding a new flag, please add it to one of the hardware-specific
11901194
// or hardware-agnostic sections at the top of this proto message.
11911195

1192-
// Next id: 379
1196+
// Next id: 381
11931197

11941198
// Extra options to pass to the compilation backend (e.g. LLVM); specific
11951199
// interpretation of these values is left to the backend.

0 commit comments

Comments
 (0)