fix

nsarka · nsarka · commit 3ef189d75afc · 2025-03-25T17:43:47.000Z
diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
@@ -50,10 +50,8 @@ struct SegmentedEdge {
   }
 };
 
-
 std::vector<Expr*> groupExprPrintSorting(const std::vector<Expr*>& exprs);
 
-
 std::ostream& operator<<(std::ostream& os, const SegmentedEdge* edge);
 
 //! Groups together expressions which create a segmented group
diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 
+#include <fusion_segmenter.h>
 #include <host_ir/container.h>
 #include <host_ir/host_ir.h>
 #include <ir/builder.h>
@@ -15,7 +16,6 @@
 #include <kernel_ir.h>
 #include <ops/all_ops.h>
 #include <runtime/executor.h>
-#include <fusion_segmenter.h>
 
 namespace nvfuser {
 
@@ -55,8 +55,7 @@ void HostIrContainer::setKernelExecutor(
   kernel_executors_.at(index) = std::move(ke);
 }
 
-void HostIrContainer::sortExprs()
-{
+void HostIrContainer::sortExprs() {
   this->top_level_exprs_ = groupExprPrintSorting(this->top_level_exprs_);
 }
 
diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -10,8 +10,11 @@
 #include <fusion.h>
 #include <fusion_profiler.h>
 #include <fusion_segmenter.h>
+#include <host_ir/lower.h>
 #include <instrumentation.h>
 #include <ir/base_nodes.h>
+#include <multidevice/communication.h>
+#include <multidevice/utils.h>
 #include <preseg_passes/pre_segmenter.h>
 #include <python_frontend/fusion_definition.h>
 #include <python_frontend/translation.h>
@@ -21,9 +24,6 @@
 #include <scheduler/heuristic.h>
 #include <serde/fusion_cache_generated.h>
 #include <type.h>
-#include <host_ir/lower.h>
-#include <multidevice/communication.h>
-#include <multidevice/utils.h>
 
 #include <c10/cuda/CUDAGuard.h>
 
@@ -450,34 +450,37 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
         hic->pushBackTopLevelExprs(launch_kernel);
       } else {
         const bool is_resharding = std::any_of(
-            group_to_run->exprs().begin(), group_to_run->exprs().end(), [](auto expr) {
-              return isResharding(expr);
-            });
+            group_to_run->exprs().begin(),
+            group_to_run->exprs().end(),
+            [](auto expr) { return isResharding(expr); });
         if (is_resharding) {
+          auto deviceid = Communicator::getInstance().deviceId();
           NVF_ERROR(
               group_to_run->exprs().size() == 1,
               "Communication segments must contain only one Expr");
           HostIrLower lower;
           for (auto* expr :
-              lower.lower(ir_cloner.clone(group_to_run->exprs().at(0)))) {
+               lower.lower(ir_cloner.clone(group_to_run->exprs().at(0)), deviceid)) {
             // Allocate the recv buffers of communications
             if (expr->isA<Communication>()) {
               auto* communication = expr->as<Communication>();
               TensorView* tv = communication->out();
-              if (tv->getDeviceMesh().has(Communicator::getInstance().deviceId())) {
+              if (tv->getDeviceMesh().has(deviceid)) {
                 auto* allocate =
                     IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
                 hic->pushBackTopLevelExprs(allocate);
               }
             }
             hic->pushBackTopLevelExprs(expr);
             if (expr->isA<Communication>()) {
-              auto wait = IrBuilder::create<hir::Wait>(expr->as<Communication>());
+              auto wait =
+                  IrBuilder::create<hir::Wait>(expr->as<Communication>());
               hic->pushBackTopLevelExprs(wait);
             }
           }
         } else {
-          // push back segment's exprs into the container as top level expressions
+          // push back segment's exprs into the container as top level
+          // expressions
           for (auto* expr : group_to_run->exprs()) {
             auto cloned_expr = ir_cloner.clone(expr);
             hic->pushBackTopLevelExprs(cloned_expr);
@@ -491,7 +494,7 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
     for (const Val* out : segmented_fusion_->outputs()) {
       hic->addOutput(ir_cloner.clone(out));
     }
-    
+
     hic->sortExprs();
   }
 

Original file line number	Diff line number	Diff line change
`@@ -50,10 +50,8 @@ struct SegmentedEdge {`
`50`	`50`	`}`
`51`	`51`	`};`
`52`	`52`
`53`		`-`
`54`	`53`	`std::vector<Expr> groupExprPrintSorting(const std::vector<Expr>& exprs);`
`55`	`54`
`56`		`-`
`57`	`55`	`std::ostream& operator<<(std::ostream& os, const SegmentedEdge* edge);`
`58`	`56`
`59`	`57`	`//! Groups together expressions which create a segmented group`