diff --git a/csrc/scheduler/pointwise.cpp b/csrc/scheduler/pointwise.cpp index 862a6a336a2..a9d72e29540 100644 --- a/csrc/scheduler/pointwise.cpp +++ b/csrc/scheduler/pointwise.cpp @@ -1040,6 +1040,14 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams* pparams) { auto output = entry.second; inner_most_tensors.erase(output); } + // IndexSelectOp reads lookup tv without cache. Because pointwise scheduler + // doesn't use ParallelType::Unroll, we need to exclude consumer of fusion + // inputs to be inlineMost. This allows us to aggregate the allocation of + // manual unroll ID and its inner ID. + for (auto idx_sel : ir_utils::getOpsOfType(fusion)) { + inner_most_tensors.erase(idx_sel->output(0)->as()); + } + inlineMost(inner_most_tensors); scheduler_utils::promoteProducerMemoryTypes(fusion, cached_inputs);