junrushao
diff --git a/‎python/tvm/tir/op.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tvm/tir/op.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/te/operation/cross_thread_reduction.cc‎
Lines changed: 7 additions & 6 deletions b/‎src/te/operation/cross_thread_reduction.cc‎
Lines changed: 7 additions & 6 deletions
@@ -616,7 +616,7 @@ def tvm_storage_sync(storage_scope):
     call : PrimExpr
         The call expression.
     """
-    return call_intrin("handle", "tir.tvm_storage_sync", storage_scope)
+    return call_intrin("int32", "tir.tvm_storage_sync", storage_scope)
 
 
 def tvm_warp_shuffle(mask, value, warp_id, width, warp_size):
 
@@ -181,22 +181,23 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
     freduce_args.push_back(dummy_load);
   }
 
+  // Checks for the thread.
+  std::vector<PrimExpr> output_preds;
+  if (stage->store_predicate.defined()) {
+    output_preds.emplace_back(stage->store_predicate);
+  }
+
   for (IterVar iv : stage->leaf_iter_vars) {
     if (iv->iter_type == kCommReduce) {
       auto it = stage->iter_var_attrs.find(iv);
       if (it != stage->iter_var_attrs.end() && (*it).second->bind_thread.defined()) {
         IterVar tv = (*it).second->bind_thread;
         freduce_args.push_back(tv->var);
+        output_preds.push_back(tv->var == make_const(tv->var->dtype, 0));
       }
     }
   }
 
-  // Checks for the thread.
-  std::vector<PrimExpr> output_preds;
-  if (stage->store_predicate.defined()) {
-    output_preds.emplace_back(stage->store_predicate);
-  }
-
   // Apply the existing input predicate if any.
   output_preds.push_back(input_pred);