Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions csrc/apis/attention.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,10 @@ static void fp8_gemm_nt_skip_head_mid(const std::pair<torch::Tensor, torch::Tens
return;

// Transform SFA and SFB into compute-required layout
if (not recipe.has_value())
recipe = get_default_recipe(a.second.scalar_type(), b.second.scalar_type());
DG_HOST_ASSERT(recipe.value() == std::make_tuple(1, 1, 128) or recipe.value() == std::make_tuple(1, 128, 128));
const auto& sfa = layout::transform_sf_into_required_layout(a.second, m, k, recipe.value(), std::nullopt, true, disable_ue8m0_cast);
const auto& sfb = layout::transform_sf_into_required_layout(b.second, n, k, recipe.value(), std::nullopt, false, disable_ue8m0_cast);
const auto& [sfa, sfb, gran_k_a, gran_k_b] = layout::transform_sf_pair_into_required_layout(
a.second, b.second, m, n, k, recipe, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, disable_ue8m0_cast);
DG_HOST_ASSERT(gran_k_a == 128 and gran_k_b == 128);

// Dispatch into different implements
const auto& arch_major = device_runtime->get_arch_major();
Expand All @@ -66,7 +65,9 @@ static void fp8_gemm_nt_skip_head_mid(const std::pair<torch::Tensor, torch::Tens
const auto& major_sfb = get_major_type_ab(sfb);
sm90_fp8_gemm_1d2d(a.first, sfa, b.first, sfb, std::nullopt, d, m, n, k, major_a, major_b, major_sfb, compiled_dims, epilogue_type);
} else if (arch_major == 10 and sfa.scalar_type() == torch::kInt) {
sm100_fp8_gemm_1d1d(a.first, sfa, b.first, sfb, std::nullopt, d, m, n, k, major_a, major_b, compiled_dims, epilogue_type);
// NOTES: Only granularity 128 and FP8 are exposed in the API
sm100_fp8_fp4_gemm_1d1d(a.first, sfa, b.first, sfb, std::nullopt, d, m, n, k,
128, 128, major_a, major_b, compiled_dims, epilogue_type);
} else {
DG_HOST_UNREACHABLE("Unsupported architecture or scaling factor types");
}
Expand Down Expand Up @@ -229,8 +230,8 @@ static torch::Tensor fp8_paged_mqa_logits(const torch::Tensor& q,
);

// Allocate output
constexpr int num_math_warp_groups = 4;
const auto& aligned_max_context_len = align(max_context_len, num_math_warp_groups * block_kv);
constexpr int split_kv = 256;
const auto& aligned_max_context_len = align(max_context_len, split_kv);
auto logits = torch::empty({batch_size * next_n, aligned_max_context_len}, q.options().dtype(torch::kFloat));
logits = logits.slice(-1, 0, max_context_len);

Expand All @@ -239,7 +240,7 @@ static torch::Tensor fp8_paged_mqa_logits(const torch::Tensor& q,
if (arch_major == 9 or arch_major == 10) {
smxx_fp8_paged_mqa_logits(q, kv_cache, kv_cache_scales, weights, context_lens, logits, block_table, schedule_meta,
batch_size, next_n, num_heads, head_dim, num_kv_blocks, block_kv, is_context_lens_2d,
kv_cache_stride_bytes, aligned_max_context_len, block_table_stride, num_sms, num_math_warp_groups);
kv_cache_stride_bytes, aligned_max_context_len, block_table_stride, num_sms, split_kv);
} else {
DG_HOST_UNREACHABLE("Unsupported architecture");
}
Expand All @@ -251,7 +252,8 @@ static torch::Tensor fp8_paged_mqa_logits(const torch::Tensor& q,
}
return logits;
}
#endif

#endif

static void register_apis(pybind11::module_& m) {
#if DG_FP8_COMPATIBLE and DG_TENSORMAP_COMPATIBLE
Expand Down
16 changes: 9 additions & 7 deletions csrc/apis/einsum.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ static void fp8_bmm(const torch::Tensor& a, const torch::Tensor& sfa,
const torch::Tensor& b, const torch::Tensor& sfb,
const torch::Tensor& d,
const std::optional<torch::Tensor>& c,
const std::tuple<int, int, int>& recipe,
std::optional<std::tuple<int, int, int>> recipe,
const std::string& compiled_dims) {
// Shape must be `[B, M, K] @ [B, N, K].T`
const auto& major_a = a.stride(-1) == 1 ? cute::UMMA::Major::K : cute::UMMA::Major::MN;
Expand All @@ -163,15 +163,16 @@ static void fp8_bmm(const torch::Tensor& a, const torch::Tensor& sfa,
return;

// Transform scaling factors
const auto& transformed_sfa = layout::transform_sf_into_required_layout(sfa, m, k, recipe, batch_size, true, false);
const auto& transformed_sfb = layout::transform_sf_into_required_layout(sfb, n, k, recipe, batch_size, false, false);
const auto& [transformed_sfa, transformed_sfb, gran_k_a, gran_k_b] = layout::transform_sf_pair_into_required_layout(
sfa, sfb, m, n, k, recipe, std::nullopt, std::nullopt, batch_size, batch_size, false);

// Dispatch implementation
const auto& arch_major = device_runtime->get_arch_major();
const auto arch_major = device_runtime->get_arch_major();
if (arch_major == 10) {
sm100_fp8_bmm(a, transformed_sfa, b, transformed_sfb, c, d, batch_size, m, n, k, major_a, major_b, compiled_dims);
} else {
DG_HOST_UNREACHABLE("Unsupported architecture");
const auto& major_sfb = get_major_type_ab(sfb);
sm90_fp8_bmm(a, transformed_sfa, b, transformed_sfb, c, d, batch_size, m, n, k, major_a, major_b, major_sfb, compiled_dims);
}
}

Expand All @@ -182,6 +183,7 @@ static void fp8_einsum(const std::string& expr,
const std::optional<torch::Tensor>& c,
const std::tuple<int, int, int>& recipe) {
// Some hardcoded Einstein sum kernels
const auto arch_major = device_runtime->get_arch_major();
if (expr == "bhr,hdr->bhd") {
// Permute dims to satisfy the order of (batch_size, m, n, k)
// (batch_size, m, n, k): (h, b, d, r)
Expand All @@ -190,7 +192,7 @@ static void fp8_einsum(const std::string& expr,
const auto& perm_d = d.permute({1, 0, 2});
const auto& perm_c = c.has_value() ? std::make_optional(c.value().permute({1, 0, 2})) : std::nullopt;
fp8_bmm(perm_a, perm_sfa, b.first, b.second, perm_d, perm_c, recipe, "nk");
} else if (expr == "bhd,hdr->bhr") {
} else if (expr == "bhd,hdr->bhr" and arch_major == 10) {
// (batch_size, m, n, k): (h, b, r, d)
const auto& perm_a = a.first.permute({1, 0, 2});
const auto& perm_sfa = a.second.permute({1, 0, 2});
Expand All @@ -199,7 +201,7 @@ static void fp8_einsum(const std::string& expr,
const auto& perm_d = d.permute({1, 0, 2});
const auto& perm_c = c.has_value() ? std::make_optional(c.value().permute({1, 0, 2})) : std::nullopt;
fp8_bmm(perm_a, perm_sfa, perm_b, perm_sfb, perm_d, perm_c, recipe, "nk");
} else if (expr == "bhd,bhr->hdr") {
} else if (expr == "bhd,bhr->hdr" and arch_major == 10) {
// (batch_size, m, n, k): (h, d, r, b)
const auto& perm_a = a.first.permute({1, 2, 0});
const auto& perm_sfa = a.second.permute({1, 2, 0});
Expand Down
Loading
Loading