Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,8 @@ set (GGML_HEADERS_IQK iqk/iqk_config.h)
if (GGML_IQK_MUL_MAT)
message(STATUS "Using optimized iqk matrix multiplications")
add_compile_definitions(GGML_USE_IQK_MULMAT)
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp)
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h)
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp iqk/iqk_flash_attn.cpp)
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h iqk/iqk_flash_impl.h)
if (GGML_IQK_FA_ALL_QUANTS)
message(STATUS "Including all IQK FA kernels")
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
Expand Down
112 changes: 72 additions & 40 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -17870,46 +17870,57 @@ static void ggml_compute_forward_flash_attn_ext_f16(
}

#if GGML_USE_IQK_MULMAT
if (max_bias <= 0.0f && q->type == GGML_TYPE_F32 && mask && mask->type == GGML_TYPE_F16) {
//if (ith == 0) printf("k: %ld x %ld x %ld, q: %ld x %ld x %ld, v: %ld x %ld x %ld mask: %ld x %ld x %ld\n",
// k->ne[0], k->ne[1], k->ne[2], q->ne[0], q->ne[1], q->ne[2], v->ne[0], v->ne[1], v->ne[2], mask->ne[0], mask->ne[1], mask->ne[2]);
// I keep changing my mind what is the best strategy to split the threads when processing
// multiple heads. This is my current thinking, the commented out code below was the previous.
int ntg = nth/simple_gcd(neq2*neq3, nth);
int64_t neq1g = (neq1 + ntg - 1)/ntg;
//int64_t work_per_slice = D*nek1*neq1;
//int ntg = 1;
//
// When neq1 is large, it is better to have more than one thread process one (iq2,iq3) matrix
// But we also want each thread to process the same amount of rows, so neq1 must be a multiple of
// the number of threads processing the (iq2, iq3) matrix.
//
//if (neq1 >= 8*nth) {
// if (nth%8 == 0 && neq1%8 == 0 && work_per_slice >= (1 << 23)) ntg = 8;
// else if (nth%4 == 0 && neq1%4 == 0 && work_per_slice >= (1 << 21)) ntg = 4;
// else if (nth%2 == 0 && neq1%2 == 0 && work_per_slice >= (1 << 19)) ntg = 2;
//}
int counter = 0;
for (int64_t iq3 = 0; iq3 < neq3; iq3++) {
for (int64_t iq2 = 0; iq2 < neq2; iq2++) {
if (counter++ % (nth/ntg) == ith/ntg) {
int iq1 = (ith%ntg)*neq1g;
int this_neq1 = MIN(neq1g, neq1-iq1);
if (!iqk_flash_attn_noalibi(k->type, v->type,
Dk, Dv, this_neq1, nek1, q->nb[1], k->nb[1], v->nb[1], mask->nb[1], ne1*nb1/sizeof(float),
(const float *)((const char *)q->data + iq2*q->nb[2] + iq3*q->nb[3] + iq1*q->nb[1]),
(const void *)((const char *)k->data + iq2/rk2*k->nb[2] + iq3/rk3*k->nb[3]),
(const void *)((const char *)v->data + iq2/rv2*v->nb[2] + iq3/rv3*v->nb[3]),
(const void *)((const char *)mask->data + iq1*mask->nb[1]),
scale, softcap,
(float *)((char *) dst->data + (iq3*ne2*ne1 + iq2 + iq1*ne1)*nb1))) goto IQK_Flash_Attn_NotAvailable;
}
}
}
return;
IQK_Flash_Attn_NotAvailable:;
printf("iqk_flash was rejected\n");
}
if (iqk_flash_attn_noalibi(q->type, mask->type, max_bias,
q->ne[3], q->ne[2], q->nb[3], q->nb[2],
k->ne[3], k->ne[2], k->nb[3], k->nb[2],
v->ne[3], v->ne[2], v->nb[3], v->nb[2],
dst->ne[2], dst->ne[1], dst->nb[1],
k->type, v->type,
Dk, Dv, neq1, nek1, q->nb[1], k->nb[1], v->nb[1], mask->nb[1],
q->data, k->data, v->data, mask->data,
scale, softcap, (float *)dst->data,
params->wdata, (barrier_t)ggml_barrier, (void *)params->shared, ith, nth)) return;

// if (max_bias <= 0.0f && q->type == GGML_TYPE_F32 && mask && mask->type == GGML_TYPE_F16) {
// //if (ith == 0) printf("k: %ld x %ld x %ld, q: %ld x %ld x %ld, v: %ld x %ld x %ld mask: %ld x %ld x %ld\n",
// // k->ne[0], k->ne[1], k->ne[2], q->ne[0], q->ne[1], q->ne[2], v->ne[0], v->ne[1], v->ne[2], mask->ne[0], mask->ne[1], mask->ne[2]);
// // I keep changing my mind what is the best strategy to split the threads when processing
// // multiple heads. This is my current thinking, the commented out code below was the previous.
// int ntg = nth/simple_gcd(neq2*neq3, nth);
// int64_t neq1g = (neq1 + ntg - 1)/ntg;
// //int64_t work_per_slice = D*nek1*neq1;
// //int ntg = 1;
// //
// // When neq1 is large, it is better to have more than one thread process one (iq2,iq3) matrix
// // But we also want each thread to process the same amount of rows, so neq1 must be a multiple of
// // the number of threads processing the (iq2, iq3) matrix.
// //
// //if (neq1 >= 8*nth) {
// // if (nth%8 == 0 && neq1%8 == 0 && work_per_slice >= (1 << 23)) ntg = 8;
// // else if (nth%4 == 0 && neq1%4 == 0 && work_per_slice >= (1 << 21)) ntg = 4;
// // else if (nth%2 == 0 && neq1%2 == 0 && work_per_slice >= (1 << 19)) ntg = 2;
// //}
// int counter = 0;
// for (int64_t iq3 = 0; iq3 < neq3; iq3++) {
// for (int64_t iq2 = 0; iq2 < neq2; iq2++) {
// if (counter++ % (nth/ntg) == ith/ntg) {
// int iq1 = (ith%ntg)*neq1g;
// int this_neq1 = MIN(neq1g, neq1-iq1);
// if (!iqk_flash_attn_noalibi(k->type, v->type,
// Dk, Dv, this_neq1, nek1, q->nb[1], k->nb[1], v->nb[1], mask->nb[1], ne1*nb1/sizeof(float),
// (const float *)((const char *)q->data + iq2*q->nb[2] + iq3*q->nb[3] + iq1*q->nb[1]),
// (const void *)((const char *)k->data + iq2/rk2*k->nb[2] + iq3/rk3*k->nb[3]),
// (const void *)((const char *)v->data + iq2/rv2*v->nb[2] + iq3/rv3*v->nb[3]),
// (const void *)((const char *)mask->data + iq1*mask->nb[1]),
// scale, softcap,
// (float *)((char *) dst->data + (iq3*ne2*ne1 + iq2 + iq1*ne1)*nb1))) goto IQK_Flash_Attn_NotAvailable;
// }
// }
// }
// return;
//IQK_Flash_Attn_NotAvailable:;
// printf("iqk_flash was rejected\n");
// }
#endif

const uint32_t n_head = neq2;
Expand Down Expand Up @@ -21534,6 +21545,27 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const int64_t D = MAX(Dk, Dv);

cur = 3*sizeof(float)*D*n_tasks; // 3x head size/thread
#if GGML_USE_IQK_MULMAT
const struct ggml_tensor * q = node->src[0];
const struct ggml_tensor * k = node->src[1];
if (q->ne[1] == 1 && q->ne[3] == 1 && q->ne[2]/k->ne[2] > 1 && n_tasks > 1 && k->ne[1]/32 > 1) {
int nstep_k = k->ne[1]/32;
int gcd_k = simple_gcd(nstep_k, n_tasks);
if (gcd_k > 1) {
int nth_k = n_tasks/gcd_k;
int rk2 = q->ne[2]/k->ne[2];
if (rk2%nth_k == 0) {
size_t size = (Dv + 16)*rk2/nth_k*sizeof(float)*n_tasks;
if (ggml_is_quantized(k->type)) {
enum ggml_type vec_dot_type = type_traits[k->type].vec_dot_type;
size_t row_size = ggml_row_size(vec_dot_type, q->ne[0]);
size += q->ne[2]*row_size;
}
cur = MAX(cur, size);
}
}
}
#endif
} break;
case GGML_OP_FLASH_ATTN_BACK:
{
Expand Down
138 changes: 138 additions & 0 deletions ggml/src/iqk/iqk_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
//
// Copyright (C) 2024 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//

#include "iqk_config.h"

#if defined IQK_IMPLEMENT

#include <cstring>
#include <type_traits>
#include <vector>

#include "ggml-impl.h"
#include "ggml-quants.h"
#include "iqk_mul_mat.h"
#include "iqk_quantize.h"

#define GGML_COMMON_IMPL_C
#include "ggml-common.h"

#define FA_TIMING 0

#include <utility>
#include <array>
#if FA_TIMING
#include <chrono>
#include <mutex>
struct Perf {
using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
std::array<double, 5> times = {};
std::mutex mutex;
bool report;
static auto cur_time() { return std::chrono::high_resolution_clock::now(); }
inline void accum(int what, const TimePoint& t1) {
auto t2 = cur_time();
auto dt = delta(t1, t2);
std::lock_guard<std::mutex> lock(mutex);
times[what] += dt;
}
inline void accum_nolock(int what, const TimePoint& t1) {
auto t2 = cur_time();
auto dt = delta(t1, t2);
times[what] += dt;
}
inline void add(const Perf& other) {
std::lock_guard<std::mutex> lock(mutex);
for (int i = 0; i < int(times.size()); ++i) times[i] += other.times[i];
}
Perf(bool r) : report(r) {}
~Perf() {
if (report) {
double tot = 0;
for (auto& t : times) tot += t;
if (!tot) return;
printf("======================= Timing: %g ms in total\n", tot);
for (int i = 0; i < int(times.size()); ++i) {
if (times[i]) {
printf("%d: %g ms -> %g%c\n", i, times[i], 100*times[i]/tot, '%');
}
}
}
}
static Perf& instance() {
static Perf p(true);
return p;
}
static double delta(const TimePoint& t1, const TimePoint& t2) {
return 1e-6*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
}
};
#endif

#ifdef __AVX2__
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
#endif

namespace {

typedef struct {
int32_t i1;
int32_t i2;
} mmid_row_mapping;

struct DataInfo {
float * s;
const char * cy;
size_t bs;
size_t by;
int cur_y = 0;
int ne11;
const mmid_row_mapping * row_mapping = nullptr;
size_t bs2 = 0;

inline const char * src1_row(int iy) const {
if (!row_mapping) return cy + (cur_y + iy)*by;
int i11 = row_mapping[cur_y + iy].i1 % ne11;
int i12 = row_mapping[cur_y + iy].i2;
return cy + (i11 + i12*ne11)*by;
}

inline void store(int ix, int iy, float result) const {
*(dst_row(iy) + ix) = result;
}
#ifdef __AVX__
inline void store(int ix, int iy, __m128 result) const {
_mm_storeu_ps(dst_row(iy) + ix, result);
}
inline void store(int ix, int iy, __m256 result) const {
_mm256_storeu_ps(dst_row(iy) + ix, result);
}
#endif
#ifdef __AVX512F__
inline void store(int ix, int iy, __m512 result) const {
_mm512_storeu_ps(dst_row(iy) + ix, result);
}
#endif
#ifdef __ARM_NEON
inline void store(int ix, int iy, float32x4_t result) const {
vst1q_f32(dst_row(iy) + ix, result);
}
#endif
inline float * dst_row(int iy) const {
if (!row_mapping) return s + (cur_y + iy)*bs;
int i12 = row_mapping[cur_y + iy].i2;
int i1 = row_mapping[cur_y + iy].i1;
int i2 = i12;
return s + i1*bs + i2*bs2;
}
};

typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);

#endif
Loading