Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "llama.h"

#include <algorithm>
#include <chrono>
#include <cinttypes>
#include <climits>
#include <cmath>
Expand Down
287 changes: 271 additions & 16 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

option(GGML_TMAC "ggml: use TMAC" OFF)

# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
Expand All @@ -217,6 +219,9 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
#

set(CMAKE_C_STANDARD 11)
if (GGML_TMAC)
set(CMAKE_C_STANDARD 17)
endif()
set(CMAKE_C_STANDARD_REQUIRED true)

set(CMAKE_CXX_STANDARD 17)
Expand Down
4 changes: 4 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ extern "C" {
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_atomic_store_explicit(struct ggml_threadpool * threadpool, int value);
GGML_BACKEND_API int ggml_threadpool_atomic_fetch_add_explicit(struct ggml_threadpool * threadpool, int value);

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
Expand Down Expand Up @@ -120,6 +122,8 @@ extern "C" {

GGML_BACKEND_API void ggml_cpu_init(void);

GGML_BACKEND_API void ggml_cpu_tmac_init(const char * fname);

//
// CPU backend
//
Expand Down
11 changes: 10 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,16 @@ extern "C" {
// GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT = 39,
GGML_TYPE_TMAC_BN_0 = 39,
GGML_TYPE_TMAC_W2G64_0 = 40,
GGML_TYPE_TMAC_W2G64_1 = 41,
GGML_TYPE_TMAC_W2G128_0 = 42,
GGML_TYPE_TMAC_W2G128_1 = 43,
GGML_TYPE_TMAC_W4G64_0 = 44,
GGML_TYPE_TMAC_W4G64_1 = 45,
GGML_TYPE_TMAC_W4G128_0 = 46,
GGML_TYPE_TMAC_W4G128_1 = 47,
GGML_TYPE_COUNT = 48,
};

// precision
Expand Down
24 changes: 24 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ add_library(ggml-base
ggml.c
ggml-alloc.c
ggml-backend.cpp
ggml-common.h
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h
Expand All @@ -211,6 +212,29 @@ endif()
add_library(ggml
ggml-backend-reg.cpp)

# if (GGML_TMAC)
# # set(GGML_HEADERS_TMAC
# # ggml-cpu/tmac/lut_ctor.h
# # ggml-cpu/tmac/tbl.h
# # ggml-cpu/tmac/ggml-tmac.h
# # ../../common/log.h
# # )
# set(GGML_SOURCES_TMAC
# ggml-cpu/tmac/lut_ctor.cpp
# ggml-cpu/tmac/tbl.cpp
# ggml-cpu/tmac/ggml-tmac.cpp
# ../../common/log.cpp
# )
# # list (APPEND GGML_CPU_SOURCES ${GGML_SOURCES_TMAC} ${GGML_HEADERS_TMAC})
# target_sources(ggml-base PRIVATE ${GGML_SOURCES_TMAC})
# target_compile_definitions(ggml-base PUBLIC GGML_USE_TMAC)
# target_include_directories(ggml-base PUBLIC ggml-cpu/tmac)
# target_compile_definitions(ggml PUBLIC GGML_USE_TMAC)
# target_include_directories(ggml PUBLIC ggml-cpu/tmac)
# target_compile_options(ggml-base PUBLIC /arch:AVX2)
# target_compile_definitions(ggml-base PUBLIC GGML_AVX2 GGML_FMA GGML_F16C)
# endif()

target_link_libraries(ggml PUBLIC ggml-base)

if (CMAKE_SYSTEM_NAME MATCHES "Linux")
Expand Down
61 changes: 60 additions & 1 deletion ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/amx/amx.h
ggml-cpu/amx/mmq.cpp
ggml-cpu/amx/mmq.h
ggml-cpu/tmac/tmac.cpp
ggml-cpu/tmac/tmac.h
ggml-cpu/tmac/lut_mul_mat.cpp
ggml-cpu/tmac/lut_mul_mat.h
ggml-cpu/tmac/lut_ctor.cpp
ggml-cpu/tmac/lut_ctor.h
ggml-cpu/tmac/tbl.cpp
ggml-cpu/tmac/tbl.h
ggml-cpu/ggml-cpu-impl.h
ggml-cpu/common.h
ggml-cpu/binary-ops.h
Expand Down Expand Up @@ -72,6 +80,36 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/llamafile/sgemm.h)
endif()

if (GGML_TMAC)
target_compile_definitions(${GGML_CPU_NAME} PUBLIC GGML_USE_TMAC)
target_include_directories(${GGML_CPU_NAME} PUBLIC ggml-cpu/tmac)
get_target_property(cdefs ${GGML_CPU_NAME} COMPILE_DEFINITIONS)
message(STATUS "GGML_CPU_NAME: ${GGML_CPU_NAME} COMPILE_DEFINITIONS: ${cdefs}")

# set(GGML_HEADERS_TMAC
# ggml-cpu/tmac/lut_ctor.h
# ggml-cpu/tmac/tbl.h
# ggml-cpu/tmac/ggml-tmac.h
# ../../common/log.h
# )
# set(GGML_SOURCES_TMAC
# ggml-cpu/tmac/lut_ctor.cpp
# ggml-cpu/tmac/tbl.cpp
# ggml-cpu/tmac/ggml-tmac.cpp
# ../../common/log.cpp
# )
# list (APPEND GGML_CPU_SOURCES ${GGML_SOURCES_TMAC} ${GGML_HEADERS_TMAC})

if ((NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") OR
(NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
message(FATAL_ERROR "Clang is required for T-MAC compilation")
endif()

if (GGML_TMAC_RECHUNK)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE TMAC_RECHUNK)
endif()
endif()

if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)

Expand Down Expand Up @@ -145,6 +183,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
endif()
endif()
if (GGML_TMAC)
# ARM Windows with LLVM clang GNU interface
# We need fullfp16 for T-MAC
# TODO: check_cxx_source_compiles
list(APPEND ARCH_FLAGS -march=armv8.2a+fp16)
endif()

# show enabled features
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
Expand Down Expand Up @@ -181,7 +225,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_NATIVE)
include(ggml-cpu/cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
# Can't use GGML_AVX512 with T-MAC and Clang for MSVC
# with error: conflicting types for '_m_prefetchw
if (GGML_AVX512 AND (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") AND (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
list(APPEND ARCH_FLAGS /arch:AVX512)
# /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
# MSVC has no compile-time flags enabling specific
Expand Down Expand Up @@ -323,6 +369,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" AND GGML_TMAC)
# We need fullfp16 for T-MAC
# TODO: we need to simplify this logic through check_cxx_source_compiles or Presets?
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
# Device with armv8.7a+ cpu, e.g., WSL on Surface Laptop 7
# based on arm64-windows-llvm.cmake
list(APPEND ARCH_FLAGS -march=armv8.7-a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
else ()
# Jetson AGX Orin, Raspberry Pi 5
list(APPEND ARCH_FLAGS -march=armv8.2a+fp16)
endif ()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")

Expand Down
62 changes: 61 additions & 1 deletion ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
#include "llamafile/sgemm.h"
#endif

#ifdef GGML_USE_TMAC
#include "tmac.h"
#endif

#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
Expand Down Expand Up @@ -373,7 +377,51 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
};
[GGML_TYPE_TMAC_BN_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G64_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G64_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G128_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G128_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G64_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G64_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G128_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G128_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},};

const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
return &type_traits_cpu[type];
Expand Down Expand Up @@ -2639,6 +2687,14 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
#endif
}

void ggml_threadpool_atomic_store_explicit(struct ggml_threadpool * threadpool, int value) {
atomic_store_explicit(&threadpool->current_chunk, value, memory_order_relaxed);
}

int ggml_threadpool_atomic_fetch_add_explicit(struct ggml_threadpool * threadpool, int value) {
return (int)atomic_fetch_add_explicit(&threadpool->current_chunk, value, memory_order_relaxed);
}

struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads,
Expand Down Expand Up @@ -3406,6 +3462,10 @@ void ggml_cpu_init(void) {
ggml_init_arm_arch_features();
#endif

#ifdef GGML_USE_TMAC
ggml_tmac_init();
#endif

is_first_call = false;
}

Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "ggml-cpu-traits.h"
#include "ggml-impl.h"
#include "amx/amx.h"
#include "tmac/tmac.h"

#include <cctype>
#include <string>
Expand Down Expand Up @@ -43,6 +44,12 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
}
#endif

#ifdef GGML_USE_TMAC
if (ggml_backend_tmac_buffer_type()) {
bufts.push_back(ggml_backend_tmac_buffer_type());
}
#endif

#ifdef GGML_USE_CPU_KLEIDIAI
if (ggml_backend_cpu_kleidiai_buffer_type()) {
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
Expand Down
9 changes: 9 additions & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4978,6 +4978,15 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_TMAC_BN_0:
case GGML_TYPE_TMAC_W2G64_0:
case GGML_TYPE_TMAC_W2G64_1:
case GGML_TYPE_TMAC_W2G128_0:
case GGML_TYPE_TMAC_W2G128_1:
case GGML_TYPE_TMAC_W4G64_0:
case GGML_TYPE_TMAC_W4G64_1:
case GGML_TYPE_TMAC_W4G128_0:
case GGML_TYPE_TMAC_W4G128_1:
case GGML_TYPE_COUNT:
{
GGML_ABORT("fatal error");
Expand Down
Loading