Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,13 @@ jobs:
path: |
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

windows-latest-cmake-cublas:
windows-latest-cmake-cuda:
runs-on: windows-latest

strategy:
matrix:
cuda: ['12.1.0', '11.7.1']
build: ['cublas']
build: ['cuda']

steps:
- name: Clone
Expand All @@ -299,7 +299,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON
cmake --build . --config Release

- name: Get commit hash
Expand Down Expand Up @@ -361,7 +361,7 @@ jobs:
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
- windows-latest-cmake-cuda

steps:
- name: Download artifacts
Expand Down
10 changes: 5 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
Expand Down Expand Up @@ -238,18 +238,18 @@ if (LLAMA_K_QUANTS)
endif()
endif()

if (LLAMA_CUBLAS)
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
message(STATUS "CUDA found")

enable_language(CUDA)

set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
Expand Down Expand Up @@ -279,7 +279,7 @@ if (LLAMA_CUBLAS)
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

else()
message(WARNING "cuBLAS not found")
message(WARNING "CUDA not found")
endif()
endif()

Expand Down
29 changes: 23 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ else
CXXFLAGS += -DNDEBUG
endif

ifdef LLAMA_SANITIZE
CFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
CXXFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
LDFLAGS += -g -fsanitize=$(LLAMA_SANITIZE)
endif

ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
Expand Down Expand Up @@ -157,13 +163,16 @@ ifdef LLAMA_BLIS
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
ifdef LLAMA_CUDA
CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler
ifdef LLAMA_DEBUG
NVCCFLAGS += -lineinfo
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depending on what the intended use of -lineinfo is this could be added unconditionally (I don't think it makes a difference for performance). When I use Nsight Compute I typically use it without LLAMA_DEBUG since that also affects compiler optimizations.

endif # LLAMA_DEBUG
ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else
Expand Down Expand Up @@ -192,10 +201,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif

ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda-kern.h ggml-cuda-quant.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
endif # LLAMA_CUDA

ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
Expand Down Expand Up @@ -262,6 +270,9 @@ $(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV))
$(info I CXX: $(CXXV))
ifdef LLAMA_CUDA
$(info I NVCC: $(NVCCV))
endif # LLAMA_CUDA
$(info )

#
Expand All @@ -271,6 +282,12 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@

# temporary, probably will be added to ggml.c
ggml-backend.o: ggml-backend.c ggml-backend.h ggml.h
$(CC) $(CFLAGS) -c $< -o $@

OBJS += ggml-backend.o

llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

Expand Down
4 changes: 3 additions & 1 deletion examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main(int argc, char ** argv)
// Init LLM :
//---------------------------------

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -173,6 +173,8 @@ int main(int argc, char ** argv)
llama_free( ctx );
llama_free_model( model );

llama_backend_free();

return 0;
}

Expand Down
Loading