go-skynet · mudler · May 16, 2023 · May 14, 2023 · May 14, 2023 · May 14, 2023
diff --git a/Makefile b/Makefile
@@ -33,6 +33,7 @@ endif
 # Compile flags
 #
 
+BUILD_TYPE?=
 # keep standard at C11 and C++11
 CFLAGS   = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
 CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
@@ -120,6 +121,18 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
+ifeq ($(BUILD_TYPE),cublas)
+	EXTRA_LIBS=
+	CMAKE_ARGS+="-DLLAMA_CUBLAS=ON"
+	EXTRA_TARGETS+=llama.cpp/ggml-cuda.o
+endif
+
+ifeq ($(BUILD_TYPE),openblas)
+	EXTRA_LIBS=
+	CMAKE_ARGS+="-DLLAMA_OPENBLAS=ON"
+	EXTRA_TARGETS+=
+endif
+
 #
 # Print build information
 #
@@ -131,14 +144,23 @@ $(info I UNAME_M:  $(UNAME_M))
 $(info I CFLAGS:   $(CFLAGS))
 $(info I CXXFLAGS: $(CXXFLAGS))
 $(info I LDFLAGS:  $(LDFLAGS))
+$(info I BUILD_TYPE:  $(BUILD_TYPE))
+$(info I CMAKE_ARGS:  $(CMAKE_ARGS))
+$(info I EXTRA_TARGETS:  $(EXTRA_TARGETS))
 $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
+# Use this if you want to set the default behavior
+
+
 llama.cpp/ggml.o:
 	mkdir build
 	cd build && cmake ../llama.cpp $(CMAKE_ARGS) && make VERBOSE=1 ggml && cp -rf CMakeFiles/ggml.dir/ggml.c.o ../llama.cpp/ggml.o
 
+llama.cpp/ggml-cuda.o: llama.cpp/ggml.o
+	cd build && cp -rf CMakeFiles/ggml.dir/ggml-cuda.cu.o ../llama.cpp/ggml-cuda.o
+
 llama.cpp/llama.o:
 	$(MAKE) -C llama.cpp llama.o
 
@@ -148,8 +170,8 @@ llama.cpp/common.o:
 binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
 	$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)
 
-libbinding.a: binding.o
-	ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o
+libbinding.a: binding.o $(EXTRA_TARGETS)
+	ar src libbinding.a llama.cpp/ggml.o $(EXTRA_TARGETS) llama.cpp/common.o llama.cpp/llama.o binding.o
 
 generic-llama.cpp/ggml.o:
 	$(MAKE) -C llama.cpp ggml.o

diff --git a/README.md b/README.md
@@ -31,6 +31,24 @@ Now you can run the example with:
 LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples -m "/model/path/here" -t 14
 ```
 
+## OpenBLAS accelleration
+
+To build and run with OpenBLAS, for example:
+
+```
+BUILD_TYPE=openblas make libbinding.a
+CGO_LDFLAGS="-lopenblas" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run -tags openblas ./examples -m "/model/path/here" -t 14
+```
+
+## GPU
+
+To build with CuBLAS:
+
+```
+BUILD_TYPE=cublas make libbinding.a
+CGO_LDFLAGS="-lcublas -lcudart -L/usr/local/cuda/lib64/" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples -m "/model/path/here" -t 14
+```
+
 Enjoy!
 
 The documentation is available [here](https://pkg.go.dev/github.com/go-skynet/go-llama.cpp) and the full example code is [here](https://github.com/go-skynet/go-llama.cpp/blob/master/examples/main.go).

diff --git a/binding.cpp b/binding.cpp
@@ -369,7 +369,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
 }
 
 
-void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings) {
+void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings, int n_gpu_layers) {
     // load the model
     auto lparams = llama_context_default_params();
 
@@ -379,6 +379,8 @@ void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool mem
     lparams.f16_kv     = memory_f16;
     lparams.embedding  = embeddings;
     lparams.use_mlock  = mlock;
+    lparams.n_gpu_layers = n_gpu_layers;
+
     void* res = nullptr;
     try {
         res = llama_init_from_file(fname, lparams);

diff --git a/binding.h b/binding.h
@@ -8,7 +8,7 @@ extern "C" {
 
 extern unsigned char tokenCallback(void *, char *);
 
-void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings);
+void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings, int n_gpu);
 
 int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
 

diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.go b/llama.go
@@ -22,7 +22,7 @@ type LLama struct {
 func New(model string, opts ...ModelOption) (*LLama, error) {
 	mo := NewModelOptions(opts...)
 	modelPath := C.CString(model)
-	result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings))
+	result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.int(mo.NGPULayers))
 	if result == nil {
 		return nil, fmt.Errorf("failed loading model")
 	}

diff --git a/llama_cublas.go b/llama_cublas.go
@@ -0,0 +1,9 @@
+//go:build cublas
+// +build cublas
+
+package llama
+
+/*
+#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
+*/
+import "C"
diff --git a/llama_openblas.go b/llama_openblas.go
@@ -0,0 +1,9 @@
+//go:build openblas
+// +build openblas
+
+package llama
+
+/*
+#cgo LDFLAGS: -lopenblas
+*/
+import "C"
diff --git a/options.go b/options.go
@@ -7,6 +7,7 @@ type ModelOptions struct {
 	F16Memory   bool
 	MLock       bool
 	Embeddings  bool
+	NGPULayers  int
 }
 
 type PredictOptions struct {
@@ -113,6 +114,13 @@ var IgnoreEOS PredictOption = func(p *PredictOptions) {
 	p.IgnoreEOS = true
 }
 
+// SetGPULayers sets the number of GPU layers to use to offload computation
+func SetGPULayers(n int) ModelOption {
+	return func(p *ModelOptions) {
+		p.NGPULayers = n
+	}
+}
+
 // SetTokenCallback sets the prompts that will stop predictions.
 func SetTokenCallback(fn func(string) bool) PredictOption {
 	return func(p *PredictOptions) {