Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ endif
# Compile flags
#

BUILD_TYPE?=
# keep standard at C11 and C++11
CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
Expand Down Expand Up @@ -120,6 +121,18 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

ifeq ($(BUILD_TYPE),cublas)
EXTRA_LIBS=
CMAKE_ARGS+="-DLLAMA_CUBLAS=ON"
EXTRA_TARGETS+=llama.cpp/ggml-cuda.o
endif

ifeq ($(BUILD_TYPE),openblas)
EXTRA_LIBS=
CMAKE_ARGS+="-DLLAMA_OPENBLAS=ON"
EXTRA_TARGETS+=
endif

#
# Print build information
#
Expand All @@ -131,14 +144,23 @@ $(info I UNAME_M: $(UNAME_M))
$(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I BUILD_TYPE: $(BUILD_TYPE))
$(info I CMAKE_ARGS: $(CMAKE_ARGS))
$(info I EXTRA_TARGETS: $(EXTRA_TARGETS))
$(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )

# Use this if you want to set the default behavior


llama.cpp/ggml.o:
mkdir build
cd build && cmake ../llama.cpp $(CMAKE_ARGS) && make VERBOSE=1 ggml && cp -rf CMakeFiles/ggml.dir/ggml.c.o ../llama.cpp/ggml.o

llama.cpp/ggml-cuda.o: llama.cpp/ggml.o
cd build && cp -rf CMakeFiles/ggml.dir/ggml-cuda.cu.o ../llama.cpp/ggml-cuda.o

llama.cpp/llama.o:
$(MAKE) -C llama.cpp llama.o

Expand All @@ -148,8 +170,8 @@ llama.cpp/common.o:
binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)

libbinding.a: binding.o
ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o
libbinding.a: binding.o $(EXTRA_TARGETS)
ar src libbinding.a llama.cpp/ggml.o $(EXTRA_TARGETS) llama.cpp/common.o llama.cpp/llama.o binding.o

generic-llama.cpp/ggml.o:
$(MAKE) -C llama.cpp ggml.o
Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,24 @@ Now you can run the example with:
LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples -m "/model/path/here" -t 14
```

## OpenBLAS accelleration

To build and run with OpenBLAS, for example:

```
BUILD_TYPE=openblas make libbinding.a
CGO_LDFLAGS="-lopenblas" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run -tags openblas ./examples -m "/model/path/here" -t 14
```

## GPU

To build with CuBLAS:

```
BUILD_TYPE=cublas make libbinding.a
CGO_LDFLAGS="-lcublas -lcudart -L/usr/local/cuda/lib64/" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go run ./examples -m "/model/path/here" -t 14
```

Enjoy!

The documentation is available [here](https://pkg.go.dev/github.com/go-skynet/go-llama.cpp) and the full example code is [here](https://github.com/go-skynet/go-llama.cpp/blob/master/examples/main.go).
Expand Down
4 changes: 3 additions & 1 deletion binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
}


void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings) {
void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings, int n_gpu_layers) {
// load the model
auto lparams = llama_context_default_params();

Expand All @@ -379,6 +379,8 @@ void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool mem
lparams.f16_kv = memory_f16;
lparams.embedding = embeddings;
lparams.use_mlock = mlock;
lparams.n_gpu_layers = n_gpu_layers;

void* res = nullptr;
try {
res = llama_init_from_file(fname, lparams);
Expand Down
2 changes: 1 addition & 1 deletion binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extern "C" {

extern unsigned char tokenCallback(void *, char *);

void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings);
void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings, int n_gpu);

int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);

Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
2 changes: 1 addition & 1 deletion llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ type LLama struct {
func New(model string, opts ...ModelOption) (*LLama, error) {
mo := NewModelOptions(opts...)
modelPath := C.CString(model)
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings))
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.int(mo.NGPULayers))
if result == nil {
return nil, fmt.Errorf("failed loading model")
}
Expand Down
9 changes: 9 additions & 0 deletions llama_cublas.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build cublas
// +build cublas

package llama

/*
#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
*/
import "C"
9 changes: 9 additions & 0 deletions llama_openblas.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build openblas
// +build openblas

package llama

/*
#cgo LDFLAGS: -lopenblas
*/
import "C"
8 changes: 8 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ type ModelOptions struct {
F16Memory bool
MLock bool
Embeddings bool
NGPULayers int
}

type PredictOptions struct {
Expand Down Expand Up @@ -113,6 +114,13 @@ var IgnoreEOS PredictOption = func(p *PredictOptions) {
p.IgnoreEOS = true
}

// SetGPULayers sets the number of GPU layers to use to offload computation
func SetGPULayers(n int) ModelOption {
return func(p *ModelOptions) {
p.NGPULayers = n
}
}

// SetTokenCallback sets the prompts that will stop predictions.
func SetTokenCallback(fn func(string) bool) PredictOption {
return func(p *PredictOptions) {
Expand Down