Skip to content

Commit 8937ec5

Browse files
committed
Merge branch 'master' into gg/flash-attn
ggml-ci
2 parents 751591d + 3fe847b commit 8937ec5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+1583
-8252
lines changed

.github/workflows/bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ on:
3232
- cron: '04 2 * * *'
3333

3434
concurrency:
35-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
35+
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }}
3636
cancel-in-progress: true
3737

3838
jobs:

.github/workflows/build.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ jobs:
3232
- name: Clone
3333
id: checkout
3434
uses: actions/checkout@v4
35+
with:
36+
fetch-depth: 0
3537

3638
- name: Dependencies
3739
id: depends
@@ -88,6 +90,8 @@ jobs:
8890
- name: Clone
8991
id: checkout
9092
uses: actions/checkout@v4
93+
with:
94+
fetch-depth: 0
9195

9296
- name: Dependencies
9397
id: depends
@@ -206,6 +210,8 @@ jobs:
206210
- name: Clone
207211
id: checkout
208212
uses: actions/checkout@v4
213+
with:
214+
fetch-depth: 0
209215

210216
- name: Dependencies
211217
id: depends
@@ -238,6 +244,33 @@ jobs:
238244
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
239245
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
240246
247+
- name: Determine tag name
248+
id: tag
249+
shell: bash
250+
run: |
251+
BUILD_NUMBER="$(git rev-list --count HEAD)"
252+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
253+
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
254+
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
255+
else
256+
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
257+
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
258+
fi
259+
260+
- name: Pack artifacts
261+
id: pack_artifacts
262+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
263+
run: |
264+
cp LICENSE ./build/bin/
265+
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
266+
267+
- name: Upload artifacts
268+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
269+
uses: actions/upload-artifact@v4
270+
with:
271+
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
272+
name: llama-bin-ubuntu-x64.zip
273+
241274
# ubuntu-latest-cmake-sanitizer:
242275
# runs-on: ubuntu-latest
243276
#

.github/workflows/server.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323
- cron: '2 4 * * *'
2424

2525
concurrency:
26-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
26+
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
2727
cancel-in-progress: true
2828

2929
jobs:

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ lcov-report/
3434
gcovr-report/
3535

3636
build*
37+
!build.zig
3738
cmake-build-*
3839
out/
3940
tmp/
@@ -100,6 +101,9 @@ qnt-*.txt
100101
perf-*.txt
101102

102103
examples/jeopardy/results.txt
104+
examples/server/*.html.hpp
105+
examples/server/*.js.hpp
106+
examples/server/*.mjs.hpp
103107

104108
poetry.lock
105109
poetry.toml

CMakeLists.txt

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,11 @@ else()
4343
set(LLAMA_METAL_DEFAULT OFF)
4444
endif()
4545

46-
# TODO: fix this for Android CI
47-
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
48-
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
49-
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
50-
#else()
51-
# set(LLAMA_LLAMAFILE_DEFAULT ON)
52-
#endif()
53-
54-
# TODO: temporary disable until MoE is fixed
55-
# https://github.com/ggerganov/llama.cpp/pull/6716
56-
set(LLAMA_LLAMAFILE_DEFAULT OFF)
46+
if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
47+
set(LLAMA_LLAMAFILE_DEFAULT OFF)
48+
else()
49+
set(LLAMA_LLAMAFILE_DEFAULT ON)
50+
endif()
5751

5852
# general
5953
option(BUILD_SHARED_LIBS "build shared libraries" OFF)

Makefile

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS
384384
MK_LDFLAGS += $(shell pkg-config --libs openblas)
385385
endif # LLAMA_OPENBLAS
386386

387-
# TODO: temporary disable until MoE is fixed
388-
# https://github.com/ggerganov/llama.cpp/pull/6716
389-
LLAMA_NO_LLAMAFILE := 1
390-
391387
ifndef LLAMA_NO_LLAMAFILE
392388
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
393389
OBJS += sgemm.o
@@ -699,7 +695,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
699695
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
700696
$(CXX) $(CXXFLAGS) -c $< -o $@
701697

702-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
698+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
703699
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
704700

705701
common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -800,10 +796,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
800796
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
801797
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
802798

803-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
799+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804800
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
805801
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
806802

803+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
804+
examples/server/%.hpp: examples/server/public/% Makefile
805+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
806+
echo "unsigned char $${NAME}[] = {" && \
807+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
808+
echo "};" && \
809+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
810+
) > $@
811+
807812
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
808813
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
809814
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

README-sycl.md

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh
229229
# Build LLAMA with MKL BLAS acceleration for intel GPU
230230
mkdir -p build && cd build
231231

232-
# Option 1: Use FP16 for better performance in long-prompt inference
233-
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
234-
235-
# Option 2: Use FP32 by default
232+
# Option 1: Use FP32 (recommended for better performance in most cases)
236233
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
237234

235+
# Option 2: Use FP16
236+
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
237+
238238
#build all binary
239239
cmake --build . --config Release -j -v
240240
```
@@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
250250
# Build LLAMA with Nvidia BLAS acceleration through SYCL
251251
mkdir -p build && cd build
252252

253-
# Option 1: Use FP16 for better performance in long-prompt inference
254-
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
255-
256-
# Option 2: Use FP32 by default
253+
# Option 1: Use FP32 (recommended for better performance in most cases)
257254
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
258255

256+
# Option 2: Use FP16
257+
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
258+
259259
#build all binary
260260
cmake --build . --config Release -j -v
261261

@@ -416,6 +416,10 @@ mkdir -p build
416416
cd build
417417
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
418418
419+
# Option 1: Use FP32 (recommended for better performance in most cases)
420+
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
421+
422+
# Option 2: Or FP16
419423
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
420424
421425
make -j

README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13+
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
1314
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
1415
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
1516
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
@@ -95,7 +96,7 @@ Typically finetunes of the base models below are supported as well.
9596
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
9697
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
9798
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
98-
- [X] Falcon
99+
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
99100
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
100101
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
101102
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -549,7 +550,7 @@ Building the program with BLAS support may lead to some performance improvements
549550
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
550551
551552
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
552-
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
553+
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
553554
554555
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
555556
@@ -574,6 +575,12 @@ Building the program with BLAS support may lead to some performance improvements
574575
575576
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
576577
578+
Linux packaging:
579+
Fedora Linux:
580+
```bash
581+
sudo dnf install clblast
582+
```
583+
577584
Alternatively, they may be built from source.
578585
579586
- <details>

build.zig

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,4 +140,33 @@ pub fn build(b: *std.build.Builder) !void {
140140
if (server.target.isWindows()) {
141141
server.linkSystemLibrary("ws2_32");
142142
}
143+
144+
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
145+
for (server_assets) |asset| {
146+
const input_path = b.fmt("examples/server/public/{s}", .{asset});
147+
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
148+
149+
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
150+
151+
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
152+
defer b.allocator.free(input);
153+
154+
var buf = std.ArrayList(u8).init(b.allocator);
155+
defer buf.deinit();
156+
157+
for (input) |byte| {
158+
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
159+
}
160+
161+
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
162+
defer b.allocator.free(name);
163+
std.mem.replaceScalar(u8, name, '.', '_');
164+
165+
try std.fs.cwd().writeFile(output_path, b.fmt(
166+
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
167+
.{ name, buf.items, name, input.len },
168+
));
169+
170+
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
171+
}
143172
}

common/common.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ int32_t get_num_physical_cores() {
108108
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
109109
}
110110

111-
#if defined(__x86_64__) && defined(__linux__)
111+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
112112
#include <pthread.h>
113113

114114
static void cpuid(unsigned leaf, unsigned subleaf,
@@ -162,7 +162,7 @@ static int count_math_cpus(int cpu_count) {
162162
* Returns number of CPUs on system that are useful for math.
163163
*/
164164
int get_math_cpu_count() {
165-
#if defined(__x86_64__) && defined(__linux__)
165+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
166166
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
167167
if (cpu_count < 1) {
168168
return get_num_physical_cores();
@@ -242,7 +242,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
242242
invalid_param = true;
243243
return true;
244244
}
245+
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
245246
params.seed = std::stoul(argv[i]);
247+
sparams.seed = std::stoul(argv[i]);
246248
return true;
247249
}
248250
if (arg == "-t" || arg == "--threads") {
@@ -2332,12 +2334,12 @@ std::vector<llama_token> llama_tokenize(
23322334
return result;
23332335
}
23342336

2335-
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
2337+
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
23362338
std::vector<char> result(8, 0);
2337-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
2339+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
23382340
if (n_tokens < 0) {
23392341
result.resize(-n_tokens);
2340-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
2342+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
23412343
GGML_ASSERT(check == -n_tokens);
23422344
} else {
23432345
result.resize(n_tokens);

0 commit comments

Comments
 (0)