Skip to content

Commit 9dd7e77

Browse files
committed
Merge branch 'master' into xsn/llama_batch_remove_compat
2 parents 4be7ecf + afd9909 commit 9dd7e77

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+8914
-4616
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_AMX)
92+
set(GGML_AMX ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_CUDA_GRAPHS)
9296
set(GGML_CUDA_GRAPHS_DEFAULT ON)
9397
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
10871087
$(CXX) $(CXXFLAGS) -c $< -o $@
10881088
endif # GGML_NO_LLAMAFILE
10891089

1090+
ifndef GGML_NO_AMX
1091+
ggml/src/ggml-amx.o: \
1092+
ggml/src/ggml-amx.cpp \
1093+
ggml/include/ggml-amx.h
1094+
$(CXX) $(CXXFLAGS) -c $< -o $@
1095+
1096+
ggml/src/ggml-amx/mmq.o: \
1097+
ggml/src/ggml-amx/mmq.cpp \
1098+
ggml/src/ggml-amx/mmq.h \
1099+
ggml/include/ggml.h
1100+
$(CXX) $(CXXFLAGS) -c $< -o $@
1101+
endif
1102+
10901103
ifdef GGML_RPC
10911104
ggml/src/ggml-rpc.o: \
10921105
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
12381251
rm -vrf ggml/src/ggml-metal-embed.metal
12391252
rm -vrf ggml/src/ggml-cuda/*.o
12401253
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1254+
rm -vrf ggml/src/ggml-amx/*.o
12411255
rm -rvf $(BUILD_TARGETS)
12421256
rm -rvf $(TEST_TARGETS)
12431257
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
2929

3030
- Plain C/C++ implementation without any dependencies
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
32-
- AVX, AVX2 and AVX512 support for x86 architectures
32+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
3434
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support
@@ -130,6 +130,8 @@ Typically finetunes of the base models below are supported as well.
130130
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
131131
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
132132
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
133+
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
134+
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
133135

134136
**UI:**
135137

common/arg.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
947947
params.sparams.tfs_z = std::stof(value);
948948
}
949949
).set_sparam());
950+
add_opt(common_arg(
951+
{"--xtc-probability"}, "N",
952+
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
953+
[](common_params & params, const std::string & value) {
954+
params.sparams.xtc_probability = std::stof(value);
955+
}
956+
).set_sparam());
957+
add_opt(common_arg(
958+
{"--xtc-threshold"}, "N",
959+
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
960+
[](common_params & params, const std::string & value) {
961+
params.sparams.xtc_threshold = std::stof(value);
962+
}
963+
).set_sparam());
950964
add_opt(common_arg(
951965
{"--typical"}, "N",
952966
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
@@ -1788,6 +1802,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17881802
params.n_threads_http = value;
17891803
}
17901804
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1805+
add_opt(common_arg(
1806+
{"--cache-reuse"}, "N",
1807+
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
1808+
[](common_params & params, int value) {
1809+
params.n_cache_reuse = value;
1810+
}
1811+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
17911812
add_opt(common_arg(
17921813
{"--metrics"},
17931814
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
21042104
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
21052105
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
21062106
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2107+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2108+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
21072109
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
21082110
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
21092111
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");

common/common.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ enum common_sampler_type {
9090
COMMON_SAMPLER_TYPE_TFS_Z = 4,
9191
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
9292
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
93+
COMMON_SAMPLER_TYPE_XTC = 7,
94+
COMMON_SAMPLER_TYPE_INFILL = 8,
9395
};
9496

9597
// dimensionality reduction methods, used by cvector-generator
@@ -108,6 +110,8 @@ struct common_sampler_params {
108110
int32_t top_k = 40; // <= 0 to use vocab size
109111
float top_p = 0.95f; // 1.0 = disabled
110112
float min_p = 0.05f; // 0.0 = disabled
113+
float xtc_probability = 0.00f; // 0.0 = disabled
114+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
111115
float tfs_z = 1.00f; // 1.0 = disabled
112116
float typ_p = 1.00f; // typical_p, 1.0 = disabled
113117
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -124,13 +128,15 @@ struct common_sampler_params {
124128
bool ignore_eos = false;
125129
bool no_perf = false; // disable performance metrics
126130

131+
127132
std::vector<enum common_sampler_type> samplers = {
128133
COMMON_SAMPLER_TYPE_TOP_K,
129134
COMMON_SAMPLER_TYPE_TFS_Z,
130135
COMMON_SAMPLER_TYPE_TYPICAL_P,
131136
COMMON_SAMPLER_TYPE_TOP_P,
132137
COMMON_SAMPLER_TYPE_MIN_P,
133-
COMMON_SAMPLER_TYPE_TEMPERATURE
138+
COMMON_SAMPLER_TYPE_XTC,
139+
COMMON_SAMPLER_TYPE_TEMPERATURE,
134140
};
135141

136142
std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -277,7 +283,8 @@ struct common_params {
277283
int32_t port = 8080; // server listens on this network port
278284
int32_t timeout_read = 600; // http read timeout in seconds
279285
int32_t timeout_write = timeout_read; // http write timeout in seconds
280-
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
286+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
287+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
281288

282289
std::string hostname = "127.0.0.1";
283290
std::string public_path = ""; // NOLINT

common/json-schema-to-grammar.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ class SchemaConverter {
611611
}
612612
return join_seq();
613613
};
614-
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614+
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615615
}
616616

617617
/*

common/sampling.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
130130

131131
snprintf(result, sizeof(result),
132132
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133-
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
133+
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
134134
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
135135
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
136-
top_k, tfs_z, top_p, min_p, typ_p, temp,
136+
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
137137
mirostat, mirostat_eta, mirostat_tau);
138138

139139
return std::string(result);
@@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
184184
case COMMON_SAMPLER_TYPE_MIN_P:
185185
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
186186
break;
187+
case COMMON_SAMPLER_TYPE_XTC:
188+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
189+
break;
187190
case COMMON_SAMPLER_TYPE_TFS_Z:
188191
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
189192
break;
@@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
193196
case COMMON_SAMPLER_TYPE_TEMPERATURE:
194197
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
195198
break;
199+
case COMMON_SAMPLER_TYPE_INFILL:
200+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
201+
break;
196202
default:
197203
GGML_ASSERT(false && "unknown sampler type");
198204
}
@@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
372378
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
373379
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
374380
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
381+
case COMMON_SAMPLER_TYPE_XTC: return 'x';
382+
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
375383
default : return '?';
376384
}
377385
}
@@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
384392
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
385393
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
386394
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
395+
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
396+
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
387397
default : return "";
388398
}
389399
}
@@ -396,6 +406,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
396406
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
397407
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
398408
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
409+
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
410+
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
399411
};
400412

401413
// since samplers names are written multiple ways
@@ -441,7 +453,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
441453
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
442454
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
443455
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
444-
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
456+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
457+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
458+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
445459
};
446460

447461
std::vector<common_sampler_type> samplers;

examples/json_schema_to_grammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ def join_seq():
540540
return self._add_rule(
541541
name,
542542
to_rule(transform()) if self._raw_pattern \
543-
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
543+
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
544544

545545

546546
def _resolve_ref(self, ref):

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
151151
int count = ggml_backend_sycl_get_device_count();
152152
for (int i = 0; i < count; i++) {
153153
char buf[128];
154-
ggml_sycl_get_device_description(i, buf, sizeof(buf));
154+
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
155155
id += buf;
156156
if (i < count - 1) {
157157
id += "/";

0 commit comments

Comments
 (0)