diff --git a/.gitignore b/.gitignore index 8b5f45a2d04..67ea6f5fffb 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ models-mnt /baby-llama /beam-search /save-load-state +/fill-in-middle build-info.h arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index 23f050c0d7a..bd9750aaa76 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search fill-in-middle tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 @@ -67,8 +67,8 @@ OPT = -Ofast else OPT = -O3 endif -CFLAGS = -I. $(OPT) -std=c11 -fPIC -CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC +CFLAGS = -I. $(OPT) -std=c11 -fPIC -g -fsanitize=address +CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC -g -fsanitize=address LDFLAGS = ifdef LLAMA_DEBUG @@ -475,6 +475,9 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +fill-in-middle: examples/fill-in-middle/FIM.c ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' BUILD_TARGETS += metal endif diff --git a/examples/fill-in-middle/CMakeLists.txt b/examples/fill-in-middle/CMakeLists.txt new file mode 100644 index 00000000000..9150d7c1c58 --- /dev/null +++ b/examples/fill-in-middle/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET FIM) +add_executable(${TARGET} FIM.c) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/fill-in-middle/FIM.c b/examples/fill-in-middle/FIM.c new file mode 100644 index 00000000000..7263477d253 --- /dev/null +++ b/examples/fill-in-middle/FIM.c @@ -0,0 +1,192 @@ +#include +#include +#include +#include "../../llama.h" + +/* +The FIM (Fill-In-Middle) objective is useful for generating text conditioned on a prefix and a suffix. +For a quick summary of what's going on here, see issue #2818. +*/ + + +static inline struct llama_context* +codellama_create_fim_context(const char* model_path, const char** error_message) { + struct llama_context_params params = llama_context_default_params(); + params.use_mlock = 1; + struct llama_model* model = llama_load_model_from_file(model_path, params); + if (!model) { + *error_message = "Failed to load model."; + return NULL; + } + + struct llama_context* context = llama_new_context_with_model(model, params); + if (!context) { + *error_message = "Failed to create context."; + llama_free_model(model); + return NULL; + } + + return context; +} + +static inline char* +codellama_fill_in_middle(struct llama_context* ctx, const char* prefix, const char* suffix, size_t n_max_tokens, int n_threads, bool spm, const char** error_message) { + + int num_tokens; + size_t combined_len = strlen(prefix) + strlen(suffix) + 3; + size_t initial_size = sizeof(llama_token) * combined_len; + llama_token* tokens_end = (llama_token*)malloc(initial_size); + llama_token* tokens = tokens_end; + if (!tokens) { + *error_message = "Failed to allocate memory for tokens."; + return NULL; + } + + // Append first part of prompt + *tokens_end++ = spm ? llama_token_suffix(ctx) : llama_token_prefix(ctx); + tokens_end += num_tokens = llama_tokenize(ctx, spm ? suffix : prefix, tokens_end, n_max_tokens, 0); + if (num_tokens < 0) { + *error_message = "Failed to tokenize the prompt."; + free(tokens); + return NULL; + } + + // Append second part of prompt + *tokens_end++ = spm ? llama_token_prefix(ctx) : llama_token_suffix(ctx); + tokens_end += num_tokens = llama_tokenize(ctx, spm ? prefix : suffix, tokens_end, n_max_tokens, 0); + if (num_tokens < 0) { + *error_message = "Failed to tokenize the prompt."; + free(tokens); + return NULL; + } + + // Append middle token + *tokens_end++ = llama_token_middle(ctx); + + // Grow to accommodate the prompt and the max amount of generated tokens + size_t prompt_len = (size_t)(tokens_end - tokens); + size_t min_len = (prompt_len + n_max_tokens); + if (min_len > combined_len) { + llama_token* new_tokens = (llama_token*)realloc(tokens, sizeof(llama_token) * min_len); + if (!new_tokens) { + *error_message = "Failed to allocate memory for tokens."; + free(tokens); + return NULL; + } + tokens = new_tokens; + } + + // Evaluate the LM on the prompt. + if (llama_eval(ctx, tokens, prompt_len, 0, n_threads)) { + *error_message = "Failed to evaluate the prompt."; + free(tokens); + return NULL; + } + + // Generate tokens until n_max_tokens or the token is generated. + llama_token* generated_tokens = tokens + prompt_len; + size_t num_generated_tokens = 0; + int vocab_size = llama_n_vocab(ctx); + for (size_t i = 0; i < n_max_tokens; i++) { + // Evaluate the LM for a single token, obtaining the logits and probabilities. + if (llama_eval(ctx, &generated_tokens[num_generated_tokens], 1, (int)num_generated_tokens, n_threads)) { + *error_message = "Failed to evaluate the prompt."; + free(tokens); + break; + } + float* logits = llama_get_logits(ctx); + + // From the logits, select the most likely token. + float highest_log_likelihood = -1; + llama_token likeliest_token = -1; + for (llama_token token_id = 0; token_id < vocab_size; token_id++) { + if (logits[token_id] > highest_log_likelihood) { + highest_log_likelihood = logits[token_id]; + likeliest_token = token_id; + } + } + + // Don't add the token if it's . + if (likeliest_token == llama_token_eot(ctx)) { + break; + } + + // Append the token, so it's there for subsequent evaluations. + generated_tokens[num_generated_tokens++] = likeliest_token; + + // Translate the token to a string. + char cs[20] = {0}; + int token_length = llama_token_to_piece(ctx, likeliest_token, cs, 20); + cs[token_length] = '\0'; + printf("%s\n", cs); + } + + // Allocate memory for the final result + size_t result_length = 0; + size_t result_capacity = 4096; + char* result = (char*)malloc(sizeof(char) * result_capacity); + if (!result) { + *error_message = "Failed to allocate memory for result."; + free(tokens); + return NULL; + } + + // Translate tokens to string, growing the allocation if it's too small. + for (size_t i = 0; i < num_generated_tokens; i++) { + int appended = llama_token_to_piece(ctx, generated_tokens[i], result, result_capacity - result_length); + if (appended < 0) { + i--; // retry the token with a larger buffer + size_t new_capacity = result_capacity * 2; + char* new_result = (char*)realloc(result, sizeof(char) * new_capacity); + if (!new_result) { + *error_message = "Failed to allocate memory for result."; + free(tokens); + free(result); + return NULL; + } + result = new_result; + result_capacity = new_capacity; + } + + result_length += appended; + } + + free(tokens); + *error_message = NULL; + return result; +} + +int main(int argc, char** argv) { + if (argc != 6) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + char* model = argv[1]; + char* prefix = argv[2]; + char* suffix = argv[3]; + size_t n_max_tokens = atoi(argv[4]) > 0 ? atoi(argv[4]) : 64; + int n_threads = atoi(argv[5]); + bool spm = false; + const char* error_message = NULL; + + puts("Loading the model. This could take quite a while..."); + struct llama_context* ctx = codellama_create_fim_context(model, &error_message); + if (error_message) { + fprintf(stderr, "Error: %s\n", error_message); + return 1; + } + + puts("Model loaded. Generating text..."); + char* result = codellama_fill_in_middle(ctx, prefix, suffix, n_max_tokens, n_threads, spm, &error_message); + if (error_message) { + fprintf(stderr, "Error: %s\n", error_message); + return 1; + } + + puts("Generated text:"); + printf("%s%s%s\n", prefix, result, suffix); + + free(result); + llama_free(ctx); +} diff --git a/examples/fill-in-middle/README.md b/examples/fill-in-middle/README.md new file mode 100644 index 00000000000..98f81e043fa --- /dev/null +++ b/examples/fill-in-middle/README.md @@ -0,0 +1,35 @@ + +# Example + +The FIM (Fill-In-Middle) objective is useful for generating text conditioned on a prefix and a suffix. +This example is for use with codellama, for doing exactly that. + +For a quick summary of what's going on here, see issue #2818, and/or read [the FIM paper](https://arxiv.org/abs/2207.14255). + +``` +Usage: ./fill-in-middle +``` +```sh +./fill-in-middle \ + CodeLlama-34B-GGUF/codellama-34b.Q4_K_S.gguf \ + $'def add(a, b):\n' \ + $'\n' \ + 64 \ + 4 +``` + +With prefix: +```py +def add(a, b): + +``` + +And a newline as suffix: +```py + +``` + +We can expect it to generate somethng like: +```py + return a + b +``` diff --git a/llama.cpp b/llama.cpp index 3114d331116..1ec7e9dd4b5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -964,6 +964,13 @@ struct llama_vocab { id linefeed_id = 13; + // codellama FIM special tokens + // TODO: load these from the vocabulary. + id special_prefix_id = 32007; + id special_middle_id = 32009; + id special_suffix_id = 32008; + id special_eot_id = 32010; + int find_bpe_rank(std::string token_left, std::string token_right) const { replace_all(token_left, " ", "\u0120"); replace_all(token_left, "\n", "\u010A"); @@ -6132,6 +6139,22 @@ llama_token llama_token_nl(const struct llama_context * ctx) { return ctx->model.vocab.linefeed_id; } +llama_token llama_token_prefix(const struct llama_context * ctx) { + return ctx->model.vocab.special_prefix_id; +} + +llama_token llama_token_middle(const struct llama_context * ctx) { + return ctx->model.vocab.special_middle_id; +} + +llama_token llama_token_suffix(const struct llama_context * ctx) { + return ctx->model.vocab.special_suffix_id; +} + +llama_token llama_token_eot(const struct llama_context * ctx) { + return ctx->model.vocab.special_eot_id; +} + int llama_tokenize( struct llama_context * ctx, const char * text, diff --git a/llama.h b/llama.h index 422f28527a0..1d1cfb692e0 100644 --- a/llama.h +++ b/llama.h @@ -361,6 +361,12 @@ extern "C" { LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line + // codellama FIM tokens + LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of FIM prefix + LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of FIM middle + LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of FIM suffix + LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of FIM middle + // // Tokenization //