diff --git a/.gitignore b/.gitignore index 95afccc..a05c0f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ +ai/build/ + *.xml test/carbon +ai/testgen +ai/*.diff diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..6c7d5cc --- /dev/null +++ b/.gitmodules @@ -0,0 +1,12 @@ +[submodule "ai/vendor/highway"] + path = ai/vendor/highway + url = https://github.com/google/highway +[submodule "ai/vendor/sentencepiece"] + path = ai/vendor/sentencepiece + url = https://github.com/google/sentencepiece +[submodule "ai/vendor/gemma.cpp"] + path = ai/vendor/gemma.cpp + url = https://github.com/google/gemma.cpp +[submodule "ai/model"] + path = ai/model + url = https://huggingface.co/sparky-game/carbon diff --git a/ai/GNUmakefile b/ai/GNUmakefile new file mode 100644 index 0000000..418daac --- /dev/null +++ b/ai/GNUmakefile @@ -0,0 +1,149 @@ +PPO_MKDIR = MKDIR +PPO_CLEAN = CLEAN +PPO_MERGE = MERGE +PPO_HASH = HASH +PPO_CXX = CXX +PPO_LD = LD + +CXX = clang++ +SHA256SUM = sha256sum -c +OPTIMIZATIONS = -pipe -O3 + +HIGHWAY_HDR_DIR = vendor/highway +HIGHWAY_SRC_DIR = $(HIGHWAY_HDR_DIR)/hwy +HIGHWAY_BUILD_DIR = build/highway +HIGHWAY_SRCS := $(wildcard $(HIGHWAY_SRC_DIR)/contrib/sort/vqsort*.cc) $(addsuffix .cc, $(addprefix $(HIGHWAY_SRC_DIR)/, abort aligned_allocator nanobenchmark per_target print targets timer)) $(HIGHWAY_SRC_DIR)/contrib/thread_pool/topology.cc +HIGHWAY_OBJS := $(patsubst $(HIGHWAY_SRC_DIR)/%.cc, $(HIGHWAY_BUILD_DIR)/%.o, $(HIGHWAY_SRCS)) +HIGHWAY_OBJS := $(patsubst $(HIGHWAY_BUILD_DIR)/contrib/sort/%.o, $(HIGHWAY_BUILD_DIR)/%.o, $(HIGHWAY_OBJS)) +HIGHWAY_OBJS := $(patsubst $(HIGHWAY_BUILD_DIR)/contrib/thread_pool/%.o, $(HIGHWAY_BUILD_DIR)/%.o, $(HIGHWAY_OBJS)) +HIGHWAY_CPPFLAGS = -isystem $(HIGHWAY_HDR_DIR) +HIGHWAY_CXXFLAGS = -std=c++11 $(OPTIMIZATIONS) + +SENTENCEPIECE_HDR_DIR = vendor/sentencepiece +SENTENCEPIECE_SRC_DIR = $(SENTENCEPIECE_HDR_DIR)/src +SENTENCEPIECE_BUILD_DIR = build/sentencepiece +SENTENCEPIECE_SRCS := $(wildcard $(SENTENCEPIECE_HDR_DIR)/third_party/protobuf-lite/*.cc) $(SENTENCEPIECE_HDR_DIR)/third_party/absl/flags/flag.cc $(addsuffix .pb.cc, $(addprefix $(SENTENCEPIECE_SRC_DIR)/builtin_pb/, sentencepiece sentencepiece_model)) $(addsuffix .cc, $(addprefix $(SENTENCEPIECE_SRC_DIR)/, bpe_model char_model error filesystem model_factory model_interface normalizer sentencepiece_processor unigram_model util word_model)) +SENTENCEPIECE_OBJS := $(patsubst $(SENTENCEPIECE_SRC_DIR)/%.cc, $(SENTENCEPIECE_BUILD_DIR)/%.o, $(SENTENCEPIECE_SRCS)) +SENTENCEPIECE_OBJS := $(patsubst $(SENTENCEPIECE_BUILD_DIR)/builtin_pb/%.o, $(SENTENCEPIECE_BUILD_DIR)/%.o, $(SENTENCEPIECE_OBJS)) +SENTENCEPIECE_OBJS := $(patsubst $(SENTENCEPIECE_HDR_DIR)/third_party/protobuf-lite/%.cc, $(SENTENCEPIECE_BUILD_DIR)/%.o, $(SENTENCEPIECE_OBJS)) +SENTENCEPIECE_OBJS := $(patsubst $(SENTENCEPIECE_HDR_DIR)/third_party/absl/flags/%.cc, $(SENTENCEPIECE_BUILD_DIR)/%.o, $(SENTENCEPIECE_OBJS)) +SENTENCEPIECE_CPPFLAGS = -D HAVE_PTHREAD -isystem $(SENTENCEPIECE_HDR_DIR) -isystem $(SENTENCEPIECE_SRC_DIR)/builtin_pb -isystem $(SENTENCEPIECE_HDR_DIR)/third_party/protobuf-lite +SENTENCEPIECE_CXXFLAGS = -std=c++17 $(OPTIMIZATIONS) +SENTENCEPIECE_VERSION := $(shell cat $(SENTENCEPIECE_HDR_DIR)/VERSION.txt) + +GEMMACPP_HDR_DIR = vendor/gemma.cpp +GEMMACPP_SRC_DIR = $(GEMMACPP_HDR_DIR) +GEMMACPP_BUILD_DIR = build/gemma.cpp +GEMMACPP_SRCS := $(addsuffix .cc, $(addprefix $(GEMMACPP_SRC_DIR)/backprop/, backward forward optimizer)) $(addsuffix .cc, $(addprefix $(GEMMACPP_SRC_DIR)/compression/, blob_store io_win io)) $(addsuffix .cc, $(addprefix $(GEMMACPP_SRC_DIR)/evals/, benchmark_helper cross_entropy)) $(wildcard $(GEMMACPP_SRC_DIR)/gemma/instantiations/*.cc) $(addsuffix .cc, $(addprefix $(GEMMACPP_SRC_DIR)/gemma/, common gemma kv_cache tokenizer weights)) +GEMMACPP_OBJS := $(patsubst $(GEMMACPP_SRC_DIR)/gemma/%.cc, $(GEMMACPP_BUILD_DIR)/%.o, $(GEMMACPP_SRCS)) +GEMMACPP_OBJS := $(patsubst $(GEMMACPP_BUILD_DIR)/instantiations/%.o, $(GEMMACPP_BUILD_DIR)/%.o, $(GEMMACPP_OBJS)) +GEMMACPP_OBJS := $(patsubst $(GEMMACPP_SRC_DIR)/evals/%.cc, $(GEMMACPP_BUILD_DIR)/%.o, $(GEMMACPP_OBJS)) +GEMMACPP_OBJS := $(patsubst $(GEMMACPP_SRC_DIR)/backprop/%.cc, $(GEMMACPP_BUILD_DIR)/%.o, $(GEMMACPP_OBJS)) +GEMMACPP_OBJS := $(patsubst $(GEMMACPP_SRC_DIR)/compression/%.cc, $(GEMMACPP_BUILD_DIR)/%.o, $(GEMMACPP_OBJS)) +GEMMACPP_CPPFLAGS = -isystem $(GEMMACPP_HDR_DIR) -isystem $(HIGHWAY_HDR_DIR) -isystem $(SENTENCEPIECE_HDR_DIR) +GEMMACPP_CXXFLAGS = -std=c++17 $(OPTIMIZATIONS) + +TESTGEN_BUILD_DIR = build +TESTGEN_WEIGHTS_IN := $(wildcard model/weights.sbs.*) +TESTGEN_WEIGHTS_OUT = $(TESTGEN_BUILD_DIR)/weights.sbs +TESTGEN_WEIGHTS_SUM = model/weights.sha256 +TESTGEN_SRCS = testgen.cc +TESTGEN_OBJS = $(patsubst %.cc, $(TESTGEN_BUILD_DIR)/%.o, $(TESTGEN_SRCS)) +TESTGEN_CPPFLAGS = -isystem $(HIGHWAY_HDR_DIR) -isystem $(GEMMACPP_HDR_DIR) +TESTGEN_CXXFLAGS = -std=c++20 -Wall -Wextra -Wpedantic -Werror $(OPTIMIZATIONS) +TESTGEN_LDFLAGS = -static $(OPTIMIZATIONS) +TESTGEN_OUT = testgen + +DIRS_OUT = $(TESTGEN_BUILD_DIR) $(HIGHWAY_BUILD_DIR) $(SENTENCEPIECE_BUILD_DIR) $(GEMMACPP_BUILD_DIR) + +.PHONY: all clean mrproper + +all: $(DIRS_OUT) $(TESTGEN_WEIGHTS_OUT) $(TESTGEN_OUT) + @: + +$(DIRS_OUT): + @echo " $(PPO_MKDIR) $@" + @mkdir -p $@ + +$(TESTGEN_WEIGHTS_OUT): $(TESTGEN_WEIGHTS_IN) + @[ ! -e $@ ] && for i in $^; do \ + echo " $(PPO_MERGE) $$i >> $@"; \ + cat $$i >> $@; \ + done || true + @echo " $(PPO_HASH) $@" + @$(SHA256SUM) $(TESTGEN_WEIGHTS_SUM) >/dev/null 2>&1 + +$(TESTGEN_OUT): $(HIGHWAY_OBJS) $(SENTENCEPIECE_OBJS) $(GEMMACPP_OBJS) $(TESTGEN_OBJS) + @echo " $(PPO_LD) $@" + @$(CXX) $^ $(TESTGEN_LDFLAGS) -o $@ + +$(HIGHWAY_BUILD_DIR)/%.o: $(HIGHWAY_SRC_DIR)/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(HIGHWAY_CPPFLAGS) $(HIGHWAY_CXXFLAGS) -c -MD $< -o $@ + +$(HIGHWAY_BUILD_DIR)/%.o: $(HIGHWAY_SRC_DIR)/contrib/sort/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(HIGHWAY_CPPFLAGS) $(HIGHWAY_CXXFLAGS) -c -MD $< -o $@ + +$(HIGHWAY_BUILD_DIR)/%.o: $(HIGHWAY_SRC_DIR)/contrib/thread_pool/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(HIGHWAY_CPPFLAGS) $(HIGHWAY_CXXFLAGS) -c -MD $< -o $@ + +$(SENTENCEPIECE_BUILD_DIR)/%.o: $(SENTENCEPIECE_SRC_DIR)/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(SENTENCEPIECE_CPPFLAGS) $(SENTENCEPIECE_CXXFLAGS) -c -MD $< -o $@ + +$(SENTENCEPIECE_BUILD_DIR)/%.o: $(SENTENCEPIECE_SRC_DIR)/builtin_pb/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(SENTENCEPIECE_CPPFLAGS) $(SENTENCEPIECE_CXXFLAGS) -c -MD $< -o $@ + +$(SENTENCEPIECE_BUILD_DIR)/%.o: $(SENTENCEPIECE_HDR_DIR)/third_party/protobuf-lite/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(SENTENCEPIECE_CPPFLAGS) $(SENTENCEPIECE_CXXFLAGS) -c -MD $< -o $@ + +$(SENTENCEPIECE_BUILD_DIR)/%.o: $(SENTENCEPIECE_HDR_DIR)/third_party/absl/flags/%.cc + @echo " $(PPO_CXX) $@" + @[ ! -e $(SENTENCEPIECE_HDR_DIR)/config.h ] && cp $(SENTENCEPIECE_HDR_DIR)/config.h.in $(SENTENCEPIECE_HDR_DIR)/config.h || true + @sed -i -e 's/@PROJECT_VERSION@/$(SENTENCEPIECE_VERSION)/' -e 's/@PROJECT_NAME@/sentencepiece/' $(SENTENCEPIECE_HDR_DIR)/config.h + @$(CXX) $(SENTENCEPIECE_CPPFLAGS) $(SENTENCEPIECE_CXXFLAGS) -c -MD $< -o $@ + +$(GEMMACPP_BUILD_DIR)/%.o: $(GEMMACPP_SRC_DIR)/gemma/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(GEMMACPP_CPPFLAGS) $(GEMMACPP_CXXFLAGS) -c -MD $< -o $@ + +$(GEMMACPP_BUILD_DIR)/%.o: $(GEMMACPP_SRC_DIR)/gemma/instantiations/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(GEMMACPP_CPPFLAGS) $(GEMMACPP_CXXFLAGS) -c -MD $< -o $@ + +$(GEMMACPP_BUILD_DIR)/%.o: $(GEMMACPP_SRC_DIR)/evals/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(GEMMACPP_CPPFLAGS) $(GEMMACPP_CXXFLAGS) -c -MD $< -o $@ + +$(GEMMACPP_BUILD_DIR)/%.o: $(GEMMACPP_SRC_DIR)/backprop/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(GEMMACPP_CPPFLAGS) $(GEMMACPP_CXXFLAGS) -c -MD $< -o $@ + +$(GEMMACPP_BUILD_DIR)/%.o: $(GEMMACPP_SRC_DIR)/compression/%.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(GEMMACPP_CPPFLAGS) $(GEMMACPP_CXXFLAGS) -c -MD $< -o $@ + +$(TESTGEN_BUILD_DIR)/%.o: %.cc + @echo " $(PPO_CXX) $@" + @$(CXX) $(TESTGEN_CPPFLAGS) $(TESTGEN_CXXFLAGS) -c -MD $< -o $@ + +-include $(TESTGEN_BUILD_DIR)/*.d +-include $(HIGHWAY_BUILD_DIR)/*.d +-include $(GEMMACPP_BUILD_DIR)/*.d +-include $(SENTENCEPIECE_BUILD_DIR)/*.d + +clean: + @if [ -d $(TESTGEN_BUILD_DIR) ]; then \ + echo " $(PPO_CLEAN) $(TESTGEN_BUILD_DIR)"; \ + rm -r $(TESTGEN_BUILD_DIR); \ + fi + +mrproper: clean + @if [ -e testgen ]; then \ + echo " $(PPO_CLEAN) testgen"; \ + rm testgen; \ + fi diff --git a/ai/model b/ai/model new file mode 160000 index 0000000..fd286b2 --- /dev/null +++ b/ai/model @@ -0,0 +1 @@ +Subproject commit fd286b2a49aaf9bd45a2365d28d350e8b439f1aa diff --git a/ai/testgen.cc b/ai/testgen.cc new file mode 100644 index 0000000..a4d84ab --- /dev/null +++ b/ai/testgen.cc @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include "vendor/gemma.cpp/util/app.h" +#include "vendor/gemma.cpp/util/args.h" +#include "vendor/gemma.cpp/gemma/gemma.h" +#include "vendor/gemma.cpp/evals/benchmark_helper.h" + +#define CARBON_AI_ABORT_ERROR(err) HWY_ABORT("%s:%u :: %s", __FILE__, __LINE__, (err)) +#define CARBON_AI_ABORT_USAGE HWY_ABORT("usage: %s [-t ] --src --test ", argv[0]) + +static constexpr auto system_prompt { + "Strictly follow the following instructions and rules:\n" + "- Just write the function that has been requested, no main, no examples, no nonsense.\n" + "- When finished writing the function, do not repeat it, just write it once.\n" + "- Do not explain anything, just write the code requested.\n" + "- Do not use any markdown formatting at all, just plain text.\n" + "- Only write the plain text code with no additional formatting.\n" + "- Always use 2 space indenting, no tabs.\n" + "- Do not write multiple blocks of code, just one.\n" + "- Do not use any third-party dependency, just built-in features.\n" + "- Do not remove any original content, always extend it while maintaining original code.\n" + "Take all of these instructions into consideration while performing as accurate as possible the following requests.\n" +}; +static constexpr auto test_code_prompt { + "Here is a C/C++ unit test translation unit:" +}; +static constexpr auto src_code_prompt { + "And this is the C/C++ translation unit that it tests:" +}; +static constexpr auto request_prompt { + "Write an extended version of the unit test translation unit that includes additional unit tests that will increase the test coverage of the code under test.\n" +}; + +static inline std::string load_file_contents(const std::string &filepath) { + std::ifstream ifs { filepath }; + if (not ifs) throw std::runtime_error { "unable to open file for reading (`" + filepath + "`)" }; + std::stringstream ss; + ss << "```\n" << ifs.rdbuf() << "\n```\n"; + return ss.str(); +} + +static inline std::vector tokenize(const gcpp::GemmaTokenizer &tokenizer, const std::string &test_code, const std::string &src_code) { + std::string prompt { + std::string(system_prompt) + "\n" + + test_code_prompt + "\n" + + test_code + "\n" + + src_code_prompt + "\n" + + src_code + "\n" + + request_prompt + "\n" + }; + std::vector tokens; + HWY_ASSERT(tokenizer.Encode(prompt, &tokens)); + tokens.insert(tokens.begin(), gcpp::BOS_ID); + return tokens; +} + +static inline void preprocess_output(std::stringstream &ss) { + std::string line; + std::vector lines; + while (std::getline(ss, line)) lines.emplace_back(line); + ss.str(""); + ss.clear(); + for (const auto &i : lines) { + if (i.find("```") == std::string::npos) ss << i << std::endl; + } +} + +static inline bool it_builds(const std::stringstream &ss) { + std::ofstream ofs { "tmp.cc" }; + if (not ofs) throw std::runtime_error { "unable to open/create file for writing (`./tmp.cc`)" }; + ofs << ss.str(); + ofs.close(); + int result { std::system("clang++ -S tmp.cc -o /dev/null >/dev/null 2>&1") }; + std::remove("tmp.cc"); + if (result != 0) return false; + return true; +} + +int main(int argc, char **argv) { + gcpp::LoaderArgs loader(argc, argv); + loader.tokenizer = "model/tokenizer.spm"; + loader.weights = "build/weights.sbs"; + loader.model_type_str = "7b-it"; + loader.weight_type_str = "sfp"; + + gcpp::InferenceArgs inference(argc, argv); + inference.max_tokens = gcpp::kSeqLen; + inference.max_generated_tokens = 1024; + inference.temperature = 0.2f; + inference.deterministic = false; + inference.multiturn = false; + + gcpp::AppArgs app { argc, argv }; + app.num_threads = 1; + if (argc >= 3 and argv[1] == std::string("-t")) app.num_threads = std::atoi(argv[2]); + else std::cout << "WARNING: using 1 thread for inference by default.\n" << std::endl; + hwy::ThreadPool thread_pool(app.num_threads); + if (app.num_threads > 10) gcpp::PinWorkersToCores(thread_pool); + + std::string src_file, test_file; + switch (argc) { + case 5: + if (argv[1] != std::string("--src")) CARBON_AI_ABORT_USAGE; + if (not std::filesystem::exists(argv[2])) CARBON_AI_ABORT_USAGE; + if (argv[3] != std::string("--test")) CARBON_AI_ABORT_USAGE; + if (not std::filesystem::exists(argv[4])) CARBON_AI_ABORT_USAGE; + src_file = argv[2]; + test_file = argv[4]; + break; + case 7: + if (argv[3] != std::string("--src")) CARBON_AI_ABORT_USAGE; + if (not std::filesystem::exists(argv[4])) CARBON_AI_ABORT_USAGE; + if (argv[5] != std::string("--test")) CARBON_AI_ABORT_USAGE; + if (not std::filesystem::exists(argv[6])) CARBON_AI_ABORT_USAGE; + src_file = argv[4]; + test_file = argv[6]; + break; + default: + CARBON_AI_ABORT_USAGE; + } + + if (const char* err { loader.Validate() }) CARBON_AI_ABORT_ERROR(err); + if (const char* err { inference.Validate() }) CARBON_AI_ABORT_ERROR(err); + + std::string src_file_contents { load_file_contents(src_file) }; + std::string test_file_contents { load_file_contents(test_file) }; + + gcpp::Gemma model { gcpp::CreateGemma(loader, thread_pool) }; + gcpp::KVCache kv_cache { gcpp::KVCache::Create(model.Info().model) }; + gcpp::TimingInfo timings; + std::random_device rand_dev; + std::mt19937 rand_gen { rand_dev() }; + + std::vector prompt { tokenize(model.Tokenizer(), test_file_contents, src_file_contents) }; + size_t prompt_size { prompt.size() }; + size_t pos {0}; + std::stringstream buf; + auto stream_token = [prompt_size, &pos, &buf, &model](int token, float) { + ++pos; + if (pos > prompt_size and token != gcpp::EOS_ID) { + if (pos == prompt_size + 1) std::cout << std::endl << std::flush; + std::string tok; + HWY_ASSERT(model.Tokenizer().Decode(std::vector{token}, &tok)); + buf << tok; + std::cout << tok << std::flush; + } + else std::cout << "." << std::flush; + return true; + }; + + gcpp::RuntimeConfig runtime_conf { + .max_tokens = inference.max_tokens, + .max_generated_tokens = inference.max_generated_tokens, + .temperature = inference.temperature, + .gen = &rand_gen, + .stream_token = stream_token + }; + + model.Generate(runtime_conf, prompt, 0, kv_cache, timings); + + std::cout << "\n\nStats:" << std::endl; + std::cout << " prefill_tok_sec: " << timings.prefill_tok_sec << std::endl; + std::cout << " gen_tok_sec: " << timings.gen_tok_sec << std::endl; + std::cout << " time_to_first_token: " << timings.time_to_first_token << std::endl; + + preprocess_output(buf); + if (it_builds(buf)) std::cout << "\nIt builds (:D)" << std::endl; + else std::cout << "\nIt doesn't build (D:)" << std::endl; + + std::ofstream ofs { "tmp.cc" }; + if (not ofs) throw std::runtime_error { "unable to open/create file for writing (`./tmp.cc`)" }; + ofs << buf.str(); + ofs.close(); + [[maybe_unused]] int result { std::system(("diff -u " + test_file + " tmp.cc > testgen.diff").c_str()) }; + std::remove("tmp.cc"); + std::cout << "\nOutput from BSD Carbon AI written to: `testgen.diff`" << std::endl; +} diff --git a/ai/vendor/gemma.cpp b/ai/vendor/gemma.cpp new file mode 160000 index 0000000..960ff4b --- /dev/null +++ b/ai/vendor/gemma.cpp @@ -0,0 +1 @@ +Subproject commit 960ff4b4ec583d77ecad7a14c5149012240cc7e0 diff --git a/ai/vendor/highway b/ai/vendor/highway new file mode 160000 index 0000000..1cf089d --- /dev/null +++ b/ai/vendor/highway @@ -0,0 +1 @@ +Subproject commit 1cf089d07c8fb5531cea04b505ac8d152581f401 diff --git a/ai/vendor/sentencepiece b/ai/vendor/sentencepiece new file mode 160000 index 0000000..2de10cb --- /dev/null +++ b/ai/vendor/sentencepiece @@ -0,0 +1 @@ +Subproject commit 2de10cb30e982b980125d4713236dd2b29cc5f0c