diff --git a/README.md b/README.md index ec4e2cc5..04f41fdc 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,7 @@ gcc -O3 -o run run.c -lm ./run out/model.bin ``` -You'll notice that this just streams the raw tokens. (See [performance](#performance) for compile flags that can significantly speed this up). Unless you can read those directly, you'll want to translate them into text. For now sadly we have to run this C code through a simple wrapper that does the translation (see the file, it's just 30 lines): - -```bash -pip install sentencepiece -python run_wrap.py -``` - -You'll see text stream. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. Sample output: +You'll see the text stream a sample. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. See [performance](#performance) for compile flags that can significantly speed this up. Sample output: *Once upon a time, there was a boy named Timmy. Timmy loved to play sports with his friends. He was very good at throwing and catching balls. One day, Timmy's mom gave him a new shirt to wear to a party. Timmy thought it was impressive and asked his mom to explain what a shirt could be for. "A shirt is like a special suit for a basketball game," his mom said. Timmy was happy to hear that and put on his new shirt. He felt like a soldier going to the army and shouting. From that day on, Timmy wore his new shirt every time he played sports with his friends at the party. Once upon a time, there was a little girl named Lily. She loved to play outside with her friends. One day, Lily and her friend Emma were playing with a ball. Emma threw the ball too hard and it hit Lily's face. Lily felt embarrassed and didn't want to play anymore. Emma asked Lily what was wrong, and Lily told her about her memory. Emma told Lily that she was embarrassed because she had thrown the ball too hard. Lily felt bad @@ -74,12 +67,6 @@ You can now run it simply as ./run out/model.bin ``` -But note that this only emits the SentencePiece tokens. To decode the tokens into text too, run this script through a simple wrapper: - -```bash -python run_wrap.py -``` - Watch the tokens stream by, fun! We can also run the PyTorch inference script for comparison (to run, add [model.ckpt](https://drive.google.com/file/d/1SM0rMxzy7babB-v4MfTg1GFqOCgWar5w/view?usp=share_link) to /out if you haven't already): ```bash @@ -124,8 +111,7 @@ Also, I saw someone report higher throughput replacing `gcc` with `clang`. ## unsorted todos -- why SentencePiece can't iteratively decode properly? -- would love to delete run_wrap.py and just directly use C code to string +- why is there a leading space in C sampling code when we `./run`? - todo multiquery support? doesn't seem as useful for smaller models that run on CPU (?) - todo support inferencing beyond max_seq_len steps, have to think through the kv cache - why is MFU so low (~10%) on my A100 40GB for training? diff --git a/run.c b/run.c index dc5cd9ce..a479bc1c 100644 --- a/run.c +++ b/run.c @@ -378,7 +378,6 @@ int argmax(float* v, int n) { // ---------------------------------------------------------------------------- int main(int argc, char *argv[]) { - setbuf(stdout, NULL); // disable stdout buffering // poor man's C argparse char *checkpoint = NULL; @@ -412,6 +411,24 @@ int main(int argc, char *argv[]) { } fread(&config, sizeof(Config), 1, file); + // init the Tokenizer + char** vocab = (char**)malloc(config.vocab_size * sizeof(char*)); + { + FILE *file = fopen("tokenizer.bin", "r"); + if (!file) { + printf("Unable to open the tokenizer file tokenizer.bin! Run " + "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n"); + return 1; + } + int len; + for (int i = 0; i < config.vocab_size; i++) { + fread(&len, sizeof(int), 1, file); + vocab[i] = (char *)malloc(len + 1); + fread(vocab[i], len, 1, file); + vocab[i][len] = '\0'; // add the string terminating token + } + } + // create and init the Transformer TransformerWeights weights; malloc_weights(&weights, &config); @@ -421,8 +438,9 @@ int main(int argc, char *argv[]) { // create and init the application RunState RunState state; malloc_run_state(&state, &config); - + // the current position we are in + clock_t start = clock(); int next; int token = 1; // 1 = BOS token in Llama-2 sentencepiece int pos = 0; @@ -443,14 +461,24 @@ int main(int argc, char *argv[]) { // we now want to sample from this distribution to get the next token next = sample(state.logits, config.vocab_size); } - printf("%d\n", next); + printf("%s", vocab[next]); + fflush(stdout); // advance forward token = next; pos++; } + printf("\n"); + + // report our achieved tok/s + clock_t end = clock(); + double elapsed = (double)(end - start) / CLOCKS_PER_SEC; + printf("achieved tok/s: %f\n", config.seq_len / elapsed); + // memory cleanup free_run_state(&state); free_weights(&weights); + for (int i = 0; i < config.vocab_size; i++) { free(vocab[i]); } + free(vocab); return 0; } diff --git a/run_wrap.py b/run_wrap.py deleted file mode 100644 index c1b7a720..00000000 --- a/run_wrap.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -wrapper around run.c -mostly deals with the sentencepiece encoding/decoding -C code does all the transformer inference of the individual tokens -""" - -from tokenizer import Tokenizer -import subprocess -import time - -# specify your command -command = ["./run", "out/model.bin"] - -# Start the process -proc = subprocess.Popen(command, stdout=subprocess.PIPE) -enc = Tokenizer() - -t0 = time.time() -tokens = [] -last = '' -for line in proc.stdout: - token = int(line.decode('utf-8').strip()) - dec = enc.decode(tokens + [token]) - chunk = dec[len(last):] - print(chunk, end='',flush=True) - tokens.append(token) - last = dec -t1 = time.time() -# seeking help: how can we do streaming inference in sentencepiece properly? -# or even delete sentencepiece entirely? - -print(f"\nachieved tok/s: {len(tokens) / (t1 - t0)}") -proc.wait() diff --git a/tokenizer.bin b/tokenizer.bin new file mode 100644 index 00000000..18a803f9 Binary files /dev/null and b/tokenizer.bin differ diff --git a/tokenizer.py b/tokenizer.py index 765b30d8..a147239a 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -9,6 +9,7 @@ from sentencepiece import SentencePieceProcessor TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model +TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C class Tokenizer: def __init__(self): @@ -36,3 +37,29 @@ def encode(self, s: str, bos: bool, eos: bool) -> List[int]: def decode(self, t: List[int]) -> str: return self.sp_model.decode(t) + + def export(self): + tokens = [] + for i in range(self.n_words): + + # decode the token and light postprocessing + t = self.sp_model.id_to_piece(i) + if i == self.bos_id: + t = '\n\n' + elif i == self.eos_id: + t = '\n\n' + elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'): + t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01' + t = t.replace('▁', ' ') # sentencepiece uses this as the whitespace + + tokens.append(t) + + with open(TOKENIZER_BIN, 'wb') as f: + for token in tokens: + bytes = token.encode('utf-8') + f.write((len(bytes)).to_bytes(4, 'little')) # write length of bytes + f.write(bytes) # write token bytes + +if __name__ == "__main__": + t = Tokenizer() + t.export()