{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":689773665,"defaultBranch":"main","name":"llamafile","ownerLogin":"Mozilla-Ocho","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-09-10T21:12:32.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/117940224?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1724312794.0","currentOid":""},"activityList":{"items":[{"before":"c44664b4b251b65d03d9c47919e0fa6b9d63c520","after":"98eff09ecc472f133c2c8cc6a6258f18899f37a0","ref":"refs/heads/main","pushedAt":"2024-08-24T14:16:43.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Quantize TriLM models using Q2_K_S (#552)","shortMessageHtmlLink":"Quantize TriLM models using Q2_K_S (#552)"}},{"before":"53d1990a1616416c59ff92bb9838cca01182b881","after":"9662d430ef023b0e4c4c64e1e1db274ad43e693f","ref":"refs/heads/fp8","pushedAt":"2024-08-24T08:19:21.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Fix something","shortMessageHtmlLink":"Fix something"}},{"before":"1e608265334937ee790efb858be04c6b0f777bdb","after":"53d1990a1616416c59ff92bb9838cca01182b881","ref":"refs/heads/fp8","pushedAt":"2024-08-24T08:19:01.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Fix something","shortMessageHtmlLink":"Fix something"}},{"before":"2eac9ad8a8a74e3dd15d698ce6522a68fbe8b303","after":"1e608265334937ee790efb858be04c6b0f777bdb","ref":"refs/heads/fp8","pushedAt":"2024-08-24T07:32:34.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Add AVX2 impl","shortMessageHtmlLink":"Add AVX2 impl"}},{"before":"b6a38c14bc3be79b3d7fa2fb48f2a008e69378f9","after":"2eac9ad8a8a74e3dd15d698ce6522a68fbe8b303","ref":"refs/heads/fp8","pushedAt":"2024-08-24T07:12:16.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Provide avx512f fallback","shortMessageHtmlLink":"Provide avx512f fallback"}},{"before":"78571d8b588417ecf45543cbebd2d25ad424c238","after":"b6a38c14bc3be79b3d7fa2fb48f2a008e69378f9","ref":"refs/heads/fp8","pushedAt":"2024-08-24T06:58:16.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Introduce better fp8 to fp32 conversion\n\nSee https://github.com/Mozilla-Ocho/llamafile/discussions/549","shortMessageHtmlLink":"Introduce better fp8 to fp32 conversion"}},{"before":"13d8706962496822bd3befb78a0e8638ee9c5ffc","after":"78571d8b588417ecf45543cbebd2d25ad424c238","ref":"refs/heads/fp8","pushedAt":"2024-08-24T06:21:05.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Introduce bf16 transcribing code","shortMessageHtmlLink":"Introduce bf16 transcribing code"}},{"before":"52d042ff0c2ad470ecdc73f8a8d4827afb22a086","after":"13d8706962496822bd3befb78a0e8638ee9c5ffc","ref":"refs/heads/fp8","pushedAt":"2024-08-24T05:30:59.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Introduce bf16 transcribing code","shortMessageHtmlLink":"Introduce bf16 transcribing code"}},{"before":"2e4de879166f61a2094481d677fee60d017a1dfd","after":"52d042ff0c2ad470ecdc73f8a8d4827afb22a086","ref":"refs/heads/fp8","pushedAt":"2024-08-24T04:29:53.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Boost fp8 prefill performance by 33 percent\n\n- F8 goes 127 tok/sec on my znver4 threadripper\n- F16 goes 258 tok/sec on my znver4 threadripper\n- BF16 goes 381 tok/sec on my znver4 threadripper","shortMessageHtmlLink":"Boost fp8 prefill performance by 33 percent"}},{"before":"ca063eaf4c38c3108fa21d69612b95f157821ecd","after":"2e4de879166f61a2094481d677fee60d017a1dfd","ref":"refs/heads/fp8","pushedAt":"2024-08-23T02:56:27.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Clean up fp8 code","shortMessageHtmlLink":"Clean up fp8 code"}},{"before":"c10a65c6e29f8355089be62f072c66397e0927e1","after":"ca063eaf4c38c3108fa21d69612b95f157821ecd","ref":"refs/heads/fp8","pushedAt":"2024-08-23T02:56:19.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Clean up fp8 code","shortMessageHtmlLink":"Clean up fp8 code"}},{"before":"b81b5906686b94c70c1809eac6aada0c7469e8d9","after":"c10a65c6e29f8355089be62f072c66397e0927e1","ref":"refs/heads/fp8","pushedAt":"2024-08-23T02:03:09.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Add vectorized fp8 code for avx512","shortMessageHtmlLink":"Add vectorized fp8 code for avx512"}},{"before":"42fa4226502132f456329d1b87a032fa82739375","after":"b81b5906686b94c70c1809eac6aada0c7469e8d9","ref":"refs/heads/fp8","pushedAt":"2024-08-22T18:20:21.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Address review comment","shortMessageHtmlLink":"Address review comment"}},{"before":null,"after":"42fa4226502132f456329d1b87a032fa82739375","ref":"refs/heads/fp8","pushedAt":"2024-08-22T07:46:34.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Add support for FP8 (E4M3)\n\nSee https://github.com/Mozilla-Ocho/llamafile/discussions/549","shortMessageHtmlLink":"Add support for FP8 (E4M3)"}},{"before":"2f1d558b14c5fef3f4cb43dee4e3fd6aebe95b07","after":"01cdfbd9cc2ff8842ae10020ee2b339e5bc58c9c","ref":"refs/heads/llama-matmul","pushedAt":"2024-08-22T00:38:22.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Try broken experiment with C","shortMessageHtmlLink":"Try broken experiment with C"}},{"before":"d6a505c03b7852e24b676dfd138afd9ebac3b6ad","after":"2f1d558b14c5fef3f4cb43dee4e3fd6aebe95b07","ref":"refs/heads/llama-matmul","pushedAt":"2024-08-21T20:39:09.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Try broken experiment with C","shortMessageHtmlLink":"Try broken experiment with C"}},{"before":"6287b601e50169c31a4aa45900ccd008ffda0e38","after":"c44664b4b251b65d03d9c47919e0fa6b9d63c520","ref":"refs/heads/main","pushedAt":"2024-08-21T03:41:33.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Always favor fp16 arithmetic in tinyBLAS\n\nIt was assumed earlier that upcasting would help precision. However this\nwasn't the case, according to levenshtein distance in whisperfile output\nwhich tells me this change makes things objectively better in noticeable\nways. So we now avoid the fp16 conversion, when the ISA is available. It\nshould be perfectly safe and accurate, even for large sums, since we now\nhave the ruler reduction divide and conquer approach, in tinyBLAS::gemm.","shortMessageHtmlLink":"Always favor fp16 arithmetic in tinyBLAS"}},{"before":"6230c840fe1517e820fdf99c57260274d1e10e27","after":"d6a505c03b7852e24b676dfd138afd9ebac3b6ad","ref":"refs/heads/llama-matmul","pushedAt":"2024-08-20T03:08:12.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Get new algorithm fully working\n\nI'm reasonably confident this implementation is solid. It now supports\nFP32, FP16, and BF16. I even found some new optimizations. Sadly it is\nstill going half the speed of tinyBLAS on my workstation. I tried with\nbig models and big prompts. It might just be that it's using AVX2, and\ntinyBLAS is using AVX512. Still, I would have liked to see the speedup\nthat the microbenchmarks reported. I wish I understood, why this isn't\nmeeting expectations so far.","shortMessageHtmlLink":"Get new algorithm fully working"}},{"before":"8a5a7242c51a2142a6e3a0522a52d50e0d3bc723","after":"6287b601e50169c31a4aa45900ccd008ffda0e38","ref":"refs/heads/main","pushedAt":"2024-08-19T23:18:45.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Update markdown documentation on audio conversion","shortMessageHtmlLink":"Update markdown documentation on audio conversion"}},{"before":"dc99002cb55abaddff39f24faccb6857c3bafc59","after":"8a5a7242c51a2142a6e3a0522a52d50e0d3bc723","ref":"refs/heads/main","pushedAt":"2024-08-19T23:13:49.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Improve whisperfile flag handling slightly","shortMessageHtmlLink":"Improve whisperfile flag handling slightly"}},{"before":"dab2317f6b2d64e6bf6b1e5d95fe102af1886d01","after":"dc99002cb55abaddff39f24faccb6857c3bafc59","ref":"refs/heads/main","pushedAt":"2024-08-19T23:11:51.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Convert audio files (mp3/flac/ogg) to 16khz wav\n\nIt's no longer necessary to run sox or ffmpeg beforehand, when using the\nwhisperfile command. If you're audio file isn't in the preferred format,\nit'll be converted for you automatically using the embedded audio tools.","shortMessageHtmlLink":"Convert audio files (mp3/flac/ogg) to 16khz wav"}},{"before":"c8b483f9912435967bb96dc643fb36f5a14d4456","after":"dab2317f6b2d64e6bf6b1e5d95fe102af1886d01","ref":"refs/heads/main","pushedAt":"2024-08-19T20:37:17.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Fix bug in whisperfile man page.","shortMessageHtmlLink":"Fix bug in whisperfile man page."}},{"before":"b17ccd1ebaf534b63ee1dcb727dd2af479502339","after":"c8b483f9912435967bb96dc643fb36f5a14d4456","ref":"refs/heads/main","pushedAt":"2024-08-19T20:06:47.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Mention --no-prints in whisperfile man page\n\nFixes #544","shortMessageHtmlLink":"Mention --no-prints in whisperfile man page"}},{"before":null,"after":"6230c840fe1517e820fdf99c57260274d1e10e27","ref":"refs/heads/llama-matmul","pushedAt":"2024-08-18T23:00:55.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Check-in llama matmul experiment\n\nThis code is the result of a collaboration between Aman Salykov and\nJustine Tunney. Aman managed to figure out a simple, elegant way to\nimplement the BLIS paper, which is something I spent months trying.\nHowever I could never get it to work nearly as well as this code.\n\nBoth BLIS and MKL design their microkernels to use the broadcast\ninstruction to load elements from the B matrix. This doesn't reduce the\nnumber of memory load operations compared to tinyBLAS, but it does\nreduce the number of bytes that need to be transferred. For example,\nwith BF16, that should mean we can just load a 16-byte value whenever we\nneed to fetch data from B rather than needing to pull in a whole vector.\n\nIn order for the broadcast trick to work, you have parallelize across\nthe K dimension, which is tricky. That's because you need to load tiles\nfrom both the A and B matrices before calling the microkernel, and they\nneed to be rearranged, on \"packed\", in a special way.\n\nWhat impresses me about Aman's technique is that his code appears to be\ncapable of parallelizing dot products on the K dimension across threads\nwithout using atomics or locks. It only needs two specially placed CUDA\nstyle syncthreads() directives, which impose an order on memory ops.\n\nSee https://salykova.github.io/matmul-cpu\nSee https://github.com/salykova/matmul.c","shortMessageHtmlLink":"Check-in llama matmul experiment"}},{"before":"65745b0da84043dcd3737dcc7e2dacfd6b4453b3","after":"b17ccd1ebaf534b63ee1dcb727dd2af479502339","ref":"refs/heads/main","pushedAt":"2024-08-18T17:11:51.000Z","pushType":"push","commitsCount":4,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Release llamafile v0.8.13","shortMessageHtmlLink":"Release llamafile v0.8.13"}},{"before":"202e554509fe8108fb85c2c5dc8ddf7e6ad98d3e","after":"65745b0da84043dcd3737dcc7e2dacfd6b4453b3","ref":"refs/heads/main","pushedAt":"2024-08-18T14:35:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Fix undefined behavior in llava clip","shortMessageHtmlLink":"Fix undefined behavior in llava clip"}},{"before":"b1c170330546f1929e84b6a4e57c01854e277447","after":"202e554509fe8108fb85c2c5dc8ddf7e6ad98d3e","ref":"refs/heads/main","pushedAt":"2024-08-18T04:23:57.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Write a lot of documentation and man pages","shortMessageHtmlLink":"Write a lot of documentation and man pages"}},{"before":"1ca6e4292c812033babc3fd80ab3dd72be76a77e","after":"b1c170330546f1929e84b6a4e57c01854e277447","ref":"refs/heads/main","pushedAt":"2024-08-17T18:00:51.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Sync more sources and fix regressions","shortMessageHtmlLink":"Sync more sources and fix regressions"}},{"before":"d0b5e8f2ac1df20c72ba177cfd208885ae1afc6d","after":"1ca6e4292c812033babc3fd80ab3dd72be76a77e","ref":"refs/heads/main","pushedAt":"2024-08-16T22:55:00.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Sync with upstream projects","shortMessageHtmlLink":"Sync with upstream projects"}},{"before":"bc7d85cec72f19d8134afb73290ddfbd15ccb122","after":"d0b5e8f2ac1df20c72ba177cfd208885ae1afc6d","ref":"refs/heads/main","pushedAt":"2024-08-16T20:04:37.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"jart","name":"Justine Tunney","path":"/jart","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/49262?s=80&v=4"},"commit":{"message":"Upgrade to Cosmopolitan v3.7.1","shortMessageHtmlLink":"Upgrade to Cosmopolitan v3.7.1"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAEoxUhygA","startCursor":null,"endCursor":null}},"title":"Activity ยท Mozilla-Ocho/llamafile"}