ggml-org · xingchensong · Sep 14, 2023 · Sep 16, 2023
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -79,6 +79,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             }
         } else if (arg == "-tt" || arg == "--token_test") {
             params.token_test = get_next_arg(i, argc, argv, arg, params);
+        } else if (arg == "--perplexity") {
+            params.perplexity = true;
         }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -112,6 +114,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
 }
@@ -765,6 +768,22 @@ float similarity(const std::string & s0, const std::string & s1) {
     return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
 
+std::vector<float> softmax(const std::vector<float> & logits) {
+    std::vector<float> probs(logits.size());
+    float max_logit = logits[0];
+    for (float v : logits) max_logit = std::max(max_logit, v);
+    double sum_exp = 0.0;
+    for (size_t i = 0; i < logits.size(); i++) {
+        // Subtract the maximum logit value from the current logit value for numerical stability
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
+        sum_exp += exp_logit;
+        probs[i] = exp_logit;
+    }
+    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+    return probs;
+}
+
 bool sam_params_parse(int argc, char ** argv, sam_params & params) {
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];

diff --git a/examples/common.h b/examples/common.h
@@ -35,6 +35,8 @@ struct gpt_params {
     int32_t interactive_port = -1;
 
     int32_t n_gpu_layers     = 0;
+
+    bool perplexity = false;
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -158,6 +160,9 @@ bool vad_simple(
 // compute similarity between two strings using Levenshtein distance
 float similarity(const std::string & s0, const std::string & s1);
 
+// softmax for ppl
+std::vector<float> softmax(const std::vector<float> & logits);
+
 //
 // SAM argument parsing
 //

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -803,7 +803,14 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
 
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+    if (params.perplexity) {
+        // NOTE(xcsong): We only calculate perplexity on prompt (n_predict = 0).
+        //  To get logits, we need to process prompt token-by-token (n_batch = 1).
+        params.n_predict = 0;
+        params.n_batch = 1;
+    } else {
+        params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+    }
 
     printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
     printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
@@ -816,6 +823,7 @@ int main(int argc, char ** argv) {
     // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
     std::vector<gpt_vocab::id> embd;
 
+    double nll = 0.0;  // for perplexity
     for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
         // predict
         if (embd.size() > 0) {
@@ -827,6 +835,11 @@ int main(int argc, char ** argv) {
             }
 
             t_predict_us += ggml_time_us() - t_start_us;
+
+            if (params.perplexity) {
+                const float prob = softmax(logits)[embd_inp[i]];
+                nll += -std::log(prob);
+            }
         }
 
         n_past += embd.size();
@@ -875,6 +888,11 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (params.perplexity) {
+        // perplexity is e^(average negative log-likelihood)
+        printf("\n\nperplexity:%.8lf\n", std::exp(nll / (embd_inp.size() - 1)));
+    }
+
     // report timing
     {
         const int64_t t_main_end_us = ggml_time_us();

diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp
@@ -681,22 +681,6 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
     return true;
 }
 
-std::vector<float> softmax(const std::vector<float> & logits) {
-    std::vector<float> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        const float logit = logits[i] - max_logit;
-        const float exp_logit = expf(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
-    return probs;
-}
-
 int perplexity(const mpt_params & params) {
     ggml_time_init();