From 2b29783f9a3c9a791805f6a51d1dbbc68847c3c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Thu, 8 Aug 2024 17:03:05 +0200
Subject: [PATCH 1/4] [fix] quantization of token embeddings

---
 sentence_transformers/quantization.py | 24 ++++++++++++++-----
 tests/test_compute_embeddings.py      | 34 +++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/sentence_transformers/quantization.py b/sentence_transformers/quantization.py
index 37402cae7..de2e9e932 100644
--- a/sentence_transformers/quantization.py
+++ b/sentence_transformers/quantization.py
@@ -394,9 +394,15 @@ def quantize_embeddings(
     Returns:
         Quantized embeddings with the specified precision
     """
+    outputs, lengths = None, None
     if isinstance(embeddings, Tensor):
         embeddings = embeddings.cpu().numpy()
+        embeddings = np.concatenate(embeddings)
     elif isinstance(embeddings, list):
+        if not isinstance(embeddings[0], list) and len(embeddings[0].shape) == 2:
+            # It will happen when we request token_embeddings
+            lengths = [embedding.shape[0] for embedding in embeddings]
+            embeddings = np.concatenate(embeddings)
         if isinstance(embeddings[0], Tensor):
             embeddings = [embedding.cpu().numpy() for embedding in embeddings]
         embeddings = np.array(embeddings)
@@ -404,7 +410,7 @@ def quantize_embeddings(
         raise Exception("Embeddings to quantize must be float rather than int8 or uint8.")
 
     if precision == "float32":
-        return embeddings.astype(np.float32)
+        outputs = embeddings.astype(np.float32)
 
     if precision.endswith("int8"):
         # Either use the 1. provided ranges, 2. the calibration dataset or 3. the provided embeddings
@@ -423,14 +429,20 @@ def quantize_embeddings(
         steps = (ranges[1, :] - ranges[0, :]) / 255
 
         if precision == "uint8":
-            return ((embeddings - starts) / steps).astype(np.uint8)
+            outputs = ((embeddings - starts) / steps).astype(np.uint8)
         elif precision == "int8":
-            return ((embeddings - starts) / steps - 128).astype(np.int8)
+            outputs = ((embeddings - starts) / steps - 128).astype(np.int8)
 
     if precision == "binary":
-        return (np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) - 128).astype(np.int8)
+        outputs = (np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) - 128).astype(np.int8)
 
     if precision == "ubinary":
-        return np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
+        outputs = np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
 
-    raise ValueError(f"Precision {precision} is not supported")
+    if outputs is None:
+        raise ValueError(f"Precision {precision} is not supported")
+
+    if lengths is not None:
+        outputs = np.split(outputs, np.cumsum(lengths)[:-1])
+
+    return outputs
diff --git a/tests/test_compute_embeddings.py b/tests/test_compute_embeddings.py
index 5b0bf6aaa..d58c697e1 100644
--- a/tests/test_compute_embeddings.py
+++ b/tests/test_compute_embeddings.py
@@ -4,7 +4,10 @@
 
 from __future__ import annotations
 
+from typing import Literal
+
 import numpy as np
+import pytest
 
 from sentence_transformers import SentenceTransformer
 
@@ -84,3 +87,34 @@ def test_encode_tuple_sentences(paraphrase_distilroberta_base_v1_model: Sentence
     )
     assert emb.shape == (3, 768)
     assert abs(np.sum(emb) - 32.14627) < 0.002
+
+
+@pytest.mark.parametrize("precision", ("float32", "int8", "uint8"))
+def test_encode_token_embeddings_int_precision(
+    paraphrase_distilroberta_base_v1_model: SentenceTransformer,
+    precision: Literal["float32", "int8", "uint8", "binary", "ubinary"]
+) -> None:
+    model = paraphrase_distilroberta_base_v1_model
+    # Single sentence
+    emb = model.encode("Hello Word, a test sentence", output_value="token_embeddings", precision="uint8")
+    assert emb.shape == (8, 768)
+
+    # Single sentence as list
+    emb = model.encode(["Hello Word, a test sentence"], output_value="token_embeddings", precision="uint8")
+    assert isinstance(emb, list)
+    assert emb[0].shape == (8, 768)
+
+    # Sentence list
+    emb = model.encode(
+        [
+            "Hello Word, a test sentence",
+            "Here comes another sentence",
+            "My final sentence",
+        ],
+        output_value="token_embeddings",
+        precision=precision,
+    )
+    assert isinstance(emb, list)
+    assert emb[0].shape == (8, 768)
+    assert emb[1].shape == (6, 768)
+    assert emb[2].shape == (5, 768)

From 5e4a505b1178e7a7c1f7a20c9827f9ea5cbb0ad8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 9 Aug 2024 12:45:38 +0200
Subject: [PATCH 2/4] Fix handling tensors first

---
 sentence_transformers/quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sentence_transformers/quantization.py b/sentence_transformers/quantization.py
index de2e9e932..342400d42 100644
--- a/sentence_transformers/quantization.py
+++ b/sentence_transformers/quantization.py
@@ -399,12 +399,12 @@ def quantize_embeddings(
         embeddings = embeddings.cpu().numpy()
         embeddings = np.concatenate(embeddings)
     elif isinstance(embeddings, list):
+        if isinstance(embeddings[0], Tensor):
+            embeddings = [embedding.cpu().numpy() for embedding in embeddings]
         if not isinstance(embeddings[0], list) and len(embeddings[0].shape) == 2:
             # It will happen when we request token_embeddings
             lengths = [embedding.shape[0] for embedding in embeddings]
             embeddings = np.concatenate(embeddings)
-        if isinstance(embeddings[0], Tensor):
-            embeddings = [embedding.cpu().numpy() for embedding in embeddings]
         embeddings = np.array(embeddings)
     if embeddings.dtype in (np.uint8, np.int8):
         raise Exception("Embeddings to quantize must be float rather than int8 or uint8.")

From decbe81482e6f5cc973faa3d3e865e0c44684eaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 9 Aug 2024 12:48:59 +0200
Subject: [PATCH 3/4] Remove float32 from tests

---
 tests/test_compute_embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_compute_embeddings.py b/tests/test_compute_embeddings.py
index d58c697e1..d6af77b99 100644
--- a/tests/test_compute_embeddings.py
+++ b/tests/test_compute_embeddings.py
@@ -89,7 +89,7 @@ def test_encode_tuple_sentences(paraphrase_distilroberta_base_v1_model: Sentence
     assert abs(np.sum(emb) - 32.14627) < 0.002
 
 
-@pytest.mark.parametrize("precision", ("float32", "int8", "uint8"))
+@pytest.mark.parametrize("precision", ("int8", "uint8"))
 def test_encode_token_embeddings_int_precision(
     paraphrase_distilroberta_base_v1_model: SentenceTransformer,
     precision: Literal["float32", "int8", "uint8", "binary", "ubinary"]

From 38186dfad72e06e3dafa2972f5041e1a68c0a1af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 30 Aug 2024 10:52:58 +0200
Subject: [PATCH 4/4] Implement quantization in case output_value=None

---
 sentence_transformers/SentenceTransformer.py | 18 +++-
 tests/test_compute_embeddings.py             | 93 +++++++++++++++++++-
 2 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py
index c2d476782..b38c11d0b 100644
--- a/sentence_transformers/SentenceTransformer.py
+++ b/sentence_transformers/SentenceTransformer.py
@@ -614,7 +614,23 @@ def encode(
         all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
 
         if precision and precision != "float32":
-            all_embeddings = quantize_embeddings(all_embeddings, precision=precision)
+            if output_value:
+                all_embeddings = quantize_embeddings(all_embeddings, precision=precision)
+            else:
+                # output_value=None, means we want to get both token and sentence embeddings.
+                # The value of all_embeddings is now a list of dictionaries. We temporarily
+                # build a list of token embeddings and sentence embeddings separately, quantize
+                # them, and then recombine them into a list of dictionaries.
+                combined_embeddings = []
+                for emb in embeddings:
+                    combined_embeddings.append(emb["token_embeddings"])
+                    combined_embeddings.append(emb["sentence_embedding"].reshape(1, -1))
+                combined_embeddings = quantize_embeddings(combined_embeddings, precision=precision)
+
+                # Reconstruct the list of dictionaries with quantized embeddings
+                for i, emb in enumerate(all_embeddings):
+                    emb["token_embeddings"] = combined_embeddings[2 * i]
+                    emb["sentence_embedding"] = combined_embeddings[2 * i + 1].reshape(-1)
 
         if convert_to_tensor:
             if len(all_embeddings):
diff --git a/tests/test_compute_embeddings.py b/tests/test_compute_embeddings.py
index d6af77b99..4f2eccac1 100644
--- a/tests/test_compute_embeddings.py
+++ b/tests/test_compute_embeddings.py
@@ -89,6 +89,38 @@ def test_encode_tuple_sentences(paraphrase_distilroberta_base_v1_model: Sentence
     assert abs(np.sum(emb) - 32.14627) < 0.002
 
 
+@pytest.mark.parametrize("precision", ("int8", "uint8"))
+def test_encode_sentence_embedding_int_precision(
+    paraphrase_distilroberta_base_v1_model: SentenceTransformer,
+    precision: Literal["float32", "int8", "uint8", "binary", "ubinary"]
+) -> None:
+    model = paraphrase_distilroberta_base_v1_model
+    # Single sentence
+    emb = model.encode("Hello Word, a test sentence", output_value="sentence_embedding", precision=precision)
+    assert emb.shape == (768, )
+    assert emb.dtype == np.dtype(precision)
+
+    # Single sentence as list
+    emb = model.encode(["Hello Word, a test sentence"], output_value="sentence_embedding", precision=precision)
+    assert isinstance(emb, np.ndarray)
+    assert emb.shape == (1, 768)
+    assert emb.dtype == np.dtype(precision)
+
+    # Sentence list
+    emb = model.encode(
+        [
+            "Hello Word, a test sentence",
+            "Here comes another sentence",
+            "My final sentence",
+        ],
+        output_value="sentence_embedding",
+        precision=precision,
+    )
+    assert isinstance(emb, np.ndarray)
+    assert emb.shape == (3, 768)
+    assert emb.dtype == np.dtype(precision)
+
+
 @pytest.mark.parametrize("precision", ("int8", "uint8"))
 def test_encode_token_embeddings_int_precision(
     paraphrase_distilroberta_base_v1_model: SentenceTransformer,
@@ -96,13 +128,15 @@ def test_encode_token_embeddings_int_precision(
 ) -> None:
     model = paraphrase_distilroberta_base_v1_model
     # Single sentence
-    emb = model.encode("Hello Word, a test sentence", output_value="token_embeddings", precision="uint8")
+    emb = model.encode("Hello Word, a test sentence", output_value="token_embeddings", precision=precision)
     assert emb.shape == (8, 768)
+    assert emb.dtype == np.dtype(precision)
 
     # Single sentence as list
-    emb = model.encode(["Hello Word, a test sentence"], output_value="token_embeddings", precision="uint8")
+    emb = model.encode(["Hello Word, a test sentence"], output_value="token_embeddings", precision=precision)
     assert isinstance(emb, list)
     assert emb[0].shape == (8, 768)
+    assert emb[0].dtype == np.dtype(precision)
 
     # Sentence list
     emb = model.encode(
@@ -116,5 +150,60 @@ def test_encode_token_embeddings_int_precision(
     )
     assert isinstance(emb, list)
     assert emb[0].shape == (8, 768)
+    assert emb[0].dtype == np.dtype(precision)
     assert emb[1].shape == (6, 768)
+    assert emb[1].dtype == np.dtype(precision)
     assert emb[2].shape == (5, 768)
+    assert emb[2].dtype == np.dtype(precision)
+
+
+@pytest.mark.parametrize("precision", ("int8", "uint8"))
+def test_encode_output_value_none_int_precision(
+    paraphrase_distilroberta_base_v1_model: SentenceTransformer,
+    precision: Literal["float32", "int8", "uint8", "binary", "ubinary"]
+) -> None:
+    model = paraphrase_distilroberta_base_v1_model
+    # Single sentence
+    emb = model.encode("Hello Word, a test sentence", output_value=None, precision=precision)
+    assert isinstance(emb, dict)
+    assert emb["sentence_embedding"].shape == (768,)
+    assert emb["sentence_embedding"].dtype == np.dtype(precision)
+    assert emb["token_embeddings"].shape == (8, 768)
+    assert emb["token_embeddings"].dtype == np.dtype(precision)
+
+    # Single sentence as list
+    emb = model.encode(["Hello Word, a test sentence"], output_value=None, precision=precision)
+    assert isinstance(emb, list)
+    assert isinstance(emb[0], dict)
+    assert emb[0]["sentence_embedding"].shape == (768,)
+    assert emb[0]["sentence_embedding"].dtype == np.dtype(precision)
+    assert emb[0]["token_embeddings"].shape == (8, 768)
+    assert emb[0]["token_embeddings"].dtype == np.dtype(precision)
+
+    # Sentence list
+    emb = model.encode(
+        [
+            "Hello Word, a test sentence",
+            "Here comes another sentence",
+            "My final sentence",
+        ],
+        output_value=None,
+        precision=precision,
+    )
+    assert isinstance(emb, list)
+    assert all(isinstance(e, dict) for e in emb)
+
+    assert emb[0]["sentence_embedding"].shape == (768,)
+    assert emb[0]["sentence_embedding"].dtype == np.dtype(precision)
+    assert emb[0]["token_embeddings"].shape == (8, 768)
+    assert emb[0]["token_embeddings"].dtype == np.dtype(precision)
+
+    assert emb[1]["sentence_embedding"].shape == (768,)
+    assert emb[1]["sentence_embedding"].dtype == np.dtype(precision)
+    assert emb[1]["token_embeddings"].shape == (8, 768)
+    assert emb[1]["token_embeddings"].dtype == np.dtype(precision)
+
+    assert emb[2]["sentence_embedding"].shape == (768,)
+    assert emb[2]["sentence_embedding"].dtype == np.dtype(precision)
+    assert emb[2]["token_embeddings"].shape == (8, 768)
+    assert emb[2]["token_embeddings"].dtype == np.dtype(precision)
\ No newline at end of file