Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion mteb/models/voyage_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
# synthetic data
}

# The missing values are translated to themselves
VOYAGE_DTYPE_TRANSLATION = {
"float32": "float",
"bf16": "float",
}

# Total token limits per model based on VoyageAI documentation
VOYAGE_TOTAL_TOKEN_LIMITS = {
"voyage-3.5-lite": 1_000_000,
Expand Down Expand Up @@ -95,6 +101,7 @@ def __init__(
max_tpm: int = 1_000_000,
max_tokens: int | None = None,
model_prompts: dict[str, str] | None = None,
output_dtype: str | None = None,
**kwargs,
) -> None:
requires_package(self, "voyageai", model_name, "pip install 'mteb[voyageai]'")
Expand All @@ -106,6 +113,7 @@ def __init__(
self._max_tpm = max_tpm
self._max_tokens = max_tokens
self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
self.output_dtype = output_dtype

def _calculate_default_batch_size(self) -> int:
"""Calculate the default batch size based on total token limit and context length.
Expand Down Expand Up @@ -143,6 +151,10 @@ def _batched_encode(
) -> np.ndarray:
embeddings, index = [], 0

output_dtype = VOYAGE_DTYPE_TRANSLATION.get(
self.output_dtype, self.output_dtype
)

while index < len(sentences):
batch, batch_tokens = [], 0
while (
Expand All @@ -164,10 +176,31 @@ def _batched_encode(
texts=batch,
model=self._model_name,
input_type=input_type,
output_dtype=output_dtype,
).embeddings
)

return np.array(embeddings)
embeddings_array = np.array(embeddings)

if output_dtype == "binary":
# Unpack bit-packed embeddings: each byte contains 8 embedding values
unpacked_embeddings = []
for embedding in embeddings_array:
# Convert bytes to bits and unpack
unpacked = []
for byte_val in embedding:
# Extract 8 bits from each byte (LSB first)
for bit_pos in range(8):
bit_val = (byte_val >> bit_pos) & 1
# Convert 0/1 to -1/1 for binary (signed)
unpacked.append(1.0 if bit_val else -1.0)
unpacked_embeddings.append(unpacked)
embeddings_array = np.array(unpacked_embeddings, dtype=np.float32)
elif output_dtype != "float":
# Convert int8/uint8 embeddings to float32
embeddings_array = embeddings_array.astype(np.float32)

return embeddings_array


model_prompts = {
Expand Down Expand Up @@ -201,6 +234,60 @@ def _batched_encode(
public_training_data=None,
)

voyage_3_5_int8 = ModelMeta(
name="voyageai/voyage-3.5 (output_dtype=int8)",
revision="1",
release_date="2025-01-21",
languages=None, # supported languages not specified
loader=partial(
VoyageWrapper,
model_name="voyage-3.5",
model_prompts=model_prompts,
output_dtype="int8",
),
max_tokens=32000,
embed_dim=1024,
open_weights=False,
n_parameters=None,
memory_usage_mb=None,
license=None,
reference="https://docs.voyageai.com/docs/flexible-dimensions-and-quantization",
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
training_datasets=VOYAGE_TRAINING_DATA,
public_training_code=None,
public_training_data=None,
adapted_from="voyageai/voyage-3.5",
)

voyage_3_5_binary = ModelMeta(
name="voyageai/voyage-3.5 (output_dtype=binary)",
revision="1",
release_date="2025-01-21",
languages=None, # supported languages not specified
loader=partial(
VoyageWrapper,
model_name="voyage-3.5",
model_prompts=model_prompts,
output_dtype="binary",
),
max_tokens=32000,
embed_dim=1024, # Same as original after unpacking from bits
open_weights=False,
n_parameters=None,
memory_usage_mb=None,
license=None,
reference="https://docs.voyageai.com/docs/flexible-dimensions-and-quantization",
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
training_datasets=VOYAGE_TRAINING_DATA,
public_training_code=None,
public_training_data=None,
adapted_from="voyageai/voyage-3.5",
)

voyage_large_2_instruct = ModelMeta(
name="voyageai/voyage-large-2-instruct",
revision="1",
Expand Down
Loading