Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions mteb/models/openai_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ class OpenAIWrapper(Wrapper):
def __init__(
self,
model_name: str,
*,
max_tokens: int,
embed_dim: int,
tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now
embed_dim: int | None = None,
**kwargs,
) -> None:
"""Wrapper for OpenAIs embedding API.
Expand Down Expand Up @@ -54,17 +55,23 @@ def truncate_text_tokens(self, text):
return self._encoding.decode(truncated_sentence)

def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
requires_package(self, "openai", "Openai text embedding")

print("Starting OpenAIWrapper encode method")
from openai import NotGiven

empty_mask = [not s.strip() for s in sentences]
if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None:
logger.warning(
"Reducing embedding size available only for text-embedding-3-* models"
)

if any(empty_mask):
logger.warning(
"Empty strings detected - encoding non-empty only, returing zero vectors for empty strings."
)

non_empty_sentences = [s for s, e in zip(sentences, empty_mask) if not e]
trimmed_sentences = []
for sentence in sentences:
for sentence in non_empty_sentences:
encoded_sentence = self._encoding.encode(sentence)
if len(encoded_sentence) > self._max_tokens:
truncated_sentence = self.truncate_text_tokens(sentence)
Expand Down Expand Up @@ -118,7 +125,11 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
)
all_embeddings.extend(self._to_numpy(response))

return np.array(all_embeddings)
all_embeddings = np.array(all_embeddings)
final_embeddings = np.zeros((len(sentences), self._embed_dim), dtype=np.float32)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this already fill out the empty texts.

You can then simply fill in the text_embeddings using:

mask = [i for i, t in enumerate(text) if t.strip()]
final_emb[mask, :] = text_emb

Here is a small sample to show the idea:

import numpy as np

matrix1 = np.zeros((5, 4))
matrix2 = np.random.rand(3, 4)
matrix1[[0, 2, 4],:] = matrix2

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't get these one. Current implementation was 1st creating whole vector of zeros, and then only setting text embeddings, at positions where non-empty text is present. I think you are suggesting same only.

non_empty_idxs = [i for i, e in enumerate(empty_mask) if not e]
final_embeddings[non_empty_idxs] = all_embeddings
return final_embeddings

def _to_numpy(self, embedding_response) -> np.ndarray:
return np.array([e.embedding for e in embedding_response.data])
Expand All @@ -134,6 +145,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
model_name="text-embedding-3-small",
tokenizer_name="cl100k_base",
max_tokens=8191,
embed_dim=1536,
),
max_tokens=8191,
embed_dim=1536,
Expand All @@ -159,6 +171,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
model_name="text-embedding-3-large",
tokenizer_name="cl100k_base",
max_tokens=8191,
embed_dim=3072,
),
max_tokens=8191,
embed_dim=3072,
Expand All @@ -184,6 +197,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
model_name="text-embedding-ada-002",
tokenizer_name="cl100k_base",
max_tokens=8191,
embed_dim=1536,
),
reference="https://openai.com/index/new-and-improved-embedding-model/",
max_tokens=8191,
Expand Down