diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 8588104081..dce63fe9ea 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -18,9 +18,10 @@ class OpenAIWrapper(Wrapper): def __init__( self, model_name: str, + *, max_tokens: int, + embed_dim: int, tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now - embed_dim: int | None = None, **kwargs, ) -> None: """Wrapper for OpenAIs embedding API. @@ -54,17 +55,23 @@ def truncate_text_tokens(self, text): return self._encoding.decode(truncated_sentence) def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: - requires_package(self, "openai", "Openai text embedding") - + print("Starting OpenAIWrapper encode method") from openai import NotGiven + empty_mask = [not s.strip() for s in sentences] if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None: logger.warning( "Reducing embedding size available only for text-embedding-3-* models" ) + if any(empty_mask): + logger.warning( + "Empty strings detected - encoding non-empty only, returing zero vectors for empty strings." + ) + + non_empty_sentences = [s for s, e in zip(sentences, empty_mask) if not e] trimmed_sentences = [] - for sentence in sentences: + for sentence in non_empty_sentences: encoded_sentence = self._encoding.encode(sentence) if len(encoded_sentence) > self._max_tokens: truncated_sentence = self.truncate_text_tokens(sentence) @@ -118,7 +125,11 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: ) all_embeddings.extend(self._to_numpy(response)) - return np.array(all_embeddings) + all_embeddings = np.array(all_embeddings) + final_embeddings = np.zeros((len(sentences), self._embed_dim), dtype=np.float32) + non_empty_idxs = [i for i, e in enumerate(empty_mask) if not e] + final_embeddings[non_empty_idxs] = all_embeddings + return final_embeddings def _to_numpy(self, embedding_response) -> np.ndarray: return np.array([e.embedding for e in embedding_response.data]) @@ -134,6 +145,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: model_name="text-embedding-3-small", tokenizer_name="cl100k_base", max_tokens=8191, + embed_dim=1536, ), max_tokens=8191, embed_dim=1536, @@ -159,6 +171,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: model_name="text-embedding-3-large", tokenizer_name="cl100k_base", max_tokens=8191, + embed_dim=3072, ), max_tokens=8191, embed_dim=3072, @@ -184,6 +197,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", max_tokens=8191, + embed_dim=1536, ), reference="https://openai.com/index/new-and-improved-embedding-model/", max_tokens=8191,