Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed redundant and potentially error cause validation for single doc OpenAI embedding #3819

Closed
wants to merge 17 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 48 additions & 57 deletions langchain/embeddings/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,73 +153,64 @@ def _get_len_safe_embeddings(
embeddings: List[List[float]] = [[] for _ in range(len(texts))]
try:
import tiktoken

tokens = []
indices = []
encoding = tiktoken.model.encoding_for_model(self.model)
for i, text in enumerate(texts):
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
for j in range(0, len(token), self.embedding_ctx_length):
tokens += [token[j : j + self.embedding_ctx_length]]
indices += [i]

batched_embeddings = []
_chunk_size = chunk_size or self.chunk_size
for i in range(0, len(tokens), _chunk_size):
response = embed_with_retry(
self,
input=tokens[i : i + _chunk_size],
engine=self.deployment,
)
batched_embeddings += [r["embedding"] for r in response["data"]]

results: List[List[List[float]]] = [[] for _ in range(len(texts))]
num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
for i in range(len(indices)):
results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i]))

for i in range(len(texts)):
_result = results[i]
if len(_result) == 0:
average = embed_with_retry(self, input="", engine=self.deployment)[
"data"
][0]["embedding"]
else:
average = np.average(
_result, axis=0, weights=num_tokens_in_batch[i]
)
embeddings[i] = (average / np.linalg.norm(average)).tolist()

return embeddings

except ImportError:
raise ValueError(
"Could not import tiktoken python package. "
"This is needed in order to for OpenAIEmbeddings. "
"This is needed for OpenAIEmbeddings. "
"Please install it with `pip install tiktoken`."
)

def _embedding_func(self, text: str, *, engine: str) -> List[float]:
"""Call out to OpenAI's embedding endpoint."""
# handle large input text
if len(text) > self.embedding_ctx_length:
return self._get_len_safe_embeddings([text], engine=engine)[0]
else:
tokens = []
indices = []
encoding = tiktoken.encoding_for_model(self.model)
for i, text in enumerate(texts):
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
return embed_with_retry(self, input=[text], engine=engine)["data"][0][
"embedding"
]
token = encoding.encode(
text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
for j in range(0, len(token), self.embedding_ctx_length):
tokens += [token[j : j + self.embedding_ctx_length]]
indices += [i]

batched_embeddings = []
_chunk_size = chunk_size or self.chunk_size
for i in range(0, len(tokens), _chunk_size):
response = embed_with_retry(
self,
input=tokens[i : i + _chunk_size],
engine=self.deployment,
)
batched_embeddings += [r["embedding"] for r in response["data"]]

results: List[List[List[float]]] = [[] for _ in range(len(texts))]
num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
for i in range(len(indices)):
results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i]))

for i in range(len(texts)):
_result = results[i]
if len(_result) == 0:
average = embed_with_retry(self, input="", engine=self.deployment)[
"data"
][0]["embedding"]
else:
average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
embeddings[i] = (average / np.linalg.norm(average)).tolist()

return embeddings

def _embedding_func(self, text: str, *, engine: str) -> List[float]:
"""Call out to OpenAI's embedding endpoint."""
# NOTE: to keep things simple, we assume the list may contain texts longer
# than the maximum context and use length-safe embedding function.
return self._get_len_safe_embeddings([text], engine=engine)[0]

def embed_documents(
self, texts: List[str], chunk_size: Optional[int] = 0
self, texts: List[str], chunk_size: Optional[int] = None
) -> List[List[float]]:
"""Call out to OpenAI's embedding endpoint for embedding search docs.

Expand Down