Skip to content

Commit 354e0c6

Browse files
clean up vectorizer interface
1 parent 1169949 commit 354e0c6

File tree

3 files changed

+83
-55
lines changed

3 files changed

+83
-55
lines changed

redisvl/vectorize/base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import Callable, Dict, List, Optional
2-
2+
from redisvl.utils.utils import array_to_buffer
33

44
class BaseVectorizer:
55
def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None):
@@ -51,3 +51,8 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None):
5151
yield [preprocess(chunk) for chunk in seq[pos : pos + size]]
5252
else:
5353
yield seq[pos : pos + size]
54+
55+
def _process_embedding(self, embedding: List[float], as_buffer: bool):
56+
if as_buffer:
57+
return array_to_buffer(embedding)
58+
return embedding
Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from typing import Callable, Dict, List, Optional
22

33
from redisvl.vectorize.base import BaseVectorizer
4-
from redisvl.utils.utils import array_to_buffer
54

65

76
class HFTextVectorizer(BaseVectorizer):
7+
# TODO - add docstring
88
def __init__(self, model: str, api_config: Optional[Dict] = None):
99
# TODO set dims based on model
1010
dims = 768
@@ -20,30 +20,53 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
2020

2121
def embed(
2222
self,
23-
emb_input: str,
23+
text: str,
2424
preprocess: Optional[Callable] = None,
25-
as_buffer: Optional[bool] = False
25+
as_buffer: Optional[float] = False
2626
) -> List[float]:
27+
"""Embed a chunk of text using the Hugging Face sentence transformer.
28+
29+
Args:
30+
text (str): Chunk of text to embed.
31+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
32+
perform before vectorization. Defaults to None.
33+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
34+
to a byte string. Defaults to False.
35+
36+
Returns:
37+
List[float]: Embedding.
38+
"""
2739
if preprocess:
28-
emb_input = preprocess(emb_input)
29-
embedding = self._model_client.encode([emb_input])[0]
30-
embedding = embedding.tolist()
31-
if as_buffer:
32-
return array_to_buffer(embedding)
33-
return embedding
40+
text = preprocess(text)
41+
embedding = self._model_client.encode([text])[0]
42+
return self._process_embedding(embedding.tolist(), as_buffer)
3443

3544
def embed_many(
3645
self,
37-
inputs: List[str],
46+
texts: List[str],
3847
preprocess: Optional[Callable] = None,
39-
chunk_size: int = 1000,
48+
batch_size: int = 1000,
4049
as_buffer: Optional[float] = None
4150
) -> List[List[float]]:
42-
embeddings = []
43-
for batch in self.batchify(inputs, chunk_size, preprocess):
51+
"""Asynchronously embed many chunks of texts using the Hugging Face sentence
52+
transformer.
53+
54+
Args:
55+
texts (List[str]): List of text chunks to embed.
56+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
57+
perform before vectorization. Defaults to None.
58+
batch_size (int, optional): Batch size of texts to use when creating
59+
embeddings. Defaults to 10.
60+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
61+
to a byte string. Defaults to False.
62+
63+
Returns:
64+
List[List[float]]: List of embeddings.
65+
"""
66+
embeddings: List = []
67+
for batch in self.batchify(texts, batch_size, preprocess):
4468
batch_embeddings = self._model_client.encode(batch)
4569
embeddings.extend([
46-
array_to_buffer(embedding.tolist()) if as_buffer else embedding.tolist()
47-
for embedding in batch_embeddings
70+
self._process_embedding(embedding.tolist(), as_buffer) for embedding in batch_embeddings
4871
])
4972
return embeddings

redisvl/vectorize/text/openai.py

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Callable, Dict, List, Optional
22

33
from redisvl.vectorize.base import BaseVectorizer
4-
from redisvl.utils.utils import array_to_buffer
4+
55

66
class OpenAITextVectorizer(BaseVectorizer):
77
# TODO - add docstring
@@ -19,32 +19,29 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
1919
openai.api_key = api_config.get("api_key", None)
2020
self._model_client = openai.Embedding
2121

22-
def _process_embedding(self, embedding: List[float], as_buffer: bool):
23-
if as_buffer:
24-
return array_to_buffer(embedding)
25-
return embedding
26-
2722
def embed_many(
2823
self,
29-
inputs: List[str],
24+
texts: List[str],
3025
preprocess: Optional[Callable] = None,
3126
batch_size: Optional[int] = 10,
3227
as_buffer: Optional[float] = False
3328
) -> List[List[float]]:
3429
"""Embed many chunks of texts using the OpenAI API.
3530
3631
Args:
37-
inputs (List[str]): List of text chunks to embed.
32+
texts (List[str]): List of text chunks to embed.
3833
preprocess (Optional[Callable], optional): Optional preprocessing callable to
3934
perform before vectorization. Defaults to None.
40-
batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10.
41-
as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False.
35+
batch_size (int, optional): Batch size of texts to use when creating
36+
embeddings. Defaults to 10.
37+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
38+
to a byte string. Defaults to False.
4239
4340
Returns:
44-
List[List[float]]: _description_
41+
List[List[float]]: List of embeddings.
4542
"""
4643
embeddings: List = []
47-
for batch in self.batchify(inputs, batch_size, preprocess):
44+
for batch in self.batchify(texts, batch_size, preprocess):
4845
response = self._model_client.create(input=batch, engine=self._model)
4946
embeddings += [
5047
self._process_embedding(r["embedding"], as_buffer) for r in response["data"]
@@ -53,49 +50,50 @@ def embed_many(
5350

5451
def embed(
5552
self,
56-
inputs: List[str],
53+
text: str,
5754
preprocess: Optional[Callable] = None,
58-
batch_size: Optional[int] = 10,
5955
as_buffer: Optional[float] = False
6056
) -> List[float]:
61-
"""Embed chunks of texts using the OpenAI API.
57+
"""Embed a chunk of text using the OpenAI API.
6258
6359
Args:
64-
inputs (List[str]): List of text chunks to embed.
60+
text (str): Chunk of text to embed.
6561
preprocess (Optional[Callable], optional): Optional preprocessing callable to
6662
perform before vectorization. Defaults to None.
67-
batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10.
68-
as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False.
63+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
64+
to a byte string. Defaults to False.
6965
7066
Returns:
71-
List[List[float]]: _description_
67+
List[float]: Embedding.
7268
"""
7369
if preprocess:
74-
emb_input = preprocess(emb_input)
75-
result = self._model_client.create(input=[emb_input], engine=self._model)
70+
text = preprocess(text)
71+
result = self._model_client.create(input=[text], engine=self._model)
7672
return self._process_embedding(result["data"][0]["embedding"], as_buffer)
7773

78-
7974
async def aembed_many(
8075
self,
81-
inputs: List[str],
76+
texts: List[str],
8277
preprocess: Optional[Callable] = None,
83-
chunk_size: int = 1000,
78+
batch_size: int = 1000,
8479
as_buffer: Optional[bool] = False
8580
) -> List[List[float]]:
86-
"""_summary_
81+
"""Asynchronously embed many chunks of texts using the OpenAI API.
8782
8883
Args:
89-
inputs (List[str]): _description_
90-
preprocess (Optional[Callable], optional): _description_. Defaults to None.
91-
chunk_size (int, optional): _description_. Defaults to 1000.
92-
as_buffer (Optional[bool], optional): _description_. Defaults to False.
84+
texts (List[str]): List of text chunks to embed.
85+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
86+
perform before vectorization. Defaults to None.
87+
batch_size (int, optional): Batch size of texts to use when creating
88+
embeddings. Defaults to 10.
89+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
90+
to a byte string. Defaults to False.
9391
9492
Returns:
95-
List[List[float]]: _description_
93+
List[List[float]]: List of embeddings.
9694
"""
9795
embeddings: List = []
98-
for batch in self.batchify(inputs, chunk_size, preprocess):
96+
for batch in self.batchify(texts, batch_size, preprocess):
9997
response = await self._model_client.acreate(input=batch, engine=self._model)
10098
embeddings += [
10199
self._process_embedding(r["embedding"], as_buffer) for r in response["data"]
@@ -104,21 +102,23 @@ async def aembed_many(
104102

105103
async def aembed(
106104
self,
107-
emb_input: str,
105+
text: str,
108106
preprocess: Optional[Callable] = None,
109107
as_buffer: Optional[bool] = False
110108
) -> List[float]:
111-
"""_summary_
109+
"""Asynchronously embed a chunk of text using the OpenAI API.
112110
113111
Args:
114-
emb_input (str): _description_
115-
preprocess (Optional[Callable], optional): _description_. Defaults to None.
116-
as_buffer (Optional[bool], optional): _description_. Defaults to False.
112+
text (str): Chunk of text to embed.
113+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
114+
perform before vectorization. Defaults to None.
115+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
116+
to a byte string. Defaults to False.
117117
118118
Returns:
119-
List[float]: _description_
119+
List[float]: Embedding.
120120
"""
121121
if preprocess:
122-
emb_input = preprocess(emb_input)
123-
result = await self._model_client.acreate(input=[emb_input], engine=self._model)
122+
text = preprocess(text)
123+
result = await self._model_client.acreate(input=[text], engine=self._model)
124124
return self._process_embedding(result["data"][0]["embedding"], as_buffer)

0 commit comments

Comments
 (0)