11from typing import Callable , Dict , List , Optional
22
33from redisvl .vectorize .base import BaseVectorizer
4- from redisvl . utils . utils import array_to_buffer
4+
55
66class OpenAITextVectorizer (BaseVectorizer ):
77 # TODO - add docstring
@@ -19,32 +19,29 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
1919 openai .api_key = api_config .get ("api_key" , None )
2020 self ._model_client = openai .Embedding
2121
22- def _process_embedding (self , embedding : List [float ], as_buffer : bool ):
23- if as_buffer :
24- return array_to_buffer (embedding )
25- return embedding
26-
2722 def embed_many (
2823 self ,
29- inputs : List [str ],
24+ texts : List [str ],
3025 preprocess : Optional [Callable ] = None ,
3126 batch_size : Optional [int ] = 10 ,
3227 as_buffer : Optional [float ] = False
3328 ) -> List [List [float ]]:
3429 """Embed many chunks of texts using the OpenAI API.
3530
3631 Args:
37- inputs (List[str]): List of text chunks to embed.
32+ texts (List[str]): List of text chunks to embed.
3833 preprocess (Optional[Callable], optional): Optional preprocessing callable to
3934 perform before vectorization. Defaults to None.
40- batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10.
41- as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False.
35+ batch_size (int, optional): Batch size of texts to use when creating
36+ embeddings. Defaults to 10.
37+ as_buffer (Optional[float], optional): Whether to convert the raw embedding
38+ to a byte string. Defaults to False.
4239
4340 Returns:
44- List[List[float]]: _description_
41+ List[List[float]]: List of embeddings.
4542 """
4643 embeddings : List = []
47- for batch in self .batchify (inputs , batch_size , preprocess ):
44+ for batch in self .batchify (texts , batch_size , preprocess ):
4845 response = self ._model_client .create (input = batch , engine = self ._model )
4946 embeddings += [
5047 self ._process_embedding (r ["embedding" ], as_buffer ) for r in response ["data" ]
@@ -53,49 +50,50 @@ def embed_many(
5350
5451 def embed (
5552 self ,
56- inputs : List [ str ] ,
53+ text : str ,
5754 preprocess : Optional [Callable ] = None ,
58- batch_size : Optional [int ] = 10 ,
5955 as_buffer : Optional [float ] = False
6056 ) -> List [float ]:
61- """Embed chunks of texts using the OpenAI API.
57+ """Embed a chunk of text using the OpenAI API.
6258
6359 Args:
64- inputs (List[ str] ): List of text chunks to embed.
60+ text ( str): Chunk of text to embed.
6561 preprocess (Optional[Callable], optional): Optional preprocessing callable to
6662 perform before vectorization. Defaults to None.
67- batch_size (int , optional): Batch size of texts to use when creating embeddings. Defaults to 10.
68- as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False.
63+ as_buffer (Optional[float] , optional): Whether to convert the raw embedding
64+ to a byte string. Defaults to False.
6965
7066 Returns:
71- List[List[ float]]: _description_
67+ List[float]: Embedding.
7268 """
7369 if preprocess :
74- emb_input = preprocess (emb_input )
75- result = self ._model_client .create (input = [emb_input ], engine = self ._model )
70+ text = preprocess (text )
71+ result = self ._model_client .create (input = [text ], engine = self ._model )
7672 return self ._process_embedding (result ["data" ][0 ]["embedding" ], as_buffer )
7773
78-
7974 async def aembed_many (
8075 self ,
81- inputs : List [str ],
76+ texts : List [str ],
8277 preprocess : Optional [Callable ] = None ,
83- chunk_size : int = 1000 ,
78+ batch_size : int = 1000 ,
8479 as_buffer : Optional [bool ] = False
8580 ) -> List [List [float ]]:
86- """_summary_
81+ """Asynchronously embed many chunks of texts using the OpenAI API.
8782
8883 Args:
89- inputs (List[str]): _description_
90- preprocess (Optional[Callable], optional): _description_. Defaults to None.
91- chunk_size (int, optional): _description_. Defaults to 1000.
92- as_buffer (Optional[bool], optional): _description_. Defaults to False.
84+ texts (List[str]): List of text chunks to embed.
85+ preprocess (Optional[Callable], optional): Optional preprocessing callable to
86+ perform before vectorization. Defaults to None.
87+ batch_size (int, optional): Batch size of texts to use when creating
88+ embeddings. Defaults to 10.
89+ as_buffer (Optional[float], optional): Whether to convert the raw embedding
90+ to a byte string. Defaults to False.
9391
9492 Returns:
95- List[List[float]]: _description_
93+ List[List[float]]: List of embeddings.
9694 """
9795 embeddings : List = []
98- for batch in self .batchify (inputs , chunk_size , preprocess ):
96+ for batch in self .batchify (texts , batch_size , preprocess ):
9997 response = await self ._model_client .acreate (input = batch , engine = self ._model )
10098 embeddings += [
10199 self ._process_embedding (r ["embedding" ], as_buffer ) for r in response ["data" ]
@@ -104,21 +102,23 @@ async def aembed_many(
104102
105103 async def aembed (
106104 self ,
107- emb_input : str ,
105+ text : str ,
108106 preprocess : Optional [Callable ] = None ,
109107 as_buffer : Optional [bool ] = False
110108 ) -> List [float ]:
111- """_summary_
109+ """Asynchronously embed a chunk of text using the OpenAI API.
112110
113111 Args:
114- emb_input (str): _description_
115- preprocess (Optional[Callable], optional): _description_. Defaults to None.
116- as_buffer (Optional[bool], optional): _description_. Defaults to False.
112+ text (str): Chunk of text to embed.
113+ preprocess (Optional[Callable], optional): Optional preprocessing callable to
114+ perform before vectorization. Defaults to None.
115+ as_buffer (Optional[float], optional): Whether to convert the raw embedding
116+ to a byte string. Defaults to False.
117117
118118 Returns:
119- List[float]: _description_
119+ List[float]: Embedding.
120120 """
121121 if preprocess :
122- emb_input = preprocess (emb_input )
123- result = await self ._model_client .acreate (input = [emb_input ], engine = self ._model )
122+ text = preprocess (text )
123+ result = await self ._model_client .acreate (input = [text ], engine = self ._model )
124124 return self ._process_embedding (result ["data" ][0 ]["embedding" ], as_buffer )
0 commit comments