clean up vectorizer interface

tylerhutcherson · tylerhutcherson · commit 354e0c6c0b32 · 2023-08-03T22:27:06.000-04:00
diff --git a/redisvl/vectorize/base.py b/redisvl/vectorize/base.py
@@ -1,5 +1,5 @@
 from typing import Callable, Dict, List, Optional
-
+from redisvl.utils.utils import array_to_buffer
 
 class BaseVectorizer:
     def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None):
@@ -51,3 +51,8 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None):
                 yield [preprocess(chunk) for chunk in seq[pos : pos + size]]
             else:
                 yield seq[pos : pos + size]
+
+    def _process_embedding(self, embedding: List[float], as_buffer: bool):
+        if as_buffer:
+            return array_to_buffer(embedding)
+        return embedding
diff --git a/redisvl/vectorize/text/huggingface.py b/redisvl/vectorize/text/huggingface.py
@@ -1,10 +1,10 @@
 from typing import Callable, Dict, List, Optional
 
 from redisvl.vectorize.base import BaseVectorizer
-from redisvl.utils.utils import array_to_buffer
 
 
 class HFTextVectorizer(BaseVectorizer):
+    # TODO - add docstring
     def __init__(self, model: str, api_config: Optional[Dict] = None):
         # TODO set dims based on model
         dims = 768
@@ -20,30 +20,53 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
 
     def embed(
         self,
-        emb_input: str,
+        text: str,
         preprocess: Optional[Callable] = None,
-        as_buffer: Optional[bool] = False
+        as_buffer: Optional[float] = False
     ) -> List[float]:
+        """Embed a chunk of text using the Hugging Face sentence transformer.
+
+        Args:
+            text (str): Chunk of text to embed.
+            preprocess (Optional[Callable], optional): Optional preprocessing callable to
+                perform before vectorization. Defaults to None.
+            as_buffer (Optional[float], optional): Whether to convert the raw embedding
+                to a byte string. Defaults to False.
+
+        Returns:
+            List[float]: Embedding.
+        """
         if preprocess:
-            emb_input = preprocess(emb_input)
-        embedding = self._model_client.encode([emb_input])[0]
-        embedding = embedding.tolist()
-        if as_buffer:
-            return array_to_buffer(embedding)
-        return embedding
+            text = preprocess(text)
+        embedding = self._model_client.encode([text])[0]
+        return self._process_embedding(embedding.tolist(), as_buffer)
 
     def embed_many(
         self,
-        inputs: List[str],
+        texts: List[str],
         preprocess: Optional[Callable] = None,
-        chunk_size: int = 1000,
+        batch_size: int = 1000,
         as_buffer: Optional[float] = None
     ) -> List[List[float]]:
-        embeddings = []
-        for batch in self.batchify(inputs, chunk_size, preprocess):
+        """Asynchronously embed many chunks of texts using the Hugging Face sentence
+        transformer.
+
+        Args:
+            texts (List[str]): List of text chunks to embed.
+            preprocess (Optional[Callable], optional): Optional preprocessing callable to
+                perform before vectorization. Defaults to None.
+            batch_size (int, optional): Batch size of texts to use when creating
+                embeddings. Defaults to 10.
+            as_buffer (Optional[float], optional): Whether to convert the raw embedding
+                to a byte string. Defaults to False.
+
+        Returns:
+            List[List[float]]: List of embeddings.
+        """
+        embeddings: List = []
+        for batch in self.batchify(texts, batch_size, preprocess):
             batch_embeddings = self._model_client.encode(batch)
             embeddings.extend([
-                array_to_buffer(embedding.tolist()) if as_buffer else embedding.tolist()
-                for embedding in batch_embeddings
+                self._process_embedding(embedding.tolist(), as_buffer) for embedding in batch_embeddings
             ])
         return embeddings
diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py
@@ -1,7 +1,7 @@
 from typing import Callable, Dict, List, Optional
 
 from redisvl.vectorize.base import BaseVectorizer
-from redisvl.utils.utils import array_to_buffer
+
 
 class OpenAITextVectorizer(BaseVectorizer):
     # TODO - add docstring
@@ -19,32 +19,29 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
         openai.api_key = api_config.get("api_key", None)
         self._model_client = openai.Embedding
 
-    def _process_embedding(self, embedding: List[float], as_buffer: bool):
-        if as_buffer:
-            return array_to_buffer(embedding)
-        return embedding
-
     def embed_many(
         self,
-        inputs: List[str],
+        texts: List[str],
         preprocess: Optional[Callable] = None,
         batch_size: Optional[int] = 10,
         as_buffer: Optional[float] = False
     ) -> List[List[float]]:
         """Embed many chunks of texts using the OpenAI API.
 
         Args:
-            inputs (List[str]): List of text chunks to embed.
+            texts (List[str]): List of text chunks to embed.
             preprocess (Optional[Callable], optional): Optional preprocessing callable to
                 perform before vectorization. Defaults to None.
-            batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10.
-            as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False.
+            batch_size (int, optional): Batch size of texts to use when creating
+                embeddings. Defaults to 10.
+            as_buffer (Optional[float], optional): Whether to convert the raw embedding
+                to a byte string. Defaults to False.
 
         Returns:
-            List[List[float]]: _description_
+            List[List[float]]: List of embeddings.
         """
         embeddings: List = []
-        for batch in self.batchify(inputs, batch_size, preprocess):
+        for batch in self.batchify(texts, batch_size, preprocess):
             response = self._model_client.create(input=batch, engine=self._model)
             embeddings += [
                 self._process_embedding(r["embedding"], as_buffer) for r in response["data"]
@@ -53,49 +50,50 @@ def embed_many(
 
     def embed(
         self,
-        inputs: List[str],
+        text: str,
         preprocess: Optional[Callable] = None,
-        batch_size: Optional[int] = 10,
         as_buffer: Optional[float] = False
     ) -> List[float]:
-        """Embed chunks of texts using the OpenAI API.
+        """Embed a chunk of text using the OpenAI API.
 
         Args:
-            inputs (List[str]): List of text chunks to embed.
+            text (str): Chunk of text to embed.
             preprocess (Optional[Callable], optional): Optional preprocessing callable to
                 perform before vectorization. Defaults to None.
-            batch_size (int, optional): Batch size of texts to use when creating embeddings. Defaults to 10.
-            as_buffer (Optional[float], optional): Whether to convert the raw embedding to a byte string. Defaults to False.
+            as_buffer (Optional[float], optional): Whether to convert the raw embedding
+                to a byte string. Defaults to False.
 
         Returns:
-            List[List[float]]: _description_
+            List[float]: Embedding.
         """
         if preprocess:
-            emb_input = preprocess(emb_input)
-        result = self._model_client.create(input=[emb_input], engine=self._model)
+            text = preprocess(text)
+        result = self._model_client.create(input=[text], engine=self._model)
         return self._process_embedding(result["data"][0]["embedding"], as_buffer)
 
-
     async def aembed_many(
         self,
-        inputs: List[str],
+        texts: List[str],
         preprocess: Optional[Callable] = None,
-        chunk_size: int = 1000,
+        batch_size: int = 1000,
         as_buffer: Optional[bool] = False
     ) -> List[List[float]]:
-        """_summary_
+        """Asynchronously embed many chunks of texts using the OpenAI API.
 
         Args:
-            inputs (List[str]): _description_
-            preprocess (Optional[Callable], optional): _description_. Defaults to None.
-            chunk_size (int, optional): _description_. Defaults to 1000.
-            as_buffer (Optional[bool], optional): _description_. Defaults to False.
+            texts (List[str]): List of text chunks to embed.
+            preprocess (Optional[Callable], optional): Optional preprocessing callable to
+                perform before vectorization. Defaults to None.
+            batch_size (int, optional): Batch size of texts to use when creating
+                embeddings. Defaults to 10.
+            as_buffer (Optional[float], optional): Whether to convert the raw embedding
+                to a byte string. Defaults to False.
 
         Returns:
-            List[List[float]]: _description_
+            List[List[float]]: List of embeddings.
         """
         embeddings: List = []
-        for batch in self.batchify(inputs, chunk_size, preprocess):
+        for batch in self.batchify(texts, batch_size, preprocess):
             response = await self._model_client.acreate(input=batch, engine=self._model)
             embeddings += [
                 self._process_embedding(r["embedding"], as_buffer) for r in response["data"]
@@ -104,21 +102,23 @@ async def aembed_many(
 
     async def aembed(
         self,
-        emb_input: str,
+        text: str,
         preprocess: Optional[Callable] = None,
         as_buffer: Optional[bool] = False
     ) -> List[float]:
-        """_summary_
+        """Asynchronously embed a chunk of text using the OpenAI API.
 
         Args:
-            emb_input (str): _description_
-            preprocess (Optional[Callable], optional): _description_. Defaults to None.
-            as_buffer (Optional[bool], optional): _description_. Defaults to False.
+            text (str): Chunk of text to embed.
+            preprocess (Optional[Callable], optional): Optional preprocessing callable to
+                perform before vectorization. Defaults to None.
+            as_buffer (Optional[float], optional): Whether to convert the raw embedding
+                to a byte string. Defaults to False.
 
         Returns:
-            List[float]: _description_
+            List[float]: Embedding.
         """
         if preprocess:
-            emb_input = preprocess(emb_input)
-        result = await self._model_client.acreate(input=[emb_input], engine=self._model)
+            text = preprocess(text)
+        result = await self._model_client.acreate(input=[text], engine=self._model)
         return self._process_embedding(result["data"][0]["embedding"], as_buffer)