Harrison/tf embeddings (#817)

Co-authored-by: Ryohei Kuroki <[email protected]>
langchain-ai · Jan 31, 2023 · 7b4882a · 7b4882a
1 parent 5d4b6e4
commit 7b4882a
Show file tree

Hide file tree

Showing 6 changed files with 1,020 additions and 301 deletions.
diff --git a/docs/modules/utils/combine_docs_examples/embeddings.ipynb b/docs/modules/utils/combine_docs_examples/embeddings.ipynb
@@ -77,7 +77,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "42f76e43",
    "metadata": {},
@@ -138,7 +137,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "ed47bb62",
    "metadata": {},
@@ -196,11 +194,79 @@
    "source": [
     "doc_result = embeddings.embed_documents([text])"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fff4734f",
+   "metadata": {},
+   "source": [
+    "## TensorflowHub\n",
+    "Let's load the TensorflowHub Embedding class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f822104b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import TensorflowHubEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bac84e46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-01-30 23:53:01.652176: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-01-30 23:53:34.362802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = TensorflowHubEmbeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4790d770",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This is a test document.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f556dcdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_result = embeddings.embed_query(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90f0db94",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cohere",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -214,7 +280,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.9"
   },
   "vscode": {
    "interpreter": {

diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py
@@ -6,6 +6,7 @@
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings
 
 logger = logging.getLogger(__name__)
 
@@ -14,6 +15,7 @@
     "HuggingFaceEmbeddings",
     "CohereEmbeddings",
     "HuggingFaceHubEmbeddings",
+    "TensorflowHubEmbeddings",
 ]
 
 

diff --git a/langchain/embeddings/tensorflow_hub.py b/langchain/embeddings/tensorflow_hub.py
@@ -0,0 +1,70 @@
+"""Wrapper around TensorflowHub embedding models."""
+from typing import Any, List
+
+from pydantic import BaseModel, Extra
+
+from langchain.embeddings.base import Embeddings
+
+DEFAULT_MODEL_URL = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+
+
+class TensorflowHubEmbeddings(BaseModel, Embeddings):
+    """Wrapper around tensorflow_hub embedding models.
+
+    To use, you should have the ``tensorflow_text`` python package installed.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.embeddings import TensorflowHubEmbeddings
+            url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+            tf = TensorflowHubEmbeddings(model_url=url)
+    """
+
+    embed: Any  #: :meta private:
+    model_url: str = DEFAULT_MODEL_URL
+    """Model name to use."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize the tensorflow_hub and tensorflow_text."""
+        super().__init__(**kwargs)
+        try:
+            import tensorflow_hub
+            import tensorflow_text  # noqa
+
+            self.embed = tensorflow_hub.load(self.model_url)
+        except ImportError as e:
+            raise ValueError(
+                "Could not import some python packages." "Please install them."
+            ) from e
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Compute doc embeddings using a TensorflowHub embedding model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        texts = list(map(lambda x: x.replace("\n", " "), texts))
+        embeddings = self.embed(texts).numpy()
+        return embeddings.tolist()
+
+    def embed_query(self, text: str) -> List[float]:
+        """Compute query embeddings using a TensorflowHub embedding model.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        text = text.replace("\n", " ")
+        embedding = self.embed(text).numpy()[0]
+        return embedding.tolist()