Skip to content

Commit

Permalink
Harrison/tf embeddings (#817)
Browse files Browse the repository at this point in the history
Co-authored-by: Ryohei Kuroki <[email protected]>
  • Loading branch information
hwchase17 and yakigac authored Jan 31, 2023
1 parent 5d4b6e4 commit 7b4882a
Show file tree
Hide file tree
Showing 6 changed files with 1,020 additions and 301 deletions.
74 changes: 70 additions & 4 deletions docs/modules/utils/combine_docs_examples/embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "42f76e43",
"metadata": {},
Expand Down Expand Up @@ -138,7 +137,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "ed47bb62",
"metadata": {},
Expand Down Expand Up @@ -196,11 +194,79 @@
"source": [
"doc_result = embeddings.embed_documents([text])"
]
},
{
"cell_type": "markdown",
"id": "fff4734f",
"metadata": {},
"source": [
"## TensorflowHub\n",
"Let's load the TensorflowHub Embedding class."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f822104b",
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings import TensorflowHubEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bac84e46",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-01-30 23:53:01.652176: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2023-01-30 23:53:34.362802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"embeddings = TensorflowHubEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4790d770",
"metadata": {},
"outputs": [],
"source": [
"text = \"This is a test document.\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f556dcdb",
"metadata": {},
"outputs": [],
"source": [
"query_result = embeddings.embed_query(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90f0db94",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "cohere",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -214,7 +280,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.10.9"
},
"vscode": {
"interpreter": {
Expand Down
2 changes: 2 additions & 0 deletions langchain/embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings

logger = logging.getLogger(__name__)

Expand All @@ -14,6 +15,7 @@
"HuggingFaceEmbeddings",
"CohereEmbeddings",
"HuggingFaceHubEmbeddings",
"TensorflowHubEmbeddings",
]


Expand Down
70 changes: 70 additions & 0 deletions langchain/embeddings/tensorflow_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Wrapper around TensorflowHub embedding models."""
from typing import Any, List

from pydantic import BaseModel, Extra

from langchain.embeddings.base import Embeddings

DEFAULT_MODEL_URL = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"


class TensorflowHubEmbeddings(BaseModel, Embeddings):
"""Wrapper around tensorflow_hub embedding models.
To use, you should have the ``tensorflow_text`` python package installed.
Example:
.. code-block:: python
from langchain.embeddings import TensorflowHubEmbeddings
url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
tf = TensorflowHubEmbeddings(model_url=url)
"""

embed: Any #: :meta private:
model_url: str = DEFAULT_MODEL_URL
"""Model name to use."""

def __init__(self, **kwargs: Any):
"""Initialize the tensorflow_hub and tensorflow_text."""
super().__init__(**kwargs)
try:
import tensorflow_hub
import tensorflow_text # noqa

self.embed = tensorflow_hub.load(self.model_url)
except ImportError as e:
raise ValueError(
"Could not import some python packages." "Please install them."
) from e

class Config:
"""Configuration for this pydantic object."""

extra = Extra.forbid

def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a TensorflowHub embedding model.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
texts = list(map(lambda x: x.replace("\n", " "), texts))
embeddings = self.embed(texts).numpy()
return embeddings.tolist()

def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a TensorflowHub embedding model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
text = text.replace("\n", " ")
embedding = self.embed(text).numpy()[0]
return embedding.tolist()
Loading

0 comments on commit 7b4882a

Please sign in to comment.