Unstructured-IO · MthwRobinson · May 24, 2024 · May 19, 2024 · May 19, 2024 · May 21, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,11 @@
-## 0.13.8-dev12
+## 0.13.8-dev13
 
 ### Enhancements
 
 * **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
 * **Faster evaluation** Support for concurrent processing of documents during evaluation
 * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
+* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.
 
 ### Features
 

diff --git a/docs/source/core/embedding.rst b/docs/source/core/embedding.rst
@@ -226,4 +226,55 @@ workload identity, etc…)
 
     [print(e.embeddings, e) for e in elements]
     print(query_embedding, query)
-    print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
+    print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
+
+``VoyageAIEmbeddingEncoder``
+--------------------------
+
+The ``VoyageAIEmbeddingEncoder`` class connects to the VoyageAI to obtain embeddings for pieces of text.
+
+``embed_documents`` will receive a list of Elements, and return an updated list which
+includes the ``embeddings`` attribute for each Element.
+
+``embed_query`` will receive a query as a string, and return a list of floats which is the
+embedding vector for the given query string.
+
+``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
+embedding vector obtained via this class.
+
+``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
+this class are unit vectors.
+
+The following code block shows an example of how to use ``VoyageAIEmbeddingEncoder``. You will
+see the updated elements list (with the ``embeddings`` attribute included for each element),
+the embedding vector for the query string, and some metadata properties about the embedding model.
+
+To use Voyage AI you will need to pass Voyage AI API Key (obtained from https://dash.voyageai.com/)
+as the ``api_key`` parameter.
+
+The ``model_name`` parameter is mandatory, please check the available models
+at https://docs.voyageai.com/docs/embeddings
+
+.. code:: python
+
+    import os
+
+    from unstructured.documents.elements import Text
+    from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
+
+    embedding_encoder = VoyageAIEmbeddingEncoder(
+        config=VoyageAIEmbeddingConfig(
+            api_key=os.environ["VOYAGE_API_KEY"],
+            model_name="voyage-law-2"
+        )
+    )
+    elements = embedding_encoder.embed_documents(
+        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+    )
+
+    query = "This is the query"
+    query_embedding = embedding_encoder.embed_query(query=query)
+
+    [print(e, e.embeddings) for e in elements]
+    print(query, query_embedding)
+    print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
diff --git a/examples/embed/example_voyageai.py b/examples/embed/example_voyageai.py
@@ -0,0 +1,27 @@
+import os
+
+from unstructured.documents.elements import Text
+from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
+
+# To use Voyage AI you will need to pass Voyage AI API Key (obtained from https://dash.voyageai.com/)
+# as the ``api_key`` parameter.
+#
+# The ``model_name`` parameter is mandatory, please check the available models
+# at https://docs.voyageai.com/docs/embeddings
+
+embedding_encoder = VoyageAIEmbeddingEncoder(
+    config=VoyageAIEmbeddingConfig(
+        api_key=os.environ["VOYAGE_API_KEY"],
+        model_name="voyage-law-2"
+    )
+)
+elements = embedding_encoder.embed_documents(
+    elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+)
+
+query = "This is the query"
+query_embedding = embedding_encoder.embed_query(query=query)
+
+[print(e, e.embeddings) for e in elements]
+print(query, query_embedding)
+print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
diff --git a/requirements/ingest/embed-voyageai.in b/requirements/ingest/embed-voyageai.in
@@ -0,0 +1,4 @@
+-c ../deps/constraints.txt
+-c ../base.txt
+langchain
+langchain-voyageai
diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt
@@ -0,0 +1,140 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile ./ingest/embed-voyageai.in
+#
+aiohttp==3.9.5
+    # via
+    #   langchain
+    #   langchain-community
+    #   voyageai
+aiolimiter==1.1.0
+    # via voyageai
+aiosignal==1.3.1
+    # via aiohttp
+annotated-types==0.6.0
+    # via pydantic
+attrs==23.2.0
+    # via aiohttp
+certifi==2024.2.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   requests
+dataclasses-json==0.6.6
+    # via
+    #   -c ./ingest/../base.txt
+    #   langchain
+    #   langchain-community
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+idna==3.7
+    # via
+    #   -c ./ingest/../base.txt
+    #   requests
+    #   yarl
+jsonpatch==1.33
+    # via langchain-core
+jsonpointer==2.4
+    # via jsonpatch
+langchain==0.1.20
+    # via -r ./ingest/embed-voyageai.in
+langchain-community==0.0.38
+    # via langchain
+langchain-core==0.1.52
+    # via
+    #   langchain
+    #   langchain-community
+    #   langchain-text-splitters
+    #   langchain-voyageai
+langchain-text-splitters==0.0.1
+    # via langchain
+langchain-voyageai==0.1.1
+    # via -r ./ingest/embed-voyageai.in
+langsmith==0.1.57
+    # via
+    #   langchain
+    #   langchain-community
+    #   langchain-core
+marshmallow==3.21.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   dataclasses-json
+multidict==6.0.5
+    # via
+    #   aiohttp
+    #   yarl
+mypy-extensions==1.0.0
+    # via
+    #   -c ./ingest/../base.txt
+    #   typing-inspect
+numpy==1.26.4
+    # via
+    #   -c ./ingest/../base.txt
+    #   langchain
+    #   langchain-community
+    #   voyageai
+orjson==3.10.3
+    # via langsmith
+packaging==23.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   langchain-core
+    #   marshmallow
+pydantic==2.7.1
+    # via
+    #   langchain
+    #   langchain-core
+    #   langsmith
+pydantic-core==2.18.2
+    # via pydantic
+pyyaml==6.0.1
+    # via
+    #   langchain
+    #   langchain-community
+    #   langchain-core
+requests==2.31.0
+    # via
+    #   -c ./ingest/../base.txt
+    #   langchain
+    #   langchain-community
+    #   langsmith
+    #   voyageai
+sqlalchemy==2.0.30
+    # via
+    #   langchain
+    #   langchain-community
+tenacity==8.3.0
+    # via
+    #   langchain
+    #   langchain-community
+    #   langchain-core
+    #   voyageai
+typing-extensions==4.11.0
+    # via
+    #   -c ./ingest/../base.txt
+    #   pydantic
+    #   pydantic-core
+    #   sqlalchemy
+    #   typing-inspect
+typing-inspect==0.9.0
+    # via
+    #   -c ./ingest/../base.txt
+    #   dataclasses-json
+urllib3==1.26.18
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   requests
+voyageai==0.2.2
+    # via langchain-voyageai
+yarl==1.9.4
+    # via aiohttp
diff --git a/setup.py b/setup.py
@@ -171,6 +171,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         "embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
         "embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
         "embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
+        "embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
         "openai": load_requirements("requirements/ingest/embed-openai.in"),
         "bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
         "databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),

diff --git a/test_unstructured/embed/test_voyageai.py b/test_unstructured/embed/test_voyageai.py
@@ -0,0 +1,19 @@
+from unstructured.documents.elements import Text
+from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
+
+
+def test_embed_documents_does_not_break_element_to_dict(mocker):
+    # Mocked client with the desired behavior for embed_documents
+    mock_client = mocker.MagicMock()
+    mock_client.embed_documents.return_value = [1, 2]
+
+    # Mock create_client to return our mock_client
+    mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client)
+
+    encoder = VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2"))
+    elements = encoder.embed_documents(
+        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+    )
+    assert len(elements) == 2
+    assert elements[0].to_dict()["text"] == "This is sentence 1"
+    assert elements[1].to_dict()["text"] == "This is sentence 2"