Unstructured-IO · MthwRobinson · May 24, 2024 · May 19, 2024 · May 19, 2024 · May 21, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -57,11 +57,14 @@
 
 * **Turn table extraction for PDFs and images off by default**. Reverting the default behavior for table extraction to "off" for PDFs and images. A number of users didn't realize we made the change and were impacted by slower processing times due to the extra model call for table extraction.
 
+## 0.13.8-dev13
+
 ### Enhancements
 
 * **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
 * **Faster evaluation** Support for concurrent processing of documents during evaluation
 * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
+* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.
 * **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.
 
 ### Features

diff --git a/examples/embed/example_voyageai.py b/examples/embed/example_voyageai.py
@@ -0,0 +1,25 @@
+import os
+
+from unstructured.documents.elements import Text
+from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
+
+# To use Voyage AI you will need to pass
+# Voyage AI API Key (obtained from https://dash.voyageai.com/)
+# as the ``api_key`` parameter.
+#
+# The ``model_name`` parameter is mandatory, please check the available models
+# at https://docs.voyageai.com/docs/embeddings
+
+embedding_encoder = VoyageAIEmbeddingEncoder(
+    config=VoyageAIEmbeddingConfig(api_key=os.environ["VOYAGE_API_KEY"], model_name="voyage-law-2")
+)
+elements = embedding_encoder.embed_documents(
+    elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+)
+
+query = "This is the query"
+query_embedding = embedding_encoder.embed_query(query=query)
+
+[print(e, e.embeddings) for e in elements]
+print(query, query_embedding)
+print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
diff --git a/requirements/ingest/embed-voyageai.in b/requirements/ingest/embed-voyageai.in
@@ -0,0 +1,4 @@
+-c ../deps/constraints.txt
+-c ../base.txt
+langchain
+langchain-voyageai
diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt
@@ -0,0 +1,115 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile ./ingest/embed-voyageai.in
+#
+aiohttp==3.9.5
+    # via
+    #   langchain
+    #   voyageai
+aiolimiter==1.1.0
+    # via voyageai
+aiosignal==1.3.1
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+async-timeout==4.0.3
+    # via
+    #   aiohttp
+    #   langchain
+attrs==23.2.0
+    # via aiohttp
+certifi==2024.2.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   requests
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+idna==3.7
+    # via
+    #   -c ./ingest/../base.txt
+    #   requests
+    #   yarl
+jsonpatch==1.33
+    # via langchain-core
+jsonpointer==2.4
+    # via jsonpatch
+langchain==0.2.1
+    # via -r ./ingest/embed-voyageai.in
+langchain-core==0.2.1
+    # via
+    #   langchain
+    #   langchain-text-splitters
+    #   langchain-voyageai
+langchain-text-splitters==0.2.0
+    # via langchain
+langchain-voyageai==0.1.1
+    # via -r ./ingest/embed-voyageai.in
+langsmith==0.1.62
+    # via
+    #   langchain
+    #   langchain-core
+multidict==6.0.5
+    # via
+    #   aiohttp
+    #   yarl
+numpy==1.26.4
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   langchain
+    #   voyageai
+orjson==3.10.3
+    # via langsmith
+packaging==23.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   langchain-core
+pydantic==2.7.1
+    # via
+    #   langchain
+    #   langchain-core
+    #   langsmith
+pydantic-core==2.18.2
+    # via pydantic
+pyyaml==6.0.1
+    # via
+    #   langchain
+    #   langchain-core
+requests==2.32.2
+    # via
+    #   -c ./ingest/../base.txt
+    #   langchain
+    #   langsmith
+    #   voyageai
+sqlalchemy==2.0.30
+    # via langchain
+tenacity==8.3.0
+    # via
+    #   langchain
+    #   langchain-core
+    #   voyageai
+typing-extensions==4.11.0
+    # via
+    #   -c ./ingest/../base.txt
+    #   pydantic
+    #   pydantic-core
+    #   sqlalchemy
+urllib3==1.26.18
+    # via
+    #   -c ./ingest/../base.txt
+    #   -c ./ingest/../deps/constraints.txt
+    #   requests
+voyageai==0.2.2
+    # via langchain-voyageai
+yarl==1.9.4
+    # via aiohttp
diff --git a/setup.py b/setup.py
@@ -171,6 +171,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         "embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
         "embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
         "embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
+        "embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
         "openai": load_requirements("requirements/ingest/embed-openai.in"),
         "bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
         "databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),

diff --git a/test_unstructured/embed/test_voyageai.py b/test_unstructured/embed/test_voyageai.py
@@ -0,0 +1,21 @@
+from unstructured.documents.elements import Text
+from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
+
+
+def test_embed_documents_does_not_break_element_to_dict(mocker):
+    # Mocked client with the desired behavior for embed_documents
+    mock_client = mocker.MagicMock()
+    mock_client.embed_documents.return_value = [1, 2]
+
+    # Mock create_client to return our mock_client
+    mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client)
+
+    encoder = VoyageAIEmbeddingEncoder(
+        config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
+    )
+    elements = encoder.embed_documents(
+        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+    )
+    assert len(elements) == 2
+    assert elements[0].to_dict()["text"] == "This is sentence 1"
+    assert elements[1].to_dict()["text"] == "This is sentence 2"