Skip to content

Commit

Permalink
feat: add VoyageAI embeddings (#3069) (#3099)
Browse files Browse the repository at this point in the history
Original PR was #3069. Merged in to a feature branch to fix dependency
and linting issues. Application code changes from the original PR were
already reviewed and approved.

------------
Original PR description:
Adding VoyageAI embeddings 
Voyage AI’s embedding models and rerankers are state-of-the-art in
retrieval accuracy.

---------

Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com>
Co-authored-by: Liuhong99 <39693953+Liuhong99@users.noreply.github.com>
3 people authored May 24, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 32df4ee commit 6b400b4
Showing 41 changed files with 20,601 additions and 56 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
## 0.14.3-dev4
## 0.14.3-dev5

### Enhancements

* **Move `category` field from Text class to Element class.**
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.

### Features

25 changes: 25 additions & 0 deletions examples/embed/example_voyageai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

from unstructured.documents.elements import Text
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder

# To use Voyage AI you will need to pass
# Voyage AI API Key (obtained from https://dash.voyageai.com/)
# as the ``api_key`` parameter.
#
# The ``model_name`` parameter is mandatory, please check the available models
# at https://docs.voyageai.com/docs/embeddings

embedding_encoder = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key=os.environ["VOYAGE_API_KEY"], model_name="voyage-law-2")
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)

query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)

[print(e, e.embeddings) for e in elements]
print(query, query_embedding)
print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
@@ -86,7 +86,7 @@ tabulate==0.9.0
# via -r ./base.in
tqdm==4.66.4
# via nltk
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -r ./base.in
# emoji
5 changes: 4 additions & 1 deletion requirements/deps/constraints.txt
Original file line number Diff line number Diff line change
@@ -57,7 +57,10 @@ unstructured-client<=0.18.0

fsspec==2024.5.0

# python 3.12 support
# python 3.12 support
numpy>=1.26.0
wrapt>=1.14.0


# NOTE(robinson): for compatiblity with voyage embeddings
langsmith==0.1.62
6 changes: 3 additions & 3 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -151,7 +151,7 @@ jsonschema-specifications==2023.12.1
# jsonschema
jupyter==1.0.0
# via -r ./dev.in
jupyter-client==8.6.1
jupyter-client==8.6.2
# via
# ipykernel
# jupyter-console
@@ -185,7 +185,7 @@ jupyter-server==2.14.0
# notebook-shim
jupyter-server-terminals==0.5.3
# via jupyter-server
jupyterlab==4.2.0
jupyterlab==4.2.1
# via notebook
jupyterlab-pygments==0.3.0
# via nbconvert
@@ -392,7 +392,7 @@ traitlets==5.14.3
# qtconsole
types-python-dateutil==2.9.0.20240316
# via arrow
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# -c ./test.txt
2 changes: 1 addition & 1 deletion requirements/extra-docx.txt
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@ python-docx==1.1.2
# via
# -c ././deps/constraints.txt
# -r ./extra-docx.in
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# python-docx
2 changes: 1 addition & 1 deletion requirements/extra-odt.txt
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ python-docx==1.1.2
# via
# -c ././deps/constraints.txt
# -r ./extra-odt.in
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# python-docx
6 changes: 3 additions & 3 deletions requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ attrdict==2.0.1
# via unstructured-paddleocr
babel==2.15.0
# via flask-babel
bce-python-sdk==0.9.10
bce-python-sdk==0.9.11
# via visualdl
blinker==1.8.2
# via flask
@@ -45,7 +45,7 @@ flask==3.0.3
# visualdl
flask-babel==4.0.0
# via visualdl
fonttools==4.51.0
fonttools==4.52.1
# via matplotlib
future==1.0.0
# via bce-python-sdk
@@ -200,7 +200,7 @@ six==1.16.0
# imgaug
# python-dateutil
# visualdl
tifffile==2024.5.10
tifffile==2024.5.22
# via scikit-image
tqdm==4.66.4
# via
6 changes: 3 additions & 3 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
@@ -39,7 +39,7 @@ filelock==3.14.0
# transformers
flatbuffers==24.3.25
# via onnxruntime
fonttools==4.51.0
fonttools==4.52.1
# via matplotlib
fsspec==2024.5.0
# via
@@ -118,7 +118,7 @@ numpy==1.26.4
# transformers
omegaconf==2.3.0
# via effdet
onnx==1.16.0
onnx==1.16.1
# via
# -r ./extra-pdf-image.in
# unstructured-inference
@@ -278,7 +278,7 @@ tqdm==4.66.4
# transformers
transformers==4.41.1
# via unstructured-inference
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# huggingface-hub
2 changes: 1 addition & 1 deletion requirements/huggingface.txt
Original file line number Diff line number Diff line change
@@ -102,7 +102,7 @@ tqdm==4.66.4
# transformers
transformers==4.41.1
# via -r ./huggingface.in
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# huggingface-hub
2 changes: 1 addition & 1 deletion requirements/ingest/airtable.txt
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ requests==2.32.2
# via
# -c ./ingest/../base.txt
# pyairtable
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pyairtable
2 changes: 1 addition & 1 deletion requirements/ingest/azure-cognitive-search.txt
Original file line number Diff line number Diff line change
@@ -34,7 +34,7 @@ six==1.16.0
# -c ./ingest/../base.txt
# azure-core
# isodate
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# azure-core
2 changes: 1 addition & 1 deletion requirements/ingest/azure.txt
Original file line number Diff line number Diff line change
@@ -93,7 +93,7 @@ six==1.16.0
# -c ./ingest/../base.txt
# azure-core
# isodate
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# azure-core
2 changes: 1 addition & 1 deletion requirements/ingest/chroma.txt
Original file line number Diff line number Diff line change
@@ -198,7 +198,7 @@ typer==0.9.0
# via
# -r ./ingest/chroma.in
# chromadb
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# chromadb
2 changes: 1 addition & 1 deletion requirements/ingest/databricks-volumes.txt
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
databricks-sdk==0.27.1
databricks-sdk==0.28.0
# via -r ./ingest/databricks-volumes.in
google-auth==2.29.0
# via databricks-sdk
2 changes: 1 addition & 1 deletion requirements/ingest/elasticsearch.txt
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ certifi==2024.2.2
# elastic-transport
elastic-transport==8.13.0
# via elasticsearch
elasticsearch==8.13.1
elasticsearch==8.13.2
# via -r ./ingest/elasticsearch.in
urllib3==1.26.18
# via
10 changes: 5 additions & 5 deletions requirements/ingest/embed-aws-bedrock.txt
Original file line number Diff line number Diff line change
@@ -37,7 +37,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
frozenlist==1.4.1
# via
@@ -56,9 +55,9 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-aws-bedrock.in
langchain-core==0.2.1
# via
@@ -67,8 +66,9 @@ langchain-core==0.2.1
# langchain-text-splitters
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@@ -135,7 +135,7 @@ tenacity==8.3.0
# langchain
# langchain-community
# langchain-core
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic
10 changes: 5 additions & 5 deletions requirements/ingest/embed-huggingface.txt
Original file line number Diff line number Diff line change
@@ -30,7 +30,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
filelock==3.14.0
# via
@@ -68,9 +67,9 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-huggingface.in
langchain-core==0.2.1
# via
@@ -79,8 +78,9 @@ langchain-core==0.2.1
# langchain-text-splitters
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@@ -188,7 +188,7 @@ tqdm==4.66.4
# transformers
transformers==4.41.1
# via sentence-transformers
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# huggingface-hub
4 changes: 2 additions & 2 deletions requirements/ingest/embed-octoai.txt
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ idna==3.7
# anyio
# httpx
# requests
openai==1.30.1
openai==1.30.3
# via -r ./ingest/embed-octoai.in
pydantic==2.7.1
# via openai
@@ -63,7 +63,7 @@ tqdm==4.66.4
# via
# -c ./ingest/../base.txt
# openai
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# openai
12 changes: 6 additions & 6 deletions requirements/ingest/embed-openai.txt
Original file line number Diff line number Diff line change
@@ -37,7 +37,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
distro==1.9.0
# via openai
@@ -64,9 +63,9 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-openai.in
langchain-core==0.2.1
# via
@@ -75,8 +74,9 @@ langchain-core==0.2.1
# langchain-text-splitters
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@@ -98,7 +98,7 @@ numpy==1.26.4
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
openai==1.30.1
openai==1.30.3
# via -r ./ingest/embed-openai.in
orjson==3.10.3
# via langsmith
@@ -152,7 +152,7 @@ tqdm==4.66.4
# via
# -c ./ingest/../base.txt
# openai
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# openai
Loading

0 comments on commit 6b400b4

Please sign in to comment.