Merge branch 'langchain-ai:master' into community_chroma_patch

langchain-ai · Nov 4, 2024 · 69de5ea · 69de5ea
2 parents 7b31bfe + ba5cba0
commit 69de5ea
Show file tree

Hide file tree

Showing 80 changed files with 2,556 additions and 5,631 deletions.
diff --git a/.github/scripts/get_min_versions.py b/.github/scripts/get_min_versions.py
@@ -7,12 +7,17 @@
     # for python 3.10 and below, which doesnt have stdlib tomllib
     import tomli as tomllib
 
-from packaging.version import parse as parse_version
 from packaging.specifiers import SpecifierSet
 from packaging.version import Version
 
+
+import requests
+from packaging.version import parse
+from typing import List
+
 import re
 
+
 MIN_VERSION_LIBS = [
     "langchain-core",
     "langchain-community",
@@ -31,29 +36,61 @@
 ]
 
 
-def get_min_version(version: str) -> str:
-    # base regex for x.x.x with cases for rc/post/etc
-    # valid strings: https://peps.python.org/pep-0440/#public-version-identifiers
-    vstring = r"\d+(?:\.\d+){0,2}(?:(?:a|b|rc|\.post|\.dev)\d+)?"
-    # case ^x.x.x
-    _match = re.match(f"^\\^({vstring})$", version)
-    if _match:
-        return _match.group(1)
+def get_pypi_versions(package_name: str) -> List[str]:
+    """
+    Fetch all available versions for a package from PyPI.
+
+    Args:
+        package_name (str): Name of the package
+
+    Returns:
+        List[str]: List of all available versions
 
-    # case >=x.x.x,<y.y.y
-    _match = re.match(f"^>=({vstring}),<({vstring})$", version)
-    if _match:
-        _min = _match.group(1)
-        _max = _match.group(2)
-        assert parse_version(_min) < parse_version(_max)
-        return _min
+    Raises:
+        requests.exceptions.RequestException: If PyPI API request fails
+        KeyError: If package not found or response format unexpected
+    """
+    pypi_url = f"https://pypi.org/pypi/{package_name}/json"
+    response = requests.get(pypi_url)
+    response.raise_for_status()
+    return list(response.json()["releases"].keys())
 
-    # case x.x.x
-    _match = re.match(f"^({vstring})$", version)
-    if _match:
-        return _match.group(1)
 
-    raise ValueError(f"Unrecognized version format: {version}")
+def get_minimum_version(package_name: str, spec_string: str) -> Optional[str]:
+    """
+    Find the minimum published version that satisfies the given constraints.
+
+    Args:
+        package_name (str): Name of the package
+        spec_string (str): Version specification string (e.g., ">=0.2.43,<0.4.0,!=0.3.0")
+
+    Returns:
+        Optional[str]: Minimum compatible version or None if no compatible version found
+    """
+    # rewrite occurrences of ^0.0.z to 0.0.z (can be anywhere in constraint string)
+    spec_string = re.sub(r"\^0\.0\.(\d+)", r"0.0.\1", spec_string)
+    # rewrite occurrences of ^0.y.z to >=0.y.z,<0.y+1 (can be anywhere in constraint string)
+    for y in range(1, 10):
+        spec_string = re.sub(rf"\^0\.{y}\.(\d+)", rf">=0.{y}.\1,<0.{y+1}", spec_string)
+    # rewrite occurrences of ^x.y.z to >=x.y.z,<x+1.0.0 (can be anywhere in constraint string)
+    for x in range(1, 10):
+        spec_string = re.sub(
+            rf"\^{x}\.(\d+)\.(\d+)", rf">={x}.\1.\2,<{x+1}", spec_string
+        )
+
+    spec_set = SpecifierSet(spec_string)
+    all_versions = get_pypi_versions(package_name)
+
+    valid_versions = []
+    for version_str in all_versions:
+        try:
+            version = parse(version_str)
+            if spec_set.contains(version):
+                valid_versions.append(version)
+        except ValueError:
+            continue
+
+    return str(min(valid_versions)) if valid_versions else None
 
 
 def get_min_version_from_toml(
@@ -96,7 +133,7 @@ def get_min_version_from_toml(
                 ][0]["version"]
 
             # Use parse_version to get the minimum supported version from version_string
-            min_version = get_min_version(version_string)
+            min_version = get_minimum_version(lib, version_string)
 
             # Store the minimum version in the min_versions dictionary
             min_versions[lib] = min_version
@@ -112,6 +149,20 @@ def check_python_version(version_string, constraint_string):
     :param constraint_string: A string representing the package's Python version constraints (e.g. ">=3.6, <4.0").
     :return: True if the version matches the constraints, False otherwise.
     """
+
+    # rewrite occurrences of ^0.0.z to 0.0.z (can be anywhere in constraint string)
+    constraint_string = re.sub(r"\^0\.0\.(\d+)", r"0.0.\1", constraint_string)
+    # rewrite occurrences of ^0.y.z to >=0.y.z,<0.y+1.0 (can be anywhere in constraint string)
+    for y in range(1, 10):
+        constraint_string = re.sub(
+            rf"\^0\.{y}\.(\d+)", rf">=0.{y}.\1,<0.{y+1}.0", constraint_string
+        )
+    # rewrite occurrences of ^x.y.z to >=x.y.z,<x+1.0.0 (can be anywhere in constraint string)
+    for x in range(1, 10):
+        constraint_string = re.sub(
+            rf"\^{x}\.0\.(\d+)", rf">={x}.0.\1,<{x+1}.0.0", constraint_string
+        )
+
     try:
         version = Version(version_string)
         constraints = SpecifierSet(constraint_string)

diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml
@@ -81,7 +81,6 @@ jobs:
           ES_URL: ${{ secrets.ES_URL }}
           ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
           ES_API_KEY: ${{ secrets.ES_API_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
           MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
           VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
           COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}

diff --git a/.github/workflows/_release.yml b/.github/workflows/_release.yml
@@ -95,9 +95,25 @@ jobs:
           PKG_NAME: ${{ needs.build.outputs.pkg-name }}
           VERSION: ${{ needs.build.outputs.version }}
         run: |
-          REGEX="^$PKG_NAME==\\d+\\.\\d+\\.\\d+\$"
-          echo $REGEX
-          PREV_TAG=$(git tag --sort=-creatordate | grep -P $REGEX || true | head -1)
+          PREV_TAG="$PKG_NAME==${VERSION%.*}.$(( ${VERSION##*.} - 1 ))"; [[ "${VERSION##*.}" -eq 0 ]] && PREV_TAG=""
+
+          # backup case if releasing e.g. 0.3.0, looks up last release
+          # note if last release (chronologically) was e.g. 0.1.47 it will get 
+          # that instead of the last 0.2 release
+          if [ -z "$PREV_TAG" ]; then
+            REGEX="^$PKG_NAME==\\d+\\.\\d+\\.\\d+\$"
+            echo $REGEX
+            PREV_TAG=$(git tag --sort=-creatordate | grep -P $REGEX || true | head -1)
+          fi
+
+          # confirm prev-tag actually exists in git repo with git tag
+          GIT_TAG_RESULT=$(git tag -l "$PREV_TAG")
+          if [ -z "$GIT_TAG_RESULT" ]; then
+            echo "Previous tag $PREV_TAG not found in git repo"
+            exit 1
+          fi
+
+
           TAG="${PKG_NAME}==${VERSION}"
           if [ "$TAG" == "$PREV_TAG" ]; then
             echo "No new version to release"
@@ -231,7 +247,7 @@ jobs:
         working-directory: ${{ inputs.working-directory }}
         id: min-version
         run: |
-          poetry run pip install packaging
+          poetry run pip install packaging requests
           python_version="$(poetry run python --version | awk '{print $2}')"
           min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml release $python_version)"
           echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"
@@ -289,7 +305,6 @@ jobs:
           ES_URL: ${{ secrets.ES_URL }}
           ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
           ES_API_KEY: ${{ secrets.ES_API_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
           MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
           VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
           UPSTAGE_API_KEY: ${{ secrets.UPSTAGE_API_KEY }}

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -47,7 +47,7 @@ jobs:
         id: min-version
         shell: bash
         run: |
-          poetry run pip install packaging tomli
+          poetry run pip install packaging tomli requests
           python_version="$(poetry run python --version | awk '{print $2}')"
           min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml pull_request $python_version)"
           echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"

diff --git a/.github/workflows/api_doc_build.yml b/.github/workflows/api_doc_build.yml
@@ -72,9 +72,7 @@ jobs:
       - name: Install dependencies
         working-directory: langchain
         run: |
-          
-          # skip airbyte due to pandas dependency issue
-          python -m uv pip install $(ls ./libs/partners | grep -vE "airbyte" | xargs -I {} echo "./libs/partners/{}")
+          python -m uv pip install $(ls ./libs/partners | xargs -I {} echo "./libs/partners/{}")
           python -m uv pip install libs/core libs/langchain libs/text-splitters libs/community libs/experimental
           python -m uv pip install -r docs/api_reference/requirements.txt
 

diff --git a/.github/workflows/check_diffs.yml b/.github/workflows/check_diffs.yml
@@ -31,7 +31,7 @@ jobs:
         uses: Ana06/[email protected]
       - id: set-matrix
         run: |
-          python -m pip install packaging
+          python -m pip install packaging requests
           python .github/scripts/check_diff.py ${{ steps.files.outputs.all }} >> $GITHUB_OUTPUT
     outputs:
       lint: ${{ steps.set-matrix.outputs.lint }}

diff --git a/docs/api_reference/create_api_rst.py b/docs/api_reference/create_api_rst.py
@@ -530,7 +530,6 @@ def _out_file_path(package_name: str) -> Path:
 
 def _build_index(dirs: List[str]) -> None:
     custom_names = {
-        "airbyte": "Airbyte",
         "aws": "AWS",
         "ai21": "AI21",
         "ibm": "IBM",

diff --git a/docs/docs/concepts/messages.mdx b/docs/docs/concepts/messages.mdx
@@ -12,7 +12,7 @@ Each message has a **role** (e.g., "user", "assistant"), **content** (e.g., text
 
 LangChain provides a unified message format that can be used across chat models, allowing users to work with different chat models without worrying about the specific details of the message format used by each model provider.
 
-## What inside a message?
+## What is inside a message?
 
 A message typically consists of the following pieces of information:
 

diff --git a/docs/docs/how_to/document_loader_csv.ipynb b/docs/docs/how_to/document_loader_csv.ipynb
@@ -157,7 +157,7 @@
     "    temp_file_path = temp_file.name\n",
     "\n",
     "loader = CSVLoader(file_path=temp_file_path)\n",
-    "loader.load()\n",
+    "data = loader.load()\n",
     "for record in data[:2]:\n",
     "    print(record)"
    ]

diff --git a/docs/docs/how_to/multi_vector.ipynb b/docs/docs/how_to/multi_vector.ipynb
@@ -207,7 +207,7 @@
    "id": "cdef8339-f9fa-4b3b-955f-ad9dbdf2734f",
    "metadata": {},
    "source": [
-    "The default search type the retriever performs on the vector database is a similarity search. LangChain vector stores also support searching via [Max Marginal Relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.max_marginal_relevance_search). This can be controlled via the `search_type` parameter of the retriever:"
+    "The default search type the retriever performs on the vector database is a similarity search. LangChain vector stores also support searching via [Max Marginal Relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html#langchain_core.vectorstores.base.VectorStore.max_marginal_relevance_search). This can be controlled via the `search_type` parameter of the retriever:"
    ]
   },
   {

diff --git a/docs/docs/how_to/qa_chat_history_how_to.ipynb b/docs/docs/how_to/qa_chat_history_how_to.ipynb
@@ -155,7 +155,7 @@
    "id": "15f8ad59-19de-42e3-85a8-3ba95ee0bd43",
    "metadata": {},
    "source": [
-    "For the retriever, we will use [WebBaseLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html) to load the content of a web page. Here we instantiate a `InMemoryVectorStore` vectorstore and then use its [.as_retriever](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.as_retriever) method to build a retriever that can be incorporated into [LCEL](/docs/concepts/lcel) chains."
+    "For the retriever, we will use [WebBaseLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html) to load the content of a web page. Here we instantiate a `InMemoryVectorStore` vectorstore and then use its [.as_retriever](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html#langchain_core.vectorstores.base.VectorStore.as_retriever) method to build a retriever that can be incorporated into [LCEL](/docs/concepts/lcel) chains."
    ]
   },
   {

diff --git a/docs/docs/how_to/vectorstore_retriever.ipynb b/docs/docs/how_to/vectorstore_retriever.ipynb
@@ -28,7 +28,7 @@
     "\n",
     "## Creating a retriever from a vectorstore\n",
     "\n",
-    "You can build a retriever from a vectorstore using its [.as_retriever](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.as_retriever) method. Let's walk through an example.\n",
+    "You can build a retriever from a vectorstore using its [.as_retriever](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html#langchain_core.vectorstores.base.VectorStore.as_retriever) method. Let's walk through an example.\n",
     "\n",
     "First we instantiate a vectorstore. We will use an in-memory [FAISS](https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html) vectorstore:"
    ]

diff --git a/docs/docs/integrations/providers/databricks.md b/docs/docs/integrations/providers/databricks.md
@@ -14,31 +14,21 @@ Databricks embraces the LangChain ecosystem in various ways:
 Installation
 ------------
 
-First-party Databricks integrations are available in the langchain-databricks partner package.
+First-party Databricks integrations are now available in the databricks-langchain partner package.
 
 ```
-pip install langchain-databricks
+pip install databricks-langchain
 ```
 
-🚧 Upcoming Package Consolidation Notice
-
-This package (`langchain-databricks`) will soon be consolidated into a new package: `databricks-langchain`. The new package will serve as the primary hub for all Databricks Langchain integrations.
-
-What’s Changing?
-In the coming months, `databricks-langchain` will include all features currently in `langchain-databricks`, as well as additional integrations to provide a unified experience for Databricks users.
-
-What You Need to Know
-For now, continue to use `langchain-databricks` as usual. When `databricks-langchain` is ready, we’ll provide clear migration instructions to make the transition seamless. During the transition period, `langchain-databricks` will remain operational, and updates will be shared here with timelines and guidance.
-
-Thank you for your support as we work toward an improved, streamlined experience!
+The legacy langchain-databricks partner package is still available but will be soon deprecated. 
 
 Chat Model
 ----------
 
 `ChatDatabricks` is a Chat Model class to access chat endpoints hosted on Databricks, including state-of-the-art models such as Llama3, Mixtral, and DBRX, as well as your own fine-tuned models.
 
 ```
-from langchain_databricks import ChatDatabricks
+from databricks_langchain import ChatDatabricks
 
 chat_model = ChatDatabricks(endpoint="databricks-meta-llama-3-70b-instruct")
 ```
@@ -69,7 +59,7 @@ Embeddings
 `DatabricksEmbeddings` is an Embeddings class to access text-embedding endpoints hosted on Databricks, including state-of-the-art models such as BGE, as well as your own fine-tuned models.
 
 ```
-from langchain_databricks import DatabricksEmbeddings
+from databricks_langchain import DatabricksEmbeddings
 
 embeddings = DatabricksEmbeddings(endpoint="databricks-bge-large-en")
 ```
@@ -83,7 +73,7 @@ Vector Search
 Databricks Vector Search is a serverless similarity search engine that allows you to store a vector representation of your data, including metadata, in a vector database. With Vector Search, you can create auto-updating vector search indexes from [Delta](https://docs.databricks.com/en/introduction/delta-comparison.html) tables managed by [Unity Catalog](https://www.databricks.com/product/unity-catalog) and query them with a simple API to return the most similar vectors.
 
 ```
-from langchain_databricks.vectorstores import DatabricksVectorSearch
+from databricks_langchain import DatabricksVectorSearch
 
 dvs = DatabricksVectorSearch(
     endpoint="<YOUT_ENDPOINT_NAME>",

diff --git a/docs/docs/integrations/providers/vectara/index.mdx b/docs/docs/integrations/providers/vectara/index.mdx
@@ -4,15 +4,14 @@
 > which is grounded in the data, documents, and knowledge that they have (technically, it is Retrieval-Augmented-Generation-as-a-service).
 
 **Vectara Overview:**
-`Vectara` is RAG-as-a-service, providing all the components of RAG behind an easy-to-use API, including:
+[Vectara](https://vectara.com/) is the trusted AI Assistant and Agent platform which focuses on enterprise readiness for mission-critical applications.
+Vectara serverless RAG-as-a-service provides all the components of RAG behind an easy-to-use API, including:
 1. A way to extract text from files (PDF, PPT, DOCX, etc)
 2. ML-based chunking that provides state of the art performance.
 3. The [Boomerang](https://vectara.com/how-boomerang-takes-retrieval-augmented-generation-to-the-next-level-via-grounded-generation/) embeddings model.
 4. Its own internal vector database where text chunks and embedding vectors are stored.
-5. A query service that automatically encodes the query into embedding, and retrieves the most relevant text segments
-(including support for [Hybrid Search](https://docs.vectara.com/docs/api-reference/search-apis/lexical-matching) and 
-[MMR](https://vectara.com/get-diverse-results-and-comprehensive-summaries-with-vectaras-mmr-reranker/))
-7. An LLM to for creating a [generative summary](https://docs.vectara.com/docs/learn/grounded-generation/grounded-generation-overview), based on the retrieved documents (context), including citations.
+5. A query service that automatically encodes the query into embedding, and retrieves the most relevant text segments, including support for [Hybrid Search](https://docs.vectara.com/docs/api-reference/search-apis/lexical-matching) as well as multiple reranking options such as the [multi-lingual relevance reranker](https://www.vectara.com/blog/deep-dive-into-vectara-multilingual-reranker-v1-state-of-the-art-reranker-across-100-languages), [MMR](https://vectara.com/get-diverse-results-and-comprehensive-summaries-with-vectaras-mmr-reranker/), [UDF reranker](https://www.vectara.com/blog/rag-with-user-defined-functions-based-reranking). 
+6. An LLM to for creating a [generative summary](https://docs.vectara.com/docs/learn/grounded-generation/grounded-generation-overview), based on the retrieved documents (context), including citations.
 
 For more information:
 - [Documentation](https://docs.vectara.com/docs/)
@@ -22,7 +21,7 @@ For more information:
 ## Installation and Setup
 
 To use `Vectara` with LangChain no special installation steps are required. 
-To get started, [sign up](https://vectara.com/integrations/langchain) for a free Vectara account (if you don't already have one), 
+To get started, [sign up](https://vectara.com/integrations/langchain) for a free Vectara trial,
 and follow the [quickstart](https://docs.vectara.com/docs/quickstart) guide to create a corpus and an API key. 
 Once you have these, you can provide them as arguments to the Vectara `vectorstore`, or you can set them as environment variables.