datastax · jordanrfrazier · May 10, 2024 · May 7, 2024 · May 8, 2024 · May 8, 2024
diff --git a/.github/workflows/ragstack_lint.yml b/.github/workflows/ragstack_lint.yml
@@ -0,0 +1,49 @@
+name: ragstack-lint
+
+on:
+  push:
+    branches: [ragstack-main]
+    paths:
+      - "ragstack/tests/**"
+  pull_request:
+    paths:
+      - "ragstack/poetry.lock"
+      - "ragstack/pyproject.toml"
+      - "ragstack/tests/**"
+
+env:
+  POETRY_VERSION: "1.8.2"
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version:
+          - "3.10"
+          - "3.11"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
+        uses: "./.github/actions/poetry_caching"
+        with:
+          python-version: ${{ matrix.python-version }}
+          poetry-version: ${{ env.POETRY_VERSION }}
+          cache-key: ${{ runner.os }}-poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('ragstack/poetry.lock') }}
+      - name: Install Python dependencies
+        run: |
+          poetry env use ${{ matrix.python-version }}
+          poetry install
+        working-directory: ragstack/
+      - name: Get .mypy_cache to speed up mypy
+        uses: actions/cache@v4
+        env:
+          SEGMENT_DOWNLOAD_TIMEOUT_MIN: "2"
+        with:
+          path: |
+            ./ragstack/.mypy_cache
+          key: ${{ runner.os }}-mypy-${{ hashFiles('ragstack/pyproject.toml') }}
+      - name: Lint check
+        run: |
+          make lint
+        working-directory: ragstack/
diff --git a/.github/workflows/ragstack_test.yml b/.github/workflows/ragstack_test.yml
@@ -0,0 +1,44 @@
+name: ragstack-test
+
+on:
+  pull_request:
+    branches:
+      - ragstack-main
+    paths:
+      - ".github/workflows/**"
+      - "ragstack/poetry.lock"
+      - "ragstack/pyproject.toml"
+      - "ragstack/tests/**"
+
+env:
+  POETRY_VERSION: "1.8.2"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version:
+          - "3.10"
+          - "3.11"
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT_DEV }}
+      ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN_DEV }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
+        uses: "./.github/actions/poetry_caching"
+        with:
+          python-version: ${{ matrix.python-version }}
+          poetry-version: ${{ env.POETRY_VERSION }}
+          cache-key: ${{ runner.os }}-poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('ragstack/poetry.lock') }}
+      - name: Install Python dependencies
+        run: |
+          poetry env use ${{ matrix.python-version }}
+          poetry install
+        working-directory: ragstack/
+      - name: Run all tests
+        run: |
+          make test
+        working-directory: ragstack/
diff --git a/ragstack/Makefile b/ragstack/Makefile
@@ -0,0 +1,23 @@
+.PHONY: format lint test integ_test
+
+test: integ_test
+
+integ_test:
+	poetry run pytest tests/integration_tests --instafail $(args)
+
+format:
+	poetry run ruff check tests --fix
+	poetry run ruff format tests
+
+lint:
+	poetry run mypy --namespace-packages -p tests.integration_tests
+
+codespell:
+	@poetry install --with spelling
+	poetry run codespell --toml pyproject.toml
+
+fix_codespell:
+	@poetry install --with spelling
+	poetry run codespell --toml pyproject.toml --write
+
+
diff --git a/ragstack/README.md b/ragstack/README.md
@@ -0,0 +1,5 @@
+# RAGStack Langflow
+
+## Documentation
+
+[DataStax RAGStack Documentation](https://docs.datastax.com/en/ragstack/docs/index.html)
diff --git a/ragstack/poetry.lock b/ragstack/poetry.lock
diff --git a/ragstack/pyproject.toml b/ragstack/pyproject.toml
@@ -0,0 +1,58 @@
+[tool.poetry]
+name = "ragstack-ai-langflow-core"
+version = "0.0.1.alpha"
+description = "RAGStack Langflow"
+license = "BUSL-1.1"
+authors = ["DataStax"]
+readme = "README.md"
+documentation = "https://docs.datastax.com/en/ragstack"
+
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.12"
+
+[tool.poetry.group.test.dependencies]
+ragstack-ai-langflow = { path = "../", develop = true }
+pytest = "^8.1.0"
+pytest-instafail = "^0.5.0"
+pytest-sugar = "^1.0.0"
+pytest-order = "^1.2.1"
+
+[tool.poetry.group.dev.dependencies]
+mypy = "^1.9.0"
+ruff = "^0.3.5"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra"
+testpaths = ["tests/integration_tests"]
+filterwarnings = ["ignore::DeprecationWarning"]
+console_output_style = "progress"
+log_cli = true
+
+[tool.poetry.group.spelling]
+optional = true
+
+[tool.poetry.group.spelling.dependencies]
+codespell = "^2.2.6"
+
+[tool.codespell]
+skip = '.git,*.pdf,*.svg,*.pdf,*.yaml,*.ipynb,poetry.lock,*.min.js,*.css,package-lock.json,*.trig'
+# Ignore latin etc
+ignore-regex = '.*(Stati Uniti|Tense=Pres).*'
+
+[tool.ruff]
+line-length = 120
+
+[tool.mypy]
+plugins = ["pydantic.mypy"]
+follow_imports = "silent"
+exclude = "tests/integration_tests/conftest.py"
+
+[[tool.mypy.overrides]]
+module = "google.cloud.*"
+ignore_missing_imports = true
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/ragstack/tests/__init__.py b/ragstack/tests/__init__.py
diff --git a/ragstack/tests/integration_tests/__init__.py b/ragstack/tests/integration_tests/__init__.py
diff --git a/ragstack/tests/integration_tests/conftest.py b/ragstack/tests/integration_tests/conftest.py
@@ -0,0 +1,125 @@
+import os
+import logging
+from typing import Callable, Optional
+import pytest
+from pathlib import Path
+
+from langchain_core.embeddings import Embeddings
+
+from astrapy.core.db import AstraDB
+
+
+def pytest_configure():
+    data_path = Path(__file__).parent.absolute() / "data"
+
+    # Uses a URL loader w/ OpenAIEmbeddings to embed into AstraDB.
+    pytest.EMBEDDING_PATH = data_path / "embedding.json"
+    # Uses OpenAIEmbeddings w/ AstraDBSearch to search for similar documents.
+    pytest.VECTOR_STORE_SEARCH_PATH = data_path / "vector_search.json"
+
+    for path in [
+        pytest.EMBEDDING_PATH,
+        pytest.VECTOR_STORE_SEARCH_PATH,
+    ]:
+        assert path.exists(), f"File {path} does not exist. Available files: {list(data_path.iterdir())}"
+
+
+LOGGER = logging.getLogger(__name__)
+DIR_PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+def _load_env() -> None:
+    dotenv_path = os.path.join(DIR_PATH, os.pardir, ".env")
+    if os.path.exists(dotenv_path):
+        from dotenv import load_dotenv
+
+        load_dotenv(dotenv_path)
+
+
+_load_env()
+
+
+def get_env_var(name: str) -> str:
+    value = os.getenv(name)
+    if not value:
+        LOGGER.warning(f"Missing environment variable: {name}")
+        pytest.skip(f"Missing environment variable: {name}")
+
+    return value
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_and_teardown():
+    LOGGER.info("Deleting existing collections")
+    astra = AstraDB(
+        token=get_env_var("ASTRA_DB_APPLICATION_TOKEN"),
+        api_endpoint=get_env_var("ASTRA_DB_API_ENDPOINT"),
+    )
+    collections = astra.get_collections().get("status").get("collections")
+    for c in collections:
+        astra.delete_collection(c)
+
+    yield
+
+    LOGGER.info("Cleaning up collections")
+    collections = astra.get_collections().get("status").get("collections")
+    for c in collections:
+        astra.delete_collection(c)
+
+
+class MockEmbeddings(Embeddings):
+    def __init__(self):
+        self.embedded_documents = None
+        self.embedded_query = None
+
+    @staticmethod
+    def mock_embedding(text: str):
+        return [len(text) / 2, len(text) / 5, len(text) / 10]
+
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        self.embedded_documents = texts
+        return [self.mock_embedding(text) for text in texts]
+
+    def embed_query(self, text: str) -> list[float]:
+        self.embedded_query = text
+        return self.mock_embedding(text)
+
+
+@pytest.fixture
+def embedding_flow() -> str:
+    with open(pytest.EMBEDDING_PATH, "r") as f:
+        return f.read()
+
+
+@pytest.fixture
+def vector_store_search_flow() -> str:
+    with open(pytest.VECTOR_STORE_SEARCH_PATH, "r") as f:
+        return f.read()
+
+
+@pytest.fixture
+def astradb_component() -> Callable:
+    from langflow.components.vectorstores import AstraDBVectorStoreComponent
+
+    def component_builder(
+        collection: str,
+        embedding: Optional[Embeddings] = None,
+        inputs: Optional[list] = None,
+    ):
+        if embedding is None:
+            embedding = MockEmbeddings()
+
+        if inputs is None:
+            inputs = []
+
+        token = get_env_var("ASTRA_DB_APPLICATION_TOKEN")
+        api_endpoint = get_env_var("ASTRA_DB_API_ENDPOINT")
+        return AstraDBVectorStoreComponent().build(
+            embedding=embedding,
+            collection_name=collection,
+            inputs=inputs,
+            token=token,
+            api_endpoint=api_endpoint,
+        )
+
+    return component_builder
diff --git a/ragstack/tests/integration_tests/data/embedding.json b/ragstack/tests/integration_tests/data/embedding.json
diff --git a/ragstack/tests/integration_tests/data/vector_search.json b/ragstack/tests/integration_tests/data/vector_search.json
diff --git a/ragstack/tests/integration_tests/test_astradb.py b/ragstack/tests/integration_tests/test_astradb.py
@@ -0,0 +1,86 @@
+import os
+import orjson
+import pytest
+from typing import Callable
+
+from langchain_core.documents import Document
+from langflow.load import run_flow_from_json
+from langflow.schema import Record
+
+from astrapy.core.db import AstraDB, AstraDBCollection
+
+BASIC_COLLECTION = "test"
+EMBEDDING_FLOW_COLLECTION = "test_embedding_flow"
+
+
+def test_build_no_inputs(astradb_component: Callable):
+    astradb_component(collection=BASIC_COLLECTION)
+
+
+def test_build_with_inputs(astradb_component: Callable):
+    record = Record.from_document(Document(page_content="test"))
+    record2 = Record.from_document(Document(page_content="test2"))
+    inputs = [record, record2]
+    astradb_component(collection=BASIC_COLLECTION, inputs=inputs)
+
+
+@pytest.mark.order(1)
+def test_astra_embedding_flow(embedding_flow: str):
+    """
+    Embeds the contents of a URL into AstraDB.
+    """
+    flow = orjson.loads(embedding_flow)
+    TWEAKS = {
+        "AstraDB-s9tdG": {
+            "token": os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+            "api_endpoint": os.environ["ASTRA_DB_API_ENDPOINT"],
+            "collection_name": EMBEDDING_FLOW_COLLECTION,
+        },
+        "SplitText-v9ZHX": {},
+        "URL-vWSxt": {},
+        "OpenAIEmbeddings-YQwtD": {"openai_api_key": os.environ["OPENAI_API_KEY"]},
+    }
+
+    result = run_flow_from_json(flow=flow, input_value="", tweaks=TWEAKS)
+    # embedding flow, so no particular output
+    assert result is not None
+
+    # however, we can check astradb to see if data was inserted
+    astra = AstraDB(
+        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+        api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+    )
+    collection: AstraDBCollection = astra.collection(EMBEDDING_FLOW_COLLECTION)
+    docs = collection.count_documents()
+    assert docs["status"]["count"] > 0
+
+
+@pytest.mark.order(2)
+def test_astra_search(vector_store_search_flow: str):
+    """
+    Searches AstraDB for the most similar document to a given query.
+    """
+    flow = orjson.loads(vector_store_search_flow)
+
+    TWEAKS = {
+        "OpenAIEmbeddings-sSuTz": {
+            "openai_api_key": os.environ["OPENAI_API_KEY"],
+        },
+        "AstraDBSearch-avH6c": {
+            "token": os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+            "api_endpoint": os.environ["ASTRA_DB_API_ENDPOINT"],
+            "collection_name": EMBEDDING_FLOW_COLLECTION,
+            "input_value": "Find 3 steps to upload examples",
+        },
+    }
+
+    result = run_flow_from_json(
+        flow=flow,
+        input_value="",  # Would like to pass the search input value here, but
+        output_component="AstraDBSearch-avH6c",
+        tweaks=TWEAKS,
+    )
+    assert result is not None
+    data = result[0].outputs[0]
+    assert data is not None
+    assert data.component_display_name == "Astra DB Search"