Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[test]: basic test framework #5

Merged
merged 9 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions .github/workflows/ragstack_lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: ragstack-lint

on:
push:
branches: [ragstack-main]
paths:
- "ragstack/tests/**"
pull_request:
paths:
- "ragstack/poetry.lock"
- "ragstack/pyproject.toml"
- "ragstack/tests/**"

env:
POETRY_VERSION: "1.8.2"

jobs:
lint:
runs-on: ubuntu-latest
strategy:
matrix:
python-version:
- "3.10"
- "3.11"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
uses: "./.github/actions/poetry_caching"
with:
python-version: ${{ matrix.python-version }}
poetry-version: ${{ env.POETRY_VERSION }}
cache-key: ${{ runner.os }}-poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('ragstack/poetry.lock') }}
- name: Install Python dependencies
run: |
poetry env use ${{ matrix.python-version }}
poetry install
working-directory: ragstack/
- name: Get .mypy_cache to speed up mypy
uses: actions/cache@v4
env:
SEGMENT_DOWNLOAD_TIMEOUT_MIN: "2"
with:
path: |
./ragstack/.mypy_cache
key: ${{ runner.os }}-mypy-${{ hashFiles('ragstack/pyproject.toml') }}
- name: Lint check
run: |
make lint
working-directory: ragstack/
44 changes: 44 additions & 0 deletions .github/workflows/ragstack_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: ragstack-test

on:
pull_request:
branches:
- ragstack-main
paths:
- ".github/workflows/**"
- "ragstack/poetry.lock"
- "ragstack/pyproject.toml"
- "ragstack/tests/**"

env:
POETRY_VERSION: "1.8.2"

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version:
- "3.10"
- "3.11"
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT_DEV }}
ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN_DEV }}
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
uses: "./.github/actions/poetry_caching"
with:
python-version: ${{ matrix.python-version }}
poetry-version: ${{ env.POETRY_VERSION }}
cache-key: ${{ runner.os }}-poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('ragstack/poetry.lock') }}
- name: Install Python dependencies
run: |
poetry env use ${{ matrix.python-version }}
poetry install
working-directory: ragstack/
- name: Run all tests
run: |
make test
working-directory: ragstack/
23 changes: 23 additions & 0 deletions ragstack/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
.PHONY: format lint test integ_test

test: integ_test

integ_test:
poetry run pytest tests/integration_tests --instafail $(args)

format:
poetry run ruff check tests --fix
poetry run ruff format tests

lint:
poetry run mypy --namespace-packages -p tests.integration_tests

codespell:
@poetry install --with spelling
poetry run codespell --toml pyproject.toml

fix_codespell:
@poetry install --with spelling
poetry run codespell --toml pyproject.toml --write


5 changes: 5 additions & 0 deletions ragstack/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# RAGStack Langflow

## Documentation

[DataStax RAGStack Documentation](https://docs.datastax.com/en/ragstack/docs/index.html)
8,158 changes: 8,158 additions & 0 deletions ragstack/poetry.lock

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions ragstack/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
[tool.poetry]
name = "ragstack-ai-langflow-core"
version = "0.0.1.alpha"
description = "RAGStack Langflow"
license = "BUSL-1.1"
authors = ["DataStax"]
readme = "README.md"
documentation = "https://docs.datastax.com/en/ragstack"


[tool.poetry.dependencies]
python = ">=3.10,<3.12"

[tool.poetry.group.test.dependencies]
ragstack-ai-langflow = { path = "../", develop = true }
pytest = "^8.1.0"
pytest-instafail = "^0.5.0"
pytest-sugar = "^1.0.0"
pytest-order = "^1.2.1"

[tool.poetry.group.dev.dependencies]
mypy = "^1.9.0"
ruff = "^0.3.5"

[tool.pytest.ini_options]
minversion = "6.0"
addopts = "-ra"
testpaths = ["tests/integration_tests"]
filterwarnings = ["ignore::DeprecationWarning"]
console_output_style = "progress"
log_cli = true

[tool.poetry.group.spelling]
optional = true

[tool.poetry.group.spelling.dependencies]
codespell = "^2.2.6"

[tool.codespell]
skip = '.git,*.pdf,*.svg,*.pdf,*.yaml,*.ipynb,poetry.lock,*.min.js,*.css,package-lock.json,*.trig'
# Ignore latin etc
ignore-regex = '.*(Stati Uniti|Tense=Pres).*'

[tool.ruff]
line-length = 120

[tool.mypy]
plugins = ["pydantic.mypy"]
follow_imports = "silent"
exclude = "tests/integration_tests/conftest.py"

[[tool.mypy.overrides]]
module = "google.cloud.*"
ignore_missing_imports = true

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Empty file added ragstack/tests/__init__.py
Empty file.
Empty file.
125 changes: 125 additions & 0 deletions ragstack/tests/integration_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import logging
from typing import Callable, Optional
import pytest
from pathlib import Path

from langchain_core.embeddings import Embeddings

from astrapy.core.db import AstraDB


def pytest_configure():
data_path = Path(__file__).parent.absolute() / "data"

# Uses a URL loader w/ OpenAIEmbeddings to embed into AstraDB.
pytest.EMBEDDING_PATH = data_path / "embedding.json"
# Uses OpenAIEmbeddings w/ AstraDBSearch to search for similar documents.
pytest.VECTOR_STORE_SEARCH_PATH = data_path / "vector_search.json"

for path in [
pytest.EMBEDDING_PATH,
pytest.VECTOR_STORE_SEARCH_PATH,
]:
assert path.exists(), f"File {path} does not exist. Available files: {list(data_path.iterdir())}"


LOGGER = logging.getLogger(__name__)
DIR_PATH = os.path.dirname(os.path.abspath(__file__))


def _load_env() -> None:
dotenv_path = os.path.join(DIR_PATH, os.pardir, ".env")
if os.path.exists(dotenv_path):
from dotenv import load_dotenv

load_dotenv(dotenv_path)


_load_env()


def get_env_var(name: str) -> str:
value = os.getenv(name)
if not value:
LOGGER.warning(f"Missing environment variable: {name}")
pytest.skip(f"Missing environment variable: {name}")

return value


@pytest.fixture(scope="session", autouse=True)
def setup_and_teardown():
LOGGER.info("Deleting existing collections")
astra = AstraDB(
token=get_env_var("ASTRA_DB_APPLICATION_TOKEN"),
api_endpoint=get_env_var("ASTRA_DB_API_ENDPOINT"),
)
collections = astra.get_collections().get("status").get("collections")
for c in collections:
astra.delete_collection(c)

yield

LOGGER.info("Cleaning up collections")
collections = astra.get_collections().get("status").get("collections")
for c in collections:
astra.delete_collection(c)


class MockEmbeddings(Embeddings):
def __init__(self):
self.embedded_documents = None
self.embedded_query = None

@staticmethod
def mock_embedding(text: str):
return [len(text) / 2, len(text) / 5, len(text) / 10]

def embed_documents(self, texts: list[str]) -> list[list[float]]:
self.embedded_documents = texts
return [self.mock_embedding(text) for text in texts]

def embed_query(self, text: str) -> list[float]:
self.embedded_query = text
return self.mock_embedding(text)


@pytest.fixture
def embedding_flow() -> str:
with open(pytest.EMBEDDING_PATH, "r") as f:
return f.read()


@pytest.fixture
def vector_store_search_flow() -> str:
with open(pytest.VECTOR_STORE_SEARCH_PATH, "r") as f:
return f.read()


@pytest.fixture
def astradb_component() -> Callable:
from langflow.components.vectorstores import AstraDBVectorStoreComponent

def component_builder(
collection: str,
embedding: Optional[Embeddings] = None,
inputs: Optional[list] = None,
):
if embedding is None:
embedding = MockEmbeddings()

if inputs is None:
inputs = []

token = get_env_var("ASTRA_DB_APPLICATION_TOKEN")
api_endpoint = get_env_var("ASTRA_DB_API_ENDPOINT")
return AstraDBVectorStoreComponent().build(
embedding=embedding,
collection_name=collection,
inputs=inputs,
token=token,
api_endpoint=api_endpoint,
)

return component_builder
1 change: 1 addition & 0 deletions ragstack/tests/integration_tests/data/embedding.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions ragstack/tests/integration_tests/data/vector_search.json

Large diffs are not rendered by default.

86 changes: 86 additions & 0 deletions ragstack/tests/integration_tests/test_astradb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import os
import orjson
import pytest
from typing import Callable

from langchain_core.documents import Document
from langflow.load import run_flow_from_json
from langflow.schema import Record

from astrapy.core.db import AstraDB, AstraDBCollection

BASIC_COLLECTION = "test"
EMBEDDING_FLOW_COLLECTION = "test_embedding_flow"


def test_build_no_inputs(astradb_component: Callable):
astradb_component(collection=BASIC_COLLECTION)


def test_build_with_inputs(astradb_component: Callable):
record = Record.from_document(Document(page_content="test"))
record2 = Record.from_document(Document(page_content="test2"))
inputs = [record, record2]
astradb_component(collection=BASIC_COLLECTION, inputs=inputs)


@pytest.mark.order(1)
def test_astra_embedding_flow(embedding_flow: str):
"""
Embeds the contents of a URL into AstraDB.
"""
flow = orjson.loads(embedding_flow)
TWEAKS = {
"AstraDB-s9tdG": {
"token": os.environ["ASTRA_DB_APPLICATION_TOKEN"],
"api_endpoint": os.environ["ASTRA_DB_API_ENDPOINT"],
"collection_name": EMBEDDING_FLOW_COLLECTION,
},
"SplitText-v9ZHX": {},
"URL-vWSxt": {},
"OpenAIEmbeddings-YQwtD": {"openai_api_key": os.environ["OPENAI_API_KEY"]},
}

result = run_flow_from_json(flow=flow, input_value="", tweaks=TWEAKS)
# embedding flow, so no particular output
assert result is not None

# however, we can check astradb to see if data was inserted
astra = AstraDB(
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
)
collection: AstraDBCollection = astra.collection(EMBEDDING_FLOW_COLLECTION)
docs = collection.count_documents()
assert docs["status"]["count"] > 0


@pytest.mark.order(2)
def test_astra_search(vector_store_search_flow: str):
"""
Searches AstraDB for the most similar document to a given query.
"""
flow = orjson.loads(vector_store_search_flow)

TWEAKS = {
"OpenAIEmbeddings-sSuTz": {
"openai_api_key": os.environ["OPENAI_API_KEY"],
},
"AstraDBSearch-avH6c": {
"token": os.environ["ASTRA_DB_APPLICATION_TOKEN"],
"api_endpoint": os.environ["ASTRA_DB_API_ENDPOINT"],
"collection_name": EMBEDDING_FLOW_COLLECTION,
"input_value": "Find 3 steps to upload examples",
},
}

result = run_flow_from_json(
flow=flow,
input_value="", # Would like to pass the search input value here, but
output_component="AstraDBSearch-avH6c",
tweaks=TWEAKS,
)
assert result is not None
data = result[0].outputs[0]
assert data is not None
assert data.component_display_name == "Astra DB Search"
Loading