Skip to content

Commit

Permalink
Adding some tests to assert generators perserve page number. (#78)
Browse files Browse the repository at this point in the history
* Adding some tests to assert generators perserve page number.

* Adding generator test.

* Reverting accidental changes.

* Typing changes.

* Adding an initial test for the core generator.

* Adding tests to the vespa generator.

* Moving util function.

* Adding more tests.

* Removing breakpoint.

* Removing breakpoint

* Removing another breakpoint.

* Adding more tests.

---------

Co-authored-by: Mark <[email protected]>
  • Loading branch information
THOR300 and Mark authored Nov 22, 2023
1 parent e50fc10 commit 191b39e
Show file tree
Hide file tree
Showing 10 changed files with 5,062 additions and 7 deletions.
98 changes: 97 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pandas = "^1.5.1"
pre-commit = "^2.17.0"
pytest = "^7.1.1"
types-requests = "^2.28.11"
moto = "^4.1.9"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
3 changes: 3 additions & 0 deletions src/index/opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,9 @@ def get_text_document_generator(

for task in tasks:
all_metadata = get_metadata_dict(task)
# FIXME: This feels wrong here, would it not be better to pass in the
# embeddings along with the ParserOutput rather than read in here?
# Makes testing hard. We could create a new pydantic object?
embeddings = np.load(str(embedding_dir_as_path / f"{task.document_id}.npy"))

# Generate text block docs
Expand Down
65 changes: 65 additions & 0 deletions src/index/test/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
import pytest as pytest
import os
from cloudpathlib import S3Path

from typing import Any
from pathlib import Path
import numpy as np

from cpr_data_access.parser_models import ParserOutput


def read_local_json_file(file_path: str) -> dict:
"""Read a local json file and return the data."""
with open(file_path) as json_file:
data = json.load(json_file)
return data


def read_local_npy_file(file_path: str) -> Any:
"""Read a local npy file and return the data."""
return np.load(file_path)


@pytest.fixture
def s3_bucket_and_region() -> dict:
return {
"bucket": "test-bucket",
"region": "eu-west-1",
}


@pytest.fixture
def indexer_input_prefix():
return "indexer-input"


@pytest.fixture
def embeddings_dir_as_path(
s3_bucket_and_region,
indexer_input_prefix,
) -> S3Path:
return S3Path(
os.path.join("s3://", s3_bucket_and_region["bucket"], indexer_input_prefix)
)


@pytest.fixture
def test_document_data() -> tuple[ParserOutput, Any]:
parser_output_json = read_local_json_file(
str(
Path(__file__).parent
/ os.path.join("data", "CCLW.executive.10002.4495.json")
)
)
parser_output = ParserOutput.model_validate(parser_output_json)

embeddings = read_local_npy_file(
str(
Path(__file__).parent
/ os.path.join("data", "CCLW.executive.10002.4495.npy")
)
)

return (parser_output, embeddings)
Loading

0 comments on commit 191b39e

Please sign in to comment.