Skip to content

Commit

Permalink
Restore click & CI (#10)
Browse files Browse the repository at this point in the history
* Restore click

* Clean up
  • Loading branch information
falexwolf authored Mar 18, 2024
1 parent a039513 commit fa3edff
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 283 deletions.
8 changes: 2 additions & 6 deletions .github/workflows/run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,15 @@ jobs:

- uses: mamba-org/setup-micromamba@v1
with:
environment-file: environment.yml
environment-file: figure_2_environment.yml
init-shell: >-
bash
cache-environment: true

- name: Get data
run: python setup.py --nrows 256 --ncols 500
shell: bash -el {0}

- name: Set up a test instance to track run
run: lamin init --storage ./testdb
shell: bash -el {0}

- name: Run benchmark
run: python main.py --test
run: python figure_2_iteration_benchmark.py --test
shell: bash -el {0}
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# benchmark
# `arrayloader-benchmarks`

1. Run `setup.py` to download and prepare the data.
2. Run `main.py` to run the benchmark. The results are saved in `results.tsv`. (5 runs, 3 loops each).
This repo contains benchmarking scripts accompanying the blog post: [lamin.ai/blog/arrayloader-benchmarks](https://lamin.ai/blog/arrayloader-benchmarks)

See
[lamin.ai/laminlabs/arrayloader-benchmarks](https://lamin.ai/laminlabs/arrayloader-benchmarks)
for artifacts, benchmarking runs, and additional notebooks.
File renamed without changes.
82 changes: 51 additions & 31 deletions figure_2_iteration_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import rich_click as click
import lamindb as ln
import scanpy as sc
import h5py
Expand All @@ -12,32 +13,38 @@


BATCH_SIZE = 128
ln.settings.transform.stem_uid = "r9vQub7PWucj"
ln.settings.transform.version = "1"


def write_data(path: Path, adata: AnnData):
# %%

def convert_adata_to_different_formats(adata: AnnData) -> None:
path: Path = Path.cwd()

# Sparse formats

# HDF5 and ZARR
adata.write_h5ad(path / "adata_benchmark_sparse.h5ad")
adata.write_zarr(path / "adata_benchmark_sparse.zrad")

# %%
# tiledbsoma
tiledbsoma.io.from_h5ad(
(path / "adata_benchmark_sparse.soma").as_posix(),
input_path=(path / "adata_benchmark_sparse.h5ad").as_posix(),
measurement_name="RNA",
)

# %% Dense onwards
adata.X = adata.X.toarray()
# Dense formats

adata.X = adata.X.toarray()
adata.write_h5ad(path / "adata_benchmark_dense.h5ad")
adata.write_zarr(path / "adata_benchmark_dense.zrad")
adata.write_zarr(
path / f"adata_benchmark_dense_chunk_{BATCH_SIZE}.zrad",
chunks=(BATCH_SIZE, adata.X.shape[1]),
)

# %%
# save h5 with dense chunked X, no way to do it with adata.write_h5ad
# Save h5 with dense chunked X, no way to do it with adata.write_h5ad
with h5py.File(path / f"adata_dense_chunk_{BATCH_SIZE}.h5", mode="w") as f:
f.create_dataset(
"adata",
Expand All @@ -49,10 +56,10 @@ def write_data(path: Path, adata: AnnData):
labels = adata.obs.cell_states.cat.codes.to_numpy()
f.create_dataset("labels", labels.shape, data=labels)

# %%
df_X_labels = sc.get.obs_df(adata, keys=adata.var_names.to_list() + ["cell_states"])

# %%
# Parquet

# default row groups
df_X_labels.to_parquet(path / "adata_dense.parquet", compression=None)
df_X_labels.to_parquet(
Expand All @@ -61,6 +68,8 @@ def write_data(path: Path, adata: AnnData):
row_group_size=BATCH_SIZE,
)

# tensorstore

sharded_dense_chunk = ts.open(
{
"driver": "zarr3",
Expand Down Expand Up @@ -121,11 +130,11 @@ def write_data(path: Path, adata: AnnData):
sharded_labels[:] = adata.obs["cell_states"].cat.codes.values


def run_benchmarks(path: Path, output: str, epochs: int):
console = rich.get_console()
def run_benchmarks(*, epochs: int) -> None:
main_path: Path = Path.cwd()

paths = {
name: path / filename
name: main_path / filename
for name, filename in {
"h5py_sp": "adata_benchmark_sparse.h5ad",
"soma_sp": "adata_benchmark_sparse.soma",
Expand All @@ -142,9 +151,6 @@ def run_benchmarks(path: Path, output: str, epochs: int):
"zarrV2tensorstore_dense_chunk": f"adata_benchmark_dense_chunk_{BATCH_SIZE}.zrad",
}.items()
}
logger.info("Initializing")

main_path = path

benches = {}
for name, path in paths.items():
Expand All @@ -161,37 +167,51 @@ def run_benchmarks(path: Path, output: str, epochs: int):
...
logger.info("Initialized " + name)

results_filename = "results.tsv"
console = rich.get_console()
for name, bench in benches.items():
console.rule(f"[bold]Running '{name}'", align="left")
with open(main_path / output, "a") as f:
with open(main_path / results_filename, "a") as f:
for i in range(epochs):
time_taken = timeit.Timer(lambda: next(bench)).timeit(1)
f.write(f"{name}\t{i}\t{time_taken}\n")
print(f"Loop {i}: {time_taken:01f}s/epoch")
next(bench)


if __name__ == "__main__":
ln.settings.transform.stem_uid = "r9vQub7PWucj"
ln.settings.transform.version = "1"

@click.command()
@click.option("--test", "is_test", is_flag=True, type=bool, default=False, help="Tell Lamin that we're testing")
def main(is_test: bool = True):

is_production_db = (ln.setup.settings.instance.slug == "laminlabs/arrayloader-benchmarks")
assert is_test != is_production_db, "You're trying to run a test on the production database"
if not is_test:
assert ln.setup.settings.user.handle != "anonymous"

# track script
ln.track()

artifact = ln.Artifact.filter(uid="z3AsAOO39crEioi5kEaG").one()
logger.info("Artifact: {}", artifact)
# load input data
artifact = ln.Artifact.using("laminlabs/arrayloader-benchmarks").filter(uid="z3AsAOO39crEioi5kEaG").one()

# we will save in different formats, so no need to cache
logger.info("Loading data from S3")
with artifact.backed() as store:
adata = store[:, :5000].to_memory()
adata.raw = None
# subset to 5k genes and less for test runs
nrows = 256 if is_test else None
ncols = 500 if is_test else 5000

path = Path.cwd()
with artifact.backed() as adata:
adata_subset = adata[:nrows, :ncols].to_memory()
adata_subset.raw = None

write_data(path, adata)
output = "results.tsv"
run_benchmarks(path, output, 4)
# convert data
convert_adata_to_different_formats(adata_subset)

ln.Artifact(output, key=f"cli_runs/{output}").save()
# run benchmarks
run_benchmarks(epochs=4)

# finish run
ln.finish()


if __name__ == "__main__":
main()
93 changes: 0 additions & 93 deletions main.py

This file was deleted.

Loading

0 comments on commit fa3edff

Please sign in to comment.