diff --git a/.gitignore b/.gitignore index aedc8d7c..ac4340c1 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,4 @@ venv/ # written by setuptools_scm **/_version.py +benchmarks/results/* diff --git a/MANIFEST.in b/MANIFEST.in index c1b54649..1e12c6ea 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,8 +3,14 @@ include README.md exclude .pre-commit-config.yaml recursive-include brainglobe_workflows *.py +recursive-exclude brainglobe_workflows/cellfinder/notebooks *.py recursive-exclude * __pycache__ recursive-exclude * *.py[co] recursive-exclude docs * recursive-exclude tests * + +include *.json +recursive-include benchmarks *.json +recursive-include benchmarks *.py +recursive-exclude benchmarks/results * diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 00000000..473e347d --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,194 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "brainglobe_workflows", + + // The project's homepage + "project_url": "https://github.com/brainglobe/brainglobe-workflows", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": ".", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building the project. + // See asv.conf.json documentation. + // To build the package using pyproject.toml (PEP518), uncomment the following lines + "build_command": [ + "python -m pip install build", + "python -m build", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + ], + // To build the package using setuptools and a setup.py file, uncomment the following lines + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // Customizable commands for installing and uninstalling the project. + // See asv.conf.json documentation. + "install_command": ["in-dir={env_dir} python -mpip install --force-reinstall {wheel_file}"], + "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["smg/cellfinder-wf1-benchmark"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv", "mamba" (above 3.8) + // or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/brainglobe/brainglobe-workflows/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + "pythons": ["3.10"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // A conda environment file that is used for environment creation. + // "conda_environment_file": "environment.yml", + + // The matrix of dependencies to test. Each key of the "req" + // requirements dictionary is the name of a package (in PyPI) and + // the values are version numbers. An empty list or empty string + // indicates to just test against the default (latest) + // version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed + // via pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // The ``@env`` and ``@env_nobuild`` keys contain the matrix of + // environment variables to pass to build and benchmark commands. + // An environment will be created for every combination of the + // cartesian product of the "@env" variables in this matrix. + // Variables in "@env_nobuild" will be passed to every environment + // during the benchmark phase, but will not trigger creation of + // new environments. A value of ``null`` means that the variable + // will not be set for the current combination. + // + // "matrix": { + // "req": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""] // emcee is only available for install with pip. + // }, + // "env": {"ENV_VAR_1": ["val1", "val2"]}, + // "env_nobuild": {"ENV_VAR_2": ["val3", null]}, + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // - req + // Required packages + // - env + // Environment variables + // - env_nobuild + // Non-build environment variables + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda + // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1 + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": "benchmarks/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": "benchmarks/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/cellfinder/__init__.py b/benchmarks/cellfinder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py new file mode 100644 index 00000000..9e7b6ea5 --- /dev/null +++ b/benchmarks/cellfinder/workflows.py @@ -0,0 +1,98 @@ +import shutil + +from brainglobe_utils.IO.cells import save_cells +from cellfinder_core.main import main as cellfinder_run +from cellfinder_core.tools.IO import read_with_dask + +from brainglobe_workflows.cellfinder.cellfinder_main import ( + Workflow, + workflow_from_cellfinder_run, +) + + +class TimeBenchmark: + """ + Base class with sensible options + See https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes + + The sample_time, number, repeat, and timer attributes can be adjusted in + the setup() routine, which can be useful for parameterized benchmarks + + Other attributes for time benchmarks not specified in this class: + - number: the number of iterations in each sample. If number is specified, + sample_time is ignored. Note that setup and teardown are not run between + iterations: setup runs first, then the timed benchmark routine is called + number times, and after that teardown runs. + - timer: timeit.default_timer by default + + Notes about some of the default attributes for time benchmarks: + - warmup_time: asv will spend this time (in seconds) in calling the + benchmarked function repeatedly, before starting to run the + actual benchmark + + - repeat: when not provided (repeat set to 0): + - if rounds==1 the default is + (min_repeat, max_repeat, max_time) = (1, 10, 20.0), + - if rounds != 1 the default is + (min_repeat, max_repeat, max_time) = (1, 5, 10.0) + + - sample_time: `number` is determined so that each sample takes + approx sample_time=10ms + """ + + timeout = 60 # default: 60 s + version = None # default: None (i.e. hash of source code) + warmup_time = 0.1 # default:0.1; + rounds = 2 # default:2 + repeat = 0 # default: 0 + sample_time = 0.01 # default: 10 ms = 0.01 s; + min_run_count = 2 # default:2 + + @classmethod + def setup(self): + cfg = Workflow() + cfg.setup_parameters() + cfg.setup_input_data() + self.cfg = cfg + + def teardown(self): + shutil.rmtree(self.cfg.install_path) + + +class TimeFullWorkflow(TimeBenchmark): + def time_workflow_from_cellfinder_run(self): + workflow_from_cellfinder_run(self.cfg) + + +class TimeReadInputDask(TimeBenchmark): + def time_read_signal_w_dask(self): + read_with_dask(self.cfg.signal_parent_dir) + + def time_read_background_w_dask(self): + read_with_dask(self.cfg.background_parent_dir) + + +class TimeCellfinderRun(TimeBenchmark): + def setup(self): + TimeBenchmark.setup() + self.signal_array = read_with_dask(self.cfg.signal_parent_dir) + self.background_array = read_with_dask(self.cfg.background_parent_dir) + + def time_cellfinder_run(self): + cellfinder_run( + self.signal_array, self.background_array, self.cfg.voxel_sizes + ) + + +class TimeSaveCells(TimeBenchmark): + def setup(self): + TimeBenchmark.setup() + signal_array = read_with_dask(self.cfg.signal_parent_dir) + background_array = read_with_dask(self.cfg.background_parent_dir) + + self.detected_cells = cellfinder_run( + signal_array, background_array, self.cfg.voxel_sizes + ) + + def time_save_cells(self): + save_cells(self.detected_cells, self.cfg.detected_cells_filepath) diff --git a/brainglobe_workflows/__init__.py b/brainglobe_workflows/__init__.py index 28709be9..00081a03 100644 --- a/brainglobe_workflows/__init__.py +++ b/brainglobe_workflows/__init__.py @@ -1,7 +1,7 @@ from importlib.metadata import PackageNotFoundError, version try: - __version__ = version("brainglobe-scripts") + __version__ = version("brainglobe-workflows") except PackageNotFoundError: # package is not installed pass diff --git a/brainglobe_workflows/cellfinder/__init__.py b/brainglobe_workflows/cellfinder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py new file mode 100644 index 00000000..928b1786 --- /dev/null +++ b/brainglobe_workflows/cellfinder/cellfinder_main.py @@ -0,0 +1,153 @@ +from pathlib import Path + +import pooch +from brainglobe_utils.IO.cells import save_cells +from cellfinder_core.main import main as cellfinder_run +from cellfinder_core.tools.IO import read_with_dask + +# Input data URL and hash +DATA_URL = "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip" +DATA_HASH = "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" + +# Local cached directories +CELLFINDER_CACHE_DIR = Path.home() / ".cellfinder_benchmarks" +INPUT_DATA_CACHE_DIR = CELLFINDER_CACHE_DIR / "cellfinder_test_data" +SIGNAL_DATA_PATH = INPUT_DATA_CACHE_DIR / "signal" +BACKGROUND_DATA_PATH = INPUT_DATA_CACHE_DIR / "background" +OUTPUT_DATA_CACHE_DIR = CELLFINDER_CACHE_DIR / "cellfinder_output" + + +class Workflow: + """ + Defines the cellfinder workflow built around running the + cellfinder_core.main.main() function. + + It includes `setup` methods that encapsulate steps which are required + to run the workflow, but that we don't expect to benchmark + (such as defining processing parameters or downloading the test data). + """ + + def setup_parameters(self): + """ + Define input and output data locations and parameters for + preprocessing steps. + + Methods that start with `setup_` will in principle not be benchmarked. + """ + + # cellfinder benchmarks cache directory + self.install_path = CELLFINDER_CACHE_DIR + + # origin of data to download + self.data_url = DATA_URL + self.data_hash = DATA_HASH + + # cached subdirectory to save data to + self.local_path = INPUT_DATA_CACHE_DIR + self.output_path = OUTPUT_DATA_CACHE_DIR + self.output_path.mkdir(parents=True, exist_ok=True) + self.detected_cells_filepath = self.output_path / "detected_cells.xml" + + # preprocessing parameters + self.voxel_sizes = [5, 2, 2] # microns + self.start_plane = 0 + self.end_plane = -1 + self.trained_model = None # if None, it will use a default model + self.model_weights = None + self.model = "resnet50_tv" + self.batch_size = 32 + self.n_free_cpus = 2 + self.network_voxel_sizes = [5, 1, 1] + self.soma_diameter = 16 + self.ball_xy_size = 6 + self.ball_z_size = 15 + self.ball_overlap_fraction = 0.6 + self.log_sigma_size = 0.2 + self.n_sds_above_mean_thresh = 10 + self.soma_spread_factor = 1.4 + self.max_cluster_size = 100000 + self.cube_width = 50 + self.cube_height = 50 + self.cube_depth = 20 + self.network_depth = "50" + + def setup_input_data(self): + """ + Retrieve input data from GIN repository, and add relevant + parent directories and list of files as attributes of the + workflow class. + + Methods that start with `setup_` will in principle not be benchmarked. + """ + + # retrieve data from GIN repository + list_files_archive = pooch.retrieve( + url=self.data_url, + known_hash=self.data_hash, + path=self.install_path, # path to download zip to + progressbar=True, + processor=pooch.Unzip( + extract_dir=self.local_path # path to unzipped dir + ), + ) + + # signal data: parent dir and list of files + self.signal_parent_dir = str(SIGNAL_DATA_PATH) + self.list_signal_files = [ + f + for f in list_files_archive + if f.startswith(self.signal_parent_dir) + ] + + # background data: parent dir and list of files + self.background_parent_dir = str(BACKGROUND_DATA_PATH) + self.list_background_files = [ + f + for f in list_files_archive + if f.startswith(self.background_parent_dir) + ] + + +def workflow_from_cellfinder_run(cfg): + """ + Run workflow based on the cellfinder_core.main.main() + function. + + The steps are: + 1. Read the input signal and background data as two separate + Dask arrays. + 2. Run the main cellfinder pipeline on the input Dask arrays, + with the parameters defined in the input configuration (cfg). + 3. Save the detected cells as an xml file to the location specified in + the input configuration (cfg). + + We plan to time each of the steps in the workflow individually, + as well as the full workflow. + + Parameters + ---------- + cfg : Workflow + a class with the required setup methods and parameters for + the cellfinder workflow + """ + # Read input data as Dask arrays + signal_array = read_with_dask(cfg.signal_parent_dir) + background_array = read_with_dask(cfg.background_parent_dir) + + # Run main analysis using `cellfinder_run` + detected_cells = cellfinder_run( + signal_array, background_array, cfg.voxel_sizes + ) + + # Save results to xml file + save_cells(detected_cells, cfg.detected_cells_filepath) + + +if __name__ == "__main__": + # Run setup steps (these won't be timed) + cfg = Workflow() + cfg.setup_parameters() + cfg.setup_input_data() + + # Run full workflow (this will be timed) + workflow_from_cellfinder_run(cfg) diff --git a/pyproject.toml b/pyproject.toml index ec5c3257..698e6eb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,11 @@ description = "A place to keep scripts to use as benchmarks, user-examples end-t readme = "README.md" requires-python = ">=3.8.0" dynamic = ["version"] +dependencies = [ + "pooch", + "cellfinder-core" +] + license = {text = "BSD-3-Clause"} @@ -51,11 +56,11 @@ build-backend = "setuptools.build_meta" include-package-data = true [tool.setuptools.packages.find] -include = ["brainglobe_scripts*"] +include = ["brainglobe_workflows*"] exclude = ["tests*"] [tool.pytest.ini_options] -addopts = "--cov=brainglobe_scripts" +addopts = "--cov=brainglobe_workflows" [tool.black] target-version = ['py38', 'py39', 'py310'] @@ -101,5 +106,5 @@ python = extras = dev commands = - pytest -v --color=yes --cov=brainglobe_scripts --cov-report=xml + pytest -v --color=yes --cov=brainglobe_workflows --cov-report=xml """