Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
8bfa73b
cellfinder workflow first draft
sfmig Sep 18, 2023
c317ec8
test data
sfmig Sep 18, 2023
b64d518
rename
sfmig Sep 18, 2023
3925755
Merge branch 'main' into smg/cellfinder-workflow
sfmig Sep 18, 2023
027d675
delete test data from repo
sfmig Sep 20, 2023
23f3a12
rename init
sfmig Sep 20, 2023
10cac14
rename and reduce workflow to main version
sfmig Sep 20, 2023
0a34c72
exclude notebooks from sdist
sfmig Sep 20, 2023
adfc179
precommit fixes
sfmig Sep 20, 2023
cf1ed9d
change brainglobe_scripts references in pyproject.toml
sfmig Sep 21, 2023
113c668
add config class and default config. Combine setup steps into one set…
sfmig Oct 2, 2023
fdedb44
set signal and parent dirs as strings (for read with dask). set list …
sfmig Oct 2, 2023
c9bc5de
add logging info
sfmig Oct 2, 2023
cab653b
add smoke tests for running the workflow with different options for t…
sfmig Oct 2, 2023
8b9c169
add smoketests for setup
sfmig Oct 2, 2023
9ab1e34
add logging message for fetching from GIN
sfmig Oct 2, 2023
39754ea
remove class for tests
sfmig Oct 2, 2023
4959cb6
refactor the input data retrieval
sfmig Oct 2, 2023
e4d7ced
add pooch as dependency
sfmig Oct 2, 2023
07f6772
add cellfinder-core dependency
sfmig Oct 2, 2023
0e5054b
fix typo in Pathlike
sfmig Oct 2, 2023
bece87e
replace caplog level setting with context manager option
sfmig Oct 2, 2023
b3c5d5e
specify logger to try and fix pytest caplog error
sfmig Oct 2, 2023
08c0e1b
cellfinder workflow first draft
sfmig Sep 18, 2023
852ee04
test data
sfmig Sep 18, 2023
25883ef
rename
sfmig Sep 18, 2023
7b97560
delete test data from repo
sfmig Sep 20, 2023
c87f9f2
rename init
sfmig Sep 20, 2023
61bbbeb
rename and reduce workflow to main version
sfmig Sep 20, 2023
aedb87d
exclude notebooks from sdist
sfmig Sep 20, 2023
7b6e882
precommit fixes
sfmig Sep 20, 2023
f1836ff
change brainglobe_scripts references in pyproject.toml
sfmig Sep 21, 2023
16e6312
add config class and default config. Combine setup steps into one set…
sfmig Oct 2, 2023
4de8497
set signal and parent dirs as strings (for read with dask). set list …
sfmig Oct 2, 2023
5844ebf
add logging info
sfmig Oct 2, 2023
9192df3
add logging message for fetching from GIN
sfmig Oct 2, 2023
59d9ca0
refactor the input data retrieval
sfmig Oct 2, 2023
06ec524
add pooch as dependency
sfmig Oct 2, 2023
c85c2d5
add cellfinder-core dependency
sfmig Oct 2, 2023
8e13570
fix typo in Pathlike
sfmig Oct 2, 2023
02b2a84
teardown for env var fixture
sfmig Oct 3, 2023
9d0c6c2
add logger_str as fixture. changed prep_json to check if object is Pa…
sfmig Oct 4, 2023
ff3532e
remove logger.propagate setting to True (it's the default value)
sfmig Oct 4, 2023
c8f22ff
remove some comments and TODOs
sfmig Oct 4, 2023
446d959
timestamp output directory
sfmig Oct 4, 2023
a4bcd51
make default config explicitly a module-level constant. separate defa…
sfmig Oct 4, 2023
ac14d84
remove option to pass a config dict to setup_workflow
sfmig Oct 4, 2023
607dc0a
add description of the script
sfmig Oct 4, 2023
bb66598
remove some stray comments
sfmig Oct 4, 2023
abd9e4c
Merge branch 'smg/cellfinder-workflow-only' into smg/cellfinder-workf…
sfmig Oct 4, 2023
5555fba
fix output_path with timestamp error. added function to generate conf…
sfmig Oct 4, 2023
654ff31
add check for output directory to tests
sfmig Oct 4, 2023
3af3b8a
rename json factory function
sfmig Oct 5, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include README.md
exclude .pre-commit-config.yaml

recursive-include brainglobe_workflows *.py
recursive-exclude brainglobe_workflows/cellfinder/notebooks *.py

recursive-exclude * __pycache__
recursive-exclude * *.py[co]
Expand Down
2 changes: 1 addition & 1 deletion brainglobe_workflows/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from importlib.metadata import PackageNotFoundError, version

try:
__version__ = version("brainglobe-scripts")
__version__ = version("brainglobe-workflows")
except PackageNotFoundError:
# package is not installed
pass
Empty file.
343 changes: 343 additions & 0 deletions brainglobe_workflows/cellfinder/cellfinder_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
"""A script reproducing the main cellfinder workflow

It assumes an environment variable called "CELLFINDER_CONFIG_PATH" exists,
which points to a json file with the required parameters. If the environment
variable does not exist, the default configuration parameters (defined in
DEFAULT_CONFIG_DICT below) are used

"""


import datetime
import json
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple, Union

import pooch
from brainglobe_utils.IO.cells import save_cells
from cellfinder_core.main import main as cellfinder_run
from cellfinder_core.tools.IO import read_with_dask
from cellfinder_core.train.train_yml import depth_type

Pathlike = Union[str, os.PathLike]

# logger
# if imported as a module, the logger is named after the module
logger = logging.getLogger(__name__)

# Default config
CELLFINDER_CACHE_DIR = Path.home() / ".cellfinder_workflows"


def make_default_config_dict(cellfinder_cache_dir):
"""Generate a config dictionary with the required parameters
for the workflow

The input data is fetched from GIN and downloaded to
the location provided by cellfinder_cache_dir. The results are
also saved in a timestamped output subdirectory under cellfinder_cache_dir

Parameters
----------
cellfinder_cache_dir : _type_
_description_

Returns
-------
dict
dictionary with the required parameters for the workflow
"""
return {
"install_path": cellfinder_cache_dir,
"data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip",
"data_hash": (
"b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914"
),
"local_path": cellfinder_cache_dir / "cellfinder_test_data",
"signal_parent_dir": str(
cellfinder_cache_dir / "cellfinder_test_data" / "signal"
),
"background_parent_dir": str(
cellfinder_cache_dir / "cellfinder_test_data" / "background"
),
"output_path_basename": cellfinder_cache_dir / "cellfinder_output_",
"detected_cells_filename": "detected_cells.xml",
"voxel_sizes": [5, 2, 2], # microns
"start_plane": 0,
"end_plane": -1,
"trained_model": None, # if None, it will use a default model
"model_weights": None,
"model": "resnet50_tv",
"batch_size": 32,
"n_free_cpus": 2,
"network_voxel_sizes": [5, 1, 1],
"soma_diameter": 16,
"ball_xy_size": 6,
"ball_z_size": 15,
"ball_overlap_fraction": 0.6,
"log_sigma_size": 0.2,
"n_sds_above_mean_thresh": 10,
"soma_spread_factor": 1.4,
"max_cluster_size": 100000,
"cube_width": 50,
"cube_height": 50,
"cube_depth": 20,
"network_depth": "50",
}


@dataclass
class CellfinderConfig:
"""
Define input and output data locations, and parameters for
preprocessing steps.
"""

# cellfinder benchmarks cache directory
install_path: Pathlike

# origin of data to download (if required)
data_url: Optional[str]
data_hash: Optional[str]

# cached subdirectory to save data to
local_path: Pathlike
signal_parent_dir: str
background_parent_dir: str
output_path_basename: Pathlike
detected_cells_filename: Pathlike

# preprocessing parameters
voxel_sizes: Tuple[float, float, float]
start_plane: int
end_plane: int
trained_model: Optional[
os.PathLike
] # if None, it will use a default model
model_weights: Optional[os.PathLike]
model: str
batch_size: int
n_free_cpus: int
network_voxel_sizes: Tuple[int, int, int]
soma_diameter: int
ball_xy_size: int
ball_z_size: int
ball_overlap_fraction: float
log_sigma_size: float
n_sds_above_mean_thresh: int
soma_spread_factor: float
max_cluster_size: int
cube_width: int
cube_height: int
cube_depth: int
network_depth: depth_type

list_signal_files: Optional[list] = None
list_background_files: Optional[list] = None
output_path: Optional[Pathlike] = None


def example_cellfinder_script():
cfg = setup_workflow()
run_workflow_from_cellfinder_run(cfg)


def run_workflow_from_cellfinder_run(cfg):
"""
Run workflow based on the cellfinder_core.main.main()
function.

The steps are:
1. Read the input signal and background data as two separate
Dask arrays.
2. Run the main cellfinder pipeline on the input Dask arrays,
with the parameters defined in the input configuration (cfg).
3. Save the detected cells as an xml file to the location specified in
the input configuration (cfg).

We plan to time each of the steps in the workflow individually,
as well as the full workflow.

Parameters
----------
cfg : CellfinderConfig
a class with the required setup methods and parameters for
the cellfinder workflow
"""
# Read input data as Dask arrays
signal_array = read_with_dask(cfg.signal_parent_dir)
background_array = read_with_dask(cfg.background_parent_dir)

# Run main analysis using `cellfinder_run`
detected_cells = cellfinder_run(
signal_array, background_array, cfg.voxel_sizes
)

# Save results to xml file
save_cells(detected_cells, cfg.output_path / cfg.detected_cells_filename)


def setup_workflow(cellfinder_cache_dir=CELLFINDER_CACHE_DIR):
"""Prepare configuration to run workflow

This includes
- instantiating the config dictionary,
- checking if the input data exists locally, and fetching from
GIN repository otherwise,
- creating a timestamped directory for the output of the workflow if
it doesn't exist and adding it to the config

To instantiate the config dictionary, we first check if an environment
variable "CELLFINDER_CONFIG_PATH" pointing to a config json file exists.
If not, the default config (DEFAULT_CONFIG_DICT) is used.

Returns
-------
config : CellfinderConfig
a class with the required setup methods and parameters for
the cellfinder workflow
"""

# Define config
# if environment variable defined, that prevails
if "CELLFINDER_CONFIG_PATH" in os.environ.keys():
input_config_path = Path(os.environ["CELLFINDER_CONFIG_PATH"])
assert input_config_path.exists()

# read config into dict
# (assumes config is json serializable)
with open(input_config_path) as cfg:
config_dict = json.load(cfg)

config = CellfinderConfig(**config_dict)

logger.info(
"Configuration retrieved from "
f'{os.environ["CELLFINDER_CONFIG_PATH"]}'
)
# else use the default config, with the cellfinder cache directory provided
else:
config = CellfinderConfig(
**make_default_config_dict(cellfinder_cache_dir)
)
logger.info("Using default configuration")

# Retrieve and add lists of input data to config if neither are defined
if not (config.list_signal_files and config.list_signal_files):
config = retrieve_input_data(config)

# Create output directory if it doesn't exist, timestamped
timestamp = datetime.datetime.now()
timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S")
output_path_timestamped = Path(
str(config.output_path_basename) + timestamp_formatted
)
output_path_timestamped.mkdir(parents=True, exist_ok=True)
# add to config
config.output_path = output_path_timestamped

return config


def retrieve_input_data(config):
"""
Adds the lists of input data files (signal and background) to the config.

It first checks if the input data exists locally.
- If both directories (signal and background) exist, the lists of signal
and background files are added to the relevant config attributes
- If exactly one of the input data directories is missing, an error
message is logged.
- If neither of them exist, the data is retrieved from the provided GIN
repository. If no URL or hash to GIN is provided, an error is shown.

Parameters
----------
config : CellfinderConfig
a dataclass whose attributes are the parameters
for running cellfinder.

Returns
-------
config : CellfinderConfig
a dataclass whose attributes are the parameters
for running cellfinder.
"""
# Check if input data (signal and background) exist locally.
# If both directories exist, get list of signal and background files
if (
Path(config.signal_parent_dir).exists()
and Path(config.background_parent_dir).exists()
):
logger.info("Fetching input data from the local directories")

config.list_signal_files = [
f for f in Path(config.signal_parent_dir).iterdir() if f.is_file()
]
config.list_background_files = [
f
for f in Path(config.background_parent_dir).iterdir()
if f.is_file()
]

# If exactly one of the input data directories is missing, print error
elif (
Path(config.signal_parent_dir).exists()
or Path(config.background_parent_dir).exists()
):
if not Path(config.signal_parent_dir).exists():
logger.error(
f"The directory {config.signal_parent_dir} does not exist"
)
else:
logger.error(
f"The directory {config.background_parent_dir} does not exist"
)

# If neither of them exist, retrieve data from GIN repository
else:
if (not config.data_url) or (not config.data_hash):
logger.error(
"Input data not found locally, and URL/hash to "
"GIN repository not provided"
)

else:
# get list of files in GIN archive with retrieve
list_files_archive = pooch.retrieve(
url=config.data_url,
known_hash=config.data_hash,
path=config.install_path, # path to download zip to
progressbar=True,
processor=pooch.Unzip(
extract_dir=config.local_path # path to unzipped dir
),
)
logger.info("Fetching input data from the provided GIN repository")

# check signal and background parent directories exist now
assert Path(config.signal_parent_dir).exists()
assert Path(config.background_parent_dir).exists()

# add signal files to config
config.list_signal_files = [
f
for f in list_files_archive
if f.startswith(config.signal_parent_dir)
]

# add background files to config
config.list_background_files = [
f
for f in list_files_archive
if f.startswith(config.background_parent_dir)
]

return config


if __name__ == "__main__":
example_cellfinder_script()
11 changes: 7 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ description = "A place to keep scripts to use as benchmarks, user-examples end-t
readme = "README.md"
requires-python = ">=3.8.0"
dynamic = ["version"]

dependencies = [
"pooch",
"cellfinder-core"
]
license = {text = "BSD-3-Clause"}

classifiers = [
Expand Down Expand Up @@ -51,11 +54,11 @@ build-backend = "setuptools.build_meta"
include-package-data = true

[tool.setuptools.packages.find]
include = ["brainglobe_scripts*"]
include = ["brainglobe_workflows*"]
exclude = ["tests*"]

[tool.pytest.ini_options]
addopts = "--cov=brainglobe_scripts"
addopts = "--cov=brainglobe_workflows"

[tool.black]
target-version = ['py38', 'py39', 'py310']
Expand Down Expand Up @@ -101,5 +104,5 @@ python =
extras =
dev
commands =
pytest -v --color=yes --cov=brainglobe_scripts --cov-report=xml
pytest -v --color=yes --cov=brainglobe_workflows --cov-report=xml
"""
Loading