Add BitSeg NucSeg dataset (#398)

constantinpape · Oct 29, 2024 · 2dd4965 · 2dd4965
1 parent e296039
commit 2dd4965
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 0 deletions.
diff --git a/scripts/datasets/light_microscopy/check_bitdepth_nucseg.py b/scripts/datasets/light_microscopy/check_bitdepth_nucseg.py
@@ -0,0 +1,24 @@
+import os
+import sys
+
+from torch_em.util.debug import check_loader
+from torch_em.data.datasets import get_bitdepth_nucseg_loader
+
+sys.path.append("..")
+
+
+def check_bitdepth_nucseg():
+    from util import ROOT
+
+    loader = get_bitdepth_nucseg_loader(
+        path=os.path.join(ROOT, "bitdepth_nucseg"),
+        patch_shape=(512, 512),
+        batch_size=1,
+        download=True,
+    )
+
+    check_loader(loader, 8, instance_labels=True)
+
+
+if __name__ == "__main__":
+    check_bitdepth_nucseg()
diff --git a/torch_em/data/datasets/light_microscopy/__init__.py b/torch_em/data/datasets/light_microscopy/__init__.py
@@ -1,4 +1,5 @@
 from .arvidsson import get_arvidsson_loader, get_arvidsson_dataset
+from .bitdepth_nucseg import get_bitdepth_nucseg_loader, get_bitdepth_nucseg_dataset
 from .cartocell import get_cartocell_loader, get_cartocell_dataset
 from .cellpose import get_cellpose_loader, get_cellpose_dataset
 from .cellseg_3d import get_cellseg_3d_loader, get_cellseg_3d_dataset

diff --git a/torch_em/data/datasets/light_microscopy/arvidsson.py b/torch_em/data/datasets/light_microscopy/arvidsson.py
@@ -163,6 +163,7 @@ def get_arvidsson_loader(
 
     Args:
         path: Filepath to a folder where the downloaded data will be saved.
+        batch_size: The batch size for training.
         patch_shape: The patch shape to use for training.
         split: The data split to use. Either 'train', 'val' or 'test'.
         download: Whether to download the data if it is not present.

diff --git a/torch_em/data/datasets/light_microscopy/bitdepth_nucseg.py b/torch_em/data/datasets/light_microscopy/bitdepth_nucseg.py
@@ -0,0 +1,154 @@
+"""The BitDepth NucSeg dataset contains annotations for nucleus segmentation
+in DAPI stained fluorescence microscopy images.
+
+The dataset is located at https://github.com/masih4/BitDepth_NucSeg/
+This dataset is from the publication https://doi.org/10.3390/diagnostics11060967.
+Please cite it if you use this dataset in your research.
+"""
+
+import os
+import shutil
+import subprocess
+from glob import glob
+from natsort import natsorted
+from typing import Union, Tuple, Optional, Literal, List
+
+from torch.utils.data import Dataset, DataLoader
+
+import torch_em
+
+from .. import util
+
+
+URL = "https://github.com/masih4/BitDepth_NucSeg"
+
+
+def _remove_other_files(path):
+    "Remove other files from the git repository"
+    all_files = glob(os.path.join(path, "*"))
+    all_files.extend(glob(os.path.join(path, ".*")))
+    for _file in all_files:
+        if os.path.basename(_file) == "data":
+            continue
+
+        if os.path.isdir(_file):
+            shutil.rmtree(_file)
+        else:
+            os.remove(_file)
+
+
+def get_bitdepth_nucseg_data(path: Union[os.PathLike, str], download: bool = False) -> str:
+    """Download the BitDepth NucSeg dataset for nucleus segmentation.
+
+    Args:
+        path: Filepath to a folder where the downloaded data will be saved.
+        download: Whether to download the data if it is not present.
+
+    Returns:
+        The filepath to the downloaded data.
+    """
+    data_dir = os.path.join(path, "data")
+    if os.path.exists(data_dir):
+        return data_dir
+
+    if not download:
+        raise ValueError("The data directory is not found and download is set to False.")
+
+    # The data is located in a GitHub repository as a zipfile.
+    subprocess.run(["git", "clone", URL, path])
+    # Remove all git files besides the zipfile
+    _remove_other_files(path)
+
+    zip_path = os.path.join(path, "data", "data.zip")
+    util.unzip(zip_path=zip_path, dst=data_dir)
+
+    return data_dir
+
+
+def get_bitdepth_nucseg_paths(
+    path: Union[os.PathLike, str],
+    magnification: Optional[Literal['20x', '40x_air', '40x_oil' '63x_oil']] = None,
+    download: bool = False
+) -> Tuple[List[str], List[str]]:
+    """Get paths to the BitDepth NucSeg data.
+
+    Args:
+        path: Filepath to a folder where the downloaded data will be saved.
+        magnification: The magnification scale for the input images.
+        download: Whether to download the data if it is not present.
+
+    Returns:
+        List of filepaths for the image data.
+        List of filepaths for the label data.
+    """
+    data_dir = get_bitdepth_nucseg_data(path, download)
+
+    if magnification is None:
+        magnification = "*"
+    else:
+        if magnification.find("_") != -1:
+            _splits = magnification.split("_")
+            magnification = f"{_splits[0]} {_splits[1]}"
+
+    raw_paths = natsorted(glob(os.path.join(data_dir, magnification, "images_16bit", "*.tif")))
+    label_paths = natsorted(glob(os.path.join(data_dir, magnification, "label masks", "*.tif")))
+
+    return raw_paths, label_paths
+
+
+def get_bitdepth_nucseg_dataset(
+    path: Union[os.PathLike, str],
+    patch_shape: Tuple[int, int],
+    magnification: Optional[Literal['20x', '40x_air', '40x_oil' '63x_oil']] = None,
+    download: bool = False,
+    **kwargs
+) -> Dataset:
+    """Get the BitDepth NucSeg dataset for nucleus segmentation.
+
+    Args:
+        path: Filepath to a folder where the downloaded data will be saved.
+        patch_shape: The patch shape to use for training.
+        magnification: The magnification scale for the input images.
+        download: Whether to download the data if it is not present.
+        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
+
+    Returns:
+        The segmentation dataset.
+    """
+    raw_paths, label_paths = get_bitdepth_nucseg_paths(path, magnification, download)
+
+    return torch_em.default_segmentation_dataset(
+        raw_paths=raw_paths,
+        raw_key=None,
+        label_paths=label_paths,
+        label_key=None,
+        is_seg_dataset=False,
+        patch_shape=patch_shape,
+        **kwargs
+    )
+
+
+def get_bitdepth_nucseg_loader(
+    path: Union[os.PathLike, str],
+    batch_size: int,
+    patch_shape: Tuple[int, int],
+    magnification: Optional[Literal['20x', '40x_air', '40x_oil' '63x_oil']] = None,
+    download: bool = False,
+    **kwargs
+) -> DataLoader:
+    """Get the BitDepth NucSeg dataloader for nucleus segmentation.
+
+    Args:
+        path: Filepath to a folder where the downloaded data will be saved.
+        batch_size: The batch size for training.
+        patch_shape: The patch shape to use for training.
+        magnification: The magnification scale for the input images.
+        download: Whether to download the data if it is not present.
+        kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
+
+    Returns:
+        The DataLoader.
+    """
+    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
+    dataset = get_bitdepth_nucseg_dataset(path, patch_shape, magnification, download, **ds_kwargs)
+    return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)