Skip to content
Merged
278 changes: 119 additions & 159 deletions notebooks/Segmentation_QC.ipynb

Large diffs are not rendered by default.

28 changes: 20 additions & 8 deletions src/celldega/pre/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,19 +884,18 @@ def add_custom_segmentation(

cbg_custom = pd.read_parquet(Path(path_segmentation_files) / "cell_by_gene_matrix.parquet")

# make sure all genes are present in cbg_custom
meta_gene = pd.read_parquet(Path(path_landscape_files) / "meta_gene.parquet")
missing_cols = meta_gene.index.difference(cbg_custom.columns)
for col in missing_cols:
cbg_custom[col] = 0

make_meta_gene(
cbg=cbg_custom,
path_output=Path(path_landscape_files)
/ f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet",
)

save_cbg_gene_parquets(
base_path=path_landscape_files,
cbg=cbg_custom,
verbose=True,
segmentation_approach=segmentation_parameters["segmentation_approach"],
)

make_meta_cell_image_coord(
technology=segmentation_parameters["technology"],
path_transformation_matrix=str(
Expand All @@ -912,13 +911,26 @@ def add_custom_segmentation(
image_scale=image_scale,
)

save_cbg_gene_parquets(
base_path=path_landscape_files,
cbg=cbg_custom,
verbose=True,
segmentation_approach=segmentation_parameters["segmentation_approach"],
)

create_cluster_and_meta_cluster(
technology=segmentation_parameters["technology"],
path_landscape_files=path_landscape_files,
segmentation_approach=segmentation_parameters["segmentation_approach"],
)

tree = ET.parse(Path(path_landscape_files) / "pyramid_images" / "bound.dzi")
# Get the first .dzi file in sorted order
dzi_files = sorted((Path(path_landscape_files) / "pyramid_images").glob("*.dzi"))
if not dzi_files:
raise FileNotFoundError("No .dzi files found in pyramid_images.")

# Use the first .dzi file
tree = ET.parse(dzi_files[0])
root = tree.getroot()
width = int(root[0].attrib["Width"])
height = int(root[0].attrib["Height"])
Expand Down
15 changes: 10 additions & 5 deletions src/celldega/pre/boundary_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ def get_cell_polygons(

cells_orig.set_index("cell_id", inplace=True)

cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)

elif technology == "Xenium":
xenium_cells = pd.read_parquet(path_cell_boundaries)
grouped = xenium_cells.groupby("cell_id")[["vertex_x", "vertex_y"]].agg(
Expand All @@ -261,17 +263,20 @@ def get_cell_polygons(
)
cells_orig = gpd.GeoDataFrame(grouped, geometry="geometry")[["geometry"]]

cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)

# Transform geometries
cells_orig["GEOMETRY"] = batch_transform_geometries(
cells_orig["geometry"], transformation_matrix, image_scale
cells_orig["geometry_micron"], transformation_matrix, image_scale
)

# Convert transformed geometries to polygons and calculate centroids
cells_orig["polygon"] = cells_orig["GEOMETRY"].apply(lambda x: Polygon(x[0]))
gdf_cells = gpd.GeoDataFrame(geometry=cells_orig["polygon"])
gdf_cells["GEOMETRY"] = cells_orig["GEOMETRY"]

return gdf_cells
# Specify the columns to include
columns_to_include = ["geometry_micron", "GEOMETRY"]

return gpd.GeoDataFrame(cells_orig[columns_to_include], geometry=cells_orig["polygon"])


def make_cell_boundary_tiles(
Expand Down Expand Up @@ -330,7 +335,7 @@ def make_cell_boundary_tiles(

# Convert string index to integer index
cell_str_to_int_mapping = _get_name_mapping(
path_output.replace("/cell_segmentation", ""), # get the path of landscape files
str(Path(path_output).parent), # get the path of landscape files
layer="boundary",
segmentation=path_output.split("cell_segmentation_")[
1
Expand Down
19 changes: 15 additions & 4 deletions src/celldega/pre/run_pre_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import argparse
from pathlib import Path
import shutil

import pandas as pd

Expand All @@ -26,16 +27,18 @@ def _create_directories(directories):

def create_dummy_clusters(path_landscape_files, cbg):
_create_directories([f"{path_landscape_files}/cell_clusters"])
meta_cluster = pd.DataFrame(index=["0"], columns=["color", "count"])
meta_cluster.loc["0", "color"] = "#1f77b4"
meta_cluster.loc["0", "count"] = 1000
meta_cluster.to_parquet(f"{path_landscape_files}/cell_clusters/meta_cluster.parquet")

inst_index = [str(x) for x in cbg.index.tolist()]
meta_cell = pd.DataFrame(index=inst_index)
meta_cell["cluster"] = "0"
meta_cell.index = meta_cell.index.astype(str)
meta_cell.to_parquet(f"{path_landscape_files}/cell_clusters/cluster.parquet")

meta_cluster = pd.DataFrame(index=["0"], columns=["color", "count"])
meta_cluster.loc["0", "color"] = "#1f77b4"
meta_cluster.loc["0", "count"] = len(meta_cell)
meta_cluster.to_parquet(f"{path_landscape_files}/cell_clusters/meta_cluster.parquet")


def _determine_technology(data_dir):
"""
Expand Down Expand Up @@ -144,12 +147,20 @@ def main(
# Setup file paths
paths = _setup_preprocessing_paths(technology, path_landscape_files, data_dir)

# Save transformation matrices

if technology == "Xenium":
# Unzip compressed files in Xenium data folder
dega.pre._xenium_unzipper(str(data_dir))
# Write transform file
dega.pre.write_xenium_transform(str(data_dir), path_landscape_files)

elif technology == "MERSCOPE":
source_path = Path(paths["transformation_matrix"])

# Copy the file to the destination directory, keeping the same filename
shutil.copy(source_path, Path(path_landscape_files) / "micron_to_image_transform.csv")

# Check required files for preprocessing
dega.pre._check_required_files(technology, str(data_dir))

Expand Down
114 changes: 74 additions & 40 deletions src/celldega/qc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
import seaborn as sns
from shapely.geometry import MultiPolygon

from ..pre import write_xenium_transform
from ..pre.boundary_tile import get_cell_polygons
from ..pre.landscape import read_cbg_mtx


def get_largest_polygon(geometry): # TODO: underscore
def _get_largest_polygon(geometry):
"""
Get the largest polygon from a geometry.

Expand All @@ -27,7 +28,20 @@ def get_largest_polygon(geometry): # TODO: underscore
return geometry


def _load_segmentation_parameters(base_path):
def _write_default_seg_json(parameters_file, technology, dataset_name):
segmentation_parameters = {
"technology": technology,
"segmentation_approach": "default",
"dataset_name": dataset_name,
}

with parameters_file.open("w") as f:
json.dump(segmentation_parameters, f, indent=4)

return segmentation_parameters


def _load_segmentation_parameters(base_path, technology, dataset_name):
"""
Load segmentation parameters from JSON file.

Expand All @@ -40,8 +54,9 @@ def _load_segmentation_parameters(base_path):
parameters_file = Path(base_path) / "segmentation_parameters.json"

if not parameters_file.exists():
print("segmentation_parameters.json does not exist")
return None
print("segmentation_parameters.json does not exist, creating one...")

return _write_default_seg_json(parameters_file, technology, dataset_name)

try:
with parameters_file.open() as parameter_file:
Expand All @@ -68,7 +83,7 @@ def _process_custom_technology(base_path):
trx = pd.read_parquet(Path(base_path) / "transcripts.parquet")
trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]
cell_gdf = gpd.read_parquet(Path(base_path) / "cell_polygons.parquet")
cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata.parquet")
cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata_micron_space.parquet")

return cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf

Expand All @@ -92,13 +107,17 @@ def _process_xenium_technology(base_path, segmentation_parameters):
trx = trx.rename(columns={"name": gene})
trx_meta = trx[trx[cell_index] != "UNASSIGNED"][[transcript_index, cell_index, gene]]

transformation_matrix = write_xenium_transform(base_path, base_path)

cell_gdf = get_cell_polygons(
technology=segmentation_parameters["technology"],
path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
transformation_matrix=transformation_matrix,
)

cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry"])
cell_gdf["geometry"] = cell_gdf["geometry"].apply(get_largest_polygon)
cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])

cell_gdf["geometry"] = cell_gdf["geometry"].apply(_get_largest_polygon)
cell_gdf.reset_index(inplace=True)
cell_gdf["area"] = cell_gdf["geometry"].area
cell_gdf["centroid"] = cell_gdf["geometry"].centroid
Expand All @@ -107,9 +126,7 @@ def _process_xenium_technology(base_path, segmentation_parameters):
return cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf


def _process_merscope_technology(
base_path, segmentation_parameters, path_output, path_meta_cell_micron
):
def _process_merscope_technology(base_path, segmentation_parameters):
"""
Process data for MERSCOPE technology.

Expand All @@ -122,29 +139,35 @@ def _process_merscope_technology(
Returns:
- Tuple of (cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf)
"""
cell_index = "EntityID"
cell_index = "cell_id"
gene = "gene"
transcript_index = "transcript_id"

# Define base paths
csv_path = Path(base_path) / "detected_transcripts.csv"
parquet_path = csv_path.with_suffix(".parquet")

path_meta_cell_micron = Path(base_path) / "cell_metadata.csv"

# Prefer Parquet if it exists
trx = pd.read_parquet(parquet_path) if parquet_path.exists() else pd.read_csv(csv_path)
trx = trx.rename(columns={"name": gene})
trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]

transformation_matrix = pd.read_csv(
Path(base_path) / "images/micron_to_mosaic_pixel_transform.csv", header=None, sep=" "
).values

cell_gdf = get_cell_polygons(
technology=segmentation_parameters["technology"],
path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
path_output=path_output,
transformation_matrix=transformation_matrix,
path_meta_cell_micron=path_meta_cell_micron,
)

cell_gdf["geometry"] = cell_gdf["Geometry"].apply(get_largest_polygon)
cell_gdf.drop(["Geometry"], axis=1, inplace=True)
cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["Geometry"])
cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])

cell_gdf["geometry"] = cell_gdf["geometry"].apply(_get_largest_polygon)

cell_gdf.reset_index(inplace=True)
cell_gdf["area"] = cell_gdf["geometry"].area
Expand Down Expand Up @@ -223,7 +246,7 @@ def _save_qc_results(base_path, metrics, gene_specific_metrics_df, segmentation_
gene_specific_metrics_df.to_csv(Path(base_path) / "gene_specific_qc.csv")


def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
def qc_segmentation(base_path, technology=None, dataset_name=None):
"""
Calculate segmentation quality control (QC) metrics for imaging spatial transcriptomics data.

Expand All @@ -249,7 +272,7 @@ def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
-------
qc_segmentation(base_path="path/to/data")
"""
segmentation_parameters = _load_segmentation_parameters(base_path)
segmentation_parameters = _load_segmentation_parameters(base_path, technology, dataset_name)
if segmentation_parameters is None:
return

Expand All @@ -266,9 +289,7 @@ def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
trx = trx.rename(columns={"name": gene})
elif segmentation_parameters["technology"] == "MERSCOPE":
cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf = (
_process_merscope_technology(
base_path, segmentation_parameters, path_output, path_meta_cell_micron
)
_process_merscope_technology(base_path, segmentation_parameters)
)

# Define base paths
Expand Down Expand Up @@ -396,7 +417,7 @@ def _load_cbg_data(base_paths):
)
elif segmentation_parameters["technology"] == "MERSCOPE":
cbg_dict[segmentation_parameters["segmentation_approach"]] = pd.read_csv(
Path(base_path) / "cell_by_gene_matrix.csv"
Path(base_path) / "cell_by_gene.csv"
)

return cbg_dict
Expand Down Expand Up @@ -472,25 +493,38 @@ def _create_barplot_visualization(
- cell_a_name: Name of cell type A
- cell_b_name: Name of cell type B
"""
orthogonal_data = pd.DataFrame(
{
"Technology": [i for i in cell_a_with_b_cell_specific_genes for _ in range(2)],
"Category": [
f"{cell_a_name} with {cell_b_name} genes",
f"{cell_b_name} with {cell_a_name} genes",
]
* 4,
"Count": [
gene
for pair in zip(
cell_a_with_b_cell_specific_genes.values(),
cell_b_with_a_cell_specific_genes.values(),
strict=False,
)
for gene in pair
],
}
)

# Step 1: Repeat each technology name twice (for each gene comparison)
technologies = []
for tech in cell_a_with_b_cell_specific_genes:
technologies.extend([tech, tech])

technology_series = pd.Series(technologies, name="Technology")

# Step 2: Create corresponding category labels (A with B genes, B with A genes)
category_labels = []
for _ in cell_a_with_b_cell_specific_genes:
category_labels.extend(
[f"{cell_a_name} with {cell_b_name} genes", f"{cell_b_name} with {cell_a_name} genes"]
)

category_series = pd.Series(category_labels, name="Category")

# Step 3: Extract counts from both dictionaries and interleave them
a_counts = cell_a_with_b_cell_specific_genes.values()
b_counts = cell_b_with_a_cell_specific_genes.values()

count_values = []

for a_count, b_count in zip(a_counts, b_counts, strict=False):
# Step 3: Add both counts to the list
count_values.append(a_count)
count_values.append(b_count)

count_series = pd.Series(count_values, name="Count")

# Step 4: Combine into a single DataFrame
orthogonal_data = pd.concat([technology_series, category_series, count_series], axis=1)

fig, ax = plt.subplots(figsize=(10, 6))

Expand Down
Loading