Skip to content
Merged
252 changes: 100 additions & 152 deletions notebooks/Segmentation_QC.ipynb

Large diffs are not rendered by default.

28 changes: 20 additions & 8 deletions src/celldega/pre/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,19 +884,18 @@ def add_custom_segmentation(

cbg_custom = pd.read_parquet(Path(path_segmentation_files) / "cell_by_gene_matrix.parquet")

# make sure all genes are present in cbg_custom
meta_gene = pd.read_parquet(Path(path_landscape_files) / f"meta_gene.parquet")
missing_cols = meta_gene.index.difference(cbg_custom.columns)
for col in missing_cols:
cbg_custom[col] = 0

make_meta_gene(
cbg=cbg_custom,
path_output=Path(path_landscape_files)
/ f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet",
)

save_cbg_gene_parquets(
base_path=path_landscape_files,
cbg=cbg_custom,
verbose=True,
segmentation_approach=segmentation_parameters["segmentation_approach"],
)

make_meta_cell_image_coord(
technology=segmentation_parameters["technology"],
path_transformation_matrix=str(
Expand All @@ -912,13 +911,26 @@ def add_custom_segmentation(
image_scale=image_scale,
)

save_cbg_gene_parquets(
base_path=path_landscape_files,
cbg=cbg_custom,
verbose=True,
segmentation_approach=segmentation_parameters["segmentation_approach"],
)

create_cluster_and_meta_cluster(
technology=segmentation_parameters["technology"],
path_landscape_files=path_landscape_files,
segmentation_approach=segmentation_parameters["segmentation_approach"],
)

tree = ET.parse(Path(path_landscape_files) / "pyramid_images" / "bound.dzi")
# Get the first .dzi file in sorted order
dzi_files = sorted((Path(path_landscape_files) / "pyramid_images").glob("*.dzi"))
if not dzi_files:
raise FileNotFoundError("No .dzi files found in pyramid_images.")

# Use the first .dzi file
tree = ET.parse(dzi_files[0])
root = tree.getroot()
width = int(root[0].attrib["Width"])
height = int(root[0].attrib["Height"])
Expand Down
16 changes: 11 additions & 5 deletions src/celldega/pre/boundary_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ def get_cell_polygons(

cells_orig.set_index("cell_id", inplace=True)

cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)

elif technology == "Xenium":
xenium_cells = pd.read_parquet(path_cell_boundaries)
grouped = xenium_cells.groupby("cell_id")[["vertex_x", "vertex_y"]].agg(
Expand All @@ -261,18 +263,22 @@ def get_cell_polygons(
)
cells_orig = gpd.GeoDataFrame(grouped, geometry="geometry")[["geometry"]]

cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)

# Transform geometries
cells_orig["GEOMETRY"] = batch_transform_geometries(
cells_orig["geometry"], transformation_matrix, image_scale
cells_orig["geometry_micron"], transformation_matrix, 1
)

# Convert transformed geometries to polygons and calculate centroids
cells_orig["polygon"] = cells_orig["GEOMETRY"].apply(lambda x: Polygon(x[0]))
gdf_cells = gpd.GeoDataFrame(geometry=cells_orig["polygon"])
gdf_cells["GEOMETRY"] = cells_orig["GEOMETRY"]

return gdf_cells
# Specify the columns to include
columns_to_include = ["geometry_micron", "GEOMETRY"]

gdf_cells = gpd.GeoDataFrame(cells_orig[columns_to_include], geometry=cells_orig["polygon"])

return gdf_cells

def make_cell_boundary_tiles(
technology,
Expand Down Expand Up @@ -330,7 +336,7 @@ def make_cell_boundary_tiles(

# Convert string index to integer index
cell_str_to_int_mapping = _get_name_mapping(
path_output.replace("/cell_segmentation", ""), # get the path of landscape files
path_output.rsplit("/", 1)[0], # get the path of landscape files
layer="boundary",
segmentation=path_output.split("cell_segmentation_")[
1
Expand Down
10 changes: 10 additions & 0 deletions src/celldega/pre/run_pre_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path

import pandas as pd
import shutil

import celldega as dega

Expand Down Expand Up @@ -144,12 +145,21 @@ def main(
# Setup file paths
paths = _setup_preprocessing_paths(technology, path_landscape_files, data_dir)

# Save transformation matrices

if technology == "Xenium":
# Unzip compressed files in Xenium data folder
dega.pre._xenium_unzipper(str(data_dir))
# Write transform file
dega.pre.write_xenium_transform(str(data_dir), path_landscape_files)

elif technology == "MERSCOPE":

source_path = Path(paths["transformation_matrix"])

# Copy the file to the destination directory, keeping the same filename
shutil.copy(source_path, Path(path_landscape_files) / "micron_to_image_transform.csv")

# Check required files for preprocessing
dega.pre._check_required_files(technology, str(data_dir))

Expand Down
108 changes: 80 additions & 28 deletions src/celldega/qc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _process_custom_technology(base_path):
trx = pd.read_parquet(Path(base_path) / "transcripts.parquet")
trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]
cell_gdf = gpd.read_parquet(Path(base_path) / "cell_polygons.parquet")
cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata.parquet")
cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata_micron_space.parquet")

return cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf

Expand All @@ -92,12 +92,15 @@ def _process_xenium_technology(base_path, segmentation_parameters):
trx = trx.rename(columns={"name": gene})
trx_meta = trx[trx[cell_index] != "UNASSIGNED"][[transcript_index, cell_index, gene]]

transformation_matrix = pd.read_csv(Path(base_path) / "micron_to_image_transform.csv", header=None, sep=" ").values
cell_gdf = get_cell_polygons(
technology=segmentation_parameters["technology"],
path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
transformation_matrix=transformation_matrix
)

cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry"])
cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])

cell_gdf["geometry"] = cell_gdf["geometry"].apply(get_largest_polygon)
cell_gdf.reset_index(inplace=True)
cell_gdf["area"] = cell_gdf["geometry"].area
Expand All @@ -108,7 +111,7 @@ def _process_xenium_technology(base_path, segmentation_parameters):


def _process_merscope_technology(
base_path, segmentation_parameters, path_output, path_meta_cell_micron
base_path, segmentation_parameters
):
"""
Process data for MERSCOPE technology.
Expand All @@ -122,29 +125,33 @@ def _process_merscope_technology(
Returns:
- Tuple of (cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf)
"""
cell_index = "EntityID"
cell_index = "cell_id"
gene = "gene"
transcript_index = "transcript_id"

# Define base paths
csv_path = Path(base_path) / "detected_transcripts.csv"
parquet_path = csv_path.with_suffix(".parquet")

path_meta_cell_micron = Path(base_path) / "cell_metadata.csv"

# Prefer Parquet if it exists
trx = pd.read_parquet(parquet_path) if parquet_path.exists() else pd.read_csv(csv_path)
trx = trx.rename(columns={"name": gene})
trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]

transformation_matrix = pd.read_csv(Path(base_path) / "images/micron_to_mosaic_pixel_transform.csv", header=None, sep=" ").values

cell_gdf = get_cell_polygons(
technology=segmentation_parameters["technology"],
path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
path_output=path_output,
transformation_matrix=transformation_matrix,
path_meta_cell_micron=path_meta_cell_micron,
)

cell_gdf["geometry"] = cell_gdf["Geometry"].apply(get_largest_polygon)
cell_gdf.drop(["Geometry"], axis=1, inplace=True)
cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["Geometry"])
cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])

cell_gdf["geometry"] = cell_gdf["geometry"].apply(get_largest_polygon)

cell_gdf.reset_index(inplace=True)
cell_gdf["area"] = cell_gdf["geometry"].area
Expand Down Expand Up @@ -267,7 +274,7 @@ def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
elif segmentation_parameters["technology"] == "MERSCOPE":
cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf = (
_process_merscope_technology(
base_path, segmentation_parameters, path_output, path_meta_cell_micron
base_path, segmentation_parameters
)
)

Expand Down Expand Up @@ -396,7 +403,7 @@ def _load_cbg_data(base_paths):
)
elif segmentation_parameters["technology"] == "MERSCOPE":
cbg_dict[segmentation_parameters["segmentation_approach"]] = pd.read_csv(
Path(base_path) / "cell_by_gene_matrix.csv"
Path(base_path) / "cell_by_gene.csv"
)

return cbg_dict
Expand Down Expand Up @@ -472,24 +479,69 @@ def _create_barplot_visualization(
- cell_a_name: Name of cell type A
- cell_b_name: Name of cell type B
"""
orthogonal_data = pd.DataFrame(
{
"Technology": [i for i in cell_a_with_b_cell_specific_genes for _ in range(2)],
"Category": [
f"{cell_a_name} with {cell_b_name} genes",
f"{cell_b_name} with {cell_a_name} genes",
]
* 4,
"Count": [
gene
for pair in zip(
cell_a_with_b_cell_specific_genes.values(),
cell_b_with_a_cell_specific_genes.values(),
strict=False,
)
for gene in pair
],
}

# technology_series = pd.Series(
# [tech for tech in cell_a_with_b_cell_specific_genes for _ in range(2)],
# name="Technology"
# )

# category_series = pd.Series(
# [
# f"{cell_a_name} with {cell_b_name} genes",
# f"{cell_b_name} with {cell_a_name} genes"
# ] * len(cell_a_with_b_cell_specific_genes),
# name="Category"
# )

# count_values = []
# for a_count, b_count in zip(
# cell_a_with_b_cell_specific_genes.values(),
# cell_b_with_a_cell_specific_genes.values(),
# strict=False
# ):
# count_values.extend([a_count, b_count])

# count_series = pd.Series(count_values, name="Count")

# orthogonal_data = pd.concat(
# [technology_series, category_series, count_series],
# axis=1
# )

# Step 1: Repeat each technology name twice (for each gene comparison)
technologies = []
for tech in cell_a_with_b_cell_specific_genes:
technologies.extend([tech, tech])

technology_series = pd.Series(technologies, name="Technology")

# Step 2: Create corresponding category labels (A with B genes, B with A genes)
category_labels = []
for _ in cell_a_with_b_cell_specific_genes:
category_labels.extend([
f"{cell_a_name} with {cell_b_name} genes",
f"{cell_b_name} with {cell_a_name} genes"
])

category_series = pd.Series(category_labels, name="Category")

# Step 3: Extract counts from both dictionaries and interleave them
a_counts = cell_a_with_b_cell_specific_genes.values()
b_counts = cell_b_with_a_cell_specific_genes.values()

count_values = []

for a_count, b_count in zip(a_counts, b_counts):
# Step 3: Add both counts to the list
count_values.append(a_count)
count_values.append(b_count)

count_series = pd.Series(count_values, name="Count")

# Step 4: Combine into a single DataFrame
orthogonal_data = pd.concat(
[technology_series, category_series, count_series],
axis=1
)

fig, ax = plt.subplots(figsize=(10, 6))
Expand Down
Loading