broadinstitute · jaspreetishar · Jul 17, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 10, 2025
diff --git a/notebooks/Segmentation_QC.ipynb b/notebooks/Segmentation_QC.ipynb
diff --git a/src/celldega/pre/__init__.py b/src/celldega/pre/__init__.py
@@ -884,19 +884,18 @@ def add_custom_segmentation(
 
     cbg_custom = pd.read_parquet(Path(path_segmentation_files) / "cell_by_gene_matrix.parquet")
 
+    # make sure all genes are present in cbg_custom
+    meta_gene = pd.read_parquet(Path(path_landscape_files) / "meta_gene.parquet")
+    missing_cols = meta_gene.index.difference(cbg_custom.columns)
+    for col in missing_cols:
+        cbg_custom[col] = 0
+
     make_meta_gene(
         cbg=cbg_custom,
         path_output=Path(path_landscape_files)
         / f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet",
     )
 
-    save_cbg_gene_parquets(
-        base_path=path_landscape_files,
-        cbg=cbg_custom,
-        verbose=True,
-        segmentation_approach=segmentation_parameters["segmentation_approach"],
-    )
-
     make_meta_cell_image_coord(
         technology=segmentation_parameters["technology"],
         path_transformation_matrix=str(
@@ -912,13 +911,26 @@ def add_custom_segmentation(
         image_scale=image_scale,
     )
 
+    save_cbg_gene_parquets(
+        base_path=path_landscape_files,
+        cbg=cbg_custom,
+        verbose=True,
+        segmentation_approach=segmentation_parameters["segmentation_approach"],
+    )
+
     create_cluster_and_meta_cluster(
         technology=segmentation_parameters["technology"],
         path_landscape_files=path_landscape_files,
         segmentation_approach=segmentation_parameters["segmentation_approach"],
     )
 
-    tree = ET.parse(Path(path_landscape_files) / "pyramid_images" / "bound.dzi")
+    # Get the first .dzi file in sorted order
+    dzi_files = sorted((Path(path_landscape_files) / "pyramid_images").glob("*.dzi"))
+    if not dzi_files:
+        raise FileNotFoundError("No .dzi files found in pyramid_images.")
+
+    # Use the first .dzi file
+    tree = ET.parse(dzi_files[0])
     root = tree.getroot()
     width = int(root[0].attrib["Width"])
     height = int(root[0].attrib["Height"])

diff --git a/src/celldega/pre/boundary_tile.py b/src/celldega/pre/boundary_tile.py
@@ -251,6 +251,8 @@ def get_cell_polygons(
 
         cells_orig.set_index("cell_id", inplace=True)
 
+        cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)
+
     elif technology == "Xenium":
         xenium_cells = pd.read_parquet(path_cell_boundaries)
         grouped = xenium_cells.groupby("cell_id")[["vertex_x", "vertex_y"]].agg(
@@ -261,17 +263,20 @@ def get_cell_polygons(
         )
         cells_orig = gpd.GeoDataFrame(grouped, geometry="geometry")[["geometry"]]
 
+        cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)
+
     # Transform geometries
     cells_orig["GEOMETRY"] = batch_transform_geometries(
-        cells_orig["geometry"], transformation_matrix, image_scale
+        cells_orig["geometry_micron"], transformation_matrix, image_scale
     )
 
     # Convert transformed geometries to polygons and calculate centroids
     cells_orig["polygon"] = cells_orig["GEOMETRY"].apply(lambda x: Polygon(x[0]))
-    gdf_cells = gpd.GeoDataFrame(geometry=cells_orig["polygon"])
-    gdf_cells["GEOMETRY"] = cells_orig["GEOMETRY"]
 
-    return gdf_cells
+    # Specify the columns to include
+    columns_to_include = ["geometry_micron", "GEOMETRY"]
+
+    return gpd.GeoDataFrame(cells_orig[columns_to_include], geometry=cells_orig["polygon"])
 
 
 def make_cell_boundary_tiles(
@@ -330,7 +335,7 @@ def make_cell_boundary_tiles(
 
         # Convert string index to integer index
         cell_str_to_int_mapping = _get_name_mapping(
-            path_output.replace("/cell_segmentation", ""),  # get the path of landscape files
+            str(Path(path_output).parent),  # get the path of landscape files
             layer="boundary",
             segmentation=path_output.split("cell_segmentation_")[
                 1

diff --git a/src/celldega/pre/run_pre_processing.py b/src/celldega/pre/run_pre_processing.py
@@ -4,6 +4,7 @@
 
 import argparse
 from pathlib import Path
+import shutil
 
 import pandas as pd
 
@@ -26,16 +27,18 @@ def _create_directories(directories):
 
 def create_dummy_clusters(path_landscape_files, cbg):
     _create_directories([f"{path_landscape_files}/cell_clusters"])
-    meta_cluster = pd.DataFrame(index=["0"], columns=["color", "count"])
-    meta_cluster.loc["0", "color"] = "#1f77b4"
-    meta_cluster.loc["0", "count"] = 1000
-    meta_cluster.to_parquet(f"{path_landscape_files}/cell_clusters/meta_cluster.parquet")
+
     inst_index = [str(x) for x in cbg.index.tolist()]
     meta_cell = pd.DataFrame(index=inst_index)
     meta_cell["cluster"] = "0"
     meta_cell.index = meta_cell.index.astype(str)
     meta_cell.to_parquet(f"{path_landscape_files}/cell_clusters/cluster.parquet")
 
+    meta_cluster = pd.DataFrame(index=["0"], columns=["color", "count"])
+    meta_cluster.loc["0", "color"] = "#1f77b4"
+    meta_cluster.loc["0", "count"] = len(meta_cell)
+    meta_cluster.to_parquet(f"{path_landscape_files}/cell_clusters/meta_cluster.parquet")
+
 
 def _determine_technology(data_dir):
     """
@@ -144,12 +147,20 @@ def main(
     # Setup file paths
     paths = _setup_preprocessing_paths(technology, path_landscape_files, data_dir)
 
+    # Save transformation matrices
+
     if technology == "Xenium":
         # Unzip compressed files in Xenium data folder
         dega.pre._xenium_unzipper(str(data_dir))
         # Write transform file
         dega.pre.write_xenium_transform(str(data_dir), path_landscape_files)
 
+    elif technology == "MERSCOPE":
+        source_path = Path(paths["transformation_matrix"])
+
+        # Copy the file to the destination directory, keeping the same filename
+        shutil.copy(source_path, Path(path_landscape_files) / "micron_to_image_transform.csv")
+
     # Check required files for preprocessing
     dega.pre._check_required_files(technology, str(data_dir))
 

diff --git a/src/celldega/qc/__init__.py b/src/celldega/qc/__init__.py
@@ -8,11 +8,12 @@
 import seaborn as sns
 from shapely.geometry import MultiPolygon
 
+from ..pre import write_xenium_transform
 from ..pre.boundary_tile import get_cell_polygons
 from ..pre.landscape import read_cbg_mtx
 
 
-def get_largest_polygon(geometry):  # TODO: underscore
+def _get_largest_polygon(geometry):
     """
     Get the largest polygon from a geometry.
 
@@ -27,7 +28,20 @@ def get_largest_polygon(geometry):  # TODO: underscore
     return geometry
 
 
-def _load_segmentation_parameters(base_path):
+def _write_default_seg_json(parameters_file, technology, dataset_name):
+    segmentation_parameters = {
+        "technology": technology,
+        "segmentation_approach": "default",
+        "dataset_name": dataset_name,
+    }
+
+    with parameters_file.open("w") as f:
+        json.dump(segmentation_parameters, f, indent=4)
+
+    return segmentation_parameters
+
+
+def _load_segmentation_parameters(base_path, technology, dataset_name):
     """
     Load segmentation parameters from JSON file.
 
@@ -40,8 +54,9 @@ def _load_segmentation_parameters(base_path):
     parameters_file = Path(base_path) / "segmentation_parameters.json"
 
     if not parameters_file.exists():
-        print("segmentation_parameters.json does not exist")
-        return None
+        print("segmentation_parameters.json does not exist, creating one...")
+
+        return _write_default_seg_json(parameters_file, technology, dataset_name)
 
     try:
         with parameters_file.open() as parameter_file:
@@ -68,7 +83,7 @@ def _process_custom_technology(base_path):
     trx = pd.read_parquet(Path(base_path) / "transcripts.parquet")
     trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]
     cell_gdf = gpd.read_parquet(Path(base_path) / "cell_polygons.parquet")
-    cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata.parquet")
+    cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata_micron_space.parquet")
 
     return cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf
 
@@ -92,13 +107,17 @@ def _process_xenium_technology(base_path, segmentation_parameters):
     trx = trx.rename(columns={"name": gene})
     trx_meta = trx[trx[cell_index] != "UNASSIGNED"][[transcript_index, cell_index, gene]]
 
+    transformation_matrix = write_xenium_transform(base_path, base_path)
+
     cell_gdf = get_cell_polygons(
         technology=segmentation_parameters["technology"],
         path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
+        transformation_matrix=transformation_matrix,
     )
 
-    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry"])
-    cell_gdf["geometry"] = cell_gdf["geometry"].apply(get_largest_polygon)
+    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])
+
+    cell_gdf["geometry"] = cell_gdf["geometry"].apply(_get_largest_polygon)
     cell_gdf.reset_index(inplace=True)
     cell_gdf["area"] = cell_gdf["geometry"].area
     cell_gdf["centroid"] = cell_gdf["geometry"].centroid
@@ -107,9 +126,7 @@ def _process_xenium_technology(base_path, segmentation_parameters):
     return cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf
 
 
-def _process_merscope_technology(
-    base_path, segmentation_parameters, path_output, path_meta_cell_micron
-):
+def _process_merscope_technology(base_path, segmentation_parameters):
     """
     Process data for MERSCOPE technology.
 
@@ -122,29 +139,35 @@ def _process_merscope_technology(
     Returns:
     - Tuple of (cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf)
     """
-    cell_index = "EntityID"
+    cell_index = "cell_id"
     gene = "gene"
     transcript_index = "transcript_id"
 
     # Define base paths
     csv_path = Path(base_path) / "detected_transcripts.csv"
     parquet_path = csv_path.with_suffix(".parquet")
 
+    path_meta_cell_micron = Path(base_path) / "cell_metadata.csv"
+
     # Prefer Parquet if it exists
     trx = pd.read_parquet(parquet_path) if parquet_path.exists() else pd.read_csv(csv_path)
     trx = trx.rename(columns={"name": gene})
     trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]
 
+    transformation_matrix = pd.read_csv(
+        Path(base_path) / "images/micron_to_mosaic_pixel_transform.csv", header=None, sep=" "
+    ).values
+
     cell_gdf = get_cell_polygons(
         technology=segmentation_parameters["technology"],
         path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
-        path_output=path_output,
+        transformation_matrix=transformation_matrix,
         path_meta_cell_micron=path_meta_cell_micron,
     )
 
-    cell_gdf["geometry"] = cell_gdf["Geometry"].apply(get_largest_polygon)
-    cell_gdf.drop(["Geometry"], axis=1, inplace=True)
-    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["Geometry"])
+    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])
+
+    cell_gdf["geometry"] = cell_gdf["geometry"].apply(_get_largest_polygon)
 
     cell_gdf.reset_index(inplace=True)
     cell_gdf["area"] = cell_gdf["geometry"].area
@@ -223,7 +246,7 @@ def _save_qc_results(base_path, metrics, gene_specific_metrics_df, segmentation_
     gene_specific_metrics_df.to_csv(Path(base_path) / "gene_specific_qc.csv")
 
 
-def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
+def qc_segmentation(base_path, technology=None, dataset_name=None):
     """
     Calculate segmentation quality control (QC) metrics for imaging spatial transcriptomics data.
 
@@ -249,7 +272,7 @@ def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
     -------
     qc_segmentation(base_path="path/to/data")
     """
-    segmentation_parameters = _load_segmentation_parameters(base_path)
+    segmentation_parameters = _load_segmentation_parameters(base_path, technology, dataset_name)
     if segmentation_parameters is None:
         return
 
@@ -266,9 +289,7 @@ def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
         trx = trx.rename(columns={"name": gene})
     elif segmentation_parameters["technology"] == "MERSCOPE":
         cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf = (
-            _process_merscope_technology(
-                base_path, segmentation_parameters, path_output, path_meta_cell_micron
-            )
+            _process_merscope_technology(base_path, segmentation_parameters)
         )
 
         # Define base paths
@@ -396,7 +417,7 @@ def _load_cbg_data(base_paths):
                 )
             elif segmentation_parameters["technology"] == "MERSCOPE":
                 cbg_dict[segmentation_parameters["segmentation_approach"]] = pd.read_csv(
-                    Path(base_path) / "cell_by_gene_matrix.csv"
+                    Path(base_path) / "cell_by_gene.csv"
                 )
 
     return cbg_dict
@@ -472,25 +493,38 @@ def _create_barplot_visualization(
     - cell_a_name: Name of cell type A
     - cell_b_name: Name of cell type B
     """
-    orthogonal_data = pd.DataFrame(
-        {
-            "Technology": [i for i in cell_a_with_b_cell_specific_genes for _ in range(2)],
-            "Category": [
-                f"{cell_a_name} with {cell_b_name} genes",
-                f"{cell_b_name} with {cell_a_name} genes",
-            ]
-            * 4,
-            "Count": [
-                gene
-                for pair in zip(
-                    cell_a_with_b_cell_specific_genes.values(),
-                    cell_b_with_a_cell_specific_genes.values(),
-                    strict=False,
-                )
-                for gene in pair
-            ],
-        }
-    )
+
+    # Step 1: Repeat each technology name twice (for each gene comparison)
+    technologies = []
+    for tech in cell_a_with_b_cell_specific_genes:
+        technologies.extend([tech, tech])
+
+    technology_series = pd.Series(technologies, name="Technology")
+
+    # Step 2: Create corresponding category labels (A with B genes, B with A genes)
+    category_labels = []
+    for _ in cell_a_with_b_cell_specific_genes:
+        category_labels.extend(
+            [f"{cell_a_name} with {cell_b_name} genes", f"{cell_b_name} with {cell_a_name} genes"]
+        )
+
+    category_series = pd.Series(category_labels, name="Category")
+
+    # Step 3: Extract counts from both dictionaries and interleave them
+    a_counts = cell_a_with_b_cell_specific_genes.values()
+    b_counts = cell_b_with_a_cell_specific_genes.values()
+
+    count_values = []
+
+    for a_count, b_count in zip(a_counts, b_counts, strict=False):
+        # Step 3: Add both counts to the list
+        count_values.append(a_count)
+        count_values.append(b_count)
+
+    count_series = pd.Series(count_values, name="Count")
+
+    # Step 4: Combine into a single DataFrame
+    orthogonal_data = pd.concat([technology_series, category_series, count_series], axis=1)
 
     fig, ax = plt.subplots(figsize=(10, 6))