broadinstitute · jaspreetishar · Jul 17, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 10, 2025
diff --git a/notebooks/Segmentation_QC.ipynb b/notebooks/Segmentation_QC.ipynb
diff --git a/src/celldega/pre/__init__.py b/src/celldega/pre/__init__.py
@@ -884,19 +884,18 @@ def add_custom_segmentation(
 
     cbg_custom = pd.read_parquet(Path(path_segmentation_files) / "cell_by_gene_matrix.parquet")
 
+    # make sure all genes are present in cbg_custom
+    meta_gene = pd.read_parquet(Path(path_landscape_files) / f"meta_gene.parquet")
+    missing_cols = meta_gene.index.difference(cbg_custom.columns)
+    for col in missing_cols:
+        cbg_custom[col] = 0
+
     make_meta_gene(
         cbg=cbg_custom,
         path_output=Path(path_landscape_files)
         / f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet",
     )
 
-    save_cbg_gene_parquets(
-        base_path=path_landscape_files,
-        cbg=cbg_custom,
-        verbose=True,
-        segmentation_approach=segmentation_parameters["segmentation_approach"],
-    )
-
     make_meta_cell_image_coord(
         technology=segmentation_parameters["technology"],
         path_transformation_matrix=str(
@@ -912,13 +911,26 @@ def add_custom_segmentation(
         image_scale=image_scale,
     )
 
+    save_cbg_gene_parquets(
+        base_path=path_landscape_files,
+        cbg=cbg_custom,
+        verbose=True,
+        segmentation_approach=segmentation_parameters["segmentation_approach"],
+    )
+
     create_cluster_and_meta_cluster(
         technology=segmentation_parameters["technology"],
         path_landscape_files=path_landscape_files,
         segmentation_approach=segmentation_parameters["segmentation_approach"],
     )
 
-    tree = ET.parse(Path(path_landscape_files) / "pyramid_images" / "bound.dzi")
+    # Get the first .dzi file in sorted order
+    dzi_files = sorted((Path(path_landscape_files) / "pyramid_images").glob("*.dzi"))
+    if not dzi_files:
+        raise FileNotFoundError("No .dzi files found in pyramid_images.")
+
+    # Use the first .dzi file
+    tree = ET.parse(dzi_files[0])
     root = tree.getroot()
     width = int(root[0].attrib["Width"])
     height = int(root[0].attrib["Height"])

diff --git a/src/celldega/pre/boundary_tile.py b/src/celldega/pre/boundary_tile.py
@@ -251,6 +251,8 @@ def get_cell_polygons(
 
         cells_orig.set_index("cell_id", inplace=True)
 
+        cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)
+
     elif technology == "Xenium":
         xenium_cells = pd.read_parquet(path_cell_boundaries)
         grouped = xenium_cells.groupby("cell_id")[["vertex_x", "vertex_y"]].agg(
@@ -261,18 +263,22 @@ def get_cell_polygons(
         )
         cells_orig = gpd.GeoDataFrame(grouped, geometry="geometry")[["geometry"]]
 
+        cells_orig.rename(columns={"geometry": "geometry_micron"}, inplace=True)
+
     # Transform geometries
     cells_orig["GEOMETRY"] = batch_transform_geometries(
-        cells_orig["geometry"], transformation_matrix, image_scale
+        cells_orig["geometry_micron"], transformation_matrix, 1
     )
 
     # Convert transformed geometries to polygons and calculate centroids
     cells_orig["polygon"] = cells_orig["GEOMETRY"].apply(lambda x: Polygon(x[0]))
-    gdf_cells = gpd.GeoDataFrame(geometry=cells_orig["polygon"])
-    gdf_cells["GEOMETRY"] = cells_orig["GEOMETRY"]
 
-    return gdf_cells
+    # Specify the columns to include
+    columns_to_include = ["geometry_micron", "GEOMETRY"]
 
+    gdf_cells = gpd.GeoDataFrame(cells_orig[columns_to_include], geometry=cells_orig["polygon"])
+
+    return gdf_cells
 
 def make_cell_boundary_tiles(
     technology,
@@ -330,7 +336,7 @@ def make_cell_boundary_tiles(
 
         # Convert string index to integer index
         cell_str_to_int_mapping = _get_name_mapping(
-            path_output.replace("/cell_segmentation", ""),  # get the path of landscape files
+            path_output.rsplit("/", 1)[0],  # get the path of landscape files
             layer="boundary",
             segmentation=path_output.split("cell_segmentation_")[
                 1

diff --git a/src/celldega/pre/run_pre_processing.py b/src/celldega/pre/run_pre_processing.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 
 import pandas as pd
+import shutil
 
 import celldega as dega
 
@@ -144,12 +145,21 @@ def main(
     # Setup file paths
     paths = _setup_preprocessing_paths(technology, path_landscape_files, data_dir)
 
+    # Save transformation matrices
+
     if technology == "Xenium":
         # Unzip compressed files in Xenium data folder
         dega.pre._xenium_unzipper(str(data_dir))
         # Write transform file
         dega.pre.write_xenium_transform(str(data_dir), path_landscape_files)
 
+    elif technology == "MERSCOPE":
+
+        source_path = Path(paths["transformation_matrix"])
+
+        # Copy the file to the destination directory, keeping the same filename
+        shutil.copy(source_path, Path(path_landscape_files) / "micron_to_image_transform.csv")
+
     # Check required files for preprocessing
     dega.pre._check_required_files(technology, str(data_dir))
 

diff --git a/src/celldega/qc/__init__.py b/src/celldega/qc/__init__.py
@@ -68,7 +68,7 @@ def _process_custom_technology(base_path):
     trx = pd.read_parquet(Path(base_path) / "transcripts.parquet")
     trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]
     cell_gdf = gpd.read_parquet(Path(base_path) / "cell_polygons.parquet")
-    cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata.parquet")
+    cell_meta_gdf = gpd.read_parquet(Path(base_path) / "cell_metadata_micron_space.parquet")
 
     return cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf
 
@@ -92,12 +92,15 @@ def _process_xenium_technology(base_path, segmentation_parameters):
     trx = trx.rename(columns={"name": gene})
     trx_meta = trx[trx[cell_index] != "UNASSIGNED"][[transcript_index, cell_index, gene]]
 
+    transformation_matrix = pd.read_csv(Path(base_path) / "micron_to_image_transform.csv", header=None, sep=" ").values
     cell_gdf = get_cell_polygons(
         technology=segmentation_parameters["technology"],
         path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
+        transformation_matrix=transformation_matrix
     )
 
-    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry"])
+    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])
+
     cell_gdf["geometry"] = cell_gdf["geometry"].apply(get_largest_polygon)
     cell_gdf.reset_index(inplace=True)
     cell_gdf["area"] = cell_gdf["geometry"].area
@@ -108,7 +111,7 @@ def _process_xenium_technology(base_path, segmentation_parameters):
 
 
 def _process_merscope_technology(
-    base_path, segmentation_parameters, path_output, path_meta_cell_micron
+    base_path, segmentation_parameters
 ):
     """
     Process data for MERSCOPE technology.
@@ -122,29 +125,33 @@ def _process_merscope_technology(
     Returns:
     - Tuple of (cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf)
     """
-    cell_index = "EntityID"
+    cell_index = "cell_id"
     gene = "gene"
     transcript_index = "transcript_id"
 
     # Define base paths
     csv_path = Path(base_path) / "detected_transcripts.csv"
     parquet_path = csv_path.with_suffix(".parquet")
 
+    path_meta_cell_micron = Path(base_path) / "cell_metadata.csv"
+
     # Prefer Parquet if it exists
     trx = pd.read_parquet(parquet_path) if parquet_path.exists() else pd.read_csv(csv_path)
     trx = trx.rename(columns={"name": gene})
     trx_meta = trx[trx[cell_index] != -1][[transcript_index, cell_index, gene]]
 
+    transformation_matrix = pd.read_csv(Path(base_path) / "images/micron_to_mosaic_pixel_transform.csv", header=None, sep=" ").values
+
     cell_gdf = get_cell_polygons(
         technology=segmentation_parameters["technology"],
         path_cell_boundaries=Path(base_path) / "cell_boundaries.parquet",
-        path_output=path_output,
+        transformation_matrix=transformation_matrix,
         path_meta_cell_micron=path_meta_cell_micron,
     )
 
-    cell_gdf["geometry"] = cell_gdf["Geometry"].apply(get_largest_polygon)
-    cell_gdf.drop(["Geometry"], axis=1, inplace=True)
-    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["Geometry"])
+    cell_gdf = gpd.GeoDataFrame(geometry=cell_gdf["geometry_micron"])
+
+    cell_gdf["geometry"] = cell_gdf["geometry"].apply(get_largest_polygon)
 
     cell_gdf.reset_index(inplace=True)
     cell_gdf["area"] = cell_gdf["geometry"].area
@@ -267,7 +274,7 @@ def qc_segmentation(base_path, path_output=None, path_meta_cell_micron=None):
     elif segmentation_parameters["technology"] == "MERSCOPE":
         cell_index, gene, transcript_index, trx_meta, cell_gdf, cell_meta_gdf = (
             _process_merscope_technology(
-                base_path, segmentation_parameters, path_output, path_meta_cell_micron
+                base_path, segmentation_parameters
             )
         )
 
@@ -396,7 +403,7 @@ def _load_cbg_data(base_paths):
                 )
             elif segmentation_parameters["technology"] == "MERSCOPE":
                 cbg_dict[segmentation_parameters["segmentation_approach"]] = pd.read_csv(
-                    Path(base_path) / "cell_by_gene_matrix.csv"
+                    Path(base_path) / "cell_by_gene.csv"
                 )
 
     return cbg_dict
@@ -472,24 +479,69 @@ def _create_barplot_visualization(
     - cell_a_name: Name of cell type A
     - cell_b_name: Name of cell type B
     """
-    orthogonal_data = pd.DataFrame(
-        {
-            "Technology": [i for i in cell_a_with_b_cell_specific_genes for _ in range(2)],
-            "Category": [
-                f"{cell_a_name} with {cell_b_name} genes",
-                f"{cell_b_name} with {cell_a_name} genes",
-            ]
-            * 4,
-            "Count": [
-                gene
-                for pair in zip(
-                    cell_a_with_b_cell_specific_genes.values(),
-                    cell_b_with_a_cell_specific_genes.values(),
-                    strict=False,
-                )
-                for gene in pair
-            ],
-        }
+
+    # technology_series = pd.Series(
+    #     [tech for tech in cell_a_with_b_cell_specific_genes for _ in range(2)],
+    #     name="Technology"
+    # )
+
+    # category_series = pd.Series(
+    #     [
+    #         f"{cell_a_name} with {cell_b_name} genes",
+    #         f"{cell_b_name} with {cell_a_name} genes"
+    #     ] * len(cell_a_with_b_cell_specific_genes),
+    #     name="Category"
+    # )
+
+    # count_values = []
+    # for a_count, b_count in zip(
+    #     cell_a_with_b_cell_specific_genes.values(),
+    #     cell_b_with_a_cell_specific_genes.values(),
+    #     strict=False
+    # ):
+    #     count_values.extend([a_count, b_count])
+
+    # count_series = pd.Series(count_values, name="Count")
+
+    # orthogonal_data = pd.concat(
+    #     [technology_series, category_series, count_series],
+    #     axis=1
+    # )
+
+    # Step 1: Repeat each technology name twice (for each gene comparison)
+    technologies = []
+    for tech in cell_a_with_b_cell_specific_genes:
+        technologies.extend([tech, tech])
+
+    technology_series = pd.Series(technologies, name="Technology")
+
+    # Step 2: Create corresponding category labels (A with B genes, B with A genes)
+    category_labels = []
+    for _ in cell_a_with_b_cell_specific_genes:
+        category_labels.extend([
+            f"{cell_a_name} with {cell_b_name} genes",
+            f"{cell_b_name} with {cell_a_name} genes"
+        ])
+
+    category_series = pd.Series(category_labels, name="Category")
+
+    # Step 3: Extract counts from both dictionaries and interleave them
+    a_counts = cell_a_with_b_cell_specific_genes.values()
+    b_counts = cell_b_with_a_cell_specific_genes.values()
+
+    count_values = []
+
+    for a_count, b_count in zip(a_counts, b_counts):
+        # Step 3: Add both counts to the list
+        count_values.append(a_count)
+        count_values.append(b_count)
+
+    count_series = pd.Series(count_values, name="Count")
+
+    # Step 4: Combine into a single DataFrame
+    orthogonal_data = pd.concat(
+        [technology_series, category_series, count_series],
+        axis=1
     )
 
     fig, ax = plt.subplots(figsize=(10, 6))