Skip to content

Commit

Permalink
FIx dataset simulator (#431)
Browse files Browse the repository at this point in the history
* explicitly convert non-string objects in .obs slots to categoricals

* Add info about fix in comments
  • Loading branch information
sainirmayi authored Apr 17, 2024
1 parent 949427c commit 624c442
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions src/tasks/spatial_decomposition/dataset_simulator/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@
## VIASH END

CELLTYPE_MIN_CELLS = 25

# Reading input dataset
adata = ad.read_h5ad(par['input'])


def generate_synthetic_dataset(
adata: ad.AnnData,
alpha: Union[float, Sequence] = 1.0,
Expand Down Expand Up @@ -162,12 +164,14 @@ def generate_synthetic_dataset(
adata_merged.uns["cell_type_names"] = uni_labs
return adata_merged


def filter_celltypes(adata, min_cells=CELLTYPE_MIN_CELLS):
"""Filter rare celltypes from an AnnData"""
celltype_counts = adata.obs["cell_type"].value_counts() >= min_cells
keep_cells = np.isin(adata.obs["cell_type"], celltype_counts.index[celltype_counts])
return adata[adata.obs.index[keep_cells]].copy()


def filter_genes_cells(adata):
"""Remove empty cells and genes."""
if "var_names_all" not in adata.uns:
Expand All @@ -176,6 +180,7 @@ def filter_genes_cells(adata):
sc.pp.filter_genes(adata, min_cells=1)
sc.pp.filter_cells(adata, min_counts=2)


adata.X = adata.layers["counts"]
sc.pp.filter_genes(adata, min_counts=10)
adata_merged = generate_synthetic_dataset(adata,
Expand All @@ -189,8 +194,15 @@ def filter_genes_cells(adata):
adata_merged.uns["spatial_data_summary"] = f"Dirichlet alpha={par['alpha']}"
filter_genes_cells(adata_merged)
adata_merged.X = None
if "is_primary_data" in adata_merged.obs:
adata_merged.obs['is_primary_data'] = adata_merged.obs['is_primary_data'].fillna(False)

# Convert non-string objects to categoricals to avoid
# TypeError: Can't implicitly convert non-string objects to strings
# In this case, the error is raised when there are NA values in .obs columns with dtype object (boolean).
# The resulting anndata object cannot be written to a file.
# This conversion is handled in later versions of anndata (0.10)
for col in adata_merged.obs:
if adata_merged.obs[col].dtype == 'object':
adata_merged.obs[col] = adata_merged.obs[col].astype('category')

print("Writing output to file")
adata_merged.write_h5ad(par["simulated_data"])

0 comments on commit 624c442

Please sign in to comment.