Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement support for adding custom metadata columns and including certain genes to HVGs #50

Merged
merged 5 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions bin/merge_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,14 @@
parser.add_argument("--output_transfer", help="Output file containing all cells which require transfer learning", type=str)
parser.add_argument("--output_counts", help="Output file, outer join of cells and genes", type=str)
parser.add_argument("--min_cells", help='Minimum number of cells to keep a gene', type=int, required=False, default=50)
parser.add_argument("--custom_metadata", help="Additional metadata columns to include", type=str, nargs="*")
parser.add_argument("--custom_genes", help="Additional genes to include", type=str, nargs="*")

args = parser.parse_args()

columns_additional = {column: False for column in args.custom_metadata}
columns_considered = {**columns_required, **columns_additional}

datasets = [ad.read_h5ad(f) for f in args.input]

for file_name, dataset in zip(args.input, datasets):
Expand All @@ -36,7 +41,7 @@
f'Dataset is empty: {file_name}'
)
# Make sure all required columns are present
for column, required in columns_required.items():
for column, required in columns_considered.items():
if column not in dataset.obs.columns:
if required:
raise ValueError(
Expand All @@ -46,11 +51,20 @@
dataset.obs[column] = "Unknown"

# Subset columns
dataset.obs = dataset.obs[columns_required.keys()]
dataset.obs = dataset.obs[columns_considered.keys()]

adata = ad.concat(datasets)
adata_outer = ad.concat(datasets, join='outer')

additional_genes = [gene for gene in args.custom_genes if gene not in adata.var_names]

# Add custom genes from outer join to the intersection
if additional_genes:
adata_additional = adata_outer[adata.obs_names, additional_genes]
adata_concatenated = ad.concat([adata, adata_additional], join="outer", axis=1)
adata_concatenated.obs, adata_concatenated.obsm = adata.obs, adata.obsm
adata = adata_concatenated

# Convert to CSR matrix
adata.X = csr_matrix(adata.X)
adata_outer.X = csr_matrix(adata_outer.X)
Expand Down Expand Up @@ -100,7 +114,7 @@ def to_Florent_case(s: str):

return corrected[0].upper() + corrected[1:]

for column in columns_required.keys():
for column in columns_considered.keys():
if column == "transfer":
continue
# Convert first to string and then to category
Expand Down
9 changes: 9 additions & 0 deletions modules/identify_hvgs.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ process IDENTIFY_HVGS {
input:
tuple val(meta), path(adata)
val(n_hvgs)
val(custom_hvgs)

output:
tuple val(meta), path("${meta.id}.hvgs.pkl")
Expand All @@ -34,6 +35,14 @@ process IDENTIFY_HVGS {
except:
span += 0.1
print(f"Increased span to {span}")

custom_hvgs = "${custom_hvgs.join(" ")}".split(" ")
custom_hvgs = [x.upper().replace("_", "-").replace(".", "-") for x in custom_hvgs]
custom_hvgs = [x for x in custom_hvgs if x in adata.var_names]

if len(custom_hvgs) > 0:
# Set "highly_variable" to True for custom HVGs
adata.var.loc[custom_hvgs, "highly_variable"] = True

adata.var[["highly_variable"]].to_pickle("${meta.id}.hvgs.pkl")
"""
Expand Down
4 changes: 3 additions & 1 deletion modules/merge_datasets.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ process MERGE_DATASETS {
input:
path(adatas)
val(min_cells)
val(custom_metadata)
val(custom_genes)

output:
path("datasets.integration.h5ad"), emit: integration
Expand All @@ -18,6 +20,6 @@ process MERGE_DATASETS {

script:
"""
merge_datasets.py --input ${adatas} --min_cells ${min_cells} --output_transfer datasets.transfer.h5ad --output_intersection datasets.intersection.h5ad --output_integration datasets.integration.h5ad --output_counts datasets.counts.h5ad
merge_datasets.py --input ${adatas} --custom_genes ${custom_genes.join(" ")} --custom_metadata ${custom_metadata.join(" ")} --min_cells ${min_cells} --output_transfer datasets.transfer.h5ad --output_intersection datasets.intersection.h5ad --output_integration datasets.integration.h5ad --output_counts datasets.counts.h5ad
"""
}
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ params {

has_extended = false
has_celltypes = true
custom_metadata = []
custom_hvgs = []

scanvi_labels = null
scanvi_integrated = null
Expand Down
10 changes: 8 additions & 2 deletions workflows/preprocessing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ workflow PREPROCESSING {

FILTER(ch_samples.h5ad.map{ meta, adata, format -> [meta, adata]}.mix(RDS_TO_H5AD.out))
GENES_UPSET(FILTER.out.map{ meta, adata -> adata }.collect())
MERGE_DATASETS(FILTER.out.flatMap{ meta, adata -> adata }.collect(), params.min_cells)
MERGE_DATASETS(
FILTER.out.flatMap{ meta, adata -> adata }.collect(),
params.min_cells,
params.custom_metadata,
params.custom_hvgs
)

ch_adata_integration = MERGE_DATASETS.out.integration
.map{ adata -> [[id: "integration"], adata] }
Expand All @@ -42,7 +47,8 @@ workflow PREPROCESSING {

IDENTIFY_HVGS(
ch_adata_integration,
params.integration_hvgs
params.integration_hvgs,
params.custom_hvgs
)

emit:
Expand Down