Skip to content

Commit

Permalink
Bugfix/aws staging (#9)
Browse files Browse the repository at this point in the history
* bugfix mzml stats on dia data
* propagated file subsetting to empirical library generation
* fixes diann conversion without ambiguous groups
  • Loading branch information
jspaezp authored Feb 23, 2024
1 parent fd20331 commit 9f4cd5b
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 28 deletions.
50 changes: 31 additions & 19 deletions bin/diann_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
import os
import re
from dataclasses import dataclass
import warnings
from pathlib import Path
from typing import Any, List, Tuple, Dict, Set, Union

Expand Down Expand Up @@ -442,9 +442,9 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
out_mztab_MTD.loc[1, "title"] = "ConsensusMap export from OpenMS"
out_mztab_MTD.loc[1, "description"] = "OpenMS export from consensusXML"
out_mztab_MTD.loc[1, "protein_search_engine_score[1]"] = "[, , DIA-NN Global.PG.Q.Value, ]"
out_mztab_MTD.loc[
1, "peptide_search_engine_score[1]"
] = "[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]"
out_mztab_MTD.loc[1, "peptide_search_engine_score[1]"] = (
"[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]"
)
out_mztab_MTD.loc[1, "psm_search_engine_score[1]"] = "[MS, MS:MS:1001869, protein-level q-value, ]"
out_mztab_MTD.loc[1, "software[1]"] = "[MS, MS:1003253, DIA-NN, Release (v1.8.1)]"
out_mztab_MTD.loc[1, "software[1]-setting[1]"] = fasta
Expand Down Expand Up @@ -486,19 +486,25 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-location"] = (
"file://" + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
)
out_mztab_MTD.loc[
1, "ms_run[" + str(i) + "]-id_format"
] = "[MS, MS:1000777, spectrum identifier nativeID format, ]"
out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
"[MS, MS:1000777, spectrum identifier nativeID format, ]"
)
out_mztab_MTD.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = "[MS, MS:1002038, unlabeled sample, ]"
out_mztab_MTD.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = "ms_run[" + str(i) + "]"

for i in range(1, max(index_ref["study_variable"]) + 1):
study_variable = []
for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values):
study_variable.append("assay[" + str(j) + "]")
out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join(study_variable)
out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = "no description given"

with warnings.catch_warnings():
warnings.simplefilter("ignore")
# This is used here in order to ignore performance warnings from pandas.
for i in range(1, max(index_ref["study_variable"]) + 1):
study_variable = []
for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values):
study_variable.append("assay[" + str(j) + "]")
out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join(study_variable)
out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = "no description given"

# The former loop makes a very sharded frame, this
# makes the frame more compact in memory.
out_mztab_MTD = out_mztab_MTD.copy()
out_mztab_MTD.loc[2, :] = "MTD"

# Transpose out_mztab_MTD
Expand Down Expand Up @@ -552,8 +558,9 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
pg["opt_global_result_type"] = "single_protein"
pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"

out_mztab_PRH = pd.DataFrame()
out_mztab_PRH = pg.drop(["Protein.Names"], axis=1)
out_mztab_PRH = pg
del pg
out_mztab_PRH = out_mztab_PRH.drop(["Protein.Names"], axis=1)
out_mztab_PRH.rename(
columns={"Protein.Group": "accession", "First.Protein.Description": "description"}, inplace=True
)
Expand All @@ -580,9 +587,14 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
protein_details_df = (
protein_details_df.drop("accession", axis=1).join(prh_series).reset_index().drop(columns="index")
)
protein_details_df.loc[:, "col"] = "protein_details"
# protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop=True)
if len(protein_details_df) > 0:
logger.info(f"Found {len(protein_details_df)} indistinguishable protein groups")
# The Following line fails if there are no indistinguishable protein groups
protein_details_df.loc[:, "col"] = "protein_details"
# protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop=True)
else:
logger.info("No indistinguishable protein groups found")

logger.debug("Calculating protein coverage (bottleneck)...")
# This is a bottleneck
Expand Down
37 changes: 34 additions & 3 deletions bin/mzml_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def parse_mzml(file_name: str, file_columns: list):

def parse_bruker_d(file_name: str, file_columns: list):
sql_filepath = f"{file_name}/analysis.tdf"
if not Path(sql_filepath).exists():
msg = f"File '{sql_filepath}' not found"
raise FileNotFoundError(msg)
conn = sqlite3.connect(sql_filepath)
c = conn.cursor()

Expand All @@ -77,13 +80,21 @@ def parse_bruker_d(file_name: str, file_columns: list):
mslevel_map = {0: 1, 8: 2}
elif 9 in df["MsMsType"].values:
mslevel_map = {0: 1, 9: 2}
else:
msg = f"Unrecognized ms type '{df['MsMsType'].values}'"
raise ValueError(msg)
df["MsMsType"] = df["MsMsType"].map(mslevel_map)

try:
# This line raises an sqlite error if the table does not exist
_ = conn.execute("SELECT * from Precursors LIMIT 1").fetchall()
precursor_df = pd.read_sql_query("SELECT * from Precursors", conn)
except Exception as e:
print(f"No precursers recorded in {file_name}")
precursor_df = pd.DataFrame()
except sqlite3.OperationalError as e:
if "no such table: Precursors" in str(e):
print(f"No precursers recorded in {file_name}, This is normal for DIA data.")
precursor_df = pd.DataFrame()
else:
raise

if len(df) == len(precursor_df):
df = pd.concat([df, precursor_df["Charge", "MonoisotopicMz"]], axis=1)
Expand All @@ -108,10 +119,30 @@ def parse_bruker_d(file_name: str, file_columns: list):

return df

if not (Path(ms_path).exists()):
print(f"Not found '{ms_path}', trying to find alias")
ms_path_path = Path(ms_path)
path_stem = str(ms_path_path.stem)
candidates = (
list(ms_path_path.parent.glob("*.d"))
+ list(ms_path_path.parent.glob("*.mzml"))
+ list(ms_path_path.parent.glob("*.mzML"))
)

candidates = [c for c in candidates if path_stem in str(c)]

if len(candidates) == 1:
ms_path = str(candidates[0].resolve())
else:
raise FileNotFoundError()

if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir():
ms_df = parse_bruker_d(ms_path, file_columns)
elif Path(ms_path).suffix in [".mzML", ".mzml"]:
ms_df = parse_mzml(ms_path, file_columns)
else:
msg = f"Unrecognized or inexistent mass spec file '{ms_path}'"
raise RuntimeError(msg)

ms_df.to_csv(
f"{Path(ms_path).stem}_ms_info.tsv",
Expand Down
17 changes: 13 additions & 4 deletions workflows/dia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,18 @@ workflow DIA {
//
// MODULE: DIANN_PRELIMINARY_ANALYSIS
//
if (params.random_preanalysis){
DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.randomSample(params.empirical_assembly_ms_n, 2024).combine(speclib))
} else{
if (params.random_preanalysis) {
preanalysis_seed = 2024
preanalysis_subset = ch_file_preparation_results
.randomSample(params.empirical_assembly_ms_n, preanalysis_seed)
empirical_lib_files = preanalysis_subset
.map { result -> result[1] }
.collect()
DIANN_PRELIMINARY_ANALYSIS(preanalysis_subset.combine(speclib))
} else {
empirical_lib_files = ch_file_preparation_results
.map { result -> result[1] }
.collect()
DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(speclib))
}
ch_software_versions = ch_software_versions
Expand All @@ -88,7 +97,7 @@ workflow DIA {
//
// Order matters in DIANN, This shoudl be sorted for reproducible results.
ASSEMBLE_EMPIRICAL_LIBRARY(
ch_result.ms_file.collect(),
empirical_lib_files,
meta,
DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(),
speclib
Expand Down
8 changes: 6 additions & 2 deletions workflows/quantms.nf
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,12 @@ workflow.onComplete {
if (params.email || params.email_on_fail) {
NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report)
}
NfcoreTemplate.dump_parameters(workflow, params)
NfcoreTemplate.summary(workflow, params, log)
try {
NfcoreTemplate.dump_parameters(workflow, params)
NfcoreTemplate.summary(workflow, params, log)
} catch (Exception e) {
log.error "Error generating summary: ${e}"
}
if (params.hook_url) {
NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)
}
Expand Down

0 comments on commit 9f4cd5b

Please sign in to comment.