Bugfix/aws staging (#9)

* bugfix mzml stats on dia data * propagated file subsetting to empirical library generation * fixes diann conversion without ambiguous groups
bigbio · Feb 23, 2024 · 9f4cd5b · 9f4cd5b
1 parent fd20331
commit 9f4cd5b
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 28 deletions.
diff --git a/bin/diann_convert.py b/bin/diann_convert.py
@@ -9,7 +9,7 @@
 import logging
 import os
 import re
-from dataclasses import dataclass
+import warnings
 from pathlib import Path
 from typing import Any, List, Tuple, Dict, Set, Union
 
@@ -442,9 +442,9 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
     out_mztab_MTD.loc[1, "title"] = "ConsensusMap export from OpenMS"
     out_mztab_MTD.loc[1, "description"] = "OpenMS export from consensusXML"
     out_mztab_MTD.loc[1, "protein_search_engine_score[1]"] = "[, , DIA-NN Global.PG.Q.Value, ]"
-    out_mztab_MTD.loc[
-        1, "peptide_search_engine_score[1]"
-    ] = "[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]"
+    out_mztab_MTD.loc[1, "peptide_search_engine_score[1]"] = (
+        "[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]"
+    )
     out_mztab_MTD.loc[1, "psm_search_engine_score[1]"] = "[MS, MS:MS:1001869, protein-level q-value, ]"
     out_mztab_MTD.loc[1, "software[1]"] = "[MS, MS:1003253, DIA-NN, Release (v1.8.1)]"
     out_mztab_MTD.loc[1, "software[1]-setting[1]"] = fasta
@@ -486,19 +486,25 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
         out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-location"] = (
             "file://" + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
         )
-        out_mztab_MTD.loc[
-            1, "ms_run[" + str(i) + "]-id_format"
-        ] = "[MS, MS:1000777, spectrum identifier nativeID format, ]"
+        out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
+            "[MS, MS:1000777, spectrum identifier nativeID format, ]"
+        )
         out_mztab_MTD.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = "[MS, MS:1002038, unlabeled sample, ]"
         out_mztab_MTD.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = "ms_run[" + str(i) + "]"
 
-    for i in range(1, max(index_ref["study_variable"]) + 1):
-        study_variable = []
-        for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values):
-            study_variable.append("assay[" + str(j) + "]")
-        out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join(study_variable)
-        out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = "no description given"
-
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # This is used here in order to ignore performance warnings from pandas.
+        for i in range(1, max(index_ref["study_variable"]) + 1):
+            study_variable = []
+            for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values):
+                study_variable.append("assay[" + str(j) + "]")
+            out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join(study_variable)
+            out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = "no description given"
+
+    # The former loop makes a very sharded frame, this
+    # makes the frame more compact in memory.
+    out_mztab_MTD = out_mztab_MTD.copy()
     out_mztab_MTD.loc[2, :] = "MTD"
 
     # Transpose out_mztab_MTD
@@ -552,8 +558,9 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     pg["opt_global_result_type"] = "single_protein"
     pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"
 
-    out_mztab_PRH = pd.DataFrame()
-    out_mztab_PRH = pg.drop(["Protein.Names"], axis=1)
+    out_mztab_PRH = pg
+    del pg
+    out_mztab_PRH = out_mztab_PRH.drop(["Protein.Names"], axis=1)
     out_mztab_PRH.rename(
         columns={"Protein.Group": "accession", "First.Protein.Description": "description"}, inplace=True
     )
@@ -580,9 +587,14 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     protein_details_df = (
         protein_details_df.drop("accession", axis=1).join(prh_series).reset_index().drop(columns="index")
     )
-    protein_details_df.loc[:, "col"] = "protein_details"
-    # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
-    out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop=True)
+    if len(protein_details_df) > 0:
+        logger.info(f"Found {len(protein_details_df)} indistinguishable protein groups")
+        # The Following line fails if there are no indistinguishable protein groups
+        protein_details_df.loc[:, "col"] = "protein_details"
+        # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
+        out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop=True)
+    else:
+        logger.info("No indistinguishable protein groups found")
 
     logger.debug("Calculating protein coverage (bottleneck)...")
     # This is a bottleneck

diff --git a/bin/mzml_statistics.py b/bin/mzml_statistics.py
@@ -63,6 +63,9 @@ def parse_mzml(file_name: str, file_columns: list):
 
     def parse_bruker_d(file_name: str, file_columns: list):
         sql_filepath = f"{file_name}/analysis.tdf"
+        if not Path(sql_filepath).exists():
+            msg = f"File '{sql_filepath}' not found"
+            raise FileNotFoundError(msg)
         conn = sqlite3.connect(sql_filepath)
         c = conn.cursor()
 
@@ -77,13 +80,21 @@ def parse_bruker_d(file_name: str, file_columns: list):
             mslevel_map = {0: 1, 8: 2}
         elif 9 in df["MsMsType"].values:
             mslevel_map = {0: 1, 9: 2}
+        else:
+            msg = f"Unrecognized ms type '{df['MsMsType'].values}'"
+            raise ValueError(msg)
         df["MsMsType"] = df["MsMsType"].map(mslevel_map)
 
         try:
+            # This line raises an sqlite error if the table does not exist
+            _ = conn.execute("SELECT * from Precursors LIMIT 1").fetchall()
             precursor_df = pd.read_sql_query("SELECT * from Precursors", conn)
-        except Exception as e:
-            print(f"No precursers recorded in {file_name}")
-            precursor_df = pd.DataFrame()
+        except sqlite3.OperationalError as e:
+            if "no such table: Precursors" in str(e):
+                print(f"No precursers recorded in {file_name}, This is normal for DIA data.")
+                precursor_df = pd.DataFrame()
+            else:
+                raise
 
         if len(df) == len(precursor_df):
             df = pd.concat([df, precursor_df["Charge", "MonoisotopicMz"]], axis=1)
@@ -108,10 +119,30 @@ def parse_bruker_d(file_name: str, file_columns: list):
 
         return df
 
+    if not (Path(ms_path).exists()):
+        print(f"Not found '{ms_path}', trying to find alias")
+        ms_path_path = Path(ms_path)
+        path_stem = str(ms_path_path.stem)
+        candidates = (
+            list(ms_path_path.parent.glob("*.d"))
+            + list(ms_path_path.parent.glob("*.mzml"))
+            + list(ms_path_path.parent.glob("*.mzML"))
+        )
+
+        candidates = [c for c in candidates if path_stem in str(c)]
+
+        if len(candidates) == 1:
+            ms_path = str(candidates[0].resolve())
+        else:
+            raise FileNotFoundError()
+
     if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir():
         ms_df = parse_bruker_d(ms_path, file_columns)
     elif Path(ms_path).suffix in [".mzML", ".mzml"]:
         ms_df = parse_mzml(ms_path, file_columns)
+    else:
+        msg = f"Unrecognized or inexistent mass spec file '{ms_path}'"
+        raise RuntimeError(msg)
 
     ms_df.to_csv(
         f"{Path(ms_path).stem}_ms_info.tsv",

diff --git a/workflows/dia.nf b/workflows/dia.nf
@@ -75,9 +75,18 @@ workflow DIA {
         //
         // MODULE: DIANN_PRELIMINARY_ANALYSIS
         //
-        if (params.random_preanalysis){
-            DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.randomSample(params.empirical_assembly_ms_n, 2024).combine(speclib))
-        } else{
+        if (params.random_preanalysis) {
+            preanalysis_seed = 2024
+            preanalysis_subset = ch_file_preparation_results
+                .randomSample(params.empirical_assembly_ms_n, preanalysis_seed)
+            empirical_lib_files = preanalysis_subset
+                .map { result -> result[1] }
+                .collect()
+            DIANN_PRELIMINARY_ANALYSIS(preanalysis_subset.combine(speclib))
+        } else {
+            empirical_lib_files = ch_file_preparation_results
+                .map { result -> result[1] }
+                .collect()
             DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(speclib))
         }
         ch_software_versions = ch_software_versions
@@ -88,7 +97,7 @@ workflow DIA {
         //
         // Order matters in DIANN, This shoudl be sorted for reproducible results.
         ASSEMBLE_EMPIRICAL_LIBRARY(
-            ch_result.ms_file.collect(),
+            empirical_lib_files,
             meta,
             DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(),
             speclib

diff --git a/workflows/quantms.nf b/workflows/quantms.nf
@@ -213,8 +213,12 @@ workflow.onComplete {
     if (params.email || params.email_on_fail) {
         NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report)
     }
-    NfcoreTemplate.dump_parameters(workflow, params)
-    NfcoreTemplate.summary(workflow, params, log)
+    try {
+        NfcoreTemplate.dump_parameters(workflow, params)
+        NfcoreTemplate.summary(workflow, params, log)
+    } catch (Exception e) {
+        log.error "Error generating summary: ${e}"
+    }
     if (params.hook_url) {
         NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)
     }