Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MsPathFinderT result reading #775

Merged
merged 16 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion mzLib/Chemistry/ChemicalFormula.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
using MzLibUtil;
using System;
using System.Collections.Generic;
using System.ComponentModel.DataAnnotations.Schema;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;

namespace Chemistry
Expand Down Expand Up @@ -65,7 +67,7 @@ public ChemicalFormula(IHasChemicalFormula capFormula)
Elements = new Dictionary<Element, int>(capFormula.ThisChemicalFormula.Elements);
}

public ChemicalFormula ThisChemicalFormula => this;
[JsonIgnore] public ChemicalFormula ThisChemicalFormula => this;

/// <summary>
/// Gets the average mass of this chemical formula
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
using CsvHelper.Configuration.Attributes;
using CsvHelper.Configuration;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Chemistry;

namespace Readers
{
public class MsPathFinderTResult
{
public static CsvConfiguration CsvConfiguration { get; } = new CsvConfiguration(System.Globalization.CultureInfo.InvariantCulture)
{
Delimiter = "\t",
HasHeaderRecord = true,
IgnoreBlankLines = true,
TrimOptions = CsvHelper.Configuration.TrimOptions.InsideQuotes,
BadDataFound = null,
};


[Name("Scan")]
public int OneBasedScanNumber { get; set; }

[Name("Pre")]
public char PreviousResidue { get; set; }

[Name("Sequence")]
public string BaseSequence { get; set; }

[Name("Post")]
public char NextResidue { get; set; }

[Name("Modifications")]
public string Modifications { get; set; }

[Name("Composition")]
[TypeConverter(typeof(MsPathFinderTCompositionToChemicalFormulaConverter))]
public ChemicalFormula ChemicalFormula { get; set; }

[Name("ProteinName")]
public string ProteinName { get; set; }

[Name("ProteinDesc")]
public string ProteinDescription { get; set; }

[Name("ProteinLength")]
public int Length { get; set; }

[Name("Start")]
public int OneBasedStartResidue { get; set; }

[Name("End")]
public int OneBasedEndResidue { get; set; }

[Name("Charge")]
public int Charge { get; set; }

[Name("MostAbundantIsotopeMz")]
public double MostAbundantIsotopeMz { get; set; }

[Name("Mass")]
public double MonoisotopicMass { get; set; }

[Name("Ms1Features")]
public int Ms1Features { get; set; }

[Name("#MatchedFragments")]
public int NumberOfMatchedFragments { get; set; }

[Name("Probability")]
public double Probability { get; set; }

[Name("SpecEValue")]
public double SpecEValue { get; set; }

[Name("EValue")]
public double EValue { get; set; }

[Name("QValue")]
[Optional]
public double QValue { get; set; }

[Name("PepQValue")]
[Optional]
public double PepQValue { get; set; }

#region InterpretedFields

[Ignore] private string _accession = null;
[Ignore] public string Accession => _accession ??= ProteinName.Split('|')[1].Trim();

[Ignore] private bool? _isDecoy = null;
[Ignore] public bool IsDecoy => _isDecoy ??= ProteinName.StartsWith("XXX");
[Optional] public string FileNameWithoutExtension { get; set; }

#endregion
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
using CsvHelper;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Easy.Common.Extensions;

namespace Readers
{
public class MsPathFinderTResultFile : ResultFile<MsPathFinderTResult>, IResultFile
{
public override SupportedFileType FileType { get; }
public override Software Software { get; set; }

public MsPathFinderTResultFile(string filePath) : base(filePath, Software.MsPathFinderT)
{
FileType = filePath.ParseFileType();
}

public MsPathFinderTResultFile() : base()
{
FileType = FilePath.IsNullOrEmpty() ? SupportedFileType.MsPathFinderTAllResults : FilePath.ParseFileType();
}

public override void LoadResults()
{
using var csv = new CsvReader(new StreamReader(FilePath), MsPathFinderTResult.CsvConfiguration);
Results = csv.GetRecords<MsPathFinderTResult>().ToList();
if (Results.Any() && Results.First().FileNameWithoutExtension.IsNullOrEmpty())
Results.ForEach(p => p.FileNameWithoutExtension = string.Join("_", Path.GetFileNameWithoutExtension(FilePath).Split('_')[..^1]));
}

public override void WriteResults(string outputPath)
{
if (!CanRead(outputPath))
outputPath += FileType.GetFileExtension();

using (var csv = new CsvWriter(new StreamWriter(File.Create(outputPath)), MsPathFinderTResult.CsvConfiguration))
{
csv.WriteHeader<MsPathFinderTResult>();
foreach (var result in Results)
{
csv.NextRecord();
csv.WriteRecord(result);
}
}
}
}
}
70 changes: 66 additions & 4 deletions mzLib/Readers/Util/Converters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
using CsvHelper.Configuration;
using CsvHelper.TypeConversion;
using MzLibUtil;
using System.Text;

namespace Readers
{
/// <summary>
/// Converts a list of doubles delimited by semicolons to a list of doubles
/// To be used with CsvHelper
/// </summary>
public class SemicolonDelimitedToDoubleListConverter : DefaultTypeConverter
internal class SemicolonDelimitedToDoubleListConverter : DefaultTypeConverter
{
public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData)
{
Expand All @@ -24,7 +25,7 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa
}
}

public class DashToNullOrDoubleConverter : DefaultTypeConverter
internal class DashToNullOrDoubleConverter : DefaultTypeConverter
{
public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData)
{
Expand All @@ -37,7 +38,7 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa
}
}

public class CommaDelimitedToIntegerArrayTypeConverter : DefaultTypeConverter
internal class CommaDelimitedToIntegerArrayTypeConverter : DefaultTypeConverter
{
public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData)
{
Expand All @@ -53,7 +54,7 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa
}
}

public class CommaDelimitedToStringArrayTypeConverter : DefaultTypeConverter
internal class CommaDelimitedToStringArrayTypeConverter : DefaultTypeConverter
{
public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData)
{
Expand All @@ -69,4 +70,65 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa
return string.Join(',', list);
}
}

/// <summary>
/// Converts the chemical formula from MsPathFinderT to MetaMorpheus
/// MsPathFinderT: "C(460) H(740) N(136) O(146) S(0)"
/// MetaMorpheus: "C460H740N136O146S"
/// </summary>
internal class MsPathFinderTCompositionToChemicalFormulaConverter : DefaultTypeConverter
{
public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData)
{
var composition = text.Split(' ').Where(p => p != "").ToArray();
var chemicalFormula = new Chemistry.ChemicalFormula();
foreach (var element in composition)
{
var elementSplit = element.Split('(');
var elementName = elementSplit[0];
var elementCount = int.Parse(elementSplit[1].Replace(")", ""));
chemicalFormula.Add(elementName, elementCount);
}
return chemicalFormula;
}

public override string ConvertToString(object value, IWriterRow row, MemberMapData memberMapData)
{
var chemicalFormula = value as Chemistry.ChemicalFormula ?? throw new Exception("Cannot convert input to ChemicalFormula");
var sb = new StringBuilder();

bool onNumber = false;
foreach (var character in chemicalFormula.Formula)
{
if (!char.IsDigit(character)) // if is a letter
{
if (onNumber)
{
sb.Append(") " + character);
onNumber = false;
}
else
sb.Append(character);
}
else
{
if (!onNumber)
{
sb.Append("(" + character);
onNumber = true;
}
else
sb.Append(character);
}
}

var stringForm = sb.ToString();
if (char.IsDigit(stringForm.Last()))
stringForm += ")";
else
stringForm += "(1)";

return stringForm;
}
}
}
4 changes: 2 additions & 2 deletions mzLib/Readers/Util/Software.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

namespace Readers
namespace Readers
{
public enum Software
{
Expand All @@ -11,5 +10,6 @@ public enum Software
MaxQuant,
Toppic,
MsFragger, // files tested were from fragpipe v21.1
MsPathFinderT,
}
}
13 changes: 12 additions & 1 deletion mzLib/Readers/Util/SupportedFileTypes.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ public enum SupportedFileType
MsFraggerPsm,
MsFraggerPeptide,
MsFraggerProtein,
MsPathFinderTTargets,
MsPathFinderTDecoys,
MsPathFinderTAllResults,
}

public static class SupportedFileTypeExtensions
Expand Down Expand Up @@ -54,10 +57,12 @@ public static string GetFileExtension(this SupportedFileType type)
SupportedFileType.MsFraggerPsm => "psm.tsv",
SupportedFileType.MsFraggerPeptide => "peptide.tsv",
SupportedFileType.MsFraggerProtein => "protein.tsv",
SupportedFileType.MsPathFinderTTargets => "_IcTarget.tsv",
SupportedFileType.MsPathFinderTDecoys => "_IcDecoy.tsv",
SupportedFileType.MsPathFinderTAllResults => "_IcTDA.tsv",
_ => throw new MzLibException("File type not supported")
};
}

public static SupportedFileType ParseFileType(this string filePath)
{
switch (Path.GetExtension(filePath).ToLower())
Expand Down Expand Up @@ -99,6 +104,12 @@ public static SupportedFileType ParseFileType(this string filePath)
return SupportedFileType.MsFraggerPeptide;
if (filePath.EndsWith(SupportedFileType.MsFraggerProtein.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase))
return SupportedFileType.MsFraggerProtein;
if (filePath.EndsWith(SupportedFileType.MsPathFinderTTargets.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase))
return SupportedFileType.MsPathFinderTTargets;
if (filePath.EndsWith(SupportedFileType.MsPathFinderTDecoys.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase))
return SupportedFileType.MsPathFinderTDecoys;
if (filePath.EndsWith(SupportedFileType.MsPathFinderTAllResults.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase))
return SupportedFileType.MsPathFinderTAllResults;

// these tsv cases are just .tsv and need an extra step to determine the type
// currently need to distinguish between FlashDeconvTsv and MsFraggerPsm
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Scan Pre Sequence Post Modifications Composition ProteinName ProteinDesc ProteinLength Start End Charge MostAbundantIsotopeMz Mass Ms1Features #MatchedFragments Probability SpecEValue EValue QValue PepQValue
1180 M PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD - C(442) H(756) N(140) O(156) S(0) sp|P05114|HMGN1_HUMAN Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3 101 2 100 15 702.8454697 10521.55277 0 61 1.0 9.99E-308 9.99E-308 0 0
1180 - EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF T C(460) H(740) N(136) O(146) S(1) XXX_sp|Q2G1S6|SSL5_STAA8 "Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325 " 235 1 91 15 703.9044982 10537.4382 670 5 0.0017 2.399925E-07 0.009145 10 10
1180 T DKGQATLRLANIALFVKVHFMSPIKWSATFFMALLYGAEGSIHKLLGNLIPKVRAVIIKKDIVGSSVRLN - C(358) H(585) N(95) O(89) S(2) XXX_sp|Q2GGT3|SYR_EHRCR "Arginine--tRNA ligase OS=Ehrlichia chaffeensis (strain ATCC CRL-10679 " 577 507 576 11 701.6776982 7703.361219 427 12 7.2369E-05 2.143132E-05 0.816662 10 10
1180 - MSQDTEVDMKDVELNELEPEKQPMNAADGAAAGEKNGLVKIKVAEDETEAGVKFTGLSKEELLKVAGSPGWVRTRWALLLLFWLGWLGMLAGAVVII V C(473) H(758) N(120) O(142) S(4) sp|P10852|4F2_MOUSE Amino acid transporter heavy chain SLC3A2 OS=Mus musculus OX=10090 GN=Slc3a2 PE=1 SV=1 527 1 97 15 702.7063795 10519.46642 660 9 3.1386E-04 1.311625E-06 0.049981 10 10
1180 - MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHSSLPVGRRHPPVLRMVLEALQAGEQRRGTSVAAIKLYILHKYPTVDVLRFKYLLKQA L C(463) H(753) N(137) O(139) S(2) sp|Q8IZA3|H18_HUMAN Histone H1.8 OS=Homo sapiens OX=9606 GN=H1-8 PE=2 SV=1 347 1 100 15 702.7786627 10520.55066 661 8 8.4705E-05 1.802522E-06 0.068687 10 10
1181 M PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD - C(442) H(756) N(140) O(156) S(0) sp|P05114|HMGN1_HUMAN Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3 101 2 100 14 752.9767692 10521.55277 0 60 1.0 9.99E-308 9.99E-308 0 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Scan Pre Sequence Post Modifications Composition ProteinName ProteinDesc ProteinLength Start End Charge MostAbundantIsotopeMz Mass Ms1Features #MatchedFragments Probability SpecEValue EValue
1180 - EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF T C(460) H(740) N(136) O(146) S(1) XXX_sp|Q2G1S6|SSL5_STAA8 "Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325 " 235 1 91 15 703.9044982 10537.4382 670 5 0.0017 2.399925E-07 0.009145
1180 T DKGQATLRLANIALFVKVHFMSPIKWSATFFMALLYGAEGSIHKLLGNLIPKVRAVIIKKDIVGSSVRLN - C(358) H(585) N(95) O(89) S(2) XXX_sp|Q2GGT3|SYR_EHRCR "Arginine--tRNA ligase OS=Ehrlichia chaffeensis (strain ATCC CRL-10679 " 577 507 576 11 701.6776982 7703.361219 427 12 7.2369E-05 2.143132E-05 0.816662
1181 R RGRLFKAHFTMLLSVLHKAGAALKPRPAGSVQLYCHLLRPPRPATPAAEPFGHPEGADCPFSTFSLETRGIGPYLSLAYIFLNKDGRSRLRTYGCA P C(477) H(745) N(137) O(124) S(4) XXX_sp|Q6UXT8|ALKL1_HUMAN ALK and LTK ligand 1 OS=Homo sapiens OX=9606 GN=ALKAL1 PE=1 SV=1 130 2 97 14 751.7593204 10504.50849 655 9 0.0152 5.839E-08 0.002225
1181 - EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF T C(460) H(740) N(136) O(146) S(1) XXX_sp|Q2G1S6|SSL5_STAA8 "Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325 " 235 1 91 14 754.1114426 10537.4382 670 4 1.6191E-04 9.002121E-07 0.034303
1181 - SALTILDEVVDLVAKTIEVIKLTSYDANGKQFNQEEFITNIPSVLSGYVTLADDANAFIDAPSDAKSRFGTGEFKVSKEIGLRVLHNRITNGGIQVQ K C(469) H(752) N(124) O(150) S(0) XXX_sp|P10592|HSP72_YEAST "Heat shock protein SSA2 OS=Saccharomyces cerevisiae (strain ATCC 204508 " 640 1 97 14 752.9731998 10521.5028 662 8 1.1724E-04 1.072104E-06 0.040854
1181 N PAELTKRVQLDMVCRIKEPLRSVKPLPSLAAPSVSSPQLASKSAATKVSMSFSAYMLKMISPVNRAQRCSPQSYRTHVTSPMKRSAVVSEYLSHDP T C(459) H(763) N(133) O(136) S(7) XXX_sp|Q9PTQ7|DMRT1_CHICK Doublesex- and mab-3-related transcription factor 1 OS=Gallus gallus OX=9031 GN=DMRT1 PE=2 SV=2 366 2 97 14 754.1867306 10538.49223 671 8 1.8379E-04 2.215391E-06 0.08442
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Scan Pre Sequence Post Modifications Composition ProteinName ProteinDesc ProteinLength Start End Charge MostAbundantIsotopeMz Mass Ms1Features #MatchedFragments Probability SpecEValue EValue
592 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 14 662.5096855 9256.016953 531 17 0.9999 1.642469E-36 6.258791E-32
595 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 15 618.4095249 9256.016953 531 12 0.2209 2.528896E-17 9.636612E-13
1089 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 15 618.4095249 9256.016953 530 6 1.1566E-04 1.241835E-14 4.732136E-10
1090 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 14 662.5096855 9256.016953 530 7 0.0039 1.27173E-17 4.846056E-13
1169 M PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD - C(442) H(756) N(140) O(156) S(0) sp|P05114|HMGN1_HUMAN Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3 101 2 100 15 702.8454697 10521.55277 662 13 0.9938 6.040125E-20 2.30165E-15
1169 M PKRKSPENTEGKDGSKVTKQEPTRRSARLSAKPAPPKPEPKPRKTSAKKEPGAKISRGAKGKKEEKQEAGKEGTAPSENGETKAEEAQKTESVDNEGE - C(442) H(749) N(141) O(156) S(0) sp|Q15651|HMGN3_HUMAN High mobility group nucleosome-binding domain-containing protein 3 OS=Homo sapiens OX=9606 GN=HMGN3 PE=1 SV=2 100 2 99 15 703.3086896 10528.50107 666 7 0.161 3.706717E-14 1.412482E-09
Loading
Loading