Skip to content

Commit 6acb727

Browse files
zdanaceauzdanaceau
and
zdanaceau
authored
Added a function to generate decoys from scrambled targets (#641)
* Added a function to generate decoys from scrambled targets. * Added function to generate decoys by scrambling targets * Fixed GetPercentIdentity to account for modifications and count cleavage motifs * Fixed comments and cleaned up code * Added more complex test cases * Added insulin protein as a more complex example to scramble. * Added target/decoy pairing to GetScrambledDecoyFromTarget * Added a test to ensure that peptides are mirrored once the maximum number of scramble attempts is reached. Co-authored-by: zdanaceau <[email protected]>
1 parent 3c77adf commit 6acb727

File tree

2 files changed

+279
-4
lines changed

2 files changed

+279
-4
lines changed

mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs

+219-3
Original file line numberDiff line numberDiff line change
@@ -1174,7 +1174,7 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA
11741174
foreach (int location in cleavageMotifLocations)
11751175
{
11761176
char[] motifArray = BaseSequence.Substring(location, cleavingMotif.Length).ToCharArray();
1177-
1177+
11781178
for (int i = 0; i < cleavingMotif.Length; i++)
11791179
{
11801180
newBase[location + i] = motifArray[i];
@@ -1191,8 +1191,9 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA
11911191
}
11921192
}
11931193

1194-
//We've kept amino acids in the digestion motif in the same position in the decoy peptide.
1195-
//Now we will fill the remaining open positions in the decoy with the reverse of amino acids from the target.
1194+
// We've kept amino acids in the digestion motif in the same position in the decoy peptide.
1195+
// Now we will fill the remaining open positions in the decoy with the reverse of amino acids from the target.
1196+
// Part to change to scramble
11961197
int fillPosition = 0;
11971198
int extractPosition = this.BaseSequence.Length - 1;
11981199
while (fillPosition < this.BaseSequence.Length && extractPosition >= 0)
@@ -1250,7 +1251,222 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA
12501251
}
12511252

12521253
}
1254+
/// <summary>
1255+
/// This function generates a decoy peptide from a target by scrambling the target peptide's amino acid sequence
1256+
/// This preserves any digestion motifs and keeps modifications with their amino acids
1257+
/// To help generate only high quality decoys, a homology cutoff of 30 % sequence similarity is used
1258+
/// If after 10 attempts no sufficient decoy is generated, the mirror sequence is returned
1259+
/// </summary>
1260+
/// <param name="revisedAminoAcidOrder">Array to store the new amino acid order in</param>
1261+
/// <param name="maximumHomology">Parameter specifying the homology cutoff to be used</param>
1262+
/// <returns></returns>
1263+
public PeptideWithSetModifications GetScrambledDecoyFromTarget(int[] revisedAminoAcidOrder, double maximumHomology = 0.3)
1264+
{
1265+
Dictionary<int, Modification> newModificationsDictionary = new Dictionary<int, Modification>();
1266+
//Copy N-terminal modifications from target dictionary to decoy dictionary.
1267+
if (this.AllModsOneIsNterminus.ContainsKey(1))
1268+
{
1269+
newModificationsDictionary.Add(1, this.AllModsOneIsNterminus[1]);
1270+
}
1271+
char[] newBase = new char[this.BaseSequence.Length];
1272+
Array.Fill(newBase, '0');
1273+
char[] evaporatingBase = this.BaseSequence.ToCharArray();
1274+
List<DigestionMotif> motifs = this.DigestionParams.Protease.DigestionMotifs;
1275+
if (motifs != null && motifs.Count > 0)
1276+
{
1277+
foreach (var motif in motifs.Where(m => m.InducingCleavage != ""))//check the empty "" for topdown
1278+
{
1279+
string cleavingMotif = motif.InducingCleavage;
1280+
List<int> cleavageMotifLocations = new List<int>();
1281+
1282+
for (int i = 0; i < BaseSequence.Length; i++)
1283+
{
1284+
bool fits;
1285+
bool prevents;
1286+
(fits, prevents) = motif.Fits(BaseSequence, i);
1287+
1288+
if (fits && !prevents)
1289+
{
1290+
cleavageMotifLocations.Add(i);
1291+
}
1292+
}
1293+
1294+
foreach (int location in cleavageMotifLocations)
1295+
{
1296+
char[] motifArray = BaseSequence.Substring(location, cleavingMotif.Length).ToCharArray();
1297+
1298+
for (int i = 0; i < cleavingMotif.Length; i++)
1299+
{
1300+
newBase[location + i] = motifArray[i];
1301+
revisedAminoAcidOrder[location + i] = location + i;
1302+
//directly copy mods that were on amino acids in the motif. Those amino acids don't change position.
1303+
if (this.AllModsOneIsNterminus.ContainsKey(location + i + 2))
1304+
{
1305+
newModificationsDictionary.Add(location + i + 2, this.AllModsOneIsNterminus[location + i + 2]);
1306+
}
1307+
1308+
evaporatingBase[location + i] = '0';//can null a char so i use a number which doesnt' appear in peptide string
1309+
}
1310+
}
1311+
}
1312+
}
1313+
1314+
//We've kept amino acids in the digestion motif in the same position in the decoy peptide.
1315+
//Now we will fill the remaining open positions in the decoy with the scrambled amino acids from the target.
1316+
int extractPosition;
1317+
int fillPosition;
1318+
int residueNumsIndex;
1319+
// Specify seed to ensure that the same decoy sequence is always generated from the target
1320+
Random rand = new(56);
1321+
double percentIdentity = 1;
1322+
int scrambleAttempt = 0;
1323+
int maxScrambles = 10;
1324+
double maxIdentity = maximumHomology;
1325+
int characterCounter;
1326+
1327+
while(scrambleAttempt < maxScrambles && percentIdentity > maxIdentity)
1328+
{
1329+
// Copies the newModificationsDictionary for the scramble attempt
1330+
Dictionary<int, Modification> tempModificationsDictionary = new(newModificationsDictionary);
1331+
fillPosition = 0;
1332+
// residueNums is a list containing array indices for each element of evaporatingBase
1333+
// Once each amino acid is added, its index is removed from residueNums to prevent the same AA from being added 2x
1334+
var residueNums = Enumerable.Range(0, evaporatingBase.Length).ToList();
1335+
characterCounter = 0;
1336+
char[] tempNewBase = new char[newBase.Length];
1337+
// Create a copy of the newBase character array for the scrambling attempt
1338+
Array.Copy(newBase, tempNewBase, newBase.Length);
1339+
1340+
// I am not sure why I need the second counter, but it always works when I have it
1341+
int seqLength = this.BaseSequence.Length;
1342+
while (fillPosition < seqLength && characterCounter < seqLength)
1343+
{
1344+
residueNumsIndex = rand.Next(residueNums.Count);
1345+
extractPosition = residueNums[residueNumsIndex];
1346+
char targetAA = evaporatingBase[extractPosition];
1347+
residueNums.RemoveAt(residueNumsIndex);
1348+
if (targetAA != '0')
1349+
{
1350+
while (tempNewBase[fillPosition] != '0')
1351+
{
1352+
fillPosition++;
1353+
}
1354+
tempNewBase[fillPosition] = targetAA;
1355+
revisedAminoAcidOrder[fillPosition] = extractPosition;
1356+
if (this.AllModsOneIsNterminus.ContainsKey(extractPosition + 2))
1357+
{
1358+
tempModificationsDictionary.Add(fillPosition + 2, this.AllModsOneIsNterminus[extractPosition + 2]);
1359+
}
1360+
fillPosition++;
1361+
}
1362+
characterCounter ++;
1363+
}
1364+
scrambleAttempt++;
1365+
/*
1366+
* Any homology scoring mechanism can go here, percent identity is probably not the best
1367+
* In terms of generating a decoy sequence that will have a different mass spectrum than
1368+
* the original, it is far more important to vary the amino acids on the edges than
1369+
* those in the middle. Changes on the edges will offset the entire b and y sequences
1370+
* leading to an effective decoy spectrum even if there is high identity in the middle of
1371+
* the sequence. Additionally, for peptides with a large amount of a certain amino acid,
1372+
* it will be very difficult to generate a low homology sequence.
1373+
*/
1374+
percentIdentity = GetPercentIdentity(tempNewBase, evaporatingBase, tempModificationsDictionary, this.AllModsOneIsNterminus);
1375+
// Check that the percent identity is below the maximum identity threshold and set actual values to the temporary values
1376+
if (percentIdentity < maxIdentity)
1377+
{
1378+
newBase = tempNewBase;
1379+
newModificationsDictionary = tempModificationsDictionary;
1380+
// Code checking similarity between theoretical spectra could go here
1381+
}
12531382

1383+
// If max scrambles are reached, make the new sequence identical to the original to trigger mirroring
1384+
else if (scrambleAttempt == maxScrambles)
1385+
{
1386+
for(int j = 0; j < newBase.Length; j++)
1387+
{
1388+
if (newBase[j] == '0')
1389+
{
1390+
newBase[j] = evaporatingBase[j];
1391+
}
1392+
}
1393+
}
1394+
}
1395+
1396+
1397+
string newBaseString = new string(newBase);
1398+
1399+
var proteinSequence = this.Protein.BaseSequence;
1400+
var aStringBuilder = new StringBuilder(proteinSequence);
1401+
aStringBuilder.Remove(this.OneBasedStartResidueInProtein - 1, this.BaseSequence.Length);
1402+
aStringBuilder.Insert(this.OneBasedStartResidueInProtein - 1, newBaseString);
1403+
proteinSequence = aStringBuilder.ToString();
1404+
1405+
Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List<Tuple<string, string>>(), new Dictionary<int, List<Modification>>(), null, null, null, true);
1406+
DigestionParams d = this.DigestionParams;
1407+
// Creates a hash code corresponding to the target's sequence
1408+
int targetHash = GetHashCode();
1409+
PeptideWithSetModifications decoyPeptide;
1410+
//Make the "peptideDescription" store the corresponding target's sequence
1411+
if (newBaseString != this.BaseSequence)
1412+
{
1413+
decoyPeptide = new PeptideWithSetModifications(decoyProtein, d, this.OneBasedStartResidueInProtein, this.OneBasedEndResidueInProtein, this.CleavageSpecificityForFdrCategory, this.FullSequence, this.MissedCleavages, newModificationsDictionary, this.NumFixedMods, newBaseString);
1414+
// Sets PairedTargetDecoyHash of the original target peptie to the hash hode of the decoy sequence
1415+
PairedTargetDecoyHash = decoyPeptide.GetHashCode();
1416+
// Sets PairedTargetDecoyHash of the decoy peptide to the hash code of the target sequence
1417+
decoyPeptide.PairedTargetDecoyHash = targetHash;
1418+
return decoyPeptide;
1419+
1420+
}
1421+
else
1422+
{
1423+
//The reverse decoy procedure failed to create a PeptideWithSetModificatons with a different sequence. Therefore,
1424+
//we retrun the mirror image peptide.
1425+
decoyPeptide = this.GetPeptideMirror(revisedAminoAcidOrder);
1426+
PairedTargetDecoyHash = decoyPeptide.GetHashCode();
1427+
decoyPeptide.PairedTargetDecoyHash = targetHash;
1428+
return decoyPeptide;
1429+
}
1430+
}
1431+
1432+
/// <summary>
1433+
/// Method to get the percent identity between two peptide sequences stored as char[]
1434+
/// </summary>
1435+
/// <param name="scrambledSequence">Character array of the scrambled sequence</param>
1436+
/// <param name="unscrambledSequence">Character array of the unscrambled sequence</param>
1437+
/// <param name="scrambledMods">Dictionary containing the scrambled sequence's modifications</param>
1438+
/// <param name="unscrambledMods">Dictionary containing the unscrambled sequence's modifications</param>
1439+
/// <returns></returns>
1440+
private static double GetPercentIdentity(char[] scrambledSequence, char[] unscrambledSequence, Dictionary<int, Modification> scrambledMods, Dictionary<int, Modification> unscrambledMods)
1441+
{
1442+
double rawScore = 0;
1443+
int seqLength = scrambledSequence.Length;
1444+
for(int i = 0; i < seqLength; i++)
1445+
{
1446+
if (scrambledSequence[i] == unscrambledSequence[i] || unscrambledSequence[i] == '0')
1447+
{
1448+
Modification scrambledMod;
1449+
if (scrambledMods.TryGetValue(i + 2, out scrambledMod) && unscrambledSequence[i] != '0')
1450+
{
1451+
Modification unscrambledMod;
1452+
if (unscrambledMods.TryGetValue(i + 2, out unscrambledMod))
1453+
{
1454+
if (scrambledMod == unscrambledMod)
1455+
{
1456+
rawScore += 1;
1457+
}
1458+
}
1459+
}
1460+
else
1461+
{
1462+
rawScore += 1;
1463+
}
1464+
1465+
}
1466+
}
1467+
return rawScore / seqLength;
1468+
}
1469+
12541470
//Returns a PeptideWithSetModifications mirror image. Used when reverse decoy sequence is same as target sequence
12551471
public PeptideWithSetModifications GetPeptideMirror(int[] revisedOrderNisOne)
12561472
{

0 commit comments

Comments
 (0)