-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProteome.hpp
171 lines (144 loc) · 5.8 KB
/
Proteome.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#ifndef _PROTEOME_HPP
#define _PROTEOME_HPP
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <set>
#include <string>
#include <algorithm>
#include <tr1/unordered_set>
#include <tr1/unordered_map>
#include "DigestOptions.hpp"
#include "Protein.hpp"
#include "Peptide.hpp"
#include <boost/regex.hpp>
#include <boost/thread/thread.hpp>
#include <boost/thread/detail/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/bind.hpp>
using namespace std;
class Proteome {
public:
tr1::unordered_map<string,int> proteinSet; // keep track of unique protein ids seen
set<string> xProteinSet; // FWD proteins to be x-linked
tr1::unordered_map<Peptide,int> peptideSet; // save matched Peptide records
/* this is a temporary multi-threading hack. Keeps peptide in limbo while all mods
* are being evaluated. Prevents from being deleted by concurrent threads
*/
tr1::unordered_set<Peptide> threadSafeLimbo;
tr1::unordered_set<string> peptideSequenceSet; // keep track of unique sequences already seen
multimap<double,ModIterator> modMap; // index all mod masses
void indexPeptides();
void indexPeptidesMT(int cur_thread_num = 0);
void reIndexPeptides();
void resetLiveIndex();
void keepOnlyMatched();
void storeProteinSet (string ref, int cur_cnt);
void storePeptideSequenceSet (string seq);
pair< tr1::unordered_map<Peptide,int>::iterator, bool > storePeptide(const Peptide& p);
void incrementPeptideHit (tr1::unordered_map<Peptide,int>::iterator pIt);
void incrementPeptideHit (Peptide p);
void decrementPeptideHit (tr1::unordered_map<Peptide,int>::iterator);
void decrementPeptideHit (Peptide p);
void checkRemovePeptide (tr1::unordered_map<Peptide,int>::iterator pIt);
void outputDigest();
/* Fasta getFastaFromId (string id);
Fasta getFastaFromfullHeader (string header);
string getSequenceFromId (string id);
string getSequenceFromfullHeader (string header);*/
Proteome() : _write_mutex(new boost::mutex()) {}
private:
boost::shared_ptr<boost::mutex> _write_mutex;
};
class FastaStreamIterator : public std::iterator<std::input_iterator_tag, Fasta> {
private:
Proteome * _proteome;
ifstream _infile;
string _fullHeader;
string _id;
string _sequence;
int _cur_cnt;
void readNextFasta() {
string line;
boost::regex re(DigestOptions::fastaHeaderRegEx);
boost::match_results<string::const_iterator> regex_result;
while (getline(_infile, line, '\n')) {
if (line[0] == '>') {
if (boost::regex_match(line, regex_result, re)) {
_fullHeader = regex_result[0];
_id = regex_result[1];
_sequence = "";
_cur_cnt ++;
_proteome->storeProteinSet(_id, _cur_cnt);
Peptide::storeRefVec(_id, _cur_cnt);
} else {
cerr << "Error parsing " << DigestOptions::fastaFile << ", entry: " << endl;
cerr << line << endl;
cerr << "|" << regex_result[0] << "|" << DigestOptions::fastaHeaderRegEx << "|" << endl; ////
exit(1);
}
} else if (isalnum(line[0])) {
_sequence += line;
} else {
cerr << "WARNING: Fasta appears to contain empty lines or is otherwise improperly formatted" << endl;
cerr << " Attempting to read through ..." << endl;
}
if (_infile.peek() == EOF || _infile.peek() == '>') {
transform(_sequence.begin(), _sequence.end(), _sequence.begin(), ::toupper);
return;
}
}
return;
}
public:
FastaStreamIterator(const FastaStreamIterator &b); //not defined, so no copy
FastaStreamIterator & operator=(const FastaStreamIterator &b); //not defined, so no copy
// don't allow copies, because streams aren't copiable. make sure to always pass by reference
// unfortunately, this means no stl algorithms either
FastaStreamIterator(Proteome &p) : _proteome(&p) {
_infile.open(DigestOptions::fastaFile.c_str());
if (!_infile.is_open()) {
cout << "Error opening fasta file " << DigestOptions::fastaFile << endl;
exit(0);
}
_cur_cnt = 0;
// initial iterator is empty, so need to increment before doing anything. Needed for elegant processing of last fasta entry
}
FastaStreamIterator() {
_infile.setstate(ios::eofbit);
}
FastaStreamIterator & operator++() {
//cout << "FastaStreamIterator &operator++()" << endl; ////
readNextFasta();
return (*this);
}
// normally you'd want operator++(int) as well, but that requires making a copy
// and std::ifstream isn't copiable.
bool operator==(const FastaStreamIterator &b) const {
if (_infile.bad() == b._infile.bad() &&
_infile.eof() == b._infile.eof() &&
_infile.good() == b._infile.good() &&
_infile.fail() == b._infile.fail())
return true; //all end() and uninitialized iterators equal
// so we can use FastaStreamIterator() as end()
return false; //since they all are seperate strings, never equal
}
bool operator!=(const FastaStreamIterator &b) const {
return !operator==(b);
}
Fasta operator*() const {
Fasta f(_id, _fullHeader, _sequence, _cur_cnt);
if (DigestOptions::swapReverseInitMet && f.isReverse()) {
f.swapRevInitMet();
}
if (DigestOptions::swapReverseCutSites && f.isReverse()) {
f.swapCutSites();
}
return f;
}
};
#endif /* _PROTEOME_HPP */