-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathCoverageGaps.groovy
245 lines (196 loc) · 7.05 KB
/
CoverageGaps.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/*
* Groovy NGS Utils - Some simple utilites for processing Next Generation Sequencing data.
*
* Copyright (C) 2018 Simon Sadedin, ssadedin<at>gmail.com
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package gngs
import groovy.lang.IntRange
import groovy.transform.CompileStatic
import groovy.util.logging.Log
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics
import org.apache.commons.math3.stat.descriptive.SummaryStatistics
/**
* Discovers blocks of low coverage in a file output in BEDTools Coverage format,
* or alternatively {@link MultiCov} output.
* <p>
* Each low coverage block is output as a {@link CoverageBlock} object which
* has statistics about the coverage within the block. As {@link CoverageBlock} extends
* {@link Region}, you can easily turn this into a set of Regions to perform more
* calculations or statistics.
* <p>
* To use this class, create the object first by passing a file to compute coverage for,
* then call the {@link #calculate} method to actually calculate the coverage blocks:
* <pre>
* CoverageGaps gaps = new CoverageGaps("test.cov.gz")
* gaps.calculate()
* println("There are " + gaps.blocks.size() + " low coverage regions")
* </pre>
* @author Simon Sadedin
*/
@Log
class CoverageGaps {
/**
* Use a large buffer for very efficient reading
*/
int BUFFER_SIZE_BYTES = 1024*1024*20
/**
* Only include gaps that are at least this size
*/
int minRegionWidth = 0
String coverageFilePath
int threshold = 20
int lineCount = 0
int blockCount = 0
int totalBP = 0
SummaryStatistics coverageStats = new SummaryStatistics()
CoverageStats coveragePercentiles = new CoverageStats(1000)
/**
* Discovered gaps (the results) are stored here for access after calling {@link #calculate}
*/
List<CoverageBlock> blocks = []
CoverageBlock block = null
Regions regions = null
int offset = 0
RegulatingActor<CoverageBlock> gapProcessor = null
CoverageGaps() {
}
CoverageGaps(String coverageFilePath) {
this.coverageFilePath = coverageFilePath
}
void calculate() {
Utils.reader(coverageFilePath) {
calculateFromBEDTools(it)
}
}
void calculateMultiCov() {
Utils.reader(coverageFilePath) {
calculateFromMultiCov(it)
}
}
@CompileStatic
void calculateFromMultiCov(Reader reader) {
String chr = null
int start = -1
int end = -1
String line
int lastPos = -1
int covField = -1
boolean first = true
ProgressCounter progress = createProgress()
while((line = reader.readLine()) != null) {
progress.count()
List<String> fields = line.tokenize('\t')
chr = fields[0]
// Use the first line to check if the coverage is in the 3rd field or the 4th
if(first) {
first = false
covField = fields.size() == 3 ? 2 : 3
}
int pos = Integer.parseInt(fields[1])
int cov = Integer.parseInt(fields[covField])
if(pos != lastPos + 1) {
start = pos
}
processLine(chr, start, pos, cov, null)
lastPos = pos
}
progress.end()
}
@CompileStatic
void calculateFromBEDTools(Reader reader) {
ProgressCounter progress = createProgress()
int pos = 0
reader.eachLine { String line ->
progress.count()
List<String> fields = line.tokenize('\t')
int cov = fields[-1].toInteger()
String chr = fields[0]
int start = fields[1].toInteger()
int end = fields[2].toInteger()
int offset = fields[-2].toInteger()
String id = null
if(fields.size()>5)
id = fields[-3]
processLine(chr, start, start+offset, cov, id)
}
// we might still have a block
if(block && (block.end - block.start + 1 >= minRegionWidth)) {
log.info("Final block processed")
outputBlock(pos)
block = null;
}
progress.end()
}
private ProgressCounter createProgress() {
ProgressCounter progress =
new ProgressCounter(withTime: true, withRate:true, timeInterval:10000, log:log, extra: {
"Region $region, ${blocks.size()} gaps after scanning ${totalBP} bp"
})
return progress
}
String region
@CompileStatic
void processLine(String chr, int start, int pos, int cov, String id) {
coverageStats.addValue(cov)
coveragePercentiles.addValue(cov)
region = "$chr:$start"
++totalBP
if(block && block.region != region) { // end of region
if (block.end - block.start + 1 >= minRegionWidth) { // only write if long enough
outputBlock()
}
else {
block = null; // forget this (too short) block
}
}
if(cov < threshold) {
if(!block) {
block = new CoverageBlock(chr:chr, region:region, start:pos)
if(id != null)
block.id = id
}
block.stats.addValue(cov)
block.end = pos // note that end is the last position that is a gap
}
else { // coverage is ok
if(block && (block.end - block.start + 1 >= minRegionWidth)) {
outputBlock()
}
else { // forget any block as it's too short
block = null;
}
}
}
@CompileStatic
void outputBlock(int endPos=-1) {
blockCount++
if(endPos>=0)
block.end = endPos
if(block.end < block.start)
assert false
blocks.add(block)
if(this.gapProcessor != null)
this.gapProcessor.sendTo(block)
block = null
}
@CompileStatic
Object asType(Class clazz) {
if(clazz == Regions) {
new Regions((Iterable<IRegion>)blocks.collect { it as Region })
}
}
}