Skip to content

Commit 83c2d3b

Browse files
Merge pull request #129 from ncsa/develop
Develop
2 parents 5f9f85a + 9cc0f80 commit 83c2d3b

File tree

2 files changed

+41
-14
lines changed

2 files changed

+41
-14
lines changed

neat/model_sequencing_error/utils.py

+40-12
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def expand_counts(count_array: list, scores: list):
4646
:return np.ndarray: a one-dimensional array reflecting the expanded count
4747
"""
4848
if len(count_array) != len(scores):
49-
_LOG.error("Count array and scores have different lengths.")
49+
_LOG.critical("Count array and scores have different lengths.")
5050
sys.exit(1)
5151

5252
ret_list = []
@@ -56,6 +56,18 @@ def expand_counts(count_array: list, scores: list):
5656
return np.array(ret_list)
5757

5858

59+
def _make_gen(reader):
60+
"""
61+
solution from stack overflow to quickly count lines in a file.
62+
https://stackoverflow.com/questions/19001402/how-to-count-the-total-number-of-lines-in-a-text-file-using-python
63+
64+
"""
65+
b = reader(1024 * 1024)
66+
while b:
67+
yield b
68+
b = reader(1024 * 1024)
69+
70+
5971
def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offset: int, readlen: int):
6072
"""
6173
Parses an individual file for statistics
@@ -84,6 +96,13 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
8496
line = fq_in.readline().strip()
8597
readlens.append(len(line))
8698

99+
# solution from stack overflow to quickly count lines in a file.
100+
# https://stackoverflow.com/questions/19001402/how-to-count-the-total-number-of-lines-in-a-text-file-using-python
101+
if max_reads == np.inf:
102+
f = open(input_file, 'rb')
103+
f_gen = _make_gen(f.raw.read)
104+
max_reads = sum(buf.count(b'\n') for buf in f_gen)
105+
87106
readlens = np.array(readlens)
88107

89108
# Using the statistical mode seems like the right approach here. We expect the readlens to be roughly the same.
@@ -100,18 +119,22 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
100119

101120
_LOG.debug(f'Read len of {read_length}, over {1000} samples')
102121

103-
_LOG.info(f"Reading {max_reads} records...")
104122
temp_q_count = np.zeros((read_length, len(quality_scores)), dtype=int)
105123
qual_score_counter = {x: 0 for x in quality_scores}
106-
# shape_curves = []
107-
quarters = max_reads//4
124+
if max_reads == np.inf:
125+
_LOG.info("Reading all records...")
126+
quarters = 10000
127+
else:
128+
_LOG.info(f"Reading {max_reads} records")
129+
quarters = max_reads//4
108130

109131
records_read = 0
110132
wrong_len = 0
111133
end_of_file = False
112134
# SeqIO eats up way too much memory for larger fastqs, so we're trying to read the file in line by line here
135+
_LOG.info(f'Reading data...')
113136
with open_input(input_file) as fq_in:
114-
while records_read < max_reads:
137+
while records_read <= max_reads:
115138

116139
# We throw away 3 lines and read the 4th, because that's fastq format
117140
for _ in (0, 1, 2, 3):
@@ -134,27 +157,32 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
134157
# TODO Adding this section to account for quality score "shape" in a fastq
135158
# shape_curves.append(qualities_to_check)
136159

137-
records_read += 1
138-
139160
for j in range(read_length):
140161
# The qualities of each read_position_scores
162+
temp_q_count[j][qualities_to_check[j]] += 1
141163
qual_score_counter[qualities_to_check[j]] += 1
142164

165+
records_read += 1
166+
143167
if records_read % quarters == 0:
144168
_LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')
145169

146-
_LOG.info(f'reading data: 100%')
170+
_LOG.info(f'Reading data: complete')
147171
if end_of_file:
148172
_LOG.info(f'{records_read} records read before end of file.')
149-
_LOG.debug(f'{wrong_len} total reads had a length other than {read_length} ({wrong_len/max_reads:.0f}%)')
173+
_LOG.debug(f'{wrong_len} total reads had a length other than {read_length} ({wrong_len/records_read:.0f}%)')
150174

151175
avg_std_by_pos = []
152176
q_count_by_pos = np.asarray(temp_q_count)
153177
for i in range(read_length):
154178
this_counts = q_count_by_pos[i]
155179
expanded_counts = expand_counts(this_counts, quality_scores)
156-
average_q = np.average(expanded_counts)
157-
st_d_q = np.std(expanded_counts)
180+
if len(expanded_counts) == 0:
181+
_LOG.error(f"Position had no quality data: {i}")
182+
sys.exit(1)
183+
else:
184+
average_q = np.average(expanded_counts)
185+
st_d_q = np.std(expanded_counts)
158186
avg_std_by_pos.append((average_q, st_d_q))
159187

160188
# TODO In progress, working on ensuring the error model produces the right shape
@@ -191,7 +219,7 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
191219
plt.figure(1)
192220
z = np.array(init_q).T
193221
x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
194-
plt.pcolormesh(x, Y, z, vmin=0., vmax=0.25)
222+
plt.pcolormesh(x, y, z, vmin=0., vmax=0.25)
195223
plt.axis([0, len(z[0]), 0, len(z)])
196224
plt.yticks(range(0, len(z), 10), range(0, len(z), 10))
197225
plt.xticks(range(0, len(z[0]), 10), range(0, len(z[0]), 10))

neat/read_simulator/runner.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ def initialize_all_models(options: Options):
5454
homozygous_freq=default_cancer_homozygous_freq,
5555
variant_probs=default_cancer_variant_probs,
5656
insert_len_model=default_cancer_insert_len_model,
57-
is_cancer=True,
58-
rng=options.rng
57+
is_cancer=True
5958
)
6059

6160
_LOG.debug("Mutation models loaded")

0 commit comments

Comments
 (0)