Skip to content

Commit 9cc0f80

Browse files
committed
Found a mistaken elimination of a required line from a previous update, which is what broke the sequencing error modeling task
1 parent fe0a37d commit 9cc0f80

File tree

1 file changed

+17
-12
lines changed

1 file changed

+17
-12
lines changed

neat/model_sequencing_error/utils.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def expand_counts(count_array: list, scores: list):
4646
:return np.ndarray: a one-dimensional array reflecting the expanded count
4747
"""
4848
if len(count_array) != len(scores):
49-
_LOG.error("Count array and scores have different lengths.")
49+
_LOG.critical("Count array and scores have different lengths.")
5050
sys.exit(1)
5151

5252
ret_list = []
@@ -119,18 +119,22 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
119119

120120
_LOG.debug(f'Read len of {read_length}, over {1000} samples')
121121

122-
_LOG.info(f"Reading {max_reads} records...")
123122
temp_q_count = np.zeros((read_length, len(quality_scores)), dtype=int)
124123
qual_score_counter = {x: 0 for x in quality_scores}
125-
# shape_curves = []
126-
quarters = max_reads//4
124+
if max_reads == np.inf:
125+
_LOG.info("Reading all records...")
126+
quarters = 10000
127+
else:
128+
_LOG.info(f"Reading {max_reads} records")
129+
quarters = max_reads//4
127130

128131
records_read = 0
129132
wrong_len = 0
130133
end_of_file = False
131134
# SeqIO eats up way too much memory for larger fastqs, so we're trying to read the file in line by line here
135+
_LOG.info(f'Reading data...')
132136
with open_input(input_file) as fq_in:
133-
while records_read < max_reads:
137+
while records_read <= max_reads:
134138

135139
# We throw away 3 lines and read the 4th, because that's fastq format
136140
for _ in (0, 1, 2, 3):
@@ -153,28 +157,29 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
153157
# TODO Adding this section to account for quality score "shape" in a fastq
154158
# shape_curves.append(qualities_to_check)
155159

156-
records_read += 1
157-
158160
for j in range(read_length):
159161
# The qualities of each read_position_scores
162+
temp_q_count[j][qualities_to_check[j]] += 1
160163
qual_score_counter[qualities_to_check[j]] += 1
161164

165+
records_read += 1
166+
162167
if records_read % quarters == 0:
163168
_LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')
164169

165-
_LOG.info(f'reading data: 100%')
170+
_LOG.info(f'Reading data: complete')
166171
if end_of_file:
167172
_LOG.info(f'{records_read} records read before end of file.')
168-
_LOG.debug(f'{wrong_len} total reads had a length other than {read_length} ({wrong_len/max_reads:.0f}%)')
173+
_LOG.debug(f'{wrong_len} total reads had a length other than {read_length} ({wrong_len/records_read:.0f}%)')
169174

170175
avg_std_by_pos = []
171176
q_count_by_pos = np.asarray(temp_q_count)
172177
for i in range(read_length):
173178
this_counts = q_count_by_pos[i]
174179
expanded_counts = expand_counts(this_counts, quality_scores)
175-
if not expanded_counts:
176-
average_q = 0
177-
st_d_q = 0
180+
if len(expanded_counts) == 0:
181+
_LOG.error(f"Position had no quality data: {i}")
182+
sys.exit(1)
178183
else:
179184
average_q = np.average(expanded_counts)
180185
st_d_q = np.std(expanded_counts)

0 commit comments

Comments
 (0)