@@ -46,7 +46,7 @@ def expand_counts(count_array: list, scores: list):
46
46
:return np.ndarray: a one-dimensional array reflecting the expanded count
47
47
"""
48
48
if len (count_array ) != len (scores ):
49
- _LOG .error ("Count array and scores have different lengths." )
49
+ _LOG .critical ("Count array and scores have different lengths." )
50
50
sys .exit (1 )
51
51
52
52
ret_list = []
@@ -119,18 +119,22 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
119
119
120
120
_LOG .debug (f'Read len of { read_length } , over { 1000 } samples' )
121
121
122
- _LOG .info (f"Reading { max_reads } records..." )
123
122
temp_q_count = np .zeros ((read_length , len (quality_scores )), dtype = int )
124
123
qual_score_counter = {x : 0 for x in quality_scores }
125
- # shape_curves = []
126
- quarters = max_reads // 4
124
+ if max_reads == np .inf :
125
+ _LOG .info ("Reading all records..." )
126
+ quarters = 10000
127
+ else :
128
+ _LOG .info (f"Reading { max_reads } records" )
129
+ quarters = max_reads // 4
127
130
128
131
records_read = 0
129
132
wrong_len = 0
130
133
end_of_file = False
131
134
# SeqIO eats up way too much memory for larger fastqs, so we're trying to read the file in line by line here
135
+ _LOG .info (f'Reading data...' )
132
136
with open_input (input_file ) as fq_in :
133
- while records_read < max_reads :
137
+ while records_read <= max_reads :
134
138
135
139
# We throw away 3 lines and read the 4th, because that's fastq format
136
140
for _ in (0 , 1 , 2 , 3 ):
@@ -153,28 +157,29 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
153
157
# TODO Adding this section to account for quality score "shape" in a fastq
154
158
# shape_curves.append(qualities_to_check)
155
159
156
- records_read += 1
157
-
158
160
for j in range (read_length ):
159
161
# The qualities of each read_position_scores
162
+ temp_q_count [j ][qualities_to_check [j ]] += 1
160
163
qual_score_counter [qualities_to_check [j ]] += 1
161
164
165
+ records_read += 1
166
+
162
167
if records_read % quarters == 0 :
163
168
_LOG .info (f'reading data: { (records_read / max_reads ) * 100 :.0f} %' )
164
169
165
- _LOG .info (f'reading data: 100% ' )
170
+ _LOG .info (f'Reading data: complete ' )
166
171
if end_of_file :
167
172
_LOG .info (f'{ records_read } records read before end of file.' )
168
- _LOG .debug (f'{ wrong_len } total reads had a length other than { read_length } ({ wrong_len / max_reads :.0f} %)' )
173
+ _LOG .debug (f'{ wrong_len } total reads had a length other than { read_length } ({ wrong_len / records_read :.0f} %)' )
169
174
170
175
avg_std_by_pos = []
171
176
q_count_by_pos = np .asarray (temp_q_count )
172
177
for i in range (read_length ):
173
178
this_counts = q_count_by_pos [i ]
174
179
expanded_counts = expand_counts (this_counts , quality_scores )
175
- if not expanded_counts :
176
- average_q = 0
177
- st_d_q = 0
180
+ if len ( expanded_counts ) == 0 :
181
+ _LOG . error ( f"Position had no quality data: { i } " )
182
+ sys . exit ( 1 )
178
183
else :
179
184
average_q = np .average (expanded_counts )
180
185
st_d_q = np .std (expanded_counts )
0 commit comments