@@ -46,7 +46,7 @@ def expand_counts(count_array: list, scores: list):
46
46
:return np.ndarray: a one-dimensional array reflecting the expanded count
47
47
"""
48
48
if len (count_array ) != len (scores ):
49
- _LOG .error ("Count array and scores have different lengths." )
49
+ _LOG .critical ("Count array and scores have different lengths." )
50
50
sys .exit (1 )
51
51
52
52
ret_list = []
@@ -56,6 +56,18 @@ def expand_counts(count_array: list, scores: list):
56
56
return np .array (ret_list )
57
57
58
58
59
+ def _make_gen (reader ):
60
+ """
61
+ solution from stack overflow to quickly count lines in a file.
62
+ https://stackoverflow.com/questions/19001402/how-to-count-the-total-number-of-lines-in-a-text-file-using-python
63
+
64
+ """
65
+ b = reader (1024 * 1024 )
66
+ while b :
67
+ yield b
68
+ b = reader (1024 * 1024 )
69
+
70
+
59
71
def parse_file (input_file : str , quality_scores : list , max_reads : int , qual_offset : int , readlen : int ):
60
72
"""
61
73
Parses an individual file for statistics
@@ -84,6 +96,13 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
84
96
line = fq_in .readline ().strip ()
85
97
readlens .append (len (line ))
86
98
99
+ # solution from stack overflow to quickly count lines in a file.
100
+ # https://stackoverflow.com/questions/19001402/how-to-count-the-total-number-of-lines-in-a-text-file-using-python
101
+ if max_reads == np .inf :
102
+ f = open (input_file , 'rb' )
103
+ f_gen = _make_gen (f .raw .read )
104
+ max_reads = sum (buf .count (b'\n ' ) for buf in f_gen )
105
+
87
106
readlens = np .array (readlens )
88
107
89
108
# Using the statistical mode seems like the right approach here. We expect the readlens to be roughly the same.
@@ -100,18 +119,22 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
100
119
101
120
_LOG .debug (f'Read len of { read_length } , over { 1000 } samples' )
102
121
103
- _LOG .info (f"Reading { max_reads } records..." )
104
122
temp_q_count = np .zeros ((read_length , len (quality_scores )), dtype = int )
105
123
qual_score_counter = {x : 0 for x in quality_scores }
106
- # shape_curves = []
107
- quarters = max_reads // 4
124
+ if max_reads == np .inf :
125
+ _LOG .info ("Reading all records..." )
126
+ quarters = 10000
127
+ else :
128
+ _LOG .info (f"Reading { max_reads } records" )
129
+ quarters = max_reads // 4
108
130
109
131
records_read = 0
110
132
wrong_len = 0
111
133
end_of_file = False
112
134
# SeqIO eats up way too much memory for larger fastqs, so we're trying to read the file in line by line here
135
+ _LOG .info (f'Reading data...' )
113
136
with open_input (input_file ) as fq_in :
114
- while records_read < max_reads :
137
+ while records_read <= max_reads :
115
138
116
139
# We throw away 3 lines and read the 4th, because that's fastq format
117
140
for _ in (0 , 1 , 2 , 3 ):
@@ -134,27 +157,32 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
134
157
# TODO Adding this section to account for quality score "shape" in a fastq
135
158
# shape_curves.append(qualities_to_check)
136
159
137
- records_read += 1
138
-
139
160
for j in range (read_length ):
140
161
# The qualities of each read_position_scores
162
+ temp_q_count [j ][qualities_to_check [j ]] += 1
141
163
qual_score_counter [qualities_to_check [j ]] += 1
142
164
165
+ records_read += 1
166
+
143
167
if records_read % quarters == 0 :
144
168
_LOG .info (f'reading data: { (records_read / max_reads ) * 100 :.0f} %' )
145
169
146
- _LOG .info (f'reading data: 100% ' )
170
+ _LOG .info (f'Reading data: complete ' )
147
171
if end_of_file :
148
172
_LOG .info (f'{ records_read } records read before end of file.' )
149
- _LOG .debug (f'{ wrong_len } total reads had a length other than { read_length } ({ wrong_len / max_reads :.0f} %)' )
173
+ _LOG .debug (f'{ wrong_len } total reads had a length other than { read_length } ({ wrong_len / records_read :.0f} %)' )
150
174
151
175
avg_std_by_pos = []
152
176
q_count_by_pos = np .asarray (temp_q_count )
153
177
for i in range (read_length ):
154
178
this_counts = q_count_by_pos [i ]
155
179
expanded_counts = expand_counts (this_counts , quality_scores )
156
- average_q = np .average (expanded_counts )
157
- st_d_q = np .std (expanded_counts )
180
+ if len (expanded_counts ) == 0 :
181
+ _LOG .error (f"Position had no quality data: { i } " )
182
+ sys .exit (1 )
183
+ else :
184
+ average_q = np .average (expanded_counts )
185
+ st_d_q = np .std (expanded_counts )
158
186
avg_std_by_pos .append ((average_q , st_d_q ))
159
187
160
188
# TODO In progress, working on ensuring the error model produces the right shape
@@ -191,7 +219,7 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
191
219
plt .figure (1 )
192
220
z = np .array (init_q ).T
193
221
x , y = np .meshgrid (range (0 , len (z [0 ]) + 1 ), range (0 , len (z ) + 1 ))
194
- plt .pcolormesh (x , Y , z , vmin = 0. , vmax = 0.25 )
222
+ plt .pcolormesh (x , y , z , vmin = 0. , vmax = 0.25 )
195
223
plt .axis ([0 , len (z [0 ]), 0 , len (z )])
196
224
plt .yticks (range (0 , len (z ), 10 ), range (0 , len (z ), 10 ))
197
225
plt .xticks (range (0 , len (z [0 ]), 10 ), range (0 , len (z [0 ]), 10 ))
0 commit comments