Skip to content

Commit b44e858

Browse files
committed
Merged commit of all releases to this date:
### First release for megSAP - added support for straglr 1.5.0 - changed output folder structure to match ExpansionHunter/REViewer output ### Plot modification - changed the output format of plots - modified size of plots - added pathogenic threshold in plots ### Bugfix: Deterministic Plots - set a constant hashsalt (seed) to make SVG plot (node names) deterministic ### Bugfix: Fixed length calculation of wt/pathogenic threshold - The length of pathogenic range and wildtype is now calculated based on the reference motif since the repeat unit length may change in straglr 1.5.1.
1 parent 791f99b commit b44e858

10 files changed

+228
-233
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.vscode
2+
.idea
3+
src/utils/__pycache__
4+
src/__pycache__
5+
src/utils/.vscode

src/straglron.py

+22-35
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import utils.extract_repeats as extract_repeats
99
import shutil
1010

11+
1112
def is_valid_file(parser, arg, type):
1213
"""
1314
Check if the provided file exists and has a .bed or .tsv extension.
@@ -29,28 +30,28 @@ def is_valid_file(parser, arg, type):
2930
return arg
3031

3132

32-
#Argument parser for command line interface including necessary file and options
33+
# Argument parser for command line interface including necessary file and options
3334

3435
parser = argparse.ArgumentParser()
3536

36-
#These arguments are always required
37+
# These arguments are always required
3738

3839
parser.add_argument("path_input_bed", type=lambda x: is_valid_file(parser, x, "bed"), help="Path to input bed file")
3940
parser.add_argument("path_input_tsv", type=lambda x: is_valid_file(parser, x, "tsv"), help="Path to input tsv file")
4041
parser.add_argument("loci_file", type=lambda x: is_valid_file(parser, x, "bed"), help="Path to Loci file used for straglr analysis")
4142
parser.add_argument("-o", "--output", type=str, required=True, help="Path to output folder")
4243

43-
#These Arguments are optional and produce histograms for each locus and sort the output .txt file based on a normalized increase in repeats respectively
44+
# These Arguments are optional and produce histograms for each locus and sort the output .txt file based on a normalized increase in repeats respectively
4445

4546
parser.add_argument("--hist", action="store_true", help="Plots histograms of pathogenic expansions")
4647
parser.add_argument("--score", action="store_true", help="Expansion in output file is sorted by normalized size difference score")
4748

48-
#Activates new clustering method
49+
# Activates new clustering method
4950

5051
parser.add_argument("--altclust", action="store_true", help="Uses Thomas Clustering")
5152
parser.add_argument("-c", "--cutoff", type=int, default=2, help="Sets number of reads cutoff when clustering unimodal or bimodal allele read frequencies")
5253

53-
#These arguments are required for producing the allele length visualization
54+
# These arguments are required for producing the allele length visualization
5455

5556
parser.add_argument("--alleles", action="store_true", help="Turns on the allele visualization")
5657
parser.add_argument("--bam", type=str, help="Location of Bam file of interest")
@@ -59,76 +60,63 @@ def is_valid_file(parser, arg, type):
5960

6061
args = parser.parse_args()
6162

63+
6264
def outputWriter(output_expansion: "list[Expansion]", sample_id, args):
6365

64-
#Base output here
65-
sample_folder_path = args.output + "/" + sample_id
66-
result_file_path = sample_folder_path + "/" + sample_id + ".txt"
67-
if not (os.path.exists(sample_folder_path)):
68-
os.mkdir(sample_folder_path)
66+
# Base output here
67+
result_file_path = args.output + "/" + sample_id + ".tsv"
68+
if not (os.path.exists(args.output)):
69+
os.mkdir(args.output)
6970

7071
with open(result_file_path, 'w') as f:
7172
print("Writing result file ...")
7273
f.write("#chr\tstart\tend\trepeat_id\trepeat_unit\tcopy_number\tsize\twt_size\tin_pathogenic_range\tsize_difference\tallele1_support\tallele2_support\tscore\n")
7374

7475
if args.altclust:
75-
7676
for x in output_expansion:
7777
f.write("{}\t{}\t{}\t{}\t{:.1f}/{:.1f}\t{}/{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(x.chr, x.start, x.end, x.repeat_id, (x.new_allele1/len(x.repeat_unit)), (x.new_allele2/len(x.repeat_unit)), x.new_allele1, x.new_allele2, x.wt_size, x.new_in_pathogenic_range, x.new_size_difference, x.new_allele1_support, x.new_allele2_support,x.new_norm_score))
78-
7978
else:
80-
8179
for x in output_expansion:
8280
f.write("{}\t{}\t{}\t{}\t{:.1f}/{:.1f}\t{}/{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(x.chr, x.start, x.end, x.repeat_id, (x.allele1_size/len(x.repeat_unit)), (x.allele2_size/len(x.repeat_unit)), x.allele1_size, x.allele2_size, x.wt_size, x.in_pathogenic_range, x.size_difference, x.allele1_support, x.allele2_support,x.norm_score))
8381

8482
print("Result file completed.")
8583

86-
#Base output plus Histograms
84+
# Base output plus Histograms
8785
if args.hist:
88-
plot_folder_path = sample_folder_path + "/plots"
89-
90-
if(os.path.exists(plot_folder_path)):
91-
shutil.rmtree(plot_folder_path)
92-
os.mkdir(plot_folder_path)
93-
9486
print("Generating read distribution histograms ...")
9587

9688
if args.altclust:
9789
for x in output_expansion:
98-
vis.plotHistogram(x, plot_folder_path, args.altclust)
99-
90+
vis.plotHistogram(x, args.output, args.altclust)
10091
else:
10192
for x in output_expansion:
102-
vis.plotHistogram(x, plot_folder_path, args.altclust)
93+
vis.plotHistogram(x, args.output, args.altclust)
10394

10495
print("Histograms completed.")
10596

10697

107-
#Base with Allele Visualization
98+
# Base with Allele Visualization
10899
if args.alleles:
109-
repeats_fasta_folder = sample_folder_path + "/" + "repeats"
110-
111-
if(os.path.exists(repeats_fasta_folder)):
112-
shutil.rmtree(repeats_fasta_folder)
113-
os.mkdir(repeats_fasta_folder)
114-
115100
print("Generating allele composition graphs ...")
116101

117102
for x in output_expansion:
118-
extract_repeats.fastaMaker(args.path_input_tsv, x.chr + ":"+ x.start + "-" + x.end, args.bam, args.flank, repeats_fasta_folder + "/" + x._title + ".fa")
119-
vis.alleleVisualiser(repeats_fasta_folder + "/" + x._title + ".fa", x.repeat_unit, args.flank, x._title, repeats_fasta_folder, x.chr, x.start, x.end, args.genome)
103+
extract_repeats.fastaMaker(args.path_input_tsv, x.chr + ":"+ x.start + "-" + x.end, args.bam, args.flank, args.output + "/" + x._title + ".fa")
104+
vis.alleleVisualiser(args.output + "/" + x._title + ".fa", x.repeat_unit, args.flank, x._title, args.output, x.chr, x.start, x.end, args.genome)
120105

121106
print("Allele composition graphs completed.")
122-
107+
108+
123109
def main():
124110

125111
loci_dict = ra.lociBedReader(args.loci_file)
126112
expansions = ra.resultBedReader(args.path_input_bed, loci_dict)
127113
sample_id = Path(args.path_input_bed).stem
128-
114+
129115
vis.getHistData(args.path_input_tsv, expansions)
130116

131117
for expansion_object in expansions:
118+
if __name__ == '__main__':
119+
expansion_object
132120
if args.altclust:
133121
ra.newGenotyping(expansion_object, args.cutoff, False)
134122

@@ -142,7 +130,6 @@ def main():
142130
#Sorts expansion by chromosome
143131
expansions.sort(key=lambda x: x.chr)
144132

145-
146133
outputWriter(expansions, sample_id, args)
147134

148135
if __name__ == "__main__":

0 commit comments

Comments
 (0)