From 21d0578f04cc48e963c599cc361b14e37b48427d Mon Sep 17 00:00:00 2001 From: Chien-Chi Lo Date: Wed, 23 Dec 2020 08:49:58 -0700 Subject: [PATCH] avoid out of range array slice --- amplicov/amplicov | 2 +- amplicov/amplicov_pdf | 268 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+), 1 deletion(-) create mode 100755 amplicov/amplicov_pdf diff --git a/amplicov/amplicov b/amplicov/amplicov index 8085431..a21eb93 100755 --- a/amplicov/amplicov +++ b/amplicov/amplicov @@ -157,7 +157,7 @@ def calculate_mean_cov_per_amplicon(cov_np_array,amplicon_d): for key in amplicon_d: start = amplicon_d[key][0] end = amplicon_d[key][-1] - mean_dict[key] = cov_np_array[start:end].mean() + mean_dict[key] = 0 if start > cov_np_array.size else cov_np_array[start:end].mean() return mean_dict def write_dict_to_file(mean_d,uniq_mean_d,outdir,prefix): diff --git a/amplicov/amplicov_pdf b/amplicov/amplicov_pdf new file mode 100755 index 0000000..2d3595c --- /dev/null +++ b/amplicov/amplicov_pdf @@ -0,0 +1,268 @@ +#! /usr/bin/env python3 +import os, errno +import argparse as ap +import numpy as np +import csv +import subprocess +import plotly.graph_objects as go +from pathlib import Path +import pysam +from collections import defaultdict +import re + +class SmartFormatter(ap.HelpFormatter): + def _split_lines(self, text, width): + if text.startswith('R|'): + return text[2:].splitlines() + # this is the RawTextHelpFormatter._split_lines + return ap.HelpFormatter._split_lines(self, text, width) + +def setup_argparse(): + parser = ap.ArgumentParser(prog='amplicov', + description = '''Script to parse amplicon region coverage and generate barplot in html''', + formatter_class = SmartFormatter) + + inGrp = parser.add_argument_group('Amplicon Input (required, mutually exclusive)') + inGrp_me = inGrp.add_mutually_exclusive_group(required=True) + inGrp_me.add_argument('--bed', metavar='[FILE]', type=str,help='amplicon bed file') + inGrp_me.add_argument('--bedpe', metavar='[FILE]', type=str,help='amplicon bedpe file') + + covGrp = parser.add_argument_group('Coverage Input (required, mutually exclusive)') + covGrp_me = covGrp.add_mutually_exclusive_group(required=True) + covGrp_me.add_argument('--bam', metavar='[FILE]', type=str,help='bam file') + covGrp_me.add_argument('--cov', metavar='[FILE]', type=str,help='coverage file [position\tcoverage]') + + outGrp = parser.add_argument_group('Output') + outGrp.add_argument('-o', '--outdir', metavar='[PATH]',type=str, default='.', help='output directory') + outGrp.add_argument('-p', '--prefix', metavar='[STR]',type=str , help='output prefix') + + parser.add_argument('--version', action='version', version='%(prog)s 0.1.1') + args_parsed = parser.parse_args() + if not args_parsed.outdir: + args_parsed.outdir = os.getcwd() + + return args_parsed + +def mkdir_p(directory_name): + try: + os.makedirs(directory_name) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(directory_name): + pass + +def covert_bed_to_amplicon_dict(input,unique=False): + ## convert bed file to amplicon region dictionary + input_bed = input + amplicon=defaultdict(dict) + if unique: + cmd = 'grep -v alt %s | paste - - | awk \'{print $4"\t"$2"\t"$9}\' | sed -e "s/_LEFT//g" -e "s/_RIGHT//g" ' % (input_bed) + else: + cmd = 'grep -v alt %s | paste - - | awk \'{print $4"\t"$3"\t"$8}\' | sed -e "s/_LEFT//g" -e "s/_RIGHT//g" ' % (input_bed) + + proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) + previous_id='' + for line in proc.stdout: + id, start, end = line.decode().rstrip().split("\t") + amplicon[id]=range(int(start),int(end)+1) + if (previous_id and unique): + set_new=set(amplicon[id]) + set_previous=set(amplicon[previous_id]) + set_new_update=sorted(set_new - set_previous) + set_previous_update=sorted(set_previous - set_new) + amplicon[id] = range(set_new_update[0],set_new_update[-1]+1) + amplicon[previous_id] = range(set_previous_update[0],set_previous_update[-1]+1) + previous_id = id + + outs, errs = proc.communicate() + if proc.returncode != 0: + print("Failed %d %s %s" % (proc.returncode, outs, errs)) + + return amplicon + +def covert_bedpe_to_amplicon_dict(input,unique=False): + ## convert bed file to amplicon region dictionary + input_bedpe = input + amplicon=defaultdict(dict) + if unique: + cmd = 'awk \'{print $7"\t"$2"\t"$6}\' %s ' % (input_bedpe) + else: + cmd = 'awk \'{print $7"\t"$3"\t"$5}\' %s ' % (input_bedpe) + proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) + previous_id='' + for line in proc.stdout: + id, start, end = line.decode().rstrip().split("\t") + amplicon[id]=range(int(start),int(end)+1) + if (previous_id and unique): + set_new=set(amplicon[id]) + set_previous=set(amplicon[previous_id]) + set_new_update=sorted(set_new - set_previous) + set_previous_update=sorted(set_previous - set_new) + amplicon[id] = range(set_new_update[0],set_new_update[-1]+1) + amplicon[previous_id] = range(set_previous_update[0],set_previous_update[-1]+1) + previous_id = id + + outs, errs = proc.communicate() + if proc.returncode != 0: + print("Failed %d %s %s" % (proc.returncode, outs, errs)) + + return amplicon + +def parse_cov_file(input): + ## read the coverage and save in a list + cov_file = input + cov_list = [] + with open(cov_file,'r') as cov_file: + for line in cov_file: + pos, cov = line.rstrip().split("\t") + cov_list.append(int(cov)) + + cov_array=np.array(cov_list) + return cov_array + +def parse_bam_file(bam): + cov_list = [] + for line in pysam.samtools.depth("-aa","-d0", "NC_045512.2.sort_sorted.bam",split_lines=True): + id, pos, cov = line.rstrip().split("\t") + cov_list.append(int(cov)) + cov_array=np.array(cov_list) + return cov_array + +def calculate_mean_cov_per_amplicon(cov_np_array,amplicon_d): + mean_dict=dict() + for key in amplicon_d: + start = amplicon_d[key][0] + end = amplicon_d[key][-1] + mean_dict[key] = cov_np_array[start:end].mean() + return mean_dict + +def write_dict_to_file(mean_d,uniq_mean_d,outdir,prefix): + output_amplicon_cov_txt = outdir + os.sep + prefix + '_amplicon_coverage.txt' + try: + os.remove(output_amplicon_cov_txt) + except OSError: + pass + with open(output_amplicon_cov_txt,"w") as f: + f.write("ID\tWhole_Amplicon\tUnique\n") + for key in mean_d: + f.write("%s\t%.2f\t%.2f\n" % (key,mean_d[key],uniq_mean_d[key])) + + +def barplot(mean_dict,uniq_mean_d,input_bed,overall_mean,outdir,prefix): + #plot bar chart + x_name=Path(input_bed).stem + x=list(mean_dict.keys()) + y=list(mean_dict.values()) + uniq_x=list(uniq_mean_d.keys()) + uniq_y=list(uniq_mean_d.values()) + x_tick_names = [re.sub(r'nCoV-2019_', '', x_tick) for x_tick in uniq_x] + barcolor = ['lightsalmon' if y >= 20 else 'black' for y in uniq_y] + fig = go.Figure(data=[go.Bar(x=x_tick_names, y=uniq_y,marker_color=barcolor)]) + #fig.add_trace(go.Bar(x=uniq_x,y=uniq_y,marker_color='lightsalmon',visible=False)) + fig.update_layout(yaxis_type="log") + fig.update_xaxes( + tickfont=dict(family='Courier New, monospace', size=8), + ticks="outside", + tickwidth=0.5, + ticklen=3 + ) + fig.update_yaxes( + tickfont=dict(family='Courier New, monospace', size=8), + ticks="outside", + tickwidth=0.5, + ticklen=3 + ) + depthMean = [dict(type='line', + xref='paper',x0=0,x1=1, + yref='y',y0=overall_mean,y1=overall_mean, + line=dict( + color="red", + width=1, + dash='dot', + ) + )] + + depth5x = [dict(type='line', + xref='paper',x0=0,x1=1, + yref='y',y0=5,y1=5, + line=dict( + color="black", + width=1, + dash='dot', + ) + )] + depth10x = [dict(type='line', + xref='paper',x0=0,x1=1, + yref='10',y0=10,y1=10, + line=dict( + color="black", + width=1, + dash='dot', + ) + )] + depth20x = [dict(type='line', + xref='paper',x0=0,x1=1, + yref='y',y0=20,y1=20, + line=dict( + color="black", + width=1, + dash='dot', + ) + )] + + + + fig.update_layout( + yaxis_title="Mean Coverage(X)", + xaxis=dict( + title='Artic V1', + tickmode='linear'), + font=dict( + family="Courier New, monospace", + size=8, + ), + shapes=[ + dict(type='line', + xref='paper',x0=0,x1=1, + yref='y',y0=20,y1=20, + line=dict( + color="red", + width=1, + dash='dot', + ), + ), + ] + ) + + output_html = outdir + os.sep + prefix + '_amplicon_coverage.html' + fig.write_html(output_html) + fig.write_image(outdir + os.sep + prefix+".svg") + fig.write_image(outdir + os.sep + prefix+".pdf",width=1280) + fig.write_image(outdir + os.sep + prefix+".eps") + +def run(argvs): + if (argvs.bed): + amplicon_dict = covert_bed_to_amplicon_dict(argvs.bed) + uniq_amplicon_dict = covert_bed_to_amplicon_dict(argvs.bed,True) + bedfile = argvs.bed + if (argvs.bedpe): + amplicon_dict = covert_bedpe_to_amplicon_dict(argvs.bedpe) + uniq_amplicon_dict = covert_bedpe_to_amplicon_dict(argvs.bedpe) + bedfile = argvs.bedpe + if (argvs.cov): + cov_array = parse_cov_file(argvs.cov) + prefix = Path(argvs.cov).stem + if (argvs.bam): + cov_array = parse_bam_file(argvs.bam) + prefix = Path(argvs.bam).stem + if not argvs.prefix: + argvs.prefix = prefix + + amplicon_mean_d = calculate_mean_cov_per_amplicon(cov_array,amplicon_dict) + uniq_amplicon_mean_d = calculate_mean_cov_per_amplicon(cov_array,uniq_amplicon_dict) + write_dict_to_file(amplicon_mean_d,uniq_amplicon_mean_d,argvs.outdir,argvs.prefix) + barplot(amplicon_mean_d,uniq_amplicon_mean_d,bedfile,cov_array.mean(),argvs.outdir,argvs.prefix) + +if __name__ == '__main__': + argvs = setup_argparse() + mkdir_p(argvs.outdir) + run(argvs)