Skip to content

Commit

Permalink
avoid out of range array slice
Browse files Browse the repository at this point in the history
  • Loading branch information
Chien-Chi Lo committed Dec 23, 2020
1 parent 409edd6 commit 21d0578
Show file tree
Hide file tree
Showing 2 changed files with 269 additions and 1 deletion.
2 changes: 1 addition & 1 deletion amplicov/amplicov
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def calculate_mean_cov_per_amplicon(cov_np_array,amplicon_d):
for key in amplicon_d:
start = amplicon_d[key][0]
end = amplicon_d[key][-1]
mean_dict[key] = cov_np_array[start:end].mean()
mean_dict[key] = 0 if start > cov_np_array.size else cov_np_array[start:end].mean()
return mean_dict

def write_dict_to_file(mean_d,uniq_mean_d,outdir,prefix):
Expand Down
268 changes: 268 additions & 0 deletions amplicov/amplicov_pdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
#! /usr/bin/env python3
import os, errno
import argparse as ap
import numpy as np
import csv
import subprocess
import plotly.graph_objects as go
from pathlib import Path
import pysam
from collections import defaultdict
import re

class SmartFormatter(ap.HelpFormatter):
def _split_lines(self, text, width):
if text.startswith('R|'):
return text[2:].splitlines()
# this is the RawTextHelpFormatter._split_lines
return ap.HelpFormatter._split_lines(self, text, width)

def setup_argparse():
parser = ap.ArgumentParser(prog='amplicov',
description = '''Script to parse amplicon region coverage and generate barplot in html''',
formatter_class = SmartFormatter)

inGrp = parser.add_argument_group('Amplicon Input (required, mutually exclusive)')
inGrp_me = inGrp.add_mutually_exclusive_group(required=True)
inGrp_me.add_argument('--bed', metavar='[FILE]', type=str,help='amplicon bed file')
inGrp_me.add_argument('--bedpe', metavar='[FILE]', type=str,help='amplicon bedpe file')

covGrp = parser.add_argument_group('Coverage Input (required, mutually exclusive)')
covGrp_me = covGrp.add_mutually_exclusive_group(required=True)
covGrp_me.add_argument('--bam', metavar='[FILE]', type=str,help='bam file')
covGrp_me.add_argument('--cov', metavar='[FILE]', type=str,help='coverage file [position\tcoverage]')

outGrp = parser.add_argument_group('Output')
outGrp.add_argument('-o', '--outdir', metavar='[PATH]',type=str, default='.', help='output directory')
outGrp.add_argument('-p', '--prefix', metavar='[STR]',type=str , help='output prefix')

parser.add_argument('--version', action='version', version='%(prog)s 0.1.1')
args_parsed = parser.parse_args()
if not args_parsed.outdir:
args_parsed.outdir = os.getcwd()

return args_parsed

def mkdir_p(directory_name):
try:
os.makedirs(directory_name)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(directory_name):
pass

def covert_bed_to_amplicon_dict(input,unique=False):
## convert bed file to amplicon region dictionary
input_bed = input
amplicon=defaultdict(dict)
if unique:
cmd = 'grep -v alt %s | paste - - | awk \'{print $4"\t"$2"\t"$9}\' | sed -e "s/_LEFT//g" -e "s/_RIGHT//g" ' % (input_bed)
else:
cmd = 'grep -v alt %s | paste - - | awk \'{print $4"\t"$3"\t"$8}\' | sed -e "s/_LEFT//g" -e "s/_RIGHT//g" ' % (input_bed)

proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
previous_id=''
for line in proc.stdout:
id, start, end = line.decode().rstrip().split("\t")
amplicon[id]=range(int(start),int(end)+1)
if (previous_id and unique):
set_new=set(amplicon[id])
set_previous=set(amplicon[previous_id])
set_new_update=sorted(set_new - set_previous)
set_previous_update=sorted(set_previous - set_new)
amplicon[id] = range(set_new_update[0],set_new_update[-1]+1)
amplicon[previous_id] = range(set_previous_update[0],set_previous_update[-1]+1)
previous_id = id

outs, errs = proc.communicate()
if proc.returncode != 0:
print("Failed %d %s %s" % (proc.returncode, outs, errs))

return amplicon

def covert_bedpe_to_amplicon_dict(input,unique=False):
## convert bed file to amplicon region dictionary
input_bedpe = input
amplicon=defaultdict(dict)
if unique:
cmd = 'awk \'{print $7"\t"$2"\t"$6}\' %s ' % (input_bedpe)
else:
cmd = 'awk \'{print $7"\t"$3"\t"$5}\' %s ' % (input_bedpe)
proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
previous_id=''
for line in proc.stdout:
id, start, end = line.decode().rstrip().split("\t")
amplicon[id]=range(int(start),int(end)+1)
if (previous_id and unique):
set_new=set(amplicon[id])
set_previous=set(amplicon[previous_id])
set_new_update=sorted(set_new - set_previous)
set_previous_update=sorted(set_previous - set_new)
amplicon[id] = range(set_new_update[0],set_new_update[-1]+1)
amplicon[previous_id] = range(set_previous_update[0],set_previous_update[-1]+1)
previous_id = id

outs, errs = proc.communicate()
if proc.returncode != 0:
print("Failed %d %s %s" % (proc.returncode, outs, errs))

return amplicon

def parse_cov_file(input):
## read the coverage and save in a list
cov_file = input
cov_list = []
with open(cov_file,'r') as cov_file:
for line in cov_file:
pos, cov = line.rstrip().split("\t")
cov_list.append(int(cov))

cov_array=np.array(cov_list)
return cov_array

def parse_bam_file(bam):
cov_list = []
for line in pysam.samtools.depth("-aa","-d0", "NC_045512.2.sort_sorted.bam",split_lines=True):
id, pos, cov = line.rstrip().split("\t")
cov_list.append(int(cov))
cov_array=np.array(cov_list)
return cov_array

def calculate_mean_cov_per_amplicon(cov_np_array,amplicon_d):
mean_dict=dict()
for key in amplicon_d:
start = amplicon_d[key][0]
end = amplicon_d[key][-1]
mean_dict[key] = cov_np_array[start:end].mean()
return mean_dict

def write_dict_to_file(mean_d,uniq_mean_d,outdir,prefix):
output_amplicon_cov_txt = outdir + os.sep + prefix + '_amplicon_coverage.txt'
try:
os.remove(output_amplicon_cov_txt)
except OSError:
pass
with open(output_amplicon_cov_txt,"w") as f:
f.write("ID\tWhole_Amplicon\tUnique\n")
for key in mean_d:
f.write("%s\t%.2f\t%.2f\n" % (key,mean_d[key],uniq_mean_d[key]))


def barplot(mean_dict,uniq_mean_d,input_bed,overall_mean,outdir,prefix):
#plot bar chart
x_name=Path(input_bed).stem
x=list(mean_dict.keys())
y=list(mean_dict.values())
uniq_x=list(uniq_mean_d.keys())
uniq_y=list(uniq_mean_d.values())
x_tick_names = [re.sub(r'nCoV-2019_', '', x_tick) for x_tick in uniq_x]
barcolor = ['lightsalmon' if y >= 20 else 'black' for y in uniq_y]
fig = go.Figure(data=[go.Bar(x=x_tick_names, y=uniq_y,marker_color=barcolor)])
#fig.add_trace(go.Bar(x=uniq_x,y=uniq_y,marker_color='lightsalmon',visible=False))
fig.update_layout(yaxis_type="log")
fig.update_xaxes(
tickfont=dict(family='Courier New, monospace', size=8),
ticks="outside",
tickwidth=0.5,
ticklen=3
)
fig.update_yaxes(
tickfont=dict(family='Courier New, monospace', size=8),
ticks="outside",
tickwidth=0.5,
ticklen=3
)
depthMean = [dict(type='line',
xref='paper',x0=0,x1=1,
yref='y',y0=overall_mean,y1=overall_mean,
line=dict(
color="red",
width=1,
dash='dot',
)
)]

depth5x = [dict(type='line',
xref='paper',x0=0,x1=1,
yref='y',y0=5,y1=5,
line=dict(
color="black",
width=1,
dash='dot',
)
)]
depth10x = [dict(type='line',
xref='paper',x0=0,x1=1,
yref='10',y0=10,y1=10,
line=dict(
color="black",
width=1,
dash='dot',
)
)]
depth20x = [dict(type='line',
xref='paper',x0=0,x1=1,
yref='y',y0=20,y1=20,
line=dict(
color="black",
width=1,
dash='dot',
)
)]



fig.update_layout(
yaxis_title="Mean Coverage(X)",
xaxis=dict(
title='Artic V1',
tickmode='linear'),
font=dict(
family="Courier New, monospace",
size=8,
),
shapes=[
dict(type='line',
xref='paper',x0=0,x1=1,
yref='y',y0=20,y1=20,
line=dict(
color="red",
width=1,
dash='dot',
),
),
]
)

output_html = outdir + os.sep + prefix + '_amplicon_coverage.html'
fig.write_html(output_html)
fig.write_image(outdir + os.sep + prefix+".svg")
fig.write_image(outdir + os.sep + prefix+".pdf",width=1280)
fig.write_image(outdir + os.sep + prefix+".eps")

def run(argvs):
if (argvs.bed):
amplicon_dict = covert_bed_to_amplicon_dict(argvs.bed)
uniq_amplicon_dict = covert_bed_to_amplicon_dict(argvs.bed,True)
bedfile = argvs.bed
if (argvs.bedpe):
amplicon_dict = covert_bedpe_to_amplicon_dict(argvs.bedpe)
uniq_amplicon_dict = covert_bedpe_to_amplicon_dict(argvs.bedpe)
bedfile = argvs.bedpe
if (argvs.cov):
cov_array = parse_cov_file(argvs.cov)
prefix = Path(argvs.cov).stem
if (argvs.bam):
cov_array = parse_bam_file(argvs.bam)
prefix = Path(argvs.bam).stem
if not argvs.prefix:
argvs.prefix = prefix

amplicon_mean_d = calculate_mean_cov_per_amplicon(cov_array,amplicon_dict)
uniq_amplicon_mean_d = calculate_mean_cov_per_amplicon(cov_array,uniq_amplicon_dict)
write_dict_to_file(amplicon_mean_d,uniq_amplicon_mean_d,argvs.outdir,argvs.prefix)
barplot(amplicon_mean_d,uniq_amplicon_mean_d,bedfile,cov_array.mean(),argvs.outdir,argvs.prefix)

if __name__ == '__main__':
argvs = setup_argparse()
mkdir_p(argvs.outdir)
run(argvs)

0 comments on commit 21d0578

Please sign in to comment.