Skip to content

Commit

Permalink
Merge branch 'master' into oriel
Browse files Browse the repository at this point in the history
  • Loading branch information
KamilSJaron authored Sep 24, 2024
2 parents 146d1ca + fcaf74d commit 33b45db
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 1 deletion.
2 changes: 1 addition & 1 deletion R/generate_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ generate_summary <- function(.args, .smudge_summary){
minimal_number_of_heterozygous_loci <- ceiling(sum(.smudge_summary$peak_sizes$abs_size[for_sure_heterozygous]) / .args$kmer_size)

smudge_summary(.args$output, "* Minimal number of heterozygous loci:", minimal_number_of_heterozygous_loci)
smudge_summary(.args$output, "Note: This number is NOT an estimate of the total number heterozygous loci, it's merly setting the lower boundary if the inference of heterozygosity peaks is correct.")
smudge_summary(.args$output, "Note: This number is NOT an estimate of the total number of heterozygous loci, it's merely setting the lower boundary if the inference of heterozygosity peaks is correct.")

smudge_summary(.args$output, "* Proportion of heterozygosity carried by pairs in different genome copies (table)")
tab_to_print <- data.frame( genome_copies = 2:max(.smudge_summary$peak_sizes$ploidy),
Expand Down
128 changes: 128 additions & 0 deletions exec/smudgeplot
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env python3

import argparse
import sys
import os
from math import log
from math import ceil
import numpy as np
from scipy.signal import argrelextrema

version = '0.2.0'

class parser():
def __init__(self):
argparser = argparse.ArgumentParser(
# description='Inference of ploidy and heterozygosity structure using whole genome sequencing data',
usage='''smudgeplot <task> [options] \n
tasks: cutoff Calculate meaningful values for lower/upper kmer histogram cutoff.
hetkmers Calculate unique kmer pairs from a Jellyfish or KMC dump file.
plot Generate 2d histogram; infere ploidy and plot a smudgeplot.\n\n''')
argparser.add_argument('task', help='Task to execute; for task specific options execute smudgeplot <task> -h')
argparser.add_argument('-v', '--version', action="store_true", default = False, help="print the version and exit")
# print version is a special case
if len(sys.argv) > 1:
if sys.argv[1] in ['-v', '--version']:
self.task = "version"
return
# the following line either prints help and die; or assign the name of task to variable task
self.task = argparser.parse_args([sys.argv[1]]).task
else:
self.task = ""
# if the task is known (i.e. defined in this file);
if hasattr(self, self.task):
# load arguments of that task
getattr(self, self.task)()
else:
argparser.print_usage()
print('"' + self.task + '" is not a valid task name')
exit(1)

def hetkmers(self):
'''
Calculate unique kmer pairs from a Jellyfish or KMC dump file.
'''
argparser = argparse.ArgumentParser(prog = 'smudgeplot hetkmers',
description='Calculate unique kmer pairs from a Jellyfish or KMC dump file.')
argparser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help='Alphabetically sorted Jellyfish or KMC dump file (stdin).')
argparser.add_argument('-o', help='The pattern used to name the output (kmerpairs).', default='kmerpairs')
argparser.add_argument('-k', help='The length of the kmer.', default=21)
argparser.add_argument('-t', help='Number of processes to use.', default = 4)
argparser.add_argument('--middle', dest='middle', action='store_const', const = True, default = False,
help='Get all kmer pairs one SNP away from each other (default: just the middle one).')
self.arguments = argparser.parse_args(sys.argv[2:])

def plot(self):
'''
Generate 2d histogram; infer ploidy and plot a smudgeplot.
'''
argparser = argparse.ArgumentParser(prog = 'smudgeplot plot', description='Generate 2d histogram for smudgeplot')
argparser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help='name of the input tsv file with covarages (default \"coverages_2.tsv\")."')
argparser.add_argument('-o', help='The pattern used to name the output (smudgeplot).', default='smudgeplot')
argparser.add_argument('-q', help='Remove kmer pairs with coverage over the specified quantile; (default none).', type=float, default=1)
argparser.add_argument('-L', help='The lower boundary used when dumping kmers (default min(total_pair_cov) / 2).', type=int, default=0)
argparser.add_argument('-n', help='The expected haploid coverage (default estimated from data).', type=int, default=0)
argparser.add_argument('-t', '--title', help='name printed at the top of the smudgeplot (default none).', default='')
argparser.add_argument('-m', '-method', help='The algorithm for annotation of smudges (default \'local_aggregation\')', default='local_aggregation')
argparser.add_argument('-nbins', help='The number of nbins used for smudgeplot matrix (nbins x nbins) (default autodetection).', type=int, default=0)
# argparser.add_argument('-k', help='The length of the kmer.', default=21)
argparser.add_argument('-kmer_file', help='Name of the input files containing kmer seuqences (assuming the same order as in the coverage file)', default = "")
argparser.add_argument('--homozygous', action="store_true", default = False, help="Assume no heterozygosity in the genome - plotting a paralog structure; (default False).")
self.arguments = argparser.parse_args(sys.argv[2:])

def cutoff(self):
'''
Calculate meaningful values for lower/upper kmer histogram cutoff.
'''
argparser = argparse.ArgumentParser(prog = 'smudgeplot cutoff', description='Calculate meaningful values for lower/upper kmer histogram cutoff.')
argparser.add_argument('infile', type=argparse.FileType('r'), help='Name of the input kmer histogram file (default \"kmer.hist\")."')
argparser.add_argument('boundary', help='Which bounary to compute L (lower, default) or U (upper)', default = 'L')
self.arguments = argparser.parse_args(sys.argv[2:])


def round_up_nice(x):
digits = ceil(log(x, 10))
if digits <= 1:
multiplier = 10 ** (digits - 1)
else:
multiplier = 10 ** (digits - 2)
return(ceil(x / multiplier) * multiplier)

def cutoff(args):
# kmer_hist = open("data/Mflo2/kmer.hist","r")
kmer_hist = args.infile
hist = np.array([int(line.split()[1]) for line in kmer_hist])
if args.boundary == "L":
local_minima = argrelextrema(hist, np.less)[0][0]
L = max(10, int(round(local_minima * 1.25)))
print(L, end = '')
else:
# take 99.8 quantile of kmers that are more than one in the read set
hist_rel_cumsum = np.cumsum(hist[1:]) / np.sum(hist[1:])
U = round_up_nice(np.argmax(hist_rel_cumsum > 0.998))
print(U, end = '')

def main():
_parser = parser()

print('Running smudgeplot v' + version)
if _parser.task == "version":
exit(0)

print('Task: ' + _parser.task)

if _parser.task == "cutoff":
cutoff(_parser.arguments)

# if _parser.task == "hetkmers":
# hetkmers(_parser.arguments)
#
# if _parser.task == "plot":
# call .R script
# plot(_parser.arguments)

print('Done!')
exit(0)

if __name__=='__main__':
main()

0 comments on commit 33b45db

Please sign in to comment.