-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
329 additions
and
126 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
#!/bin/bash | ||
|
||
script_abs=$(readlink -f "$0") | ||
script_dir=$(dirname $script_abs) | ||
export PATH=$PATH:$script_dir/script | ||
export LC_ALL=C | ||
|
||
usageHelp="\n#################### Sorting contigs or hifi reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>* Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>* Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>* Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int> K-mer length (default: 21)\n-p <int> CPU number (default: 1)\n-h Help and exit." | ||
printHelpAndExit() { | ||
echo -e "$usageHelp" | ||
exit; | ||
} | ||
|
||
while getopts "i:l:g:k:p:h" opt; do | ||
case $opt in | ||
i) KMER=$OPTARG ;; | ||
l) LREAD=$OPTARG ;; | ||
g) GSIZE=$OPTARG ;; | ||
k) KLEN=$OPTARG ;; | ||
p) CPU=$OPTARG ;; | ||
h) printHelpAndExit 0;; | ||
esac | ||
done | ||
|
||
if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then | ||
printHelpAndExit | ||
exit | ||
fi | ||
|
||
if [ -z "$KLEN" ];then | ||
KLEN=21 | ||
fi | ||
|
||
if [ -z "$CPU" ];then | ||
CPU=1 | ||
fi | ||
|
||
if [ ! -d "output" ];then | ||
mkdir output | ||
fi | ||
|
||
#judge if the softwares needed are installed. | ||
if ! type kmc >/dev/null 2>&1; then | ||
echo 'kmc is not installed!' | ||
exit | ||
fi | ||
if ! type samtools >/dev/null 2>&1; then | ||
echo 'samtools is not installed!' | ||
exit | ||
fi | ||
|
||
if ! type seqtk >/dev/null 2>&1; then | ||
echo 'seqtk is not installed!' | ||
exit | ||
fi | ||
|
||
if ! type parallel >/dev/null 2>&1; then | ||
echo 'parallel is not installed!' | ||
exit | ||
fi | ||
|
||
#obtain specific-Y kmers | ||
|
||
echo "***********************************Preparation*************************************" | ||
#calculate specific k-mer density | ||
NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'` | ||
echo "The number of specific k-mers is $NUM." | ||
DENSITY=`perl -e '$c=sprintf ("%.1f",('$NUM'*1000/4)/('$GSIZE'*1000000)); print $c;'` | ||
echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of contig or hifi reads." | ||
|
||
echo -e "\n***********************************Process*****************************************" | ||
|
||
#pre-process the input file | ||
create_reverse_complement.pl $KMER > output/kmer_rc.txt | ||
LREAD=${LREAD//,/ } | ||
if [ ! -s output/changeid.fa.fai ];then | ||
for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa | ||
samtools faidx output/changeid.fa | ||
fi | ||
|
||
#split kmer | ||
PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'` | ||
split -l 200000 output/kmer_rc.txt output/x | ||
echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..." | ||
|
||
#sort kmer | ||
for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Sort kmer has been done -> output/x*.st" | ||
|
||
#search long reads contained specific kmers | ||
for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Kmer searching on long reads has been done -> output/x*.kmercount" | ||
|
||
#obtain the position of specific kmers on each read | ||
for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Obtaining kmer position on long reads has been done -> output/x*.loci" | ||
|
||
#get the extended sequences including specific kmers | ||
for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa" | ||
|
||
#acquire the kmers in sequences | ||
for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer" | ||
|
||
#filter kmers of last step | ||
for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Searching specific kmers from last step has been done -> output/x*.need.kmer" | ||
|
||
#the relationship between kmer and read id | ||
for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN > $lf.loci.kmer2id";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id" | ||
|
||
#get the id | ||
for li in `ls output/x* | grep -v '\.'`;do echo "find_candidate_id.pl $li.need.kmer $li.loci.kmer2id > $li.seqid";done >temp.sh | ||
parallel -j $CPU <temp.sh | ||
echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid" | ||
rm temp.sh | ||
|
||
#get reads | ||
echo -e "\n***************************************Final****************************************" | ||
cat output/x*.seqid >output/ALL.seqid | ||
count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density | ||
select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa | ||
echo "DONE!" | ||
echo "The final filtered contigs or hifi reads file of Y/W chromosome -> output/candidate_target.fa" | ||
#echo "The final filtered contigs or hifi reads file of autosome and X/Z chromosome -> output/candidate_other.fa" | ||
|
||
rm -r tmp_* output/x* output/kmer_rc.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.