Skip to content

Commit

Permalink
Adding HiFi sorting function
Browse files Browse the repository at this point in the history
  • Loading branch information
caaswxb committed Jul 28, 2023
1 parent 1b88759 commit 8b97103
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 126 deletions.
2 changes: 0 additions & 2 deletions .gitattributes

This file was deleted.

8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,20 @@ covers more genomic regions of Y chromosome.
SRY identified male specific k-mers (MSK) based on population data. To decrease the impact of population structure and sequencing errors, SRY caculated MSK density of each long read and excludes those with lower marker density.

# Installation
We need samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/".
We need kmc, samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/".

# Usage
Usage: ./SRY (parameters with * must be supplied!)
Usage: ./SRY_k (parameters with * must be supplied!)
-m <string>* Male (or female for W) short-read files with comma separated in sample and plus separated between samples
-f <string>* Female (or male for W) short-read files with comma separated in sample and plus separated between samples
-l <string>* Long-read files of targeted genomes with comma separated, (support for both fa and fq)
-g <number>* Approximate genome size of targeted Y chromosome. (The unit is Mb,do NOT add the suffix "M")
-i <int> Minimum coverage of k-mers (default: 2)
-a <int> Maximum coverage of k-mers (default: unlimit)
-k <int> K-mer length (default: 21)
-p <int> CPU number (default: 1)
-h Help and exit.

SRY_k is used for sorting long reads based on kmer file, and SRY_contig is used for sorting contigs based on kmer file.
SRY_k is used for identifying specific k-mers. SRY_pb2ont is used for sorting PacBio CLR or Nanopore long reads based on kmer file, and SRY_hifi is used for sorting hifi (PacBio CCS) reads or contigs based on kmer file. combine_hifi_reads.pl is used to combine the sorted hifi reads when two or more k-mer sets (such as k=21 and k=51) are utilized.

# Paper

Expand Down
136 changes: 136 additions & 0 deletions SRY_hifi
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/bin/bash

script_abs=$(readlink -f "$0")
script_dir=$(dirname $script_abs)
export PATH=$PATH:$script_dir/script
export LC_ALL=C

usageHelp="\n#################### Sorting contigs or hifi reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>* Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>* Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>* Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int> K-mer length (default: 21)\n-p <int> CPU number (default: 1)\n-h Help and exit."
printHelpAndExit() {
echo -e "$usageHelp"
exit;
}

while getopts "i:l:g:k:p:h" opt; do
case $opt in
i) KMER=$OPTARG ;;
l) LREAD=$OPTARG ;;
g) GSIZE=$OPTARG ;;
k) KLEN=$OPTARG ;;
p) CPU=$OPTARG ;;
h) printHelpAndExit 0;;
esac
done

if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
printHelpAndExit
exit
fi

if [ -z "$KLEN" ];then
KLEN=21
fi

if [ -z "$CPU" ];then
CPU=1
fi

if [ ! -d "output" ];then
mkdir output
fi

#judge if the softwares needed are installed.
if ! type kmc >/dev/null 2>&1; then
echo 'kmc is not installed!'
exit
fi
if ! type samtools >/dev/null 2>&1; then
echo 'samtools is not installed!'
exit
fi

if ! type seqtk >/dev/null 2>&1; then
echo 'seqtk is not installed!'
exit
fi

if ! type parallel >/dev/null 2>&1; then
echo 'parallel is not installed!'
exit
fi

#obtain specific-Y kmers

echo "***********************************Preparation*************************************"
#calculate specific k-mer density
NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
echo "The number of specific k-mers is $NUM."
DENSITY=`perl -e '$c=sprintf ("%.1f",('$NUM'*1000/4)/('$GSIZE'*1000000)); print $c;'`
echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of contig or hifi reads."

echo -e "\n***********************************Process*****************************************"

#pre-process the input file
create_reverse_complement.pl $KMER > output/kmer_rc.txt
LREAD=${LREAD//,/ }
if [ ! -s output/changeid.fa.fai ];then
for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
samtools faidx output/changeid.fa
fi

#split kmer
PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
split -l 200000 output/kmer_rc.txt output/x
echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."

#sort kmer
for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
parallel -j $CPU <temp.sh
echo "Sort kmer has been done -> output/x*.st"

#search long reads contained specific kmers
for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
parallel -j $CPU <temp.sh
echo "Kmer searching on long reads has been done -> output/x*.kmercount"

#obtain the position of specific kmers on each read
for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmer position on long reads has been done -> output/x*.loci"

#get the extended sequences including specific kmers
for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"

#acquire the kmers in sequences
for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"

#filter kmers of last step
for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"

#the relationship between kmer and read id
for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN > $lf.loci.kmer2id";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"

#get the id
for li in `ls output/x* | grep -v '\.'`;do echo "find_candidate_id.pl $li.need.kmer $li.loci.kmer2id > $li.seqid";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
rm temp.sh

#get reads
echo -e "\n***************************************Final****************************************"
cat output/x*.seqid >output/ALL.seqid
count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
echo "DONE!"
echo "The final filtered contigs or hifi reads file of Y/W chromosome -> output/candidate_target.fa"
#echo "The final filtered contigs or hifi reads file of autosome and X/Z chromosome -> output/candidate_other.fa"

rm -r tmp_* output/x* output/kmer_rc.txt
117 changes: 38 additions & 79 deletions SRY_k
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,25 @@ script_dir=$(dirname $script_abs)
export PATH=$PATH:$script_dir/script
export LC_ALL=C

usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>* Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>* Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>* Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int> K-mer length (default: 21)\n-p <int> CPU number (default: 1)\n-h Help and exit."
usageHelp="\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-m <string>* Male (or female for W) short-read files with comma separated in sample and plus separated between samples\n-f <string>* Female (or male for W) short-read files with comma separated in sample and plus separated between samples\n-i <int> Minimum coverage of k-mers in each sample (default: 2)\n-a <int> Maximum coverage of k-mers in each sample (default: unlimit)\n-k <int> K-mer length (default: 21)\n-p <int> CPU number (default: 1)\n-h Help and exit."
printHelpAndExit() {
echo -e "$usageHelp"
exit;
}

while getopts "i:l:g:k:p:h" opt; do
while getopts "m:f:l:g:i:a:k:p:h" opt; do
case $opt in
i) KMER=$OPTARG ;;
l) LREAD=$OPTARG ;;
g) GSIZE=$OPTARG ;;
m) MSREAD=$OPTARG ;;
f) FSREAD=$OPTARG ;;
i) COV=$OPTARG ;;
a) MCOV=$OPTARG ;;
k) KLEN=$OPTARG ;;
p) CPU=$OPTARG ;;
h) printHelpAndExit 0;;
esac
done

if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
if [ -z "$MSREAD" ] || [ -z "$FSREAD" ];then
printHelpAndExit
exit
fi
Expand All @@ -31,6 +32,14 @@ if [ -z "$KLEN" ];then
KLEN=21
fi

if [ -z "$COV" ];then
COV=2
fi

if [ -z "$MCOV" ];then
MCOV=0
fi

if [ -z "$CPU" ];then
CPU=1
fi
Expand All @@ -40,14 +49,20 @@ if [ ! -d "output" ];then
fi

#judge if the softwares needed are installed.

if ! type kmc >/dev/null 2>&1; then
echo 'kmc is not installed!'
exit
fi

if ! type samtools >/dev/null 2>&1; then
echo 'samtools is not installed!'
exit
echo 'samtools is not installed!'
exit
fi

if ! type seqtk >/dev/null 2>&1; then
echo 'seqtk is not installed!'
exit
echo 'seqtk is not installed!'
exit
fi

if ! type parallel >/dev/null 2>&1; then
Expand All @@ -58,72 +73,16 @@ fi
#obtain specific-Y kmers

echo "***********************************Preparation*************************************"
#calculate specific k-mer density
NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
echo "The number of specific k-mers is $NUM."
CORRACT_RATE=0.85 #the correct rate of long reads from third sequencing
DENSITY=`perl -e '$c=sprintf ("%.1f",'${CORRACT_RATE}'**'$KLEN'*'$NUM'*1000/('$GSIZE'*1000000)); print $c;'`
echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of long reads."

echo -e "\n***********************************Process*****************************************"

#pre-process the input file
create_reverse_complement.pl $KMER > output/kmer_rc.txt
LREAD=${LREAD//,/ }
for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
samtools faidx output/changeid.fa

#split kmer
PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
split -l 200000 output/kmer_rc.txt output/x
echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."

#sort kmer
for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
parallel -j $CPU <temp.sh
echo "Sort kmer has been done -> output/x*.st"

#search long reads contained specific kmers
for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
parallel -j $CPU <temp.sh
echo "Kmer searching on long reads has been done -> output/x*.kmercount"

#obtain the position of specific kmers on each read
for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmer position on long reads has been done -> output/x*.loci"

#get the extended sequences including specific kmers
for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"

#acquire the kmers in sequences
for f in `ls output/x* | grep -v '\.'`;do echo "kmer_count -l $KLEN -i $f.loci.fa -o $f.loci.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"

#filter kmers of last step
for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"

#the relationship between kmer and read id
for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN | sort -k1,1 -k2,2 -u > $lf.loci.kmer2id";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"

#get the id
for li in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $li.loci.kmer2id:dup=Y $li.need.kmer | cut -f 2 > $li.seqid";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
rm temp.sh

#get reads
echo -e "\n***************************************Final****************************************"
cat output/x*.seqid >output/ALL.seqid
count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
echo "DONE!"
echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa"
echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa"
if [[ $MCOV == 0 ]]
then
echo "Min coverage:$COV;max coverage:unlimit"
else
echo "Min coverage:$COV;max coverage:$MCOV"
fi

call_kmc.pl $MSREAD $FSREAD $KLEN $COV $MCOV
parallel -j $CPU <output/kmer.sh
parallel -j 2 <output/ft.sh
KMER="output/SRY_kmer.txt"
filterx -k s -1 'cnt=1' output/Male_kmer.dat.gz:req=Y output/Female_kmer.dat.gz |cut -f 1 > $KMER
rm -r tmp_* output/M.kmer output/F.kmer
Loading

0 comments on commit 8b97103

Please sign in to comment.