Adding HiFi sorting function

caaswxb · Jul 28, 2023 · 8b97103 · 8b97103
1 parent 1b88759
commit 8b97103
Show file tree

Hide file tree

Showing 10 changed files with 329 additions and 126 deletions.
diff --git a/.gitattributes b/.gitattributes
diff --git a/README.md b/README.md
@@ -6,22 +6,20 @@ covers more genomic regions  of Y chromosome.
 SRY identified male specific k-mers (MSK) based on population data. To decrease the impact of population structure and sequencing errors, SRY caculated MSK density of each long read and excludes those with lower marker density.
 
 # Installation
-We need samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/".
+We need kmc, samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/".
 
 # Usage
-     Usage: ./SRY (parameters with * must be supplied!)
+     Usage: ./SRY_k (parameters with * must be supplied!)
      
         -m <string>*     Male (or female for W) short-read files with comma separated in sample and plus separated between samples
         -f <string>*     Female (or male for W) short-read files with comma separated in sample and plus separated between samples
-        -l <string>*     Long-read files of targeted genomes with comma separated, (support for both fa and fq)
-        -g <number>*     Approximate genome size of targeted Y chromosome. (The unit is Mb,do NOT add the suffix "M")
         -i <int>         Minimum coverage of k-mers (default: 2)
         -a <int>         Maximum coverage of k-mers (default: unlimit)
         -k <int>         K-mer length (default: 21)
         -p <int>         CPU number (default: 1)
         -h               Help and exit.
 
-SRY_k is used for sorting long reads based on kmer file, and SRY_contig is used for sorting contigs based on kmer file.
+SRY_k is used for identifying specific k-mers. SRY_pb2ont is used for sorting PacBio CLR or Nanopore long reads based on kmer file, and SRY_hifi is used for sorting hifi (PacBio CCS) reads or contigs based on kmer file. combine_hifi_reads.pl is used to combine the sorted hifi reads when two or more k-mer sets (such as k=21 and k=51) are utilized.
 
 # Paper
 

diff --git a/SRY_hifi b/SRY_hifi
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+script_abs=$(readlink -f "$0")
+script_dir=$(dirname $script_abs)
+export PATH=$PATH:$script_dir/script
+export LC_ALL=C
+
+usageHelp="\n#################### Sorting contigs or hifi reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>*       Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
+printHelpAndExit() {
+	echo -e "$usageHelp"
+	exit;
+}
+
+while getopts "i:l:g:k:p:h" opt; do
+	case $opt in
+		i) KMER=$OPTARG ;;
+		l) LREAD=$OPTARG ;;
+		g) GSIZE=$OPTARG ;;
+		k) KLEN=$OPTARG ;;
+		p) CPU=$OPTARG ;;
+		h) printHelpAndExit 0;;
+	esac
+done
+
+if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
+	printHelpAndExit
+	exit
+fi
+
+if [ -z "$KLEN" ];then
+	KLEN=21
+fi
+
+if [ -z "$CPU" ];then
+        CPU=1
+fi
+
+if [ ! -d "output" ];then
+	mkdir output
+fi
+
+#judge if the softwares needed are installed.
+if ! type kmc >/dev/null 2>&1; then
+        echo 'kmc is not installed!'
+        exit
+fi
+if ! type samtools >/dev/null 2>&1; then
+        echo 'samtools is not installed!'
+        exit
+fi
+
+if ! type seqtk >/dev/null 2>&1; then
+        echo 'seqtk is not installed!'
+        exit
+fi
+
+if ! type parallel >/dev/null 2>&1; then
+        echo 'parallel is not installed!'
+        exit
+fi
+
+#obtain specific-Y kmers
+
+echo "***********************************Preparation*************************************"
+#calculate specific k-mer density
+NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
+echo "The number of specific k-mers is $NUM."
+DENSITY=`perl -e '$c=sprintf ("%.1f",('$NUM'*1000/4)/('$GSIZE'*1000000)); print $c;'`
+echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of contig or hifi reads."
+
+echo -e "\n***********************************Process*****************************************"
+
+#pre-process the input file
+create_reverse_complement.pl $KMER > output/kmer_rc.txt
+LREAD=${LREAD//,/ }
+if [ ! -s output/changeid.fa.fai ];then
+	for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
+	samtools faidx output/changeid.fa
+fi
+
+#split kmer
+PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
+split -l 200000 output/kmer_rc.txt output/x
+echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."
+
+#sort kmer
+for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
+parallel -j $CPU <temp.sh
+echo "Sort kmer has been done -> output/x*.st"
+
+#search long reads contained specific kmers
+for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Kmer searching on long reads has been done -> output/x*.kmercount"
+
+#obtain the position of specific kmers on each read
+for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining kmer position on long reads has been done -> output/x*.loci"
+
+#get the extended sequences including specific kmers
+for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"
+
+#acquire the kmers in sequences
+for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"
+
+#filter kmers of last step
+for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"
+
+#the relationship between kmer and read id
+for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN > $lf.loci.kmer2id";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"
+
+#get the id
+for li in `ls output/x* | grep -v '\.'`;do echo "find_candidate_id.pl $li.need.kmer $li.loci.kmer2id > $li.seqid";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
+rm temp.sh
+
+#get reads
+echo -e "\n***************************************Final****************************************"
+cat output/x*.seqid >output/ALL.seqid
+count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
+select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
+echo "DONE!"
+echo "The final filtered contigs or hifi reads file of Y/W chromosome -> output/candidate_target.fa"
+#echo "The final filtered contigs or hifi reads file of autosome and X/Z chromosome -> output/candidate_other.fa"
+
+rm -r tmp_* output/x* output/kmer_rc.txt
diff --git a/SRY_k b/SRY_k
@@ -5,24 +5,25 @@ script_dir=$(dirname $script_abs)
 export PATH=$PATH:$script_dir/script
 export LC_ALL=C
 
-usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>*       Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
+usageHelp="\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-m <string>*       Male (or female for W) short-read files with comma separated in sample and plus separated between samples\n-f <string>*       Female (or male for W) short-read files with comma separated in sample and plus separated between samples\n-i <int>           Minimum coverage of k-mers in each sample (default: 2)\n-a <int>           Maximum coverage of k-mers in each sample (default: unlimit)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
 printHelpAndExit() {
 	echo -e "$usageHelp"
 	exit;
 }
 
-while getopts "i:l:g:k:p:h" opt; do
+while getopts "m:f:l:g:i:a:k:p:h" opt; do
 	case $opt in
-		i) KMER=$OPTARG ;;
-		l) LREAD=$OPTARG ;;
-		g) GSIZE=$OPTARG ;;
+		m) MSREAD=$OPTARG ;;
+		f) FSREAD=$OPTARG ;;
+		i) COV=$OPTARG ;;
+		a) MCOV=$OPTARG ;;
 		k) KLEN=$OPTARG ;;
 		p) CPU=$OPTARG ;;
 		h) printHelpAndExit 0;;
 	esac
 done
 
-if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
+if [ -z "$MSREAD" ] || [ -z "$FSREAD" ];then
 	printHelpAndExit
 	exit
 fi
@@ -31,6 +32,14 @@ if [ -z "$KLEN" ];then
 	KLEN=21
 fi
 
+if [ -z "$COV" ];then
+        COV=2
+fi
+
+if [ -z "$MCOV" ];then
+        MCOV=0
+fi
+
 if [ -z "$CPU" ];then
         CPU=1
 fi
@@ -40,14 +49,20 @@ if [ ! -d "output" ];then
 fi
 
 #judge if the softwares needed are installed.
+
+if ! type kmc >/dev/null 2>&1; then
+	echo 'kmc is not installed!'
+	exit
+fi
+
 if ! type samtools >/dev/null 2>&1; then
-        echo 'samtools is not installed!'
-        exit
+	echo 'samtools is not installed!'
+	exit
 fi
 
 if ! type seqtk >/dev/null 2>&1; then
-        echo 'seqtk is not installed!'
-        exit
+	echo 'seqtk is not installed!'
+	exit
 fi
 
 if ! type parallel >/dev/null 2>&1; then
@@ -58,72 +73,16 @@ fi
 #obtain specific-Y kmers
 
 echo "***********************************Preparation*************************************"
-#calculate specific k-mer density
-NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
-echo "The number of specific k-mers is $NUM."
-CORRACT_RATE=0.85 #the correct rate of long reads from third sequencing
-DENSITY=`perl -e '$c=sprintf ("%.1f",'${CORRACT_RATE}'**'$KLEN'*'$NUM'*1000/('$GSIZE'*1000000)); print $c;'`
-echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of long reads."
-
-echo -e "\n***********************************Process*****************************************"
-
-#pre-process the input file
-create_reverse_complement.pl $KMER > output/kmer_rc.txt
-LREAD=${LREAD//,/ }
-for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
-samtools faidx output/changeid.fa
-
-#split kmer
-PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
-split -l 200000 output/kmer_rc.txt output/x
-echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."
-
-#sort kmer
-for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
-parallel -j $CPU <temp.sh
-echo "Sort kmer has been done -> output/x*.st"
-
-#search long reads contained specific kmers
-for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Kmer searching on long reads has been done -> output/x*.kmercount"
-
-#obtain the position of specific kmers on each read
-for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining kmer position on long reads has been done -> output/x*.loci"
-
-#get the extended sequences including specific kmers
-for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"
-
-#acquire the kmers in sequences
-for f in `ls output/x* | grep -v '\.'`;do echo "kmer_count -l $KLEN -i $f.loci.fa -o $f.loci.kmer";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"
-
-#filter kmers of last step
-for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"
-
-#the relationship between kmer and read id
-for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN | sort -k1,1 -k2,2 -u > $lf.loci.kmer2id";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"
-
-#get the id
-for li in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $li.loci.kmer2id:dup=Y $li.need.kmer | cut -f 2 > $li.seqid";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
-rm temp.sh
-
-#get reads
-echo -e "\n***************************************Final****************************************"
-cat output/x*.seqid >output/ALL.seqid
-count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
-select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
-echo "DONE!"
-echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa"
-echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa"
+if [[ $MCOV == 0 ]]
+then
+	echo "Min coverage:$COV;max coverage:unlimit"
+else
+	echo "Min coverage:$COV;max coverage:$MCOV"
+fi
+
+call_kmc.pl $MSREAD $FSREAD $KLEN $COV $MCOV
+parallel -j $CPU <output/kmer.sh
+parallel -j 2 <output/ft.sh
+KMER="output/SRY_kmer.txt"
+filterx -k s -1 'cnt=1' output/Male_kmer.dat.gz:req=Y output/Female_kmer.dat.gz |cut -f 1 > $KMER
+rm -r tmp_* output/M.kmer output/F.kmer