From 8b971038bfe0234d0b3eb1273398030b82b88046 Mon Sep 17 00:00:00 2001
From: unknown <biowangxb@163.com>
Date: Fri, 28 Jul 2023 10:45:25 +0800
Subject: [PATCH] Adding HiFi sorting function

---
 .gitattributes               |   2 -
 README.md                    |   8 +--
 SRY_hifi                     | 136 +++++++++++++++++++++++++++++++++++
 SRY_k                        | 117 ++++++++++--------------------
 SRY => SRY_pb2ont            |  58 ++++++---------
 example_run.sh               |   3 +-
 script/call_kmc.pl           |  72 +++++++++++++++++++
 script/combine_hifi_reads.pl |  29 ++++++++
 script/find_candidate_id.pl  |  26 +++++++
 script/select_reads.pl       |   4 +-
 10 files changed, 329 insertions(+), 126 deletions(-)
 delete mode 100644 .gitattributes
 create mode 100644 SRY_hifi
 rename SRY => SRY_pb2ont (66%)
 create mode 100644 script/call_kmc.pl
 create mode 100644 script/combine_hifi_reads.pl
 create mode 100644 script/find_candidate_id.pl
diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index dfe0770..0000000
--- a/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-# Auto detect text files and perform LF normalization
-* text=auto
diff --git a/README.md b/README.md
index 99266ff..7dd3d75 100644
--- a/README.md
+++ b/README.md
@@ -6,22 +6,20 @@ covers more genomic regions  of Y chromosome.
 SRY identified male specific k-mers (MSK) based on population data. To decrease the impact of population structure and sequencing errors, SRY caculated MSK density of each long read and excludes those with lower marker density.
 
 # Installation
-We need samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/".
+We need kmc, samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/".
 
 # Usage
-     Usage: ./SRY (parameters with * must be supplied!)
+     Usage: ./SRY_k (parameters with * must be supplied!)
      
         -m <string>*     Male (or female for W) short-read files with comma separated in sample and plus separated between samples
         -f <string>*     Female (or male for W) short-read files with comma separated in sample and plus separated between samples
-        -l <string>*     Long-read files of targeted genomes with comma separated, (support for both fa and fq)
-        -g <number>*     Approximate genome size of targeted Y chromosome. (The unit is Mb,do NOT add the suffix "M")
         -i <int>         Minimum coverage of k-mers (default: 2)
         -a <int>         Maximum coverage of k-mers (default: unlimit)
         -k <int>         K-mer length (default: 21)
         -p <int>         CPU number (default: 1)
         -h               Help and exit.
 
-SRY_k is used for sorting long reads based on kmer file, and SRY_contig is used for sorting contigs based on kmer file.
+SRY_k is used for identifying specific k-mers. SRY_pb2ont is used for sorting PacBio CLR or Nanopore long reads based on kmer file, and SRY_hifi is used for sorting hifi (PacBio CCS) reads or contigs based on kmer file. combine_hifi_reads.pl is used to combine the sorted hifi reads when two or more k-mer sets (such as k=21 and k=51) are utilized.
 
 # Paper
 
diff --git a/SRY_hifi b/SRY_hifi
new file mode 100644
index 0000000..3cd0467
--- /dev/null
+++ b/SRY_hifi
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+script_abs=$(readlink -f "$0")
+script_dir=$(dirname $script_abs)
+export PATH=$PATH:$script_dir/script
+export LC_ALL=C
+
+usageHelp="\n#################### Sorting contigs or hifi reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>*       Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
+printHelpAndExit() {
+	echo -e "$usageHelp"
+	exit;
+}
+
+while getopts "i:l:g:k:p:h" opt; do
+	case $opt in
+		i) KMER=$OPTARG ;;
+		l) LREAD=$OPTARG ;;
+		g) GSIZE=$OPTARG ;;
+		k) KLEN=$OPTARG ;;
+		p) CPU=$OPTARG ;;
+		h) printHelpAndExit 0;;
+	esac
+done
+
+if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
+	printHelpAndExit
+	exit
+fi
+
+if [ -z "$KLEN" ];then
+	KLEN=21
+fi
+
+if [ -z "$CPU" ];then
+        CPU=1
+fi
+
+if [ ! -d "output" ];then
+	mkdir output
+fi
+
+#judge if the softwares needed are installed.
+if ! type kmc >/dev/null 2>&1; then
+        echo 'kmc is not installed!'
+        exit
+fi
+if ! type samtools >/dev/null 2>&1; then
+        echo 'samtools is not installed!'
+        exit
+fi
+
+if ! type seqtk >/dev/null 2>&1; then
+        echo 'seqtk is not installed!'
+        exit
+fi
+
+if ! type parallel >/dev/null 2>&1; then
+        echo 'parallel is not installed!'
+        exit
+fi
+
+#obtain specific-Y kmers
+
+echo "***********************************Preparation*************************************"
+#calculate specific k-mer density
+NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
+echo "The number of specific k-mers is $NUM."
+DENSITY=`perl -e '$c=sprintf ("%.1f",('$NUM'*1000/4)/('$GSIZE'*1000000)); print $c;'`
+echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of contig or hifi reads."
+
+echo -e "\n***********************************Process*****************************************"
+
+#pre-process the input file
+create_reverse_complement.pl $KMER > output/kmer_rc.txt
+LREAD=${LREAD//,/ }
+if [ ! -s output/changeid.fa.fai ];then
+	for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
+	samtools faidx output/changeid.fa
+fi
+
+#split kmer
+PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
+split -l 200000 output/kmer_rc.txt output/x
+echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."
+
+#sort kmer
+for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
+parallel -j $CPU <temp.sh
+echo "Sort kmer has been done -> output/x*.st"
+
+#search long reads contained specific kmers
+for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Kmer searching on long reads has been done -> output/x*.kmercount"
+
+#obtain the position of specific kmers on each read
+for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining kmer position on long reads has been done -> output/x*.loci"
+
+#get the extended sequences including specific kmers
+for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"
+
+#acquire the kmers in sequences
+for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"
+
+#filter kmers of last step
+for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"
+
+#the relationship between kmer and read id
+for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN > $lf.loci.kmer2id";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"
+
+#get the id
+for li in `ls output/x* | grep -v '\.'`;do echo "find_candidate_id.pl $li.need.kmer $li.loci.kmer2id > $li.seqid";done >temp.sh
+parallel -j $CPU <temp.sh
+echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
+rm temp.sh
+
+#get reads
+echo -e "\n***************************************Final****************************************"
+cat output/x*.seqid >output/ALL.seqid
+count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
+select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
+echo "DONE!"
+echo "The final filtered contigs or hifi reads file of Y/W chromosome -> output/candidate_target.fa"
+#echo "The final filtered contigs or hifi reads file of autosome and X/Z chromosome -> output/candidate_other.fa"
+
+rm -r tmp_* output/x* output/kmer_rc.txt
diff --git a/SRY_k b/SRY_k
index 6a2f5f9..889f407 100644
--- a/SRY_k
+++ b/SRY_k
@@ -5,24 +5,25 @@ script_dir=$(dirname $script_abs)
 export PATH=$PATH:$script_dir/script
 export LC_ALL=C
 
-usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>*       Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
+usageHelp="\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-m <string>*       Male (or female for W) short-read files with comma separated in sample and plus separated between samples\n-f <string>*       Female (or male for W) short-read files with comma separated in sample and plus separated between samples\n-i <int>           Minimum coverage of k-mers in each sample (default: 2)\n-a <int>           Maximum coverage of k-mers in each sample (default: unlimit)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
 printHelpAndExit() {
 	echo -e "$usageHelp"
 	exit;
 }
 
-while getopts "i:l:g:k:p:h" opt; do
+while getopts "m:f:l:g:i:a:k:p:h" opt; do
 	case $opt in
-		i) KMER=$OPTARG ;;
-		l) LREAD=$OPTARG ;;
-		g) GSIZE=$OPTARG ;;
+		m) MSREAD=$OPTARG ;;
+		f) FSREAD=$OPTARG ;;
+		i) COV=$OPTARG ;;
+		a) MCOV=$OPTARG ;;
 		k) KLEN=$OPTARG ;;
 		p) CPU=$OPTARG ;;
 		h) printHelpAndExit 0;;
 	esac
 done
 
-if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
+if [ -z "$MSREAD" ] || [ -z "$FSREAD" ];then
 	printHelpAndExit
 	exit
 fi
@@ -31,6 +32,14 @@ if [ -z "$KLEN" ];then
 	KLEN=21
 fi
 
+if [ -z "$COV" ];then
+        COV=2
+fi
+
+if [ -z "$MCOV" ];then
+        MCOV=0
+fi
+
 if [ -z "$CPU" ];then
         CPU=1
 fi
@@ -40,14 +49,20 @@ if [ ! -d "output" ];then
 fi
 
 #judge if the softwares needed are installed.
+
+if ! type kmc >/dev/null 2>&1; then
+	echo 'kmc is not installed!'
+	exit
+fi
+
 if ! type samtools >/dev/null 2>&1; then
-        echo 'samtools is not installed!'
-        exit
+	echo 'samtools is not installed!'
+	exit
 fi
 
 if ! type seqtk >/dev/null 2>&1; then
-        echo 'seqtk is not installed!'
-        exit
+	echo 'seqtk is not installed!'
+	exit
 fi
 
 if ! type parallel >/dev/null 2>&1; then
@@ -58,72 +73,16 @@ fi
 #obtain specific-Y kmers
 
 echo "***********************************Preparation*************************************"
-#calculate specific k-mer density
-NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
-echo "The number of specific k-mers is $NUM."
-CORRACT_RATE=0.85 #the correct rate of long reads from third sequencing
-DENSITY=`perl -e '$c=sprintf ("%.1f",'${CORRACT_RATE}'**'$KLEN'*'$NUM'*1000/('$GSIZE'*1000000)); print $c;'`
-echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of long reads."
-
-echo -e "\n***********************************Process*****************************************"
-
-#pre-process the input file
-create_reverse_complement.pl $KMER > output/kmer_rc.txt
-LREAD=${LREAD//,/ }
-for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
-samtools faidx output/changeid.fa
-
-#split kmer
-PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
-split -l 200000 output/kmer_rc.txt output/x
-echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."
-
-#sort kmer
-for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
-parallel -j $CPU <temp.sh
-echo "Sort kmer has been done -> output/x*.st"
-
-#search long reads contained specific kmers
-for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Kmer searching on long reads has been done -> output/x*.kmercount"
-
-#obtain the position of specific kmers on each read
-for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining kmer position on long reads has been done -> output/x*.loci"
-
-#get the extended sequences including specific kmers
-for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"
-
-#acquire the kmers in sequences
-for f in `ls output/x* | grep -v '\.'`;do echo "kmer_count -l $KLEN -i $f.loci.fa -o $f.loci.kmer";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"
-
-#filter kmers of last step
-for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"
-
-#the relationship between kmer and read id
-for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN | sort -k1,1 -k2,2 -u > $lf.loci.kmer2id";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"
-
-#get the id
-for li in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $li.loci.kmer2id:dup=Y $li.need.kmer | cut -f 2 > $li.seqid";done >temp.sh
-parallel -j $CPU <temp.sh
-echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
-rm temp.sh
-
-#get reads
-echo -e "\n***************************************Final****************************************"
-cat output/x*.seqid >output/ALL.seqid
-count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
-select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
-echo "DONE!"
-echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa"
-echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa"
+if [[ $MCOV == 0 ]]
+then
+	echo "Min coverage:$COV;max coverage:unlimit"
+else
+	echo "Min coverage:$COV;max coverage:$MCOV"
+fi
+
+call_kmc.pl $MSREAD $FSREAD $KLEN $COV $MCOV
+parallel -j $CPU <output/kmer.sh
+parallel -j 2 <output/ft.sh
+KMER="output/SRY_kmer.txt"
+filterx -k s -1 'cnt=1' output/Male_kmer.dat.gz:req=Y output/Female_kmer.dat.gz |cut -f 1 > $KMER
+rm -r tmp_* output/M.kmer output/F.kmer
diff --git a/SRY b/SRY_pb2ont
similarity index 66%
rename from SRY
rename to SRY_pb2ont
index 8845e61..0ac41a6 100644
--- a/SRY
+++ b/SRY_pb2ont
@@ -5,27 +5,24 @@ script_dir=$(dirname $script_abs)
 export PATH=$PATH:$script_dir/script
 export LC_ALL=C
 
-usageHelp="\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-m <string>*       Male (or female for W) short-read files with comma separated in sample and plus separated between samples\n-f <string>*       Female (or male for W) short-read files with comma separated in sample and plus separated between samples\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-i <int>           Minimum coverage of k-mers in each sample (default: 2)\n-a <int>           Maximum coverage of k-mers in each sample (default: unlimit)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
+usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>*       Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
 printHelpAndExit() {
 	echo -e "$usageHelp"
 	exit;
 }
 
-while getopts "m:f:l:g:i:a:k:p:h" opt; do
+while getopts "i:l:g:k:p:h" opt; do
 	case $opt in
-		m) MSREAD=$OPTARG ;;
-		f) FSREAD=$OPTARG ;;
+		i) KMER=$OPTARG ;;
 		l) LREAD=$OPTARG ;;
 		g) GSIZE=$OPTARG ;;
-		i) COV=$OPTARG ;;
-		a) MCOV=$OPTARG ;;
 		k) KLEN=$OPTARG ;;
 		p) CPU=$OPTARG ;;
 		h) printHelpAndExit 0;;
 	esac
 done
 
-if [ -z "$MSREAD" ] || [ -z "$FSREAD" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
+if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
 	printHelpAndExit
 	exit
 fi
@@ -34,33 +31,28 @@ if [ -z "$KLEN" ];then
 	KLEN=21
 fi
 
-if [ -z "$COV" ];then
-        COV=2
-fi
-
-if [ -z "$MCOV" ];then
-        MCOV=0
-fi
-
 if [ -z "$CPU" ];then
         CPU=1
 fi
 
-
 if [ ! -d "output" ];then
 	mkdir output
 fi
 
 #judge if the softwares needed are installed.
+if ! type kmc >/dev/null 2>&1; then
+        echo 'kmc is not installed!'
+        exit
+fi
 
 if ! type samtools >/dev/null 2>&1; then
-	echo 'samtools is not installed!'
-	exit
+        echo 'samtools is not installed!'
+        exit
 fi
 
 if ! type seqtk >/dev/null 2>&1; then
-	echo 'seqtk is not installed!'
-	exit
+        echo 'seqtk is not installed!'
+        exit
 fi
 
 if ! type parallel >/dev/null 2>&1; then
@@ -71,19 +63,6 @@ fi
 #obtain specific-Y kmers
 
 echo "***********************************Preparation*************************************"
-if [[ $MCOV == 0 ]]
-then
-	echo "Min coverage:$COV;max coverage:unlimit"
-else
-	echo "Min coverage:$COV;max coverage:$MCOV"
-fi
-
-call_kmer_count.pl $MSREAD $FSREAD $KLEN $COV $MCOV
-parallel -j $CPU <output/kmer.sh
-parallel -j 2 <output/ft.sh
-KMER="output/SRY_kmer.txt"
-filterx -k s -1 'cnt=1' output/Male_kmer.dat.gz:req=Y output/Female_kmer.dat.gz |cut -f 1 > $KMER
-
 #calculate specific k-mer density
 NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
 echo "The number of specific k-mers is $NUM."
@@ -96,8 +75,11 @@ echo -e "\n***********************************Process***************************
 #pre-process the input file
 create_reverse_complement.pl $KMER > output/kmer_rc.txt
 LREAD=${LREAD//,/ }
-for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
-samtools faidx output/changeid.fa
+
+if [ ! -s output/changeid.fa.fai ];then
+	for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
+	samtools faidx output/changeid.fa
+fi
 
 #split kmer
 PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
@@ -110,7 +92,7 @@ parallel -j $CPU <temp.sh
 echo "Sort kmer has been done -> output/x*.st"
 
 #search long reads contained specific kmers
-for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
+for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
 parallel -j $CPU <temp.sh
 echo "Kmer searching on long reads has been done -> output/x*.kmercount"
 
@@ -125,7 +107,7 @@ parallel -j $CPU <temp.sh
 echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"
 
 #acquire the kmers in sequences
-for f in `ls output/x* | grep -v '\.'`;do echo "kmer_count -l $KLEN -i $f.loci.fa -o $f.loci.kmer";done >temp.sh
+for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh
 parallel -j $CPU <temp.sh
 echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"
 
@@ -153,3 +135,5 @@ select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/chang
 echo "DONE!"
 echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa"
 echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa"
+
+rm -r tmp_* output/x* output/kmer_rc.txt
diff --git a/example_run.sh b/example_run.sh
index 9332db2..4f8586c 100644
--- a/example_run.sh
+++ b/example_run.sh
@@ -1 +1,2 @@
-./SRY -m data/HG01109_R1.fq.gz,data/HG01109_R2.fq.gz+data/HG01107_R1.fq.gz,data/HG01107_R2.fq.gz -f data/HG01108_R1.fq.gz,data/HG01108_R2.fq.gz -l data/HG01109_longread_1.fasta.gz,data/HG01109_longread_2.fasta.gz -g 1 -p 5
+./SRY_k -m data/HG01109_R1.fq.gz,data/HG01109_R2.fq.gz+data/HG01107_R1.fq.gz,data/HG01107_R2.fq.gz -f data/HG01108_R1.fq.gz,data/HG01108_R2.fq.gz 
+./SRY_pb2ont -i output/SRY_kmer.txt -l data/HG01109_longread_1.fasta.gz,data/HG01109_longread_2.fasta.gz -g 1 -p 5
diff --git a/script/call_kmc.pl b/script/call_kmc.pl
new file mode 100644
index 0000000..9e240ce
--- /dev/null
+++ b/script/call_kmc.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "perl $0 male_fq_files female_fq_files kmer_length min_depth max_depth\n" if @ARGV!=5;
+
+my $kmer_len = $ARGV[2];
+my $min_depth = $ARGV[3];
+my $max_depth = $ARGV[4];
+
+my $cmd="output/kmer.sh";
+open OUT,">$cmd";
+
+my ($filterx_cmdm,$filterx_cmdf);
+
+#cmd of male population
+my @msamples = split(/\+/,$ARGV[0]);
+my $mnum = $#msamples+1;
+my $mlimit = int($mnum*2/3+0.5);
+
+my $seq = '';		
+for (my $j=0;$j<@msamples;$j++){
+	my @mfiles = split(/,/,$msamples[$j]);
+	my $m_tmp = "output/SRY_tmp_M$j.file";
+	open MOUT,">$m_tmp";
+	for (my $i=0;$i<@mfiles;$i++){
+		print MOUT "$mfiles[$i]\n";
+	}
+
+
+	if ($max_depth == 0){
+		$seq .= " output/M$j.kmer";
+	}else{
+		$seq .= " <(awk \'\$2<=$max_depth\' output/M$j.kmer)";
+	}
+
+	print OUT "mkdir tmp_M$j && kmc -t1 -m3 -hp -k$kmer_len -fq \@$m_tmp tmp_M$j tmp_M$j && kmc_tools transform tmp_M$j dump -s output/M$j.kmer\n";
+	close MOUT;
+}
+
+$filterx_cmdm="filterx -1 \'cnt>=$mlimit\' -k s $seq |awk \'{for (i=1;i<=NF;i+=2){if (\$i~/[A-Z]/){print \$i;break}}}\' | gzip > output/Male_kmer.dat.gz\n";
+
+#cmd of female population
+my @fsamples = split(/\+/,$ARGV[1]);
+
+$seq = '';
+for (my $j=0;$j<@fsamples;$j++){
+	my @ffiles = split(/,/,$fsamples[$j]);
+	my $f_tmp = "output/SRY_tmp_F$j.file";
+	open FOUT,">$f_tmp";
+	for (my $i=0;$i<@ffiles;$i++){
+		print FOUT "$ffiles[$i]\n";
+	}
+
+	if ($max_depth == 0){
+		$seq .= " output/F$j.kmer";
+	}else{
+		$seq .= " <(awk \'\$2<=$max_depth\' output/F$j.kmer)";
+	}
+	print OUT "mkdir tmp_F$j && kmc -t1 -m3 -hp -k$kmer_len -fq \@$f_tmp tmp_F$j tmp_F$j && kmc_tools transform tmp_F$j dump -s output/F$j.kmer\n";
+	close FOUT;
+}
+close OUT;
+
+$filterx_cmdf="filterx -k s -1 \'cnt>=1\' $seq |awk \'{for (i=1;i<=NF;i+=2){if (\$i~/[A-Z]/){print \$i;break}}}\' |gzip > output/Female_kmer.dat.gz\n";
+
+my $ftcmd="output/ft.sh";
+open OUT1,">$ftcmd";
+if ($filterx_cmdm || $filterx_cmdf){
+	print OUT1 "$filterx_cmdm" if ($filterx_cmdm);
+	print OUT1 "$filterx_cmdf" if ($filterx_cmdf);
+}
+close OUT1;
diff --git a/script/combine_hifi_reads.pl b/script/combine_hifi_reads.pl
new file mode 100644
index 0000000..3e1567c
--- /dev/null
+++ b/script/combine_hifi_reads.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "perl $0 file1 ...\n" if @ARGV==0;
+
+my %hash;
+my $flag = 0;
+for (my $i=0;$i<@ARGV;$i++){
+	my $in = $ARGV[$i];
+	open IN,$in;
+	while (<IN>){
+		chomp;
+		if (/>(\S+)/){
+			my $id = $1;
+			if (!defined $hash{$id}){
+				$hash{$id} = "";
+				print "$_\n";
+				$flag = 1;
+			}else{
+				$flag = 0;
+			}
+		}else{
+			if ($flag == 1){
+				print "$_\n";
+			}
+		}	
+	}
+	close IN;
+}
diff --git a/script/find_candidate_id.pl b/script/find_candidate_id.pl
new file mode 100644
index 0000000..76af9e0
--- /dev/null
+++ b/script/find_candidate_id.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "perl $0 need.kmer kmer2id\n" if @ARGV!=2;
+
+my $in = $ARGV[0];
+open IN,$in;
+
+my %hash;
+while (<IN>){
+	chomp;
+	$hash{$_}='';
+}
+close IN;
+
+my $in1 = $ARGV[1];
+open IN1,$in1;
+
+my @info;
+while (<IN1>){
+	chomp;
+	@info = split;
+	if (defined $hash{$info[0]}){
+		print "$info[1]\n";
+	}
+}
diff --git a/script/select_reads.pl b/script/select_reads.pl
index a7a35b3..b5f8f72 100644
--- a/script/select_reads.pl
+++ b/script/select_reads.pl
@@ -54,13 +54,13 @@
 			print OUT ">$old2new{$id}\n";
 		}else{
 			$flag = 0;
-			print OUT1 ">$old2new{$id}\n";
+			#print OUT1 ">$old2new{$id}\n";
 		}
 	}else{	
 		if ($flag == 1){
 			print OUT "$_\n";
 		}else{
-			print OUT1 "$_\n";
+			#print OUT1 "$_\n";
 		}
 	}
 }