From 8b971038bfe0234d0b3eb1273398030b82b88046 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 28 Jul 2023 10:45:25 +0800 Subject: [PATCH] Adding HiFi sorting function --- .gitattributes | 2 - README.md | 8 +-- SRY_hifi | 136 +++++++++++++++++++++++++++++++++++ SRY_k | 117 ++++++++++-------------------- SRY => SRY_pb2ont | 58 ++++++--------- example_run.sh | 3 +- script/call_kmc.pl | 72 +++++++++++++++++++ script/combine_hifi_reads.pl | 29 ++++++++ script/find_candidate_id.pl | 26 +++++++ script/select_reads.pl | 4 +- 10 files changed, 329 insertions(+), 126 deletions(-) delete mode 100644 .gitattributes create mode 100644 SRY_hifi rename SRY => SRY_pb2ont (66%) create mode 100644 script/call_kmc.pl create mode 100644 script/combine_hifi_reads.pl create mode 100644 script/find_candidate_id.pl diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index dfe0770..0000000 --- a/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -# Auto detect text files and perform LF normalization -* text=auto diff --git a/README.md b/README.md index 99266ff..7dd3d75 100644 --- a/README.md +++ b/README.md @@ -6,22 +6,20 @@ covers more genomic regions of Y chromosome. SRY identified male specific k-mers (MSK) based on population data. To decrease the impact of population structure and sequencing errors, SRY caculated MSK density of each long read and excludes those with lower marker density. # Installation -We need samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/". +We need kmc, samtools, seqtk and parallel in your environment, you can use conda to install them and execute "export PATH=$PATH:/env/conda/bin/". # Usage - Usage: ./SRY (parameters with * must be supplied!) + Usage: ./SRY_k (parameters with * must be supplied!) -m * Male (or female for W) short-read files with comma separated in sample and plus separated between samples -f * Female (or male for W) short-read files with comma separated in sample and plus separated between samples - -l * Long-read files of targeted genomes with comma separated, (support for both fa and fq) - -g * Approximate genome size of targeted Y chromosome. (The unit is Mb,do NOT add the suffix "M") -i Minimum coverage of k-mers (default: 2) -a Maximum coverage of k-mers (default: unlimit) -k K-mer length (default: 21) -p CPU number (default: 1) -h Help and exit. -SRY_k is used for sorting long reads based on kmer file, and SRY_contig is used for sorting contigs based on kmer file. +SRY_k is used for identifying specific k-mers. SRY_pb2ont is used for sorting PacBio CLR or Nanopore long reads based on kmer file, and SRY_hifi is used for sorting hifi (PacBio CCS) reads or contigs based on kmer file. combine_hifi_reads.pl is used to combine the sorted hifi reads when two or more k-mer sets (such as k=21 and k=51) are utilized. # Paper diff --git a/SRY_hifi b/SRY_hifi new file mode 100644 index 0000000..3cd0467 --- /dev/null +++ b/SRY_hifi @@ -0,0 +1,136 @@ +#!/bin/bash + +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +export PATH=$PATH:$script_dir/script +export LC_ALL=C + +usageHelp="\n#################### Sorting contigs or hifi reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i * Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l * Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g * Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k K-mer length (default: 21)\n-p CPU number (default: 1)\n-h Help and exit." +printHelpAndExit() { + echo -e "$usageHelp" + exit; +} + +while getopts "i:l:g:k:p:h" opt; do + case $opt in + i) KMER=$OPTARG ;; + l) LREAD=$OPTARG ;; + g) GSIZE=$OPTARG ;; + k) KLEN=$OPTARG ;; + p) CPU=$OPTARG ;; + h) printHelpAndExit 0;; + esac +done + +if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then + printHelpAndExit + exit +fi + +if [ -z "$KLEN" ];then + KLEN=21 +fi + +if [ -z "$CPU" ];then + CPU=1 +fi + +if [ ! -d "output" ];then + mkdir output +fi + +#judge if the softwares needed are installed. +if ! type kmc >/dev/null 2>&1; then + echo 'kmc is not installed!' + exit +fi +if ! type samtools >/dev/null 2>&1; then + echo 'samtools is not installed!' + exit +fi + +if ! type seqtk >/dev/null 2>&1; then + echo 'seqtk is not installed!' + exit +fi + +if ! type parallel >/dev/null 2>&1; then + echo 'parallel is not installed!' + exit +fi + +#obtain specific-Y kmers + +echo "***********************************Preparation*************************************" +#calculate specific k-mer density +NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'` +echo "The number of specific k-mers is $NUM." +DENSITY=`perl -e '$c=sprintf ("%.1f",('$NUM'*1000/4)/('$GSIZE'*1000000)); print $c;'` +echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of contig or hifi reads." + +echo -e "\n***********************************Process*****************************************" + +#pre-process the input file +create_reverse_complement.pl $KMER > output/kmer_rc.txt +LREAD=${LREAD//,/ } +if [ ! -s output/changeid.fa.fai ];then + for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa + samtools faidx output/changeid.fa +fi + +#split kmer +PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'` +split -l 200000 output/kmer_rc.txt output/x +echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..." + +#sort kmer +for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh +parallel -j $CPU output/x*.st" + +#search long reads contained specific kmers +for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh +parallel -j $CPU output/x*.kmercount" + +#obtain the position of specific kmers on each read +for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh +parallel -j $CPU output/x*.loci" + +#get the extended sequences including specific kmers +for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh +parallel -j $CPU output/x*.loci.fa" + +#acquire the kmers in sequences +for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh +parallel -j $CPU output/x*.loci.kmer" + +#filter kmers of last step +for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh +parallel -j $CPU output/x*.need.kmer" + +#the relationship between kmer and read id +for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN > $lf.loci.kmer2id";done >temp.sh +parallel -j $CPU output/x*.loci.kmer2id" + +#get the id +for li in `ls output/x* | grep -v '\.'`;do echo "find_candidate_id.pl $li.need.kmer $li.loci.kmer2id > $li.seqid";done >temp.sh +parallel -j $CPU output/x*.seqid" +rm temp.sh + +#get reads +echo -e "\n***************************************Final****************************************" +cat output/x*.seqid >output/ALL.seqid +count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density +select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa +echo "DONE!" +echo "The final filtered contigs or hifi reads file of Y/W chromosome -> output/candidate_target.fa" +#echo "The final filtered contigs or hifi reads file of autosome and X/Z chromosome -> output/candidate_other.fa" + +rm -r tmp_* output/x* output/kmer_rc.txt diff --git a/SRY_k b/SRY_k index 6a2f5f9..889f407 100644 --- a/SRY_k +++ b/SRY_k @@ -5,24 +5,25 @@ script_dir=$(dirname $script_abs) export PATH=$PATH:$script_dir/script export LC_ALL=C -usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i * Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l * Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g * Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k K-mer length (default: 21)\n-p CPU number (default: 1)\n-h Help and exit." +usageHelp="\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-m * Male (or female for W) short-read files with comma separated in sample and plus separated between samples\n-f * Female (or male for W) short-read files with comma separated in sample and plus separated between samples\n-i Minimum coverage of k-mers in each sample (default: 2)\n-a Maximum coverage of k-mers in each sample (default: unlimit)\n-k K-mer length (default: 21)\n-p CPU number (default: 1)\n-h Help and exit." printHelpAndExit() { echo -e "$usageHelp" exit; } -while getopts "i:l:g:k:p:h" opt; do +while getopts "m:f:l:g:i:a:k:p:h" opt; do case $opt in - i) KMER=$OPTARG ;; - l) LREAD=$OPTARG ;; - g) GSIZE=$OPTARG ;; + m) MSREAD=$OPTARG ;; + f) FSREAD=$OPTARG ;; + i) COV=$OPTARG ;; + a) MCOV=$OPTARG ;; k) KLEN=$OPTARG ;; p) CPU=$OPTARG ;; h) printHelpAndExit 0;; esac done -if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then +if [ -z "$MSREAD" ] || [ -z "$FSREAD" ];then printHelpAndExit exit fi @@ -31,6 +32,14 @@ if [ -z "$KLEN" ];then KLEN=21 fi +if [ -z "$COV" ];then + COV=2 +fi + +if [ -z "$MCOV" ];then + MCOV=0 +fi + if [ -z "$CPU" ];then CPU=1 fi @@ -40,14 +49,20 @@ if [ ! -d "output" ];then fi #judge if the softwares needed are installed. + +if ! type kmc >/dev/null 2>&1; then + echo 'kmc is not installed!' + exit +fi + if ! type samtools >/dev/null 2>&1; then - echo 'samtools is not installed!' - exit + echo 'samtools is not installed!' + exit fi if ! type seqtk >/dev/null 2>&1; then - echo 'seqtk is not installed!' - exit + echo 'seqtk is not installed!' + exit fi if ! type parallel >/dev/null 2>&1; then @@ -58,72 +73,16 @@ fi #obtain specific-Y kmers echo "***********************************Preparation*************************************" -#calculate specific k-mer density -NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'` -echo "The number of specific k-mers is $NUM." -CORRACT_RATE=0.85 #the correct rate of long reads from third sequencing -DENSITY=`perl -e '$c=sprintf ("%.1f",'${CORRACT_RATE}'**'$KLEN'*'$NUM'*1000/('$GSIZE'*1000000)); print $c;'` -echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of long reads." - -echo -e "\n***********************************Process*****************************************" - -#pre-process the input file -create_reverse_complement.pl $KMER > output/kmer_rc.txt -LREAD=${LREAD//,/ } -for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa -samtools faidx output/changeid.fa - -#split kmer -PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'` -split -l 200000 output/kmer_rc.txt output/x -echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..." - -#sort kmer -for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh -parallel -j $CPU output/x*.st" - -#search long reads contained specific kmers -for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh -parallel -j $CPU output/x*.kmercount" - -#obtain the position of specific kmers on each read -for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh -parallel -j $CPU output/x*.loci" - -#get the extended sequences including specific kmers -for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh -parallel -j $CPU output/x*.loci.fa" - -#acquire the kmers in sequences -for f in `ls output/x* | grep -v '\.'`;do echo "kmer_count -l $KLEN -i $f.loci.fa -o $f.loci.kmer";done >temp.sh -parallel -j $CPU output/x*.loci.kmer" - -#filter kmers of last step -for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh -parallel -j $CPU output/x*.need.kmer" - -#the relationship between kmer and read id -for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN | sort -k1,1 -k2,2 -u > $lf.loci.kmer2id";done >temp.sh -parallel -j $CPU output/x*.loci.kmer2id" - -#get the id -for li in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $li.loci.kmer2id:dup=Y $li.need.kmer | cut -f 2 > $li.seqid";done >temp.sh -parallel -j $CPU output/x*.seqid" -rm temp.sh - -#get reads -echo -e "\n***************************************Final****************************************" -cat output/x*.seqid >output/ALL.seqid -count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density -select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa -echo "DONE!" -echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa" -echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa" +if [[ $MCOV == 0 ]] +then + echo "Min coverage:$COV;max coverage:unlimit" +else + echo "Min coverage:$COV;max coverage:$MCOV" +fi + +call_kmc.pl $MSREAD $FSREAD $KLEN $COV $MCOV +parallel -j $CPU $KMER +rm -r tmp_* output/M.kmer output/F.kmer diff --git a/SRY b/SRY_pb2ont similarity index 66% rename from SRY rename to SRY_pb2ont index 8845e61..0ac41a6 100644 --- a/SRY +++ b/SRY_pb2ont @@ -5,27 +5,24 @@ script_dir=$(dirname $script_abs) export PATH=$PATH:$script_dir/script export LC_ALL=C -usageHelp="\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-m * Male (or female for W) short-read files with comma separated in sample and plus separated between samples\n-f * Female (or male for W) short-read files with comma separated in sample and plus separated between samples\n-l * Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g * Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-i Minimum coverage of k-mers in each sample (default: 2)\n-a Maximum coverage of k-mers in each sample (default: unlimit)\n-k K-mer length (default: 21)\n-p CPU number (default: 1)\n-h Help and exit." +usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i * Specific-kmer file (output/SRY_kmer.txt generated by SRY)\n-l * Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g * Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k K-mer length (default: 21)\n-p CPU number (default: 1)\n-h Help and exit." printHelpAndExit() { echo -e "$usageHelp" exit; } -while getopts "m:f:l:g:i:a:k:p:h" opt; do +while getopts "i:l:g:k:p:h" opt; do case $opt in - m) MSREAD=$OPTARG ;; - f) FSREAD=$OPTARG ;; + i) KMER=$OPTARG ;; l) LREAD=$OPTARG ;; g) GSIZE=$OPTARG ;; - i) COV=$OPTARG ;; - a) MCOV=$OPTARG ;; k) KLEN=$OPTARG ;; p) CPU=$OPTARG ;; h) printHelpAndExit 0;; esac done -if [ -z "$MSREAD" ] || [ -z "$FSREAD" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then +if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then printHelpAndExit exit fi @@ -34,33 +31,28 @@ if [ -z "$KLEN" ];then KLEN=21 fi -if [ -z "$COV" ];then - COV=2 -fi - -if [ -z "$MCOV" ];then - MCOV=0 -fi - if [ -z "$CPU" ];then CPU=1 fi - if [ ! -d "output" ];then mkdir output fi #judge if the softwares needed are installed. +if ! type kmc >/dev/null 2>&1; then + echo 'kmc is not installed!' + exit +fi if ! type samtools >/dev/null 2>&1; then - echo 'samtools is not installed!' - exit + echo 'samtools is not installed!' + exit fi if ! type seqtk >/dev/null 2>&1; then - echo 'seqtk is not installed!' - exit + echo 'seqtk is not installed!' + exit fi if ! type parallel >/dev/null 2>&1; then @@ -71,19 +63,6 @@ fi #obtain specific-Y kmers echo "***********************************Preparation*************************************" -if [[ $MCOV == 0 ]] -then - echo "Min coverage:$COV;max coverage:unlimit" -else - echo "Min coverage:$COV;max coverage:$MCOV" -fi - -call_kmer_count.pl $MSREAD $FSREAD $KLEN $COV $MCOV -parallel -j $CPU $KMER - #calculate specific k-mer density NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'` echo "The number of specific k-mers is $NUM." @@ -96,8 +75,11 @@ echo -e "\n***********************************Process*************************** #pre-process the input file create_reverse_complement.pl $KMER > output/kmer_rc.txt LREAD=${LREAD//,/ } -for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa -samtools faidx output/changeid.fa + +if [ ! -s output/changeid.fa.fai ];then + for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa + samtools faidx output/changeid.fa +fi #split kmer PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'` @@ -110,7 +92,7 @@ parallel -j $CPU output/x*.st" #search long reads contained specific kmers -for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh +for s in `ls output/x* | grep -v '\.'`;do echo "grep -i -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh parallel -j $CPU output/x*.kmercount" @@ -125,7 +107,7 @@ parallel -j $CPU output/x*.loci.fa" #acquire the kmers in sequences -for f in `ls output/x* | grep -v '\.'`;do echo "kmer_count -l $KLEN -i $f.loci.fa -o $f.loci.kmer";done >temp.sh +for f in `ls output/x* | grep -v '\.' | xargs -i basename {}`;do echo "mkdir tmp_$f && kmc -t1 -m3 -hp -k$KLEN -fa output/$f.loci.fa tmp_$f tmp_$f && kmc_dump tmp_$f tmp_${f}.dump && sort -k1,1 tmp_${f}.dump >output/${f}.loci.kmer";done >temp.sh parallel -j $CPU output/x*.loci.kmer" @@ -153,3 +135,5 @@ select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/chang echo "DONE!" echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa" echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa" + +rm -r tmp_* output/x* output/kmer_rc.txt diff --git a/example_run.sh b/example_run.sh index 9332db2..4f8586c 100644 --- a/example_run.sh +++ b/example_run.sh @@ -1 +1,2 @@ -./SRY -m data/HG01109_R1.fq.gz,data/HG01109_R2.fq.gz+data/HG01107_R1.fq.gz,data/HG01107_R2.fq.gz -f data/HG01108_R1.fq.gz,data/HG01108_R2.fq.gz -l data/HG01109_longread_1.fasta.gz,data/HG01109_longread_2.fasta.gz -g 1 -p 5 +./SRY_k -m data/HG01109_R1.fq.gz,data/HG01109_R2.fq.gz+data/HG01107_R1.fq.gz,data/HG01107_R2.fq.gz -f data/HG01108_R1.fq.gz,data/HG01108_R2.fq.gz +./SRY_pb2ont -i output/SRY_kmer.txt -l data/HG01109_longread_1.fasta.gz,data/HG01109_longread_2.fasta.gz -g 1 -p 5 diff --git a/script/call_kmc.pl b/script/call_kmc.pl new file mode 100644 index 0000000..9e240ce --- /dev/null +++ b/script/call_kmc.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w +use strict; + +die "perl $0 male_fq_files female_fq_files kmer_length min_depth max_depth\n" if @ARGV!=5; + +my $kmer_len = $ARGV[2]; +my $min_depth = $ARGV[3]; +my $max_depth = $ARGV[4]; + +my $cmd="output/kmer.sh"; +open OUT,">$cmd"; + +my ($filterx_cmdm,$filterx_cmdf); + +#cmd of male population +my @msamples = split(/\+/,$ARGV[0]); +my $mnum = $#msamples+1; +my $mlimit = int($mnum*2/3+0.5); + +my $seq = ''; +for (my $j=0;$j<@msamples;$j++){ + my @mfiles = split(/,/,$msamples[$j]); + my $m_tmp = "output/SRY_tmp_M$j.file"; + open MOUT,">$m_tmp"; + for (my $i=0;$i<@mfiles;$i++){ + print MOUT "$mfiles[$i]\n"; + } + + + if ($max_depth == 0){ + $seq .= " output/M$j.kmer"; + }else{ + $seq .= " <(awk \'\$2<=$max_depth\' output/M$j.kmer)"; + } + + print OUT "mkdir tmp_M$j && kmc -t1 -m3 -hp -k$kmer_len -fq \@$m_tmp tmp_M$j tmp_M$j && kmc_tools transform tmp_M$j dump -s output/M$j.kmer\n"; + close MOUT; +} + +$filterx_cmdm="filterx -1 \'cnt>=$mlimit\' -k s $seq |awk \'{for (i=1;i<=NF;i+=2){if (\$i~/[A-Z]/){print \$i;break}}}\' | gzip > output/Male_kmer.dat.gz\n"; + +#cmd of female population +my @fsamples = split(/\+/,$ARGV[1]); + +$seq = ''; +for (my $j=0;$j<@fsamples;$j++){ + my @ffiles = split(/,/,$fsamples[$j]); + my $f_tmp = "output/SRY_tmp_F$j.file"; + open FOUT,">$f_tmp"; + for (my $i=0;$i<@ffiles;$i++){ + print FOUT "$ffiles[$i]\n"; + } + + if ($max_depth == 0){ + $seq .= " output/F$j.kmer"; + }else{ + $seq .= " <(awk \'\$2<=$max_depth\' output/F$j.kmer)"; + } + print OUT "mkdir tmp_F$j && kmc -t1 -m3 -hp -k$kmer_len -fq \@$f_tmp tmp_F$j tmp_F$j && kmc_tools transform tmp_F$j dump -s output/F$j.kmer\n"; + close FOUT; +} +close OUT; + +$filterx_cmdf="filterx -k s -1 \'cnt>=1\' $seq |awk \'{for (i=1;i<=NF;i+=2){if (\$i~/[A-Z]/){print \$i;break}}}\' |gzip > output/Female_kmer.dat.gz\n"; + +my $ftcmd="output/ft.sh"; +open OUT1,">$ftcmd"; +if ($filterx_cmdm || $filterx_cmdf){ + print OUT1 "$filterx_cmdm" if ($filterx_cmdm); + print OUT1 "$filterx_cmdf" if ($filterx_cmdf); +} +close OUT1; diff --git a/script/combine_hifi_reads.pl b/script/combine_hifi_reads.pl new file mode 100644 index 0000000..3e1567c --- /dev/null +++ b/script/combine_hifi_reads.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w +use strict; + +die "perl $0 file1 ...\n" if @ARGV==0; + +my %hash; +my $flag = 0; +for (my $i=0;$i<@ARGV;$i++){ + my $in = $ARGV[$i]; + open IN,$in; + while (){ + chomp; + if (/>(\S+)/){ + my $id = $1; + if (!defined $hash{$id}){ + $hash{$id} = ""; + print "$_\n"; + $flag = 1; + }else{ + $flag = 0; + } + }else{ + if ($flag == 1){ + print "$_\n"; + } + } + } + close IN; +} diff --git a/script/find_candidate_id.pl b/script/find_candidate_id.pl new file mode 100644 index 0000000..76af9e0 --- /dev/null +++ b/script/find_candidate_id.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +die "perl $0 need.kmer kmer2id\n" if @ARGV!=2; + +my $in = $ARGV[0]; +open IN,$in; + +my %hash; +while (){ + chomp; + $hash{$_}=''; +} +close IN; + +my $in1 = $ARGV[1]; +open IN1,$in1; + +my @info; +while (){ + chomp; + @info = split; + if (defined $hash{$info[0]}){ + print "$info[1]\n"; + } +} diff --git a/script/select_reads.pl b/script/select_reads.pl index a7a35b3..b5f8f72 100644 --- a/script/select_reads.pl +++ b/script/select_reads.pl @@ -54,13 +54,13 @@ print OUT ">$old2new{$id}\n"; }else{ $flag = 0; - print OUT1 ">$old2new{$id}\n"; + #print OUT1 ">$old2new{$id}\n"; } }else{ if ($flag == 1){ print OUT "$_\n"; }else{ - print OUT1 "$_\n"; + #print OUT1 "$_\n"; } } }