From 2731619628cb2a4faef10e52bb679cdd8badd75b Mon Sep 17 00:00:00 2001 From: Stephan Ripke Date: Thu, 23 Jun 2016 13:42:50 +0200 Subject: [PATCH] included_indels_and_multipos_ino_chepos_chefli --- rp_bin/checkflip4 | 10 ++- rp_bin/checkpos6 | 145 +++++++++++++++++++++++++++++++++++++--- rp_bin/impute_dirsub_57 | 6 +- 3 files changed, 145 insertions(+), 16 deletions(-) diff --git a/rp_bin/checkflip4 b/rp_bin/checkflip4 index 1fd0c47..679eeb7 100755 --- a/rp_bin/checkflip4 +++ b/rp_bin/checkflip4 @@ -585,6 +585,11 @@ while (my $line = ) { my $fr_ref = $cells[6]; +# if ($snp eq "chr5_172342029_I") { +# print "@cells\n"; + # } + + # if ($a1_bim ~= "I" || $a2_bim ~= "I") { # if ($a1_ref ~= "I" || $a2_ref ~= "I") { @@ -766,8 +771,9 @@ push @log_lines, "not found: \t".@snp_nan." \t-> $bim_nal\n"; push @log_lines, "ambiguous, insecure freq: \t".@snp_uif."\t-> $bim_uif\n"; push @log_lines, "freq-diff > $frq_th : \t".$nsnp_bf."\t-> $bim_bf\n"; -push @log_lines, "\nexclude:\n"; -push @log_lines, "translated indels: \t".@snp_inde."\t-> $bim_inde\n"; + + +push @log_lines, "\ntranslated indels: \t".@snp_inde."\t-> $bim_inde\n"; push @log_lines, "\n\n\nI would exclude unmatched, unfound, different and insecure unflippable SNPs:\n"; push @log_lines, "\nthese are roughly ".@snp_nan + @snp_xal + @snp_uif + $nsnp_bf." SNPs (multiple entries possible):\n"; diff --git a/rp_bin/checkpos6 b/rp_bin/checkpos6 index b56886b..d82d04f 100755 --- a/rp_bin/checkpos6 +++ b/rp_bin/checkpos6 @@ -48,10 +48,14 @@ my $p2loc = &trans("p2loc"); my $scol = 2; ## snp-col in bim-file my $ccol = 1; ## chr-col in bim-file my $kcol = 4; ## kb-col in bim-file +my $a1col = 5; ## kb-col in bim-file +my $a2col = 6; ## kb-col in bim-file my $dscol = 1; ## snp-col in reference my $dccol = 2; ## chr-col in reference my $dkcol = 3; ## kb-col in reference +my $da1col = 4; ## kb-col in reference +my $da2col = 5; ## kb-col in reference my $version = "1.0.0"; my $progname = $0; @@ -81,7 +85,7 @@ version: $version --ploc STRING location of plink-binary (default is found at Broad) default: $p2loc --col INT,INT,INT snp-col,chr-col,kb-col in bim-file: default: $scol,$ccol,$kcol - --dbcol INT,INT,INT snp-col,chr-col,kb-col in dbsnp-file: default: $dscol,$dccol,$dkcol + --dbcol INT,INT,INT snp-col,chr-col,kb-col in dbsnp-file: default: $dscol,$dccol,$dkcol,$da1col,$da2col --nocreate do not create clean datasets, only analyse --exmulti exclude all multiple annotated @@ -111,12 +115,12 @@ GetOptions( die "$usage\n" if (@ARGV != 1 || $help); if ($colstr) { - ($scol,$ccol,$kcol) = split(/,/, $colstr); - die "*****\nwrong col-usage\n****\n$usage\n" if ($kcol eq ""); + ($scol,$ccol,$kcol,$a1col,$a2col) = split(/,/, $colstr); + die "*****\nwrong col-usage\n****\n$usage\n" if ($a2col eq ""); } if ($dcolstr) { - ($dscol,$dccol,$dkcol) = split(/,/, $dcolstr); - die "*****\nwrong col-usage\n****\n$usage\n" if ($dkcol eq ""); + ($dscol,$dccol,$dkcol,$da1col,$da2col) = split(/,/, $dcolstr); + die "*****\nwrong col-usage\n****\n$usage\n" if ($da2col eq ""); } unless (-e $dbsnp_file) { @@ -155,6 +159,10 @@ my $bim_npos=$bim_file.".npos"; my $bim_rpos=$bim_file.".rpos"; my $bim_uchr=$bim_file.".uchr"; my $bim_ukb=$bim_file.".ukb"; +my $bim_trans_indel=$bim_file.".tr.indel"; +my $bim_trans_snp=$bim_file.".tr.snp"; +my $bim_notrans_variant=$bim_file.".notr.variant"; +my $bim_trans_multi=$bim_file.".tr.multi"; my $bim_extr=$bim_file.".extract"; @@ -299,6 +307,8 @@ while (my $line = ){ my $snp = $cells[$scol-1]; my $chr = $cells[$ccol-1] ; my $pos = $cells[$kcol-1] ; + my $a1 = $cells[$a1col-1] ; + my $a2 = $cells[$a2col-1] ; @@ -404,6 +414,7 @@ print "number of warning extractions: $name_war\n"; #my @bim_lines; my %bim_info; # position for snp_name my %loc_info; # also read all positions, safes SNP name for position in bim-file +my %gt_hash; die "$bfile.bim.ow".$! unless open FILE , "< $bfile.bim.ow"; while (my $line = ){ @@ -419,11 +430,13 @@ while (my $line = ){ if ($cells[$ccol-1] != 0 || $cells[$kcol-1] != 0) { my $posloc = " $cells[$ccol-1] $cells[$kcol-1]"; + my $gtloc = $cells[$a1col-1].$cells[$a2col-1]; # if (exists $loc_info{$posloc}){ # print "double entry on position: $posloc\n"; # exit; # } $loc_info{$posloc} = $cells[$scol-1]; + $gt_hash{$posloc} = $gtloc; } else { ## check the presumably right positions @@ -453,7 +466,15 @@ print "read reference-file: $dbsnp_file\n"; my %bim_in_ref; ## safe snp names that are found in reference die "$dbsnp_file".$! unless open FILE , "< $dbsnp_file"; die "$bim_joined".$! unless open OF , "> $bim_joined"; -die "$bim_addpos".$! unless open AP , "> $bim_addpos"; + +my %loc_hash; +my %addpos_hash; +my @trans_indel; +my @trans_snp; +my @trans_multi; +my @notrans_variant; + + while (my $line = ){ my @cells = @{&split_line_ref(\$line)}; my $info_loc = " $cells[$dccol-1] $cells[$dkcol-1]"; @@ -466,7 +487,7 @@ while (my $line = ){ ########################################### ### this is crucual: - ### only if the SNP is not present on the bim file, there is a position lookup + ### only if the SNP is not present on the bim file, there is a position lookup, this is hidden in the elsif ############################################## if (exists $bim_info{$cells[$dscol-1]}) { @@ -500,7 +521,83 @@ while (my $line = ){ # $bim_in_ref{$cells[$dscol-1]} = 1; - print AP $cells[$dscol-1]." ".$loc_info{$info_loc}." ".$info_loc."\n"; + + # $addpos_hash{$info_loc}= $cells[$dscol-1]." ".$loc_info{$info_loc}." ".$info_loc; + + my $length_bim_gt = length($gt_hash{$info_loc}); + my $ref1_gt = $cells[$da1col-1].$cells[$da2col-1]; + my $length_ref1 = length ($ref1_gt); + + my $bim_indel = 0; + $bim_indel = 1 if ($length_bim_gt > 2); + $bim_indel = 1 if ($gt_hash{$info_loc} =~ /-/); + + my $ref_indel = 0; + $ref_indel = 1 if ($length_ref1 > 2); + $ref_indel = 1 if ($ref1_gt =~ /-/); + + my $replace = 0; + if ($bim_indel && $ref_indel) { + $replace = 1; + # print "replacing snpname with position for an indel: $info_loc\n"; + push @trans_indel, "replacing indel with reference based on position: $loc_info{$info_loc} $cells[$dscol-1] $info_loc\n"; + } + elsif ($bim_indel == 0 && $ref_indel == 0) { + $replace = 1; + # print "replacing snpname after position for an non-indel: $info_loc\n"; + push @trans_snp, "replacing snp with reference based on position: $loc_info{$info_loc} $cells[$dscol-1] $info_loc\n"; + } + else { +# print "not replacing snpname with same position for an indel / non-indel: $info_loc\n"; + push @notrans_variant, "not_replacing variant with reference based on position: $loc_info{$info_loc} $cells[$dscol-1] $info_loc\n"; + } + + + ## maybe this whole bracket is not necessary +# if (exists $addpos_hash{$info_loc}) { +# print "double position at $info_loc $addpos_hash{$info_loc} $cells[$da1col-1] $cells[$da2col-1] $gt_hash{$info_loc}\n"; +# my $length_bim_gt = length($gt_hash{$info_loc}); +# my $ref1_gt = $cells[$da1col-1].$cells[$da2col-1]; +# my $length_ref1 = length ($ref1_gt); + +# my @ref2_arr = split(" ", $addpos_hash{$info_loc}); +# my $length_ref2= length ($ref2_arr[4].$ref2_arr[5]); + +# print "$length_bim_gt\n"; +# print "$length_ref1\n"; +# print "$length_ref2\n"; + +# my $bim_indel = 0; +# $bim_indel = 1 if ($length_bim_gt > 2); +# $bim_indel = 1 if ($gt_hash{$info_loc} =~ /-/); + +# my $ref_indel = 0; +# $ref_indel = 1 if ($length_ref_gt > 2); +# $ref_indel = 1 if ($ref1_gt =~ /-/); + +# if ($bim_indel) { +# print "$info_loc seems to be an indel: $gt_hash{$info_loc}\n"; +# } +# else { +# print "$info_loc seems not to be an indel: $gt_hash{$info_loc}\n"; +# } +# exit; + +# } +# $loc_hash{$info_loc} = "$cells[$da1col-1] $cells[$da2col-1]"; + + + + + if ($replace) { + + if (exists $addpos_hash{$info_loc}) { + push @trans_multi, "Waruning: double position at $info_loc $addpos_hash{$info_loc} $cells[$da1col-1] $cells[$da2col-1] $gt_hash{$info_loc}\n"; + } + + $addpos_hash{$info_loc}= $cells[$dscol-1]." ".$loc_info{$info_loc}." ".$info_loc." $cells[$da1col-1] $cells[$da2col-1]"; + } + # delete ($bim_info{$loc_info{$info_loc}}); } } @@ -511,13 +608,24 @@ foreach my $snp_loc (keys %bim_info) { print OF $snp_loc.$bim_info{$snp_loc}."\n"; } close OF; + + + +die "$bim_addpos".$! unless open AP , "> $bim_addpos"; + +foreach (keys %addpos_hash) { + print AP $addpos_hash{$_}."\n"; +} + close AP; -#exit; +#exit; + + ### read bim-file @@ -617,6 +725,7 @@ foreach (keys %pos_count) { } +## here it also makes sure that SNPs that are found in the database dont get translated print "read addpos file for translating SNP names\n"; die $!." <$bim_addpos>" unless open AP, "< $bim_addpos"; @@ -631,13 +740,17 @@ while (my $line = ) { $usnp++; } else { - print AD "$cells[1] $cells[0] $cells[2] $cells[3] non_replace\n"; + print AD "$cells[1] $cells[0] $cells[2] $cells[3] non_replace (because variant name exists at different location\n"; $unsnp++; } } close AP; close AD; +print "$bim_addpos\n"; +print "$bim_addpos.det\n"; +#exit; + ########################################## # exclude multiple SNPs from update lists. ########################################## @@ -662,6 +775,10 @@ print "write files...\n"; &a2file($bim_xpos,@snp_xpos); &a2file($bim_xchr,@snp_xchr); &a2file($bim_xkb,@snp_xkb); +&a2file($bim_trans_indel,@trans_indel); +&a2file($bim_trans_snp,@trans_snp); +&a2file($bim_trans_multi,@trans_multi); +&a2file($bim_notrans_variant,@notrans_variant); &a2file($bim_rpos,@snp_rpos); @@ -676,9 +793,15 @@ push @log_lines, "rs_name extraction :\t".$name_rs."\t-> $bfile.bim.o push @log_lines, "rs_name extraction (not done):\t".$name_nrs."\t-> $bfile.bim.ow.det\n"; push @log_lines, "position extraction :\t".$name_pos."\t-> $bfile.bim.ow.det\n"; push @log_lines, "warning extraction :\t".$name_war."\t-> $bfile.bim.ow.det\n"; + push @log_lines, "\nsnp-names taken from reference based on position:\n"; push @log_lines, "as noted above :\t".$usnp."\t-> $bim_addpos.det\n"; push @log_lines, "not done since snpname exists:\t".$unsnp."\t-> $bim_addpos.det\n"; +push @log_lines, "----a bit more detail--------:\n"; +push @log_lines, "translated indels :\t".@trans_indel."\t-> $bim_trans_indel\n"; +push @log_lines, "translated snps :\t".@trans_snp."\t-> $bim_trans_snp\n"; +push @log_lines, "not translated indel/snp :\t".@notrans_variant."\t-> $bim_notrans_variant\n"; +push @log_lines, "warning multi positions :\t".@trans_multi."\t-> $bim_trans_multi\n"; push @log_lines, "\npositions of rs-name in db-snp:\n"; push @log_lines, "wrong (chr and kb) :\t".@snp_xpos."\t-> $bim_xpos\n"; push @log_lines, "wrong (chr), update possible :\t".@snp_xchr."\t-> $bim_xchr\n"; @@ -744,7 +867,7 @@ push @log_lines, $cmd3; &mysystem ("mv $bfile_dbsnp.fam $rootdir/"); -&mysystem ("tar -cvzf $bfile_dbsnp.tar.gz $bim_xpos $bim_xchr $bim_xkb $bim_rpos $bim_npos $bim_xdup $bim_ukb $bim_uchr $bim_addpos.det $bfile.bim.ow.det"); +&mysystem ("tar -cvzf $bfile_dbsnp.tar.gz $bim_xpos $bim_xchr $bim_xkb $bim_rpos $bim_npos $bim_xdup $bim_ukb $bim_uchr $bim_addpos.det $bfile.bim.ow.det $bim_trans_indel $bim_trans_snp $bim_trans_multi $bim_notrans_variant"); &mysystem ("mv $bfile_dbsnp.tar.gz $rootdir/"); &mysystem ("mv $bim_log $rootdir/"); diff --git a/rp_bin/impute_dirsub_57 b/rp_bin/impute_dirsub_57 index db5b02f..efab6e1 100755 --- a/rp_bin/impute_dirsub_57 +++ b/rp_bin/impute_dirsub_57 @@ -1680,7 +1680,7 @@ unless (-e "$rootdir/posing_done") { if (-e $locref) { print "locref $locref is existing! safes some time\n"; unless (exists $bimpos_array{$accfli}) { - push @chepos_arr, "$checkpos_script --dbcol 1,2,3 --dbsnp $rootdir/$impute_dir/$locref $bfile.hg19.bim" ;# + push @chepos_arr, "$checkpos_script --dbcol 1,2,3,4,5 --dbsnp $rootdir/$impute_dir/$locref $bfile.hg19.bim" ;# # print "$checkpos_script --dbcol 1,2,3 --dbsnp $rootdir/$impute_dir/$locref $bfile.hg19.bim\n" ;# } @@ -1691,7 +1691,7 @@ unless (-e "$rootdir/posing_done") { else { print "locref $locref is not existing! would be better if it did\n"; unless (exists $bimpos_array{$accfli}) { - push @chepos_arr, "$checkpos_script --dbcol 1,8,9 --dbsnp $refdir/$suminfo_s $bfile.hg19.bim" ;# + push @chepos_arr, "$checkpos_script --dbcol 1,8,9,3,5 --dbsnp $refdir/$suminfo_s $bfile.hg19.bim" ;# } else { $chepos_fini++; @@ -1794,7 +1794,7 @@ my $chefli_fini = 0; -exit; +#exit;