Skip to content

Commit

Permalink
included_indels_and_multipos_ino_chepos_chefli
Browse files Browse the repository at this point in the history
  • Loading branch information
stephanripke committed Jun 23, 2016
1 parent 6e7d430 commit 2731619
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 16 deletions.
10 changes: 8 additions & 2 deletions rp_bin/checkflip4
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,11 @@ while (my $line = <FJ>) {
my $fr_ref = $cells[6];


# if ($snp eq "chr5_172342029_I") {
# print "@cells\n";
# }



# if ($a1_bim ~= "I" || $a2_bim ~= "I") {
# if ($a1_ref ~= "I" || $a2_ref ~= "I") {
Expand Down Expand Up @@ -766,8 +771,9 @@ push @log_lines, "not found: \t".@snp_nan." \t-> $bim_nal\n";
push @log_lines, "ambiguous, insecure freq: \t".@snp_uif."\t-> $bim_uif\n";
push @log_lines, "freq-diff > $frq_th : \t".$nsnp_bf."\t-> $bim_bf\n";

push @log_lines, "\nexclude:\n";
push @log_lines, "translated indels: \t".@snp_inde."\t-> $bim_inde\n";


push @log_lines, "\ntranslated indels: \t".@snp_inde."\t-> $bim_inde\n";

push @log_lines, "\n\n\nI would exclude unmatched, unfound, different and insecure unflippable SNPs:\n";
push @log_lines, "\nthese are roughly ".@snp_nan + @snp_xal + @snp_uif + $nsnp_bf." SNPs (multiple entries possible):\n";
Expand Down
145 changes: 134 additions & 11 deletions rp_bin/checkpos6
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,14 @@ my $p2loc = &trans("p2loc");
my $scol = 2; ## snp-col in bim-file
my $ccol = 1; ## chr-col in bim-file
my $kcol = 4; ## kb-col in bim-file
my $a1col = 5; ## kb-col in bim-file
my $a2col = 6; ## kb-col in bim-file

my $dscol = 1; ## snp-col in reference
my $dccol = 2; ## chr-col in reference
my $dkcol = 3; ## kb-col in reference
my $da1col = 4; ## kb-col in reference
my $da2col = 5; ## kb-col in reference

my $version = "1.0.0";
my $progname = $0;
Expand Down Expand Up @@ -81,7 +85,7 @@ version: $version
--ploc STRING location of plink-binary (default is found at Broad)
default: $p2loc
--col INT,INT,INT snp-col,chr-col,kb-col in bim-file: default: $scol,$ccol,$kcol
--dbcol INT,INT,INT snp-col,chr-col,kb-col in dbsnp-file: default: $dscol,$dccol,$dkcol
--dbcol INT,INT,INT snp-col,chr-col,kb-col in dbsnp-file: default: $dscol,$dccol,$dkcol,$da1col,$da2col
--nocreate do not create clean datasets, only analyse
--exmulti exclude all multiple annotated
Expand Down Expand Up @@ -111,12 +115,12 @@ GetOptions(
die "$usage\n" if (@ARGV != 1 || $help);

if ($colstr) {
($scol,$ccol,$kcol) = split(/,/, $colstr);
die "*****\nwrong col-usage\n****\n$usage\n" if ($kcol eq "");
($scol,$ccol,$kcol,$a1col,$a2col) = split(/,/, $colstr);
die "*****\nwrong col-usage\n****\n$usage\n" if ($a2col eq "");
}
if ($dcolstr) {
($dscol,$dccol,$dkcol) = split(/,/, $dcolstr);
die "*****\nwrong col-usage\n****\n$usage\n" if ($dkcol eq "");
($dscol,$dccol,$dkcol,$da1col,$da2col) = split(/,/, $dcolstr);
die "*****\nwrong col-usage\n****\n$usage\n" if ($da2col eq "");
}

unless (-e $dbsnp_file) {
Expand Down Expand Up @@ -155,6 +159,10 @@ my $bim_npos=$bim_file.".npos";
my $bim_rpos=$bim_file.".rpos";
my $bim_uchr=$bim_file.".uchr";
my $bim_ukb=$bim_file.".ukb";
my $bim_trans_indel=$bim_file.".tr.indel";
my $bim_trans_snp=$bim_file.".tr.snp";
my $bim_notrans_variant=$bim_file.".notr.variant";
my $bim_trans_multi=$bim_file.".tr.multi";
my $bim_extr=$bim_file.".extract";


Expand Down Expand Up @@ -299,6 +307,8 @@ while (my $line = <FI>){
my $snp = $cells[$scol-1];
my $chr = $cells[$ccol-1] ;
my $pos = $cells[$kcol-1] ;
my $a1 = $cells[$a1col-1] ;
my $a2 = $cells[$a2col-1] ;



Expand Down Expand Up @@ -404,6 +414,7 @@ print "number of warning extractions: $name_war\n";
#my @bim_lines;
my %bim_info; # position for snp_name
my %loc_info; # also read all positions, safes SNP name for position in bim-file
my %gt_hash;

die "$bfile.bim.ow".$! unless open FILE , "< $bfile.bim.ow";
while (my $line = <FILE>){
Expand All @@ -419,11 +430,13 @@ while (my $line = <FILE>){

if ($cells[$ccol-1] != 0 || $cells[$kcol-1] != 0) {
my $posloc = " $cells[$ccol-1] $cells[$kcol-1]";
my $gtloc = $cells[$a1col-1].$cells[$a2col-1];
# if (exists $loc_info{$posloc}){
# print "double entry on position: $posloc\n";
# exit;
# }
$loc_info{$posloc} = $cells[$scol-1];
$gt_hash{$posloc} = $gtloc;
}
else {
## check the presumably right positions
Expand Down Expand Up @@ -453,7 +466,15 @@ print "read reference-file: $dbsnp_file\n";
my %bim_in_ref; ## safe snp names that are found in reference
die "$dbsnp_file".$! unless open FILE , "< $dbsnp_file";
die "$bim_joined".$! unless open OF , "> $bim_joined";
die "$bim_addpos".$! unless open AP , "> $bim_addpos";

my %loc_hash;
my %addpos_hash;
my @trans_indel;
my @trans_snp;
my @trans_multi;
my @notrans_variant;


while (my $line = <FILE>){
my @cells = @{&split_line_ref(\$line)};
my $info_loc = " $cells[$dccol-1] $cells[$dkcol-1]";
Expand All @@ -466,7 +487,7 @@ while (my $line = <FILE>){

###########################################
### this is crucual:
### only if the SNP is not present on the bim file, there is a position lookup
### only if the SNP is not present on the bim file, there is a position lookup, this is hidden in the elsif
##############################################

if (exists $bim_info{$cells[$dscol-1]}) {
Expand Down Expand Up @@ -500,7 +521,83 @@ while (my $line = <FILE>){

# $bim_in_ref{$cells[$dscol-1]} = 1;

print AP $cells[$dscol-1]." ".$loc_info{$info_loc}." ".$info_loc."\n";

# $addpos_hash{$info_loc}= $cells[$dscol-1]." ".$loc_info{$info_loc}." ".$info_loc;

my $length_bim_gt = length($gt_hash{$info_loc});
my $ref1_gt = $cells[$da1col-1].$cells[$da2col-1];
my $length_ref1 = length ($ref1_gt);

my $bim_indel = 0;
$bim_indel = 1 if ($length_bim_gt > 2);
$bim_indel = 1 if ($gt_hash{$info_loc} =~ /-/);

my $ref_indel = 0;
$ref_indel = 1 if ($length_ref1 > 2);
$ref_indel = 1 if ($ref1_gt =~ /-/);

my $replace = 0;
if ($bim_indel && $ref_indel) {
$replace = 1;
# print "replacing snpname with position for an indel: $info_loc\n";
push @trans_indel, "replacing indel with reference based on position: $loc_info{$info_loc} $cells[$dscol-1] $info_loc\n";
}
elsif ($bim_indel == 0 && $ref_indel == 0) {
$replace = 1;
# print "replacing snpname after position for an non-indel: $info_loc\n";
push @trans_snp, "replacing snp with reference based on position: $loc_info{$info_loc} $cells[$dscol-1] $info_loc\n";
}
else {
# print "not replacing snpname with same position for an indel / non-indel: $info_loc\n";
push @notrans_variant, "not_replacing variant with reference based on position: $loc_info{$info_loc} $cells[$dscol-1] $info_loc\n";
}


## maybe this whole bracket is not necessary
# if (exists $addpos_hash{$info_loc}) {
# print "double position at $info_loc $addpos_hash{$info_loc} $cells[$da1col-1] $cells[$da2col-1] $gt_hash{$info_loc}\n";
# my $length_bim_gt = length($gt_hash{$info_loc});
# my $ref1_gt = $cells[$da1col-1].$cells[$da2col-1];
# my $length_ref1 = length ($ref1_gt);

# my @ref2_arr = split(" ", $addpos_hash{$info_loc});
# my $length_ref2= length ($ref2_arr[4].$ref2_arr[5]);

# print "$length_bim_gt\n";
# print "$length_ref1\n";
# print "$length_ref2\n";

# my $bim_indel = 0;
# $bim_indel = 1 if ($length_bim_gt > 2);
# $bim_indel = 1 if ($gt_hash{$info_loc} =~ /-/);

# my $ref_indel = 0;
# $ref_indel = 1 if ($length_ref_gt > 2);
# $ref_indel = 1 if ($ref1_gt =~ /-/);

# if ($bim_indel) {
# print "$info_loc seems to be an indel: $gt_hash{$info_loc}\n";
# }
# else {
# print "$info_loc seems not to be an indel: $gt_hash{$info_loc}\n";
# }
# exit;

# }
# $loc_hash{$info_loc} = "$cells[$da1col-1] $cells[$da2col-1]";




if ($replace) {

if (exists $addpos_hash{$info_loc}) {
push @trans_multi, "Waruning: double position at $info_loc $addpos_hash{$info_loc} $cells[$da1col-1] $cells[$da2col-1] $gt_hash{$info_loc}\n";
}

$addpos_hash{$info_loc}= $cells[$dscol-1]." ".$loc_info{$info_loc}." ".$info_loc." $cells[$da1col-1] $cells[$da2col-1]";
}

# delete ($bim_info{$loc_info{$info_loc}});
}
}
Expand All @@ -511,13 +608,24 @@ foreach my $snp_loc (keys %bim_info) {
print OF $snp_loc.$bim_info{$snp_loc}."\n";
}
close OF;



die "$bim_addpos".$! unless open AP , "> $bim_addpos";

foreach (keys %addpos_hash) {
print AP $addpos_hash{$_}."\n";
}

close AP;

#exit;


#exit;





### read bim-file

Expand Down Expand Up @@ -617,6 +725,7 @@ foreach (keys %pos_count) {
}


## here it also makes sure that SNPs that are found in the database dont get translated

print "read addpos file for translating SNP names\n";
die $!." <$bim_addpos>" unless open AP, "< $bim_addpos";
Expand All @@ -631,13 +740,17 @@ while (my $line = <AP>) {
$usnp++;
}
else {
print AD "$cells[1] $cells[0] $cells[2] $cells[3] non_replace\n";
print AD "$cells[1] $cells[0] $cells[2] $cells[3] non_replace (because variant name exists at different location\n";
$unsnp++;
}
}
close AP;
close AD;

print "$bim_addpos\n";
print "$bim_addpos.det\n";
#exit;

##########################################
# exclude multiple SNPs from update lists.
##########################################
Expand All @@ -662,6 +775,10 @@ print "write files...\n";
&a2file($bim_xpos,@snp_xpos);
&a2file($bim_xchr,@snp_xchr);
&a2file($bim_xkb,@snp_xkb);
&a2file($bim_trans_indel,@trans_indel);
&a2file($bim_trans_snp,@trans_snp);
&a2file($bim_trans_multi,@trans_multi);
&a2file($bim_notrans_variant,@notrans_variant);


&a2file($bim_rpos,@snp_rpos);
Expand All @@ -676,9 +793,15 @@ push @log_lines, "rs_name extraction :\t".$name_rs."\t-> $bfile.bim.o
push @log_lines, "rs_name extraction (not done):\t".$name_nrs."\t-> $bfile.bim.ow.det\n";
push @log_lines, "position extraction :\t".$name_pos."\t-> $bfile.bim.ow.det\n";
push @log_lines, "warning extraction :\t".$name_war."\t-> $bfile.bim.ow.det\n";

push @log_lines, "\nsnp-names taken from reference based on position:\n";
push @log_lines, "as noted above :\t".$usnp."\t-> $bim_addpos.det\n";
push @log_lines, "not done since snpname exists:\t".$unsnp."\t-> $bim_addpos.det\n";
push @log_lines, "----a bit more detail--------:\n";
push @log_lines, "translated indels :\t".@trans_indel."\t-> $bim_trans_indel\n";
push @log_lines, "translated snps :\t".@trans_snp."\t-> $bim_trans_snp\n";
push @log_lines, "not translated indel/snp :\t".@notrans_variant."\t-> $bim_notrans_variant\n";
push @log_lines, "warning multi positions :\t".@trans_multi."\t-> $bim_trans_multi\n";
push @log_lines, "\npositions of rs-name in db-snp:\n";
push @log_lines, "wrong (chr and kb) :\t".@snp_xpos."\t-> $bim_xpos\n";
push @log_lines, "wrong (chr), update possible :\t".@snp_xchr."\t-> $bim_xchr\n";
Expand Down Expand Up @@ -744,7 +867,7 @@ push @log_lines, $cmd3;
&mysystem ("mv $bfile_dbsnp.fam $rootdir/");


&mysystem ("tar -cvzf $bfile_dbsnp.tar.gz $bim_xpos $bim_xchr $bim_xkb $bim_rpos $bim_npos $bim_xdup $bim_ukb $bim_uchr $bim_addpos.det $bfile.bim.ow.det");
&mysystem ("tar -cvzf $bfile_dbsnp.tar.gz $bim_xpos $bim_xchr $bim_xkb $bim_rpos $bim_npos $bim_xdup $bim_ukb $bim_uchr $bim_addpos.det $bfile.bim.ow.det $bim_trans_indel $bim_trans_snp $bim_trans_multi $bim_notrans_variant");

&mysystem ("mv $bfile_dbsnp.tar.gz $rootdir/");
&mysystem ("mv $bim_log $rootdir/");
Expand Down
6 changes: 3 additions & 3 deletions rp_bin/impute_dirsub_57
Original file line number Diff line number Diff line change
Expand Up @@ -1680,7 +1680,7 @@ unless (-e "$rootdir/posing_done") {
if (-e $locref) {
print "locref $locref is existing! safes some time\n";
unless (exists $bimpos_array{$accfli}) {
push @chepos_arr, "$checkpos_script --dbcol 1,2,3 --dbsnp $rootdir/$impute_dir/$locref $bfile.hg19.bim" ;#
push @chepos_arr, "$checkpos_script --dbcol 1,2,3,4,5 --dbsnp $rootdir/$impute_dir/$locref $bfile.hg19.bim" ;#
# print "$checkpos_script --dbcol 1,2,3 --dbsnp $rootdir/$impute_dir/$locref $bfile.hg19.bim\n" ;#

}
Expand All @@ -1691,7 +1691,7 @@ unless (-e "$rootdir/posing_done") {
else {
print "locref $locref is not existing! would be better if it did\n";
unless (exists $bimpos_array{$accfli}) {
push @chepos_arr, "$checkpos_script --dbcol 1,8,9 --dbsnp $refdir/$suminfo_s $bfile.hg19.bim" ;#
push @chepos_arr, "$checkpos_script --dbcol 1,8,9,3,5 --dbsnp $refdir/$suminfo_s $bfile.hg19.bim" ;#
}
else {
$chepos_fini++;
Expand Down Expand Up @@ -1794,7 +1794,7 @@ my $chefli_fini = 0;



exit;
#exit;



Expand Down

0 comments on commit 2731619

Please sign in to comment.