Skip to content

Commit ece209a

Browse files
committed
Add scripts to format bed files for MIRACUM-Pipe
1 parent e5f6e96 commit ece209a

File tree

3 files changed

+345
-0
lines changed

3 files changed

+345
-0
lines changed

bedformatting/bed6_2_bed12.pl

+299
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
#!/usr/bin/perl -w
2+
3+
# <short description>
4+
#
5+
# bed6_2_bed12.pl copyright by Ruediger Lehmann (2020-07)
6+
#
7+
#
8+
#-------------------------------------------------------------------------------
9+
# Bug-Report
10+
#
11+
# <list of known bugs>
12+
#
13+
#-------------------------------------------------------------------------------
14+
15+
################################################################################
16+
# GLOBAL VARIABLES, MODULES and LIBRARIES
17+
################################################################################
18+
use strict;
19+
20+
use IO::File;
21+
use File::Basename;
22+
use Getopt::Std;
23+
use Cwd;
24+
25+
#use lib "<path of used library>";
26+
#use <name of module>;
27+
28+
################################################################################
29+
my( $file_bed6, $file_bed12, $newdir, $hash_size_bed6, $hash_size_bed12 );
30+
my( %bed6, %bed12 );
31+
#my( @<arrays> );
32+
my( $ProgName, $DirName, $VERSION );
33+
#-------------------------------------------------------------------------------
34+
# last edition <year>-<month>-<day>
35+
36+
if( defined(readlink(__FILE__)) ){
37+
$ProgName = basename(readlink(__FILE__));
38+
$DirName = dirname(readlink(__FILE__));
39+
} else {
40+
$ProgName = basename(__FILE__);
41+
$DirName = dirname(__FILE__);
42+
}
43+
44+
$VERSION = 0.1;
45+
46+
=pod
47+
48+
=head1 NAME
49+
50+
bed6_2_bed12.pl v0.1
51+
52+
This program changes a bed-File from bed6-format to bed12-format
53+
54+
=cut
55+
56+
################################################################################
57+
#-------------------------------- Options --------------------------------------
58+
#-------------------------------------------------------------------------------
59+
60+
use vars qw( $opt_i $opt_o );
61+
62+
getopts("i:o:");
63+
64+
65+
66+
$newdir = cwd();
67+
68+
69+
#-------------------------------------------------------------------------------
70+
71+
if( $opt_i ){
72+
$file_bed6 = $opt_i;
73+
} else {
74+
print "\n\t\tSorry, you have forgotten the name of the bed6-File \n\n";
75+
system "perldoc $0";
76+
exit 0;
77+
}
78+
79+
#-------------------------------------------------------------------------------
80+
81+
if( $opt_o ){
82+
$file_bed12 = $opt_o;
83+
} else {
84+
print "\n\t\tSorry, you have forgotten the name of the bed12-File \n\n";
85+
system "perldoc $0";
86+
exit 0;
87+
}
88+
89+
#-------------------------------------------------------------------------------
90+
91+
92+
################################################################################
93+
#
94+
# M A I N
95+
#
96+
################################################################################
97+
98+
$hash_size_bed6 = 0;
99+
100+
101+
#-------------------------------------------------------------------------------
102+
103+
print "\n\n\t\t\t***************** READING BED6 FILE ***************** \n\n\n";
104+
read_bed6_file( \%bed6, $file_bed6 );
105+
106+
$hash_size_bed6 = keys %bed6;
107+
#printf "TEST2\t%d\t%d\n", $hash_size_bed6, $bed6{$hash_size_bed6-1}{start};
108+
109+
#-------------------------------------------------------------------------------
110+
111+
print "\n\n\t\t\t***************** CONDENSING TABLE ***************** \n\n\n";
112+
condense_overlap( \%bed6, \%bed12, $hash_size_bed6 );
113+
114+
115+
116+
#-------------------------------------------------------------------------------
117+
118+
119+
print "\n\n\t\t\t***************** WRITTING BED12 FILE ***************** \n\n\n";
120+
$hash_size_bed12 = keys %bed12;
121+
write_bed12_file( \%bed12, $file_bed12, $hash_size_bed12 );
122+
123+
124+
#-------------------------------------------------------------------------------
125+
126+
127+
################################################################################
128+
################################################################################
129+
# function
130+
131+
sub read_bed6_file{
132+
my( $rBed6, $filename ) = @_;
133+
my( $i );
134+
135+
open( IN, "< $filename" ) || die "Cannot open $filename file!\n";
136+
137+
print "\t----------- reading bed6 file -----------\n\n\n";
138+
139+
$i = 0;
140+
141+
while( <IN> ){
142+
chomp;
143+
144+
#print "----------- TEST 1 -------------\n";
145+
146+
# Aufspalten der Zeile an den gesetzten Tabs in "Chromosom - Position1 - Position2 - Annotation"
147+
if( /^(\w+)\t(\d+)\t(\d+)\t(.+)$/ ){
148+
149+
# print "$1\t$2\t$3\t$4\n";
150+
$$rBed6{$i}{chr} = $1;
151+
$$rBed6{$i}{start} = $2;
152+
$$rBed6{$i}{stop} = $3;
153+
$$rBed6{$i}{anno} = $4;
154+
}
155+
# Manche Zeilen enthalten keine 4. Spalte, d.h. keine Annotation
156+
elsif ( /^(\w+)\t(\d+)\t(\d+)$/ ){
157+
158+
$$rBed6{$i}{chr} = $1;
159+
$$rBed6{$i}{start} = $2;
160+
$$rBed6{$i}{stop} = $3;
161+
$$rBed6{$i}{anno} = "";
162+
}
163+
else {
164+
}
165+
166+
# Inkrementieren des Zählers für den Hash
167+
$i++;
168+
169+
}
170+
#printf "TEST1\t%d\n", $$rBed6{200000}{start};
171+
172+
print "$i Eintraege in alter Tabelle.\n";
173+
close IN ;
174+
175+
}
176+
177+
#-------------------------------------------------------------------------------
178+
179+
sub condense_overlap{
180+
my( $rBed6, $rBed12, $hash_size ) = @_;
181+
my( $i, $j, $file_log, $comment );
182+
183+
184+
print "\t----------- condensing table -----------\n\n\n";
185+
186+
$j = 0;
187+
188+
$file_log = "bed6_2_bed12.log";
189+
190+
$comment = "ANNOTATION";
191+
192+
open( LOG, "> $file_log") || die "Cannot open $file_log file!\n";
193+
194+
$$rBed12{$j}{chr} = $$rBed6{0}{chr};
195+
$$rBed12{$j}{start} = $$rBed6{0}{start};
196+
$$rBed12{$j}{stop} = $$rBed6{0}{stop};
197+
$$rBed12{$j}{anno} = $$rBed6{0}{anno};
198+
199+
for ($i = 1; $i < $hash_size; $i++){
200+
201+
if ($$rBed6{$i}{chr} eq $$rBed12{$j}{chr}){ # Wir vergleichen die Positionen innerhalb eines Chromosoms
202+
203+
if ($$rBed6{$i}{start} >= $$rBed12{$j}{start} && $$rBed6{$i}{stop} <= $$rBed12{$j}{stop}){ # der neue Bereich wird vom alten umschlossen
204+
205+
if ($$rBed12{$j}{anno} ne $$rBed6{$i}{anno}){ # Es gibt Unterschiede in der Annotation bei überlappenden Bereichen
206+
207+
printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno};
208+
printf LOG "-------------- %s\n", $comment;
209+
printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno};
210+
211+
if ( length($$rBed6{$i}{anno}) > length($$rBed12{$j}{anno}) ){ # Die neue Annotation ist umfangreicher
212+
$$rBed12{$j}{anno} = $$rBed6{$i}{anno};
213+
}
214+
} else {
215+
printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno};
216+
printf LOG "--------------\n";
217+
printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno};
218+
}
219+
} elsif ($$rBed6{$i}{start} < $$rBed12{$j}{stop} && $$rBed6{$i}{stop} > $$rBed12{$j}{stop}){ # der neue Bereich überlappt sich mit dem alten
220+
221+
if ($$rBed12{$j}{anno} ne $$rBed6{$i}{anno}){ # Es gibt Unterschiede in der Annotation bei überlappenden Bereichen
222+
223+
printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno};
224+
printf LOG "-------------- %s\n", $comment;
225+
printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno};
226+
227+
if ( length($$rBed6{$i}{anno}) > length($$rBed12{$j}{anno}) ){ # Die neue Annotation ist umfangreicher
228+
$$rBed12{$j}{anno} = $$rBed6{$i}{anno};
229+
}
230+
} else {
231+
printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno};
232+
printf LOG "--------------\n";
233+
printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno};
234+
}
235+
236+
$$rBed12{$j}{stop} = $$rBed6{$i}{stop}; # Der Bereich wird erweitert
237+
238+
} elsif ($$rBed6{$i}{start} > $$rBed12{$j}{stop}) { # Es gibt keine Überlappung
239+
240+
$j++;
241+
$$rBed12{$j}{chr} = $$rBed6{$i}{chr};
242+
$$rBed12{$j}{start} = $$rBed6{$i}{start};
243+
$$rBed12{$j}{stop} = $$rBed6{$i}{stop};
244+
$$rBed12{$j}{anno} = $$rBed6{$i}{anno};
245+
246+
}
247+
248+
} else { # Es beginnen die Eintraege eines neuen Chromosoms
249+
250+
$j++;
251+
$$rBed12{$j}{chr} = $$rBed6{$i}{chr};
252+
$$rBed12{$j}{start} = $$rBed6{$i}{start};
253+
$$rBed12{$j}{stop} = $$rBed6{$i}{stop};
254+
$$rBed12{$j}{anno} = $$rBed6{$i}{anno};
255+
}
256+
257+
}
258+
259+
printf "Neue Tabelle:\t$j Eintraege\n";
260+
261+
close LOG;
262+
263+
}
264+
265+
#-------------------------------------------------------------------------------
266+
267+
sub write_bed12_file{
268+
my( $rBed12, $filename, $hash_size ) = @_;
269+
my( $i );
270+
271+
open( OUT, "> $filename" ) || die "Cannot open $filename file!\n";
272+
273+
for ($i = 0; $i < $hash_size; $i++){
274+
275+
if ($$rBed12{$i}{anno} ne ""){
276+
printf OUT "%s\t%d\t%d\t%s\n", $$rBed12{$i}{chr}, $$rBed12{$i}{start}, $$rBed12{$i}{stop}, $$rBed12{$i}{anno};
277+
} else {
278+
printf OUT "%s\t%d\t%d\t-\n", $$rBed12{$i}{chr}, $$rBed12{$i}{start}, $$rBed12{$i}{stop};
279+
}
280+
}
281+
282+
close OUT;
283+
}
284+
285+
################################################################################
286+
################################################################################
287+
################################################################################
288+
################################################################################
289+
290+
291+
=head1 SYNOPSIS
292+
293+
description of Synopsis
294+
295+
=head1 OUTPUT
296+
297+
output description
298+
299+
=cut

bedformatting/convert.sh

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
3+
IFS='.' read -ra BED <<< "$1"
4+
5+
sort -V -k1,1 -k2,2n -k3,3n -u $1 | sed 's/\s*$//' | sed 's/\s/,/g' | sed 's/,/\t/' | sed 's/,/\t/' | sed 's/,/\t/' > ${BED[0]}-sorted.bed
6+
7+
perl bed6_2_bed12.pl -i ${BED[0]}-sorted.bed -o ${BED[0]}-bed12.bed
8+
9+
Rscript --vanilla targets.R ${BED[0]}

bedformatting/targetExtractor.R

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env Rscript
2+
3+
## ---------------------------
4+
##
5+
## Script name: targetExtractor.R
6+
##
7+
## Purpose of script: Extract targets from bed file
8+
##
9+
## Author: Niklas Reimer, M.Sc.
10+
##
11+
## Date Created: 2020-11-04
12+
13+
##
14+
## ---------------------------
15+
16+
library(tidyr)
17+
library(biomaRt)
18+
19+
args = commandArgs(trailingOnly=TRUE)
20+
21+
# read bed file
22+
bed <- read.table(paste(args[1], ".bed", sep = ""))
23+
24+
# remove chr prefix if present
25+
regions <- paste(gsub("chr", "", bed$V1), paste(bed$V2, bed$V3, sep="-"), sep=":")
26+
27+
# setup ensembl
28+
m <- useMart('ensembl', dataset='hsapiens_gene_ensembl') # create a mart object
29+
df <- getBM(mart=m, attributes=c('hgnc_symbol'), filters=c('chromosomal_region'), values=list(c(regions)))
30+
31+
# remove duplicate symbols
32+
hgnc <- df$hgnc_symbol[!duplicated(df$hgnc_symbol)]
33+
34+
# write to file
35+
sink(paste(args[1], "_Targets.txt", sep = ""))
36+
cat(writeLines(hgnc, sep = "\n"))
37+
sink()

0 commit comments

Comments
 (0)