|
| 1 | +#!/usr/bin/perl -w |
| 2 | + |
| 3 | +# <short description> |
| 4 | +# |
| 5 | +# bed6_2_bed12.pl copyright by Ruediger Lehmann (2020-07) |
| 6 | +# |
| 7 | +# |
| 8 | +#------------------------------------------------------------------------------- |
| 9 | +# Bug-Report |
| 10 | +# |
| 11 | +# <list of known bugs> |
| 12 | +# |
| 13 | +#------------------------------------------------------------------------------- |
| 14 | + |
| 15 | +################################################################################ |
| 16 | +# GLOBAL VARIABLES, MODULES and LIBRARIES |
| 17 | +################################################################################ |
| 18 | +use strict; |
| 19 | + |
| 20 | +use IO::File; |
| 21 | +use File::Basename; |
| 22 | +use Getopt::Std; |
| 23 | +use Cwd; |
| 24 | + |
| 25 | +#use lib "<path of used library>"; |
| 26 | +#use <name of module>; |
| 27 | + |
| 28 | +################################################################################ |
| 29 | +my( $file_bed6, $file_bed12, $newdir, $hash_size_bed6, $hash_size_bed12 ); |
| 30 | +my( %bed6, %bed12 ); |
| 31 | +#my( @<arrays> ); |
| 32 | +my( $ProgName, $DirName, $VERSION ); |
| 33 | +#------------------------------------------------------------------------------- |
| 34 | +# last edition <year>-<month>-<day> |
| 35 | + |
| 36 | +if( defined(readlink(__FILE__)) ){ |
| 37 | + $ProgName = basename(readlink(__FILE__)); |
| 38 | + $DirName = dirname(readlink(__FILE__)); |
| 39 | +} else { |
| 40 | + $ProgName = basename(__FILE__); |
| 41 | + $DirName = dirname(__FILE__); |
| 42 | +} |
| 43 | + |
| 44 | +$VERSION = 0.1; |
| 45 | + |
| 46 | +=pod |
| 47 | +
|
| 48 | +=head1 NAME |
| 49 | +
|
| 50 | +bed6_2_bed12.pl v0.1 |
| 51 | +
|
| 52 | +This program changes a bed-File from bed6-format to bed12-format |
| 53 | +
|
| 54 | +=cut |
| 55 | + |
| 56 | +################################################################################ |
| 57 | +#-------------------------------- Options -------------------------------------- |
| 58 | +#------------------------------------------------------------------------------- |
| 59 | + |
| 60 | +use vars qw( $opt_i $opt_o ); |
| 61 | + |
| 62 | +getopts("i:o:"); |
| 63 | + |
| 64 | + |
| 65 | + |
| 66 | +$newdir = cwd(); |
| 67 | + |
| 68 | + |
| 69 | +#------------------------------------------------------------------------------- |
| 70 | + |
| 71 | +if( $opt_i ){ |
| 72 | + $file_bed6 = $opt_i; |
| 73 | +} else { |
| 74 | + print "\n\t\tSorry, you have forgotten the name of the bed6-File \n\n"; |
| 75 | + system "perldoc $0"; |
| 76 | + exit 0; |
| 77 | +} |
| 78 | + |
| 79 | +#------------------------------------------------------------------------------- |
| 80 | + |
| 81 | +if( $opt_o ){ |
| 82 | + $file_bed12 = $opt_o; |
| 83 | +} else { |
| 84 | + print "\n\t\tSorry, you have forgotten the name of the bed12-File \n\n"; |
| 85 | + system "perldoc $0"; |
| 86 | + exit 0; |
| 87 | +} |
| 88 | + |
| 89 | +#------------------------------------------------------------------------------- |
| 90 | + |
| 91 | + |
| 92 | +################################################################################ |
| 93 | +# |
| 94 | +# M A I N |
| 95 | +# |
| 96 | +################################################################################ |
| 97 | + |
| 98 | +$hash_size_bed6 = 0; |
| 99 | + |
| 100 | + |
| 101 | +#------------------------------------------------------------------------------- |
| 102 | + |
| 103 | +print "\n\n\t\t\t***************** READING BED6 FILE ***************** \n\n\n"; |
| 104 | +read_bed6_file( \%bed6, $file_bed6 ); |
| 105 | + |
| 106 | +$hash_size_bed6 = keys %bed6; |
| 107 | +#printf "TEST2\t%d\t%d\n", $hash_size_bed6, $bed6{$hash_size_bed6-1}{start}; |
| 108 | + |
| 109 | +#------------------------------------------------------------------------------- |
| 110 | + |
| 111 | +print "\n\n\t\t\t***************** CONDENSING TABLE ***************** \n\n\n"; |
| 112 | +condense_overlap( \%bed6, \%bed12, $hash_size_bed6 ); |
| 113 | + |
| 114 | + |
| 115 | + |
| 116 | +#------------------------------------------------------------------------------- |
| 117 | + |
| 118 | + |
| 119 | +print "\n\n\t\t\t***************** WRITTING BED12 FILE ***************** \n\n\n"; |
| 120 | +$hash_size_bed12 = keys %bed12; |
| 121 | +write_bed12_file( \%bed12, $file_bed12, $hash_size_bed12 ); |
| 122 | + |
| 123 | + |
| 124 | +#------------------------------------------------------------------------------- |
| 125 | + |
| 126 | + |
| 127 | +################################################################################ |
| 128 | +################################################################################ |
| 129 | +# function |
| 130 | + |
| 131 | +sub read_bed6_file{ |
| 132 | + my( $rBed6, $filename ) = @_; |
| 133 | + my( $i ); |
| 134 | + |
| 135 | + open( IN, "< $filename" ) || die "Cannot open $filename file!\n"; |
| 136 | + |
| 137 | + print "\t----------- reading bed6 file -----------\n\n\n"; |
| 138 | + |
| 139 | + $i = 0; |
| 140 | + |
| 141 | + while( <IN> ){ |
| 142 | + chomp; |
| 143 | + |
| 144 | +#print "----------- TEST 1 -------------\n"; |
| 145 | + |
| 146 | + # Aufspalten der Zeile an den gesetzten Tabs in "Chromosom - Position1 - Position2 - Annotation" |
| 147 | + if( /^(\w+)\t(\d+)\t(\d+)\t(.+)$/ ){ |
| 148 | + |
| 149 | +# print "$1\t$2\t$3\t$4\n"; |
| 150 | + $$rBed6{$i}{chr} = $1; |
| 151 | + $$rBed6{$i}{start} = $2; |
| 152 | + $$rBed6{$i}{stop} = $3; |
| 153 | + $$rBed6{$i}{anno} = $4; |
| 154 | + } |
| 155 | +# Manche Zeilen enthalten keine 4. Spalte, d.h. keine Annotation |
| 156 | + elsif ( /^(\w+)\t(\d+)\t(\d+)$/ ){ |
| 157 | + |
| 158 | + $$rBed6{$i}{chr} = $1; |
| 159 | + $$rBed6{$i}{start} = $2; |
| 160 | + $$rBed6{$i}{stop} = $3; |
| 161 | + $$rBed6{$i}{anno} = ""; |
| 162 | + } |
| 163 | + else { |
| 164 | + } |
| 165 | + |
| 166 | + # Inkrementieren des Zählers für den Hash |
| 167 | + $i++; |
| 168 | + |
| 169 | + } |
| 170 | +#printf "TEST1\t%d\n", $$rBed6{200000}{start}; |
| 171 | + |
| 172 | +print "$i Eintraege in alter Tabelle.\n"; |
| 173 | + close IN ; |
| 174 | + |
| 175 | +} |
| 176 | + |
| 177 | +#------------------------------------------------------------------------------- |
| 178 | + |
| 179 | +sub condense_overlap{ |
| 180 | + my( $rBed6, $rBed12, $hash_size ) = @_; |
| 181 | + my( $i, $j, $file_log, $comment ); |
| 182 | + |
| 183 | + |
| 184 | + print "\t----------- condensing table -----------\n\n\n"; |
| 185 | + |
| 186 | + $j = 0; |
| 187 | + |
| 188 | + $file_log = "bed6_2_bed12.log"; |
| 189 | + |
| 190 | + $comment = "ANNOTATION"; |
| 191 | + |
| 192 | + open( LOG, "> $file_log") || die "Cannot open $file_log file!\n"; |
| 193 | + |
| 194 | + $$rBed12{$j}{chr} = $$rBed6{0}{chr}; |
| 195 | + $$rBed12{$j}{start} = $$rBed6{0}{start}; |
| 196 | + $$rBed12{$j}{stop} = $$rBed6{0}{stop}; |
| 197 | + $$rBed12{$j}{anno} = $$rBed6{0}{anno}; |
| 198 | + |
| 199 | + for ($i = 1; $i < $hash_size; $i++){ |
| 200 | + |
| 201 | + if ($$rBed6{$i}{chr} eq $$rBed12{$j}{chr}){ # Wir vergleichen die Positionen innerhalb eines Chromosoms |
| 202 | + |
| 203 | + if ($$rBed6{$i}{start} >= $$rBed12{$j}{start} && $$rBed6{$i}{stop} <= $$rBed12{$j}{stop}){ # der neue Bereich wird vom alten umschlossen |
| 204 | + |
| 205 | + if ($$rBed12{$j}{anno} ne $$rBed6{$i}{anno}){ # Es gibt Unterschiede in der Annotation bei überlappenden Bereichen |
| 206 | + |
| 207 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno}; |
| 208 | + printf LOG "-------------- %s\n", $comment; |
| 209 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno}; |
| 210 | + |
| 211 | + if ( length($$rBed6{$i}{anno}) > length($$rBed12{$j}{anno}) ){ # Die neue Annotation ist umfangreicher |
| 212 | + $$rBed12{$j}{anno} = $$rBed6{$i}{anno}; |
| 213 | + } |
| 214 | + } else { |
| 215 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno}; |
| 216 | + printf LOG "--------------\n"; |
| 217 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno}; |
| 218 | + } |
| 219 | + } elsif ($$rBed6{$i}{start} < $$rBed12{$j}{stop} && $$rBed6{$i}{stop} > $$rBed12{$j}{stop}){ # der neue Bereich überlappt sich mit dem alten |
| 220 | + |
| 221 | + if ($$rBed12{$j}{anno} ne $$rBed6{$i}{anno}){ # Es gibt Unterschiede in der Annotation bei überlappenden Bereichen |
| 222 | + |
| 223 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno}; |
| 224 | + printf LOG "-------------- %s\n", $comment; |
| 225 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno}; |
| 226 | + |
| 227 | + if ( length($$rBed6{$i}{anno}) > length($$rBed12{$j}{anno}) ){ # Die neue Annotation ist umfangreicher |
| 228 | + $$rBed12{$j}{anno} = $$rBed6{$i}{anno}; |
| 229 | + } |
| 230 | + } else { |
| 231 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n", $i, $$rBed12{$j}{chr}, $$rBed12{$j}{start}, $$rBed12{$j}{stop}, $$rBed12{$j}{anno}; |
| 232 | + printf LOG "--------------\n"; |
| 233 | + printf LOG "%6d\t%s\t%d\t%d\t%s\n\n\n", $i+1, $$rBed6{$i}{chr}, $$rBed6{$i}{start}, $$rBed6{$i}{stop}, $$rBed6{$i}{anno}; |
| 234 | + } |
| 235 | + |
| 236 | + $$rBed12{$j}{stop} = $$rBed6{$i}{stop}; # Der Bereich wird erweitert |
| 237 | + |
| 238 | + } elsif ($$rBed6{$i}{start} > $$rBed12{$j}{stop}) { # Es gibt keine Überlappung |
| 239 | + |
| 240 | + $j++; |
| 241 | + $$rBed12{$j}{chr} = $$rBed6{$i}{chr}; |
| 242 | + $$rBed12{$j}{start} = $$rBed6{$i}{start}; |
| 243 | + $$rBed12{$j}{stop} = $$rBed6{$i}{stop}; |
| 244 | + $$rBed12{$j}{anno} = $$rBed6{$i}{anno}; |
| 245 | + |
| 246 | + } |
| 247 | + |
| 248 | + } else { # Es beginnen die Eintraege eines neuen Chromosoms |
| 249 | + |
| 250 | + $j++; |
| 251 | + $$rBed12{$j}{chr} = $$rBed6{$i}{chr}; |
| 252 | + $$rBed12{$j}{start} = $$rBed6{$i}{start}; |
| 253 | + $$rBed12{$j}{stop} = $$rBed6{$i}{stop}; |
| 254 | + $$rBed12{$j}{anno} = $$rBed6{$i}{anno}; |
| 255 | + } |
| 256 | + |
| 257 | + } |
| 258 | + |
| 259 | +printf "Neue Tabelle:\t$j Eintraege\n"; |
| 260 | + |
| 261 | + close LOG; |
| 262 | + |
| 263 | +} |
| 264 | + |
| 265 | +#------------------------------------------------------------------------------- |
| 266 | + |
| 267 | +sub write_bed12_file{ |
| 268 | + my( $rBed12, $filename, $hash_size ) = @_; |
| 269 | + my( $i ); |
| 270 | + |
| 271 | + open( OUT, "> $filename" ) || die "Cannot open $filename file!\n"; |
| 272 | + |
| 273 | + for ($i = 0; $i < $hash_size; $i++){ |
| 274 | + |
| 275 | + if ($$rBed12{$i}{anno} ne ""){ |
| 276 | + printf OUT "%s\t%d\t%d\t%s\n", $$rBed12{$i}{chr}, $$rBed12{$i}{start}, $$rBed12{$i}{stop}, $$rBed12{$i}{anno}; |
| 277 | + } else { |
| 278 | + printf OUT "%s\t%d\t%d\t-\n", $$rBed12{$i}{chr}, $$rBed12{$i}{start}, $$rBed12{$i}{stop}; |
| 279 | + } |
| 280 | + } |
| 281 | + |
| 282 | + close OUT; |
| 283 | +} |
| 284 | + |
| 285 | +################################################################################ |
| 286 | +################################################################################ |
| 287 | +################################################################################ |
| 288 | +################################################################################ |
| 289 | + |
| 290 | + |
| 291 | +=head1 SYNOPSIS |
| 292 | +
|
| 293 | +description of Synopsis |
| 294 | +
|
| 295 | +=head1 OUTPUT |
| 296 | + |
| 297 | +output description |
| 298 | +
|
| 299 | +=cut |
0 commit comments