bismark2bedGraph

#!/usr/bin/env perl
use warnings;
use strict;
$|++;
use Getopt::Long;
use Cwd;
use Carp;

## This program is Copyright (C) 2010-23, Felix Krueger (fkrueger@altoslabs.com)

## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.

my $bismark2bedGraph_version = 'v0.24.2';

my @bedfiles;
my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
my @sorting_files;

my ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir,$ucsc) = process_commandline();

warn "Using these input files: @sorting_files\n";
warn "\nSummary of parameters for bismark2bedGraph conversion:\n";
warn '='x54,"\n";
warn "bedGraph output:\t\t$bedGraph_output\n";
warn "output directory:\t\t>$output_dir<\n";
if ($remove){
    warn "remove whitespaces:\t\tyes\n";
}
else{
    warn "remove whitespaces:\t\tno\n";
}
if ($CX_context){
    warn "CX context:\t\t\tyes\n";
}
else{
    warn "CX context:\t\t\tno (CpG context only, default)\n";
}
if ($no_header){
    warn "No-header selected:\t\tyes\n";
}
else{
    warn "No-header selected:\t\tno\n";
}

if ($ample_mem){
    warn "Sorting method:\t\t\tArray-based (faster, but larger memory footprint)\n";
}
else{
    warn "Sorting method:\t\t\tUnix sort-based (smaller memory footprint, but slower)\n";
}
unless($ample_mem){
    warn "Sort buffer size:\t\t$sort_size\n";
}
warn "Coverage threshold:\t\t$coverage_threshold\n";


warn  "="x77,"\n";
warn "Methylation information will now be written into a bedGraph and coverage file\n";
warn  "="x77,"\n\n";
sleep (2);

### deciding which files to use for bedGraph conversion
foreach my $filename (@sorting_files){
    
  ### DETERMINING THE FULL PATH OF INPUT FILES
  if ($filename =~ /^(.*\/)(.*)$/){ # if files are in a different output folder we extract the filename again
    # warn "folder name: $1\nfilename: $2\n\n";
    chdir $1 or die "Failed to change directory to $1\n"; # $1 might be a relative path 
    $input_dir = getcwd();                                # this will always be the full path
    $filename = $2;
    # warn "Full Input folder: $input_dir\nFilename: $filename\n\n"; sleep (1);
    chdir $parent_dir or die "Failed to move back to the parent directory\n\n"; # moving back
  }
  else{
    $input_dir = $parent_dir;
  }   
  
  $input_dir .= '/';
      
  if ($CX_context){
    # push @bedfiles,$output_dir.$filename;
    push @bedfiles,$input_dir.$filename;
  }
  else{ ## CpG context only (default)
    if ($filename =~ /^CpG/){ # only testing the actual filename without the path information
        push @bedfiles,$input_dir.$filename; # we are adding the full path to the filename
    }
    else{
        # skipping CHH or CHG files
    }
  }
}

if (@bedfiles){
    warn "Using the following files as Input:\n";
    print join ("\t",@bedfiles),"\n\n";
    sleep (2);
}
else{
    die "It seems that you are trying to generate bedGraph files for files not starting with CpG.... Please specify the option '--CX' and try again\n\n";
}

open (OUT,"| gzip -c - > ${output_dir}${bedGraph_output}") or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!\n";
warn "Writing bedGraph to file: $bedGraph_output\n";
print OUT "track type=bedGraph\n";

my $coverage_output = $bedGraph_output;
unless ($coverage_output =~ s/bedGraph\.gz$/bismark.cov.gz/){
  $coverage_output =~ s/$/.bismark.cov.gz/;
}

open (COVERAGE,"| gzip -c - > ${output_dir}${coverage_output}") or die "Problems writing to the coverage output detected. File path: '$output_dir'\tfile name: '$coverage_output' $!\n\n";
warn "Also writing out a coverage file including counts methylated and unmethylated residues to file: $coverage_output\n";

if ($zero){
  my $zero_coverage_output = $bedGraph_output;
  unless ($zero_coverage_output =~ s/bedGraph$/bismark.zero.cov/){
    $zero_coverage_output =~ s/$/.bismark.zero.cov/;
  }

  open (ZEROCOVERAGE,'>',$output_dir.$zero_coverage_output) or die "Problems writing to the zero-based coverage output detected. File path: '$output_dir'\tfile name: '$\zero_coverage_output' $!\n\n";
  warn "Also writing out a 0-based, half-open coverage file including counts methylated and unmethylated residues to file: $zero_coverage_output\n";
}
warn "\n";

my %temp_fhs;
my @temp_files;  # writing all context files (default: CpG only) to these files prior to sorting
my %chr_lengths; # storing chromosome lengths in '--ample_memory' mode

### changing to the output directory
unless ($output_dir eq ''){ # default
  chdir $output_dir or die "Failed to change directory to $output_dir\n";
  warn "Changed directory to $output_dir\n";
}

if ($gazillion){
  if (scalar @bedfiles == 1){
	  warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Sorting everything in memory instead of writing out individual chromosome files ...\n";
  }
  else{
	  warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Merging all input files and sorting everything in memory instead of writing out individual chromosome files...\n";
    my $merge = "$bedGraph_output.methylation_calls.merged";
    open (MERGE,'>',$merge) or die "Failed to write to temporary merged file $merge: $!\n";
    warn "Writing all merged methylation calls to temp file $merge\n\n"; sleep(2);
    push @temp_files, $merge;
  }
}

foreach my $infile (@bedfiles) {
    
    # warn "Processing $infile\n";
    
    if ($remove) {
	warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
	
	if ($infile =~ /gz$/){
	    open (READ,"gunzip -c $infile |") or die $!;
	}
	else{
	    open (READ,$infile) or die $!;
	}
	
	my $removed_spaces_outfile = $infile;
	$removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
	$removed_spaces_outfile =~ s/.*\///;# replacing everything up to the last slash in the filename
    
	warn "Attempting to write to file ${output_dir}${removed_spaces_outfile}\n\n";
	open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
	
	unless ($no_header){
	    $_ = <READ>;		### Bismark version header
	    print REM $_;		### Bismark version header
	}
	
	while (<READ>) {
	    chomp;
	    my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
	    $id =~ s/\s+/_/g;
	    print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
	}
	
	close READ or die $!;
	close REM or die $!;
	
	### changing the infile name to the new file without spaces
	$infile = $removed_spaces_outfile; # at this stage we are already in the output directory so it should pick it up correctly
    }
    
    # opening infile
    if ($infile =~ /gz$/){
	    open (IN,"gunzip -c $infile |") or die "Couldn't find file '$infile': $!\n";
    }
    else{
	    open (IN,$infile) or die "Couldn't find file '$infile': $!\n";
    }
    
    if ($infile =~ /\//){ # if input files are in a different folders we extract the filename again
    $infile =~ s/.*\///;# replacing everything up to the last slash in the filename
    # warn "Renamed Infile: $infile\n";
    }
    
    ### People these days seem to be aligning their data to newly assembled genomes more and more, which sometimes consist of up to half a million scaffolds instead of ~23 chromosomes. This
    ### does normally clash with the operating system's limit of files that can be open for writing at the same time, and it is difficult and probably not advisable to increase this
    ### limit (some even say there is a reason for the OS doing so...).
    ### To still allow then generation of bedGraph files we will in these cases sort everything using the Linux sort command instead, which will sort by chromosome and position (the
    ### chromosome sorting is not carried out for chromosome sorted files which makes the sort MUCH faster).
    
    if ($gazillion){
	# using all infiles instead of sorting
	if (scalar @bedfiles == 1){
	    push @temp_files, $infile;
	}
	else{
	    ## always ignoring the version header
	    unless ($no_header){
		$_ = <IN>;		### Bismark version header
	    }

	    while (<IN>) {
		if ($_ =~ /^Bismark /){
		    warn "Found Bismark version information. Skipping this line (should still work fine) but consider using '--no_header' next time...\n";
		    next;
		}
		print MERGE;
	    }
	    warn "Finished writing methylation calls from $infile to merged temp file\n";
	}
    }
    else{
	warn "Now writing methylation information for file >>$infile<< to individual files for each chromosome\n";
	
	## always ignoring the version header
	unless ($no_header){
	    $_ = <IN>;		### Bismark version header
	}
	
	while (<IN>) {
	    if ($_ =~ /^Bismark /){
		warn "Found Bismark version information. Skipping this line (should still work fine) but consider losing '--no_header' next time...\n";
		next;
	    }

	    chomp;
	    
	    my ($chr,$pos) = (split (/\t/))[2,3];
	    
	    ### If --ample_mem was specified we are keeping track of the highest position for each chromosome as this will determine the size of the array we need to create in the next step
	    if ($ample_mem){
		### setting the first position for this chromosome
		unless (defined $chr_lengths{$chr} ){
		    $chr_lengths{$chr} = $pos;
		}
		# for all subsequent postions for this chromosome
		if ($pos > $chr_lengths{$chr} ){
		  $chr_lengths{$chr} = $pos; # set the current position as the new highest position
		}
	    }
	    
	    # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
	    $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the chromosome names so that they don't interfere with the temp file creation
	    $chr =~ s/\//_/g; # replacing forward slash ('/') characters in the chromosome names	
	    # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
	    unless (exists $temp_fhs{$chr}) { # Including the infile name to the temporary chromosome files to enable parallel processing of multiple files at the same time
		
		my $temp_file_name = $infile.'.chr'.$chr.'.methXtractor.temp';
		# warn "Using temp file name: $temp_file_name\n"; sleep(1);
		
		open ($temp_fhs{$chr},'>',$infile.'.chr'.$chr.'.methXtractor.temp') or die "Failed to open filehandle: $!";
		push @temp_files, $temp_file_name; # storing temp files as we open them instead
	    }
	    
	    print {$temp_fhs{$chr}} "$_\n";
	}
	
	warn "Finished writing out individual chromosome files for $infile\n";
    }
}

# closing temporary filehandles to force writing out buffered content
foreach my $temp_fh(keys %temp_fhs){
    close $temp_fhs{$temp_fh} or warn "Failed to close temporary filehandle $temp_fhs{$temp_fh}: $!\n";
}

### printing out the determined maximum position for each chromosome
if ($ample_mem){
    foreach my $chr (sort keys %chr_lengths){
	warn "Highest determined position for chromosome $chr:\t\t$chr_lengths{$chr} bp\n";
    }
    warn "\n";
}

unless ($gazillion){
    warn "\n";
    warn "Collecting temporary chromosome file information... Processing the following input file(s):\n";
    warn join ("\n",@temp_files),"\n\n";
    sleep (1);
}

if ($gazillion){
    if (scalar @bedfiles > 1){
	    close (MERGE) or die "Failed to close filehandle MERGE: $!\n";
    }
}

foreach my $in (sort @temp_files) {

  if ($sort_size){
	  warn "Sorting input file $in by positions (using -S of $sort_size)\n" unless ($ample_mem);
  }
    
  my $ifh;

  my $name;
  my $meth_state;
  my $chr = "";
  my $pos = 0;
  my $meth_state2;

  my $last_pos;
  my $last_chr;

  ### If the user specified to have a lot of RAM available (probably in the range of > 16GB for 2 arrays of human genome Chromosome 1) we will sort the methylation calls in two big arrays instead of using the Unix sort command
  if ($ample_mem){
    # warn "Generating enormous array instead of sorting the file. This may temporily use quite a bit of memory (RAM)!\n\n";

    my @meth_count;
    my @unmeth_count;

    open ($ifh,$in) or die "Couldn't read from temporary file '$in': $!\n";

    while (my $line = <$ifh>){
      next if ($line =~ /^Bismark/);
      chomp $line;

      ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;

      unless ($last_pos and $last_chr){
	$last_chr = $chr;
	$last_pos = $pos;
      }
      unless (@meth_count and @unmeth_count){
	warn  "Setting maximum position of arrays \@meth_count and \@unmeth_count for chromosome $chr to $chr_lengths{$chr}\n";
	@meth_count   = (0) x  $chr_lengths{$chr};
	@unmeth_count = (0) x  $chr_lengths{$chr};
	# warn "length of array meth count: ",scalar @meth_count,"\n";
	warn "Finished generating arrays\n";
	# sleep(1);	
      }
      # warn "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; sleep(1);
      # print join ("\t",$name, $meth_state, $chr, $pos, $meth_state2),"\n"; 
      # sleep(1);

      # if ($last_chr ne $chr) {
      #    die "Reached new chromosome '$chr' which mustn't happen from pre-sorted files (previous chromosome was: '$last_chr')\n";
      # }

      my $validated = validate_methylation_call($meth_state, $meth_state2); # as a comment, methylation calls in Unknown context (U, u) would fail this check, but they should be ignored by the methylation extractor anyway
      unless($validated){
	warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
	next;
      }

      if ($meth_state eq '+'){
	# warn "increasing meth $pos by 1\n"; sleep(1);
	$meth_count[$pos-1]++;
      }
      else{
	$unmeth_count[$pos-1]++;
	# warn "increasing unmeth $pos by 1\n"; sleep(1);
      }
    }

    close $ifh or die $!;

    warn "Now printing methylation information for this chromosome\n";
    # warn "length of array meth count: ",scalar @meth_count,"\n";
    # warn "chr\tposition\tcount methylated\tcount unmethylated\tcount total\n";
    foreach my $index (0..$#meth_count){
      my $totalcount = $meth_count[$index] + $unmeth_count[$index];
      if ($totalcount > 0){
	#	warn "$index\t$meth_count[$index]\t$unmeth_count[$index]\t$totalcount\n";
	# sleep(1);
	
	my $bed_pos = $index; ### bedGraph coordinates are 0 based
	my $one_based_pos = $bed_pos + 1;

	my $meth_percentage;
	($totalcount >= $coverage_threshold) ? ($meth_percentage = ($meth_count[$index]/$totalcount) * 100) : ($meth_percentage = undef);

	if (defined $meth_percentage){
	
	  # as of version 0.9.1 we will by default write out both a bedGraph and a more detailed coverage file
	
	  # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (half-open. Clever, huh?)
	  print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
	
	  # this is the coverage file. Coordinates are 1-based
	  print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";

	  # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
	  if ($zero){
	    print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
	  }

	}
      }
    }

    @meth_count = ();
    @unmeth_count = ();

  }
  ### default: we assume that the user wants to use the Linux Sort command. This is quite a bit slower, but features a much smaller memory footprint
  else{
    my $sort_dir = './'; # there has been a cd into the output_directory already
    # my $sort_dir = $output_dir;
    # if ($sort_dir eq ''){
    #   $sort_dir = './';
    # }
    
    if ($gazillion){
	if ($in =~ /gz$/){
	    open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k3,3V -k4,4n |" or die "Input file could not be sorted. $!\n";
	}
	else{ 
	    open $ifh, "sort -S $sort_size -T $sort_dir -k3,3V -k4,4n $in |" or die "Input file could not be sorted. $!\n";
	}
	### Comment by Volker Brendel, Indiana University
	### "The -k3,3V sort option is critical when the sequence names are numbered scaffolds (without left-buffering of zeros).  Omit the V, and things go very wrong in the tallying of reads."
    }
    else{
	### this sort command was used previously and sorts according to chromosome in addition to position. Since the files are being sorted according to chromosomes anyway,
	### we may drop the -k3,3V option. It has been reported that this will result in a dramatic speed increase
	if ($in =~ /gz$/){
	    open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k4,4n |" or die "Input file could not be sorted. $!\n";
	}
	else{
	    open $ifh, "sort -S $sort_size -T $sort_dir -k4,4n $in |" or die "Input file could not be sorted. $!\n";
	}
    }

    while (my $line = <$ifh>) {
	next if ($line =~ /^Bismark/);
	chomp $line;
	#warn "full line:\n$line\n";
	$last_chr = $chr;
	$last_pos = $pos;
	($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
	#warn "$name, $meth_state, $chr, $pos, $meth_state2\n"; sleep(1);
	if (($last_pos ne $pos) || ($last_chr ne $chr)) {
	    generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
	    @methylcalls = qw (0 0 0);
	}

      my $validated = validate_methylation_call($meth_state, $meth_state2);
      unless($validated){
	warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
	next;
      }
      if ($meth_state eq "+") {
	$methylcalls[0]++;
	$methylcalls[2]++;
      } else {
	  $methylcalls[1]++;
	$methylcalls[2]++;
      }
    }


    $last_chr = $chr;
    $last_pos = $pos;
    if ($methylcalls[2] > 0) {
      generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
    }
	
    close $ifh or die $!;

    @methylcalls = qw (0 0 0); # resetting @methylcalls

  }

  ### deleting temporary files (only needed if --gazillion hasn't been specified
  if ($gazillion and scalar @bedfiles == 1){
    # if there was only 1 file to sort this will be the input file, which obviously shouldn't be removed
  }
  else{
    my $delete = unlink $in;
    if ($delete) {
      warn "Successfully deleted the temporary input file $in\n\n";
    }
    else {
      warn "The temporary inputfile $in could not be deleted $!\n\n";
    }
  }
}

close OUT or die $!;
close COVERAGE or die $!;
if ($zero){
  close ZEROCOVERAGE or die $!;
}
### Optional conversion of the bedGraph file to be compatible with UCSC
if ($ucsc){
  warn "Finally, creating a UCSC compatible version of the bedGraph file. This shouldn't take long...\n     ==============     ==============     ==============     ==============     ==============\n";

  if ($bedGraph_output =~ /\.gz$/){
     open (UCSCTEMP,"gunzip -c $bedGraph_output | ") or die "Failed to read bedGraph file (.gz) >$bedGraph_output<: $!";
  }
  else{
    open (UCSCTEMP,$bedGraph_output) or die "Failed to read bedGraph file >$bedGraph_output<: $!";
  }

  my $ucsc_outfile = $bedGraph_output;
  $ucsc_outfile =~ s/\.gz$//;
  $ucsc_outfile .= '_UCSC.bedGraph.gz';
  
  warn "Writing a new version of the bedGraph file compatible to UCSC  genomes to file >$ucsc_outfile<\n";
  warn "Chromosomes names will start with 'chr'\n";
  warn "Chromosome MT (Ensembl mitochondrial DNA http://www.ensembl.org/Homo_sapiens/Location/Chromosome?chr=MT;r=MT:1-16569) will be renamed to 'chrM'\n";

  open (UCSCOUT, "| gzip -c - > ${output_dir}$ucsc_outfile") or die "Failed to write to >$ucsc_outfile<: $!";
  my $firstline = <UCSCTEMP>;
  print UCSCOUT $firstline;
  while (<UCSCTEMP>){
    my ($chr,$start,$end,$percentage) = (split /\t/);
    if ($chr eq "MT"){
      # warn "renaming >MT< to >chrM<\n";
      $chr = "chrM";
    }

    unless ($chr =~ /^chr/){
      # warn "renaming >$chr< to >chr$chr<\n";
      $chr = "chr$chr";
    }
    print UCSCOUT join ("\t",$chr,$start,$end,$percentage);

  }
  close UCSCTEMP;
  close UCSCOUT or warn "Had trouble closing filehandle UCSCOUT: $!\n";
  warn "\nAll done. Finished UCSC conversion.\n\n";
}

exit 0;


sub validate_methylation_call{
  my $meth_state = shift;
  croak "Missing (+/-) methylation call" unless defined $meth_state;
  my $meth_state2 = shift;
  croak "Missing alphabetical methylation call" unless defined $meth_state2;
  my $is_consistent;
  ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) 
                          : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
  return 1 if $is_consistent;
  return 0;
}

sub check_CpG_methylation_call{
  my $meth1 = shift;
  my $meth2 = shift;
  return 1 if($meth1 eq "+" && $meth2 eq "Z");
  return 1 if($meth1 eq "-" && $meth2 eq "z");
  return 0;
}

sub check_nonCpG_methylation_call{
  my $meth1 = shift;
  my $meth2 = shift;
  return 1 if($meth1 eq "+" && $meth2 eq "C");
  return 1 if($meth1 eq "+" && $meth2 eq "X");
  return 1 if($meth1 eq "+" && $meth2 eq "H");
  return 1 if($meth1 eq "-" && $meth2 eq "c");
  return 1 if($meth1 eq "-" && $meth2 eq "x");
  return 1 if($meth1 eq "-" && $meth2 eq "h");
  return 0;
}

sub generate_output{
  my $methcount = $methylcalls[0];
  my $nonmethcount = $methylcalls[1];
  my $totalcount = $methylcalls[2];
  my $last_chr = shift;
  my $last_pos = shift;
  croak "Should not be generating output if there's no reads to this region" unless ($totalcount > 0);
  croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if ($totalcount != ($methcount + $nonmethcount) );

  my $bed_pos = $last_pos - 1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
  my $meth_percentage;
  ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
  # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
  if (defined $meth_percentage){

    # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (clever, huh?)
    my $one_based_pos = $bed_pos + 1;
    print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";

    # this is the coverage file. Coordinates are 1-based
    print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";

    # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
    if ($zero){
      print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
    }
  }

}

sub process_commandline{
  my $help;
  my $output_dir;
  my $bedGraph_output;
  my $no_header;
  my $coverage_threshold; # Minimum number of reads covering before calling methylation status
  my $remove;
  my $counts;
  my $CX_context;
  my $sort_size;
  my $version;
  my $gazillion;
  my $ample_mem;
  my $zero;
  my $input_dir;
  my $ucsc;

  my $command_line = GetOptions ('help|man'            => \$help,
				 'dir=s'               => \$output_dir,
				 'o|output=s'          => \$bedGraph_output,
				 'no_header'           => \$no_header,
				 "cutoff=i"            => \$coverage_threshold,
				 "remove_spaces"       => \$remove,
				 "counts"              => \$counts,
				 "CX|CX_context"       => \$CX_context,
				 "buffer_size=s"       => \$sort_size,
				 'version'             => \$version,
				 'gazillion|scaffolds' => \$gazillion,
				 'ample_memory'        => \$ample_mem,
				 "zero_based"          => \$zero,
          'ucsc'               => \$ucsc,         
			);

  ### EXIT ON ERROR if there were errors with any of the supplied options
  unless ($command_line){
    die "Please respecify command line options\n";
  }

  ### HELPFILE
  if ($help){
    print_helpfile();
    exit;
  }

  if ($version){
    print << "VERSION";


                      Bismark Methylation Extractor Module -
                                bismark2bedGraph

                      Bismark Extractor Version: $bismark2bedGraph_version
              Copyright 2010-22 Felix Krueger, Altos Bioinformatics
                     https://github.com/FelixKrueger/Bismark


VERSION
    exit;
  }

  @sorting_files = @ARGV;

  ### no files provided
  unless (@sorting_files){
    warn "You need to provide one or more Bismark methylation caller files to create an individual C methylation bedGraph output. Please respecify!\n\n";
    sleep(2);

    print_helpfile();
    exit;
  }
 
  ### PARENT DIRECTORY
  my $parent_dir = getcwd();
  # warn "parent directory is: $parent_dir\n";
  
  ### OUTPUT DIR PATH
  if (defined $output_dir){
 
    unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
	unless ($output_dir =~ /\/$/){
	    $output_dir =~ s/$/\//;
	}
	unless (-d $output_dir){
	    mkdir $output_dir or die "Failed to create output directory $output_dir: $!\n\n";
	    warn "Created output directory $output_dir\n";
	}

      ### want to get an absolute path for the output directory instead of a relative one
      chdir $output_dir or die "Failed to move into output directory '$output_dir': $!\n\n";
      $output_dir = getcwd();
      unless ($output_dir =~ /\/$/){
	$output_dir =~ s/$/\//;
      }
      # warn "output directory is: $output_dir\n";

      # changing back to the parent directory
      chdir $parent_dir or die "Failed to move back into parent directory '$parent_dir': $!\n\n";

    }

  }
  else{
    $output_dir = '';
  }

  unless (defined $bedGraph_output){
      die "Please provide the name of the output file using the option -o/--output filename\n";
  }
  
  if ($bedGraph_output =~ /\//){ # this is supposed a filename and not a path name
      die "Please specify a file name without any path information (or use --dir if necessary)\n\n";
  }
  
  unless ($bedGraph_output =~ /\.gz$/){
      $bedGraph_output = "${bedGraph_output}.gz";  ### 22 07 2015: Output will be gzip compressed
  }

  ### NO HEADER
  unless ($no_header){
    $no_header = 0;
  }

  ### remove white spaces in read ID (needed for sorting using the sort command
  unless ($remove){
    $remove = 0;
  }

  ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
  if (defined $coverage_threshold){
    unless ($coverage_threshold > 0){
      die "Please select a coverage greater than 0 (positive integers only)\n";
    }
  }
  else{
    $coverage_threshold = 1;
  }

 
  ### SORT buffer size
  if (defined $sort_size){

      ### Using ample memory and the UNIX SORT are mutually exclusive
      if ($ample_mem){
	  die "The options '--ample_mem' and using the UNIX sort function are mutually exclusive. Please make your pick!\n\n"; 
      }
      
      unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
	  die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
      }
  }
  else{
    $sort_size = '2G';
  }

  unless ($CX_context){
    $CX_context = 0;
  }

  unless ($counts){
    $counts = 1;
  }

  if ($gazillion){
    if ($ample_mem){
      die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
    }
  }

  if ($ucsc){
    warn "Creating additional bedGraph file that has known Ensembl chromosome names replaced to work with the UCSC genome browser\n";
    warn "This option:\n- prefixes chromosome names with 'chr'\n";
    warn "- changes 'MT' to 'chrM'\n\n";
  }

  return ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir,$ucsc);
}


sub print_helpfile{
  print <<EOF

  SYNOPSIS:

  This script uses positional methylation data generated by the Bismark methylation extractor to generate
  a bedGraph file as well as a coverage file which are both sorted by chromosomal position. The bedGraph
  file uses 0-based genomic start and 1-based genomic end coordinates and should be UCSC compatible (if
  UCSC genomes were used for the alignment step). In addition this module will write out a coverage file
  which is similar to the bedGraph file, but uses 1-based genomic coordinates and also reports the count
  of methylated and unmethylated cytosines for any covered position; this coverage file is required if you
  wish to generate a genome-wide cytosine report with the module coverage2cytosine.

  USAGE: bismark2bedGraph [options] -o <output> [methylation extractor input files]

Methylation extractor input files: These files are required to start with CpG... in order for the
script to correctly work out the sequence context when using CpG context only (default). If all cytosine
contexts are selected ('--CX_context'), all input files will be used regardless of their file file name(s).


-o/--output <filename>     Name of the output file, mandatory.

--dir                      Output directory. Output is written to the current directory if not specified explicitly.

--cutoff [threshold]       The minimum number of times a methylation state has to be seen for that nucleotide
                           before its methylation percentage is reported. Default: 1.

--remove_spaces            Replaces whitespaces in the sequence ID field with underscores to allow sorting.

--CX/--CX_context          The sorted bedGraph output file contains information on every single cytosine that was covered
                           in the experiment irrespective of its sequence context. This applies to both forward and
                           reverse strands. Please be aware that this option may generate large temporary and output files
                           and may take a long time to sort (up to many hours). Default: OFF.
                           (i.e. Default = CpG context only).

--buffer_size <string>     This allows you to specify the main memory sort buffer when sorting the methylation information.
                           Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
			   a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
                           (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
                           Defaults to 2G.

--scaffolds/--gazillion    Users working with unfinished genomes sporting tens or even hundreds of thousands of
                           scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to 
                           individual chromosome files. These errors were caused by the operating system's limit
                           of the number of filehandle that can be written to at any one time (typically 1024; to
                           find out this limit on Linux, type: ulimit -a).
                           To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
                           methylation calls into individual chromosome files. Instead, all input files are
                           temporarily merged into a single file (unless there is only a single file), and this
                           file will then be sorted by both chromosome AND position using the Unix sort command.
                           Please be aware that this option might take a looooong time to complete, depending on 
                           the size of the input files, and the memory you allocate to this process (see --buffer_size).
                           Nevertheless, it seems to be working.

--ample_memory             Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
                           instead use two arrays to sort methylated and unmethylated calls, respectively. This may result
                           in a faster sorting process for very large files, but this comes at the cost of a larger memory
                           footprint (as an estimate, two arrays of the length of (the largest) human chromosome 1 (nearly
                           250 million bp) temporarily consume around 16GB of RAM). Note however that due to the overheads
                           of creating and looping through arrays this option might in fact be *slower* for small-ish
                           files (up to a few million alignments). Note also that this option is not currently compatible
                           with options '--scaffolds/--gazillion'.

--zero_based               Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
                           and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
                           instead of using 1-based coordinates throughout. Default: OFF.

--ucsc                     Writes out an additional bedGraph file (ending in '_UCSC.bedGraph.gz') that is compatible with the
                           UCSC genome browser. If alignments were carried out against an Ensembl version of the genome, this
                           step will prefix chromosome names with 'chr', and rename the mitochondrial chromosome from 'MT' to
                           'chrM'.


The bedGraph output looks like this (tab-delimited; 0-based start coords, 1-based end coords):
==============================================================================================

track type=bedGraph (header line)

<chromosome>  <start position>  <end position>  <methylation percentage>


The coverage output looks like this (tab-delimited, 1-based genomic coords; optional zero-based, half-open coords with '--zero_based'):
=======================================================================================================================================

<chromosome>  <start position>  <end position>  <methylation percentage>  <count methylated>  <count non-methylated>


                          Script last modified: 22 Sept 2020

EOF
    ;
  exit 1;
}