Skip to content

Commit

Permalink
Yf add interval list codec (#1327)
Browse files Browse the repository at this point in the history
* Implemented a tribble (co)dec for IntervalList.
* moved the business part of the dec from IntervalList to the Tribble dec.
* testing the Tribble directly.
  • Loading branch information
Yossi Farjoun authored Apr 3, 2019
1 parent aa89809 commit 1ece54c
Show file tree
Hide file tree
Showing 11 changed files with 379 additions and 117 deletions.
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/samtools/util/BufferedLineReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public String readLine() {
/**
* Non-destructive one-character look-ahead.
*
* @return If not eof, the next character that would be read. If eof, -1.
* @return If not eof, the next character that would be read. If eof, {@value EOF_VALUE}.
*/
@Override
public int peek() {
Expand Down
19 changes: 18 additions & 1 deletion src/main/java/htsjdk/samtools/util/Interval.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
package htsjdk.samtools.util;

import htsjdk.samtools.SAMException;
import htsjdk.tribble.Feature;
import htsjdk.tribble.annotation.Strand;

import java.util.Collection;
Expand All @@ -33,7 +34,7 @@
*
* @author Tim Fennell
*/
public class Interval implements Comparable<Interval>, Cloneable, Locatable {
public class Interval implements Comparable<Interval>, Cloneable, Feature {
private final boolean negativeStrand;
private final String name;
private final String contig;
Expand Down Expand Up @@ -209,6 +210,22 @@ public boolean equals(final Object other) {
}
}

/**
* Equals method that also checks strand and name
*/
public boolean equalsWithStrandAndName(final Object other) {
if (!this.equals(other)) {
return false;
}

final Interval that = (Interval) other;
if (this.negativeStrand != that.negativeStrand) {
return false;
}

return this.name.equals(that.name);
}

@Override
public int hashCode() {
int result = getContig().hashCode();
Expand Down
81 changes: 7 additions & 74 deletions src/main/java/htsjdk/samtools/util/IntervalList.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,11 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SAMTextHeaderCodec;
import htsjdk.tribble.IntervalList.IntervalListCodec;

import java.io.*;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.*;

/**
* Represents a list of intervals against a reference sequence that can be written to
Expand Down Expand Up @@ -512,11 +505,6 @@ public static IntervalList fromFiles(final Collection<File> intervalListFiles) {
*/
public static IntervalList fromReader(final BufferedReader in) {

final int SEQUENCE_POS=0;
final int START_POS=1;
final int END_POS=2;
final int STRAND_POS=3;
final int NAME_POS=4;
try {
// Setup a reader and parse the header
final StringBuilder builder = new StringBuilder(4096);
Expand All @@ -539,72 +527,17 @@ public static IntervalList fromReader(final BufferedReader in) {
final IntervalList list = new IntervalList(codec.decode(headerReader, "BufferedReader"));
final SAMSequenceDictionary dict = list.getHeader().getSequenceDictionary();

//there might not be any lines after the header, in which case we should return an empty list
// There might not be any lines after the header, in which case we should return an empty list
if (line == null) {
return list;
}

final IntervalListCodec intervalListCodec = new IntervalListCodec(dict);

// Then read in the intervals
final FormatUtil format = new FormatUtil();
String lastSeq = null;
do {
if (line.trim().isEmpty()) {
continue; // skip over blank lines
}

// Make sure we have the right number of fields
final String[] fields = line.split("\t");
if (fields.length != 5) {
throw new SAMException("Invalid interval record contains " +
fields.length + " fields: " + line);
}

// Then parse them out
String seq = fields[SEQUENCE_POS];
if (seq.equals(lastSeq)) {
seq = lastSeq;
}
lastSeq = seq;

final int start = format.parseInt(fields[START_POS]);
final int end = format.parseInt(fields[END_POS]);
if (start < 1) {
throw new IllegalArgumentException("Coordinate less than 1: start value of " + start +
" is less than 1 and thus illegal");
}

if (start > end + 1) {
throw new IllegalArgumentException("Start value of " + start +
" is greater than end + 1 for end of value: " + end +
". I'm afraid I cannot let you do that.");
}

final boolean negative;
switch (fields[STRAND_POS]) {
case "-":
negative = true;
break;
case "+":
negative = false;
break;
default:
throw new IllegalArgumentException("Invalid strand field: " + fields[STRAND_POS]);
}

final String name = fields[NAME_POS];

final Interval interval = new Interval(seq, start, end, negative, name);
final SAMSequenceRecord sequence = dict.getSequence(seq);
if (sequence == null) {
log.warn("Ignoring interval for unknown reference: " + interval);
} else {
final int sequenceLength = sequence.getSequenceLength();
if (sequenceLength > 0 && sequenceLength < end) {
throw new IllegalArgumentException("interval with end: " + end + " extends beyond end of sequence with length: " + sequenceLength);
}

list.intervals.add(interval);
}
final Optional<Interval> maybeInterval = Optional.ofNullable(intervalListCodec.decode(line));
maybeInterval.ifPresent(list.intervals::add);
}
while ((line = in.readLine()) != null);

Expand Down
7 changes: 5 additions & 2 deletions src/main/java/htsjdk/samtools/util/LineReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
/**
* Interface allows for implementations that read lines from a String, an ASCII file, or somewhere else.
*/
public interface LineReader extends Closeable{
public interface LineReader extends Closeable {

// value to return in call to peek, if eof has been reached.
int EOF_VALUE = -1;

/**
* Read a line and remove the line terminator
Expand All @@ -43,7 +46,7 @@ public interface LineReader extends Closeable{

/**
* Non-destructive one-character look-ahead.
* @return If not eof, the next character that would be read. If eof, -1.
* @return If not eof, the next character that would be read. If eof, {@value EOF_VALUE}.
*/
int peek();

Expand Down
164 changes: 164 additions & 0 deletions src/main/java/htsjdk/tribble/IntervalList/IntervalListCodec.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/*
* Copyright (c) 2019, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

package htsjdk.tribble.IntervalList;

import htsjdk.samtools.*;
import htsjdk.samtools.util.FormatUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.LineReader;
import htsjdk.samtools.util.Log;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.annotation.Strand;
import htsjdk.tribble.readers.LineIterator;

/**
* A tribble codec for IntervalLists.
*
* Also contains the parsing code for the non-tribble parsing of IntervalLists
*/

public class IntervalListCodec extends AsciiFeatureCodec<Interval> {

private final Log log = Log.getInstance(IntervalListCodec.class);

private SAMSequenceDictionary dictionary;

public IntervalListCodec() {
this(null);
}

public IntervalListCodec(final SAMSequenceDictionary dict) {
super(Interval.class);
dictionary = dict;
}

String lastSeq = null;

private Interval parseIntervalString(final String line, final SAMSequenceDictionary dict) {
final int SEQUENCE_POS = 0;
final int START_POS = 1;
final int END_POS = 2;
final int STRAND_POS = 3;
final int NAME_POS = 4;

final FormatUtil format = new FormatUtil();

// Make sure we have the right number of fields
final String[] fields = line.split("\t");
if (fields.length != 5) {
throw new TribbleException("Invalid interval record contains " +
fields.length + " fields: " + line);
}

// Then parse them out
String seq = fields[SEQUENCE_POS];
if (seq.equals(lastSeq)) {
seq = lastSeq;
}
lastSeq = seq;

final int start = format.parseInt(fields[START_POS]);
final int end = format.parseInt(fields[END_POS]);
if (start < 1) {
throw new IllegalArgumentException("Coordinate less than 1: start value of " + start +
" is less than 1 and thus illegal");
}

if (start > end + 1) {
throw new IllegalArgumentException("Start value of " + start +
" is greater than end + 1 for end of value: " + end +
". I'm afraid I cannot let you do that.");
}

Strand strand = Strand.decode(fields[STRAND_POS]);
if (strand==Strand.NONE) throw new IllegalArgumentException("Invalid strand field: " + fields[STRAND_POS]);

final String name = fields[NAME_POS];

final Interval interval = new Interval(seq, start, end, strand==Strand.NEGATIVE, name);
final SAMSequenceRecord sequence = dict.getSequence(seq);
if (sequence == null) {
log.warn("Ignoring interval for unknown reference: " + interval);
return null;
} else {
final int sequenceLength = sequence.getSequenceLength();
if (sequenceLength > 0 && sequenceLength < end) {
throw new IllegalArgumentException("interval with end: " + end + " extends beyond end of sequence with length: " + sequenceLength);
}
return interval;
}
}

@Override
public Interval decode(final String line) {
if (line.startsWith("@")) {
return null;
}

if (line.trim().isEmpty()) {
return null;
}
// our header cannot be null, we need the dictionary from the header
if (dictionary == null) {
throw new TribbleException("IntervalList dictionary cannot be null when decoding a record");
}

return parseIntervalString(line, dictionary);
}


@Override
public Object readActualHeader(LineIterator lineIterator) {
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
final SAMFileHeader header = headerCodec.decode(new LineReader() {
int lineNo = 0;
@Override
public String readLine() {
lineNo++;
return lineIterator.next();
}
@Override
public int getLineNumber() {
return lineNo;
}
@Override
public int peek() {
return lineIterator.hasNext() ?
lineIterator.peek().charAt(0) :
LineReader.EOF_VALUE;
}
@Override
public void close() { }
}, "IntervalListCodec");
dictionary = header.getSequenceDictionary();
return header;
}

@Override
public boolean canDecode(String s) {
return s.endsWith(".interval_list");
}
}
Loading

0 comments on commit 1ece54c

Please sign in to comment.