Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Immutable CRAIEntry #1256

Merged
merged 7 commits into from
Jan 10, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 57 additions & 49 deletions src/main/java/htsjdk/samtools/cram/CRAIEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,36 @@

import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;

/**
* A class representing CRAI index entry: file and alignment offsets for each slice.
* Created by vadim on 10/08/2015.
*/
public class CRAIEntry implements Comparable<CRAIEntry>, Cloneable {
public int sequenceId;
public int alignmentStart;
public int alignmentSpan;
public long containerStartOffset;
public int sliceOffset;
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
public int sliceSize;
public int sliceIndex;
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved

private static int CRAI_INDEX_COLUMNS = 6;
private static String entryFormat = "%d\t%d\t%d\t%d\t%d\t%d";

public CRAIEntry() {
public class CRAIEntry implements Comparable<CRAIEntry> {
private final int sequenceId;
private final int alignmentStart;
private final int alignmentSpan;
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
private final long containerStartOffset;
private final int sliceByteOffset;
private final int sliceByteSize;

private static final int CRAI_INDEX_COLUMNS = 6;
private static final String ENTRY_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d";

public CRAIEntry(final int sequenceId,
final int alignmentStart,
final int alignmentSpan,
final long containerStartOffset,
final int sliceByteOffset,
final int sliceByteSize) {
this.sequenceId = sequenceId;
this.alignmentStart = alignmentStart;
this.alignmentSpan = alignmentSpan;
this.containerStartOffset = containerStartOffset;
this.sliceByteOffset = sliceByteOffset;
this.sliceByteSize = sliceByteSize;
}

/**
Expand All @@ -36,7 +45,7 @@ public CRAIEntry() {
* @param line string formatted as a CRAI index entry
* @throws CRAIIndex.CRAIIndexException
*/
public CRAIEntry(final String line) throws CRAIIndex.CRAIIndexException {
public CRAIEntry(final String line) throws CRAIIndex.CRAIIndexException {
final String[] chunks = line.split("\t");
if (chunks.length != CRAI_INDEX_COLUMNS) {
throw new CRAIIndex.CRAIIndexException(
Expand All @@ -48,8 +57,8 @@ public CRAIEntry(final String line) throws CRAIIndex.CRAIIndexException {
alignmentStart = Integer.parseInt(chunks[1]);
alignmentSpan = Integer.parseInt(chunks[2]);
containerStartOffset = Long.parseLong(chunks[3]);
sliceOffset = Integer.parseInt(chunks[4]);
sliceSize = Integer.parseInt(chunks[5]);
sliceByteOffset = Integer.parseInt(chunks[4]);
sliceByteSize = Integer.parseInt(chunks[5]);
} catch (final NumberFormatException e) {
throw new CRAIIndex.CRAIIndexException(e);
}
Expand All @@ -73,30 +82,18 @@ public void writeToStream(OutputStream os) {
* Format the entry as a string suitable for serialization in the CRAI index
*/
private String serializeToString() {
return String.format(entryFormat,
return String.format(ENTRY_FORMAT,
sequenceId, alignmentStart, alignmentSpan,
containerStartOffset, sliceOffset, sliceSize);
containerStartOffset, sliceByteOffset, sliceByteSize);
}

@Override
public String toString() { return serializeToString(); }

public static List<CRAIEntry> fromContainer(final Container container) {
final List<CRAIEntry> entries = new ArrayList<>(container.slices.length);
for (int i = 0; i < container.slices.length; i++) {
final Slice s = container.slices[i];
final CRAIEntry e = new CRAIEntry();
e.sequenceId = s.sequenceId;
e.alignmentStart = s.alignmentStart;
e.alignmentSpan = s.alignmentSpan;
e.containerStartOffset = s.containerOffset;
e.sliceOffset = container.landmarks[i];
e.sliceSize = s.size;

e.sliceIndex = i;
entries.add(e);
}
return entries;
return Arrays.stream(container.slices)
.map(slice -> slice.getCRAIEntry(slice.containerOffset))
Copy link
Contributor Author

@jmthibault79 jmthibault79 Jan 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't necessarily derive the container offset from the Slice because sometimes it conflicts with the Container's own offset - will need to look into why.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have high confidence that these are managed correctly by the existing code, but in this case it's using the value from the slice anyway. Can we add a method to slice that uses it's offset directly, and if there is some other context for which that fails, maybe also add a static with this signature for that one case until we figure out whats going on, since thats the broken one ?

BTW, is slice.containerOffset supposed to be the offset of the containing container, or is it the offset of the slice relative to the beginning of the containing container ? Might be good to document those.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, any reason not to move this method to container getCRAIEntries just as it is, and have it return the list of entries. Eventually it should end up there I think, and would be similar to the new one you added to Slice.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, these suggestions make sense.

Slice.containerOffset is the byte offset of the Container from the beginning of the stream.

Slice.offset is the byte offset of the Slice from the beginning of its Container.

I'll add comments making this more clear to Slice

.collect(Collectors.toList());
}

@Override
Expand All @@ -114,19 +111,6 @@ public int compareTo(final CRAIEntry o) {
return (int) (containerStartOffset - o.containerStartOffset);
}

@Override
public CRAIEntry clone() throws CloneNotSupportedException {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only tests were using this

super.clone();
final CRAIEntry entry = new CRAIEntry();
entry.sequenceId = sequenceId;
entry.alignmentStart = alignmentStart;
entry.alignmentSpan = alignmentSpan;
entry.containerStartOffset = containerStartOffset;
entry.sliceOffset = sliceOffset;
entry.sliceSize = sliceSize;
return entry;
}

public static Comparator<CRAIEntry> byEnd = new Comparator<CRAIEntry>() {

@Override
Expand Down Expand Up @@ -192,4 +176,28 @@ public static boolean intersect(final CRAIEntry e0, final CRAIEntry e1) {
return Math.abs(a0 + b0 - a1 - b1) < (e0.alignmentSpan + e1.alignmentSpan);

}

public int getSequenceId() {
return sequenceId;
}

public int getAlignmentStart() {
return alignmentStart;
}

public int getAlignmentSpan() {
return alignmentSpan;
}

public long getContainerStartOffset() {
return containerStartOffset;
}

public int getSliceByteOffset() {
return sliceByteOffset;
}

public int getSliceByteSize() {
return sliceByteSize;
}
}
121 changes: 39 additions & 82 deletions src/main/java/htsjdk/samtools/cram/CRAIIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,15 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.CRAMBAIIndexer;
import htsjdk.samtools.CRAMCRAIIndexer;
import htsjdk.samtools.cram.encoding.reader.MultiRefSliceAlignmentSpanReader;
import htsjdk.samtools.cram.structure.*;
import htsjdk.samtools.seekablestream.SeekableMemoryStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.ValidationStringency;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.io.*;
import java.util.*;
import java.util.stream.Collectors;

import java.util.List;

/**
* CRAI index used for CRAM files.
Expand Down Expand Up @@ -55,63 +46,30 @@ public void writeIndex(final OutputStream os) {

/**
* Create index entries for a single container.
* @param c the container to index
* @param container the container to index
*/
public void processContainer(final Container c) {
public void processContainer(final Container container) {
// TODO: this should be refactored and delegate to container/slice
if (!c.isEOF()) {
for (int i = 0; i < c.slices.length; i++) {
Slice s = c.slices[i];
if (!container.isEOF()) {
for (final Slice s: container.slices) {
if (s.sequenceId == Slice.MULTI_REFERENCE) {
this.entries.addAll(getCRAIEntriesForMultiRefSlice(s, c.header, c.offset, c.landmarks));
}
else {
CRAIEntry e = new CRAIEntry();

e.sequenceId = c.sequenceId;
e.alignmentStart = s.alignmentStart;
e.alignmentSpan = s.alignmentSpan;
e.containerStartOffset = c.offset;
e.sliceOffset = c.landmarks[i];
e.sliceSize = s.size;
e.sliceIndex = i;

entries.add(e);
final Map<Integer, AlignmentSpan> spans = s.getMultiRefAlignmentSpans(container.header, ValidationStringency.DEFAULT_STRINGENCY);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

equivalent to what getCRAIEntriesForMultiRefSlice() did


this.entries.addAll(spans.entrySet().stream()
.map(e -> new CRAIEntry(e.getKey(),
e.getValue().getStart(),
e.getValue().getSpan(),
container.offset,
container.landmarks[s.index],
s.size))
.collect(Collectors.toList()));
} else {
entries.add(s.getCRAIEntry(container.offset));
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
}

/**
* Return a list of CRAI Entries; one for each reference in the multireference slice.
* TODO: this should be refactored and delegate to container/slice
*/
private static Collection<CRAIEntry> getCRAIEntriesForMultiRefSlice(
final Slice slice,
final CompressionHeader header,
final long containerOffset,
final int[] landmarks)
{
final Map<Integer, AlignmentSpan> spans = slice.getMultiRefAlignmentSpans(header, ValidationStringency.DEFAULT_STRINGENCY);

List<CRAIEntry> entries = new ArrayList<>(spans.size());
for (int seqId : spans.keySet()) {
CRAIEntry e = new CRAIEntry();
e.sequenceId = seqId;
AlignmentSpan span = spans.get(seqId);
e.alignmentStart = span.getStart();
e.alignmentSpan = span.getSpan();
e.sliceSize = slice.size;
e.sliceIndex = slice.index;
e.containerStartOffset = containerOffset;
e.sliceOffset = landmarks[slice.index];

entries.add(e);
}

return entries;
}

public static SeekableStream openCraiFileAsBaiStream(final File cramIndexFile, final SAMSequenceDictionary dictionary) throws IOException {
return openCraiFileAsBaiStream(new FileInputStream(cramIndexFile), dictionary);
}
Expand All @@ -128,16 +86,16 @@ public static SeekableStream openCraiFileAsBaiStream(final InputStream indexStre

for (final CRAIEntry entry : full) {
final Slice slice = new Slice();
slice.containerOffset = entry.containerStartOffset;
slice.alignmentStart = entry.alignmentStart;
slice.alignmentSpan = entry.alignmentSpan;
slice.sequenceId = entry.sequenceId;
// https://github.com/samtools/htsjdk/issues/531
// entry.sliceSize is the slice size in bytes, not the number of
// records; this results in the BAMIndex metadata being wrong
slice.nofRecords = entry.sliceSize;
slice.index = entry.sliceIndex;
slice.offset = entry.sliceOffset;
slice.containerOffset = entry.getContainerStartOffset();
slice.alignmentStart = entry.getAlignmentStart();
slice.alignmentSpan = entry.getAlignmentSpan();
slice.sequenceId = entry.getSequenceId();
// NOTE: the recordCount and sliceIndex fields can't be derived from the CRAM index
// so we can only set them to zero
// see https://github.com/samtools/htsjdk/issues/531
slice.nofRecords = 0;
slice.index = 0;
slice.offset = entry.getSliceByteOffset();

indexer.processSingleReferenceSlice(slice);
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
}
Expand All @@ -148,17 +106,16 @@ public static SeekableStream openCraiFileAsBaiStream(final InputStream indexStre

public static List<CRAIEntry> find(final List<CRAIEntry> list, final int seqId, final int start, final int span) {
final boolean whole = start < 1 || span < 1;
final CRAIEntry query = new CRAIEntry();
query.sequenceId = seqId;
query.alignmentStart = start < 1 ? 1 : start;
query.alignmentSpan = span < 1 ? Integer.MAX_VALUE : span;
query.containerStartOffset = Long.MAX_VALUE;
query.sliceOffset = Integer.MAX_VALUE;
query.sliceSize = Integer.MAX_VALUE;
final CRAIEntry query = new CRAIEntry(seqId,
start < 1 ? 1 : start,
span < 1 ? Integer.MAX_VALUE : span,
Long.MAX_VALUE,
Integer.MAX_VALUE,
Integer.MAX_VALUE);

final List<CRAIEntry> l = new ArrayList<>();
for (final CRAIEntry e : list) {
if (e.sequenceId != seqId) {
if (e.getSequenceId() != seqId) {
continue;
}
if (whole || CRAIEntry.intersect(e, query)) {
Expand All @@ -176,7 +133,7 @@ public static CRAIEntry getLeftmost(final List<CRAIEntry> list) {
CRAIEntry left = list.get(0);

for (final CRAIEntry e : list) {
if (e.alignmentStart < left.alignmentStart) {
if (e.getAlignmentStart() < left.getAlignmentStart()) {
left = e;
}
}
Expand All @@ -202,7 +159,7 @@ public static int findLastAlignedEntry(final List<CRAIEntry> list) {
final int mid = (low + high) >>> 1;
final CRAIEntry midVal = list.get(mid);

if (midVal.sequenceId >= 0) {
if (midVal.getSequenceId() >= 0) {
low = mid + 1;
} else {
high = mid - 1;
Expand All @@ -211,7 +168,7 @@ public static int findLastAlignedEntry(final List<CRAIEntry> list) {
if (low >= list.size()) {
return list.size() - 1;
}
for (; low >= 0 && list.get(low).sequenceId == -1; low--) {
for (; low >= 0 && list.get(low).getSequenceId() == -1; low--) {
}
return low;
}
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/htsjdk/samtools/cram/structure/Slice.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package htsjdk.samtools.cram.structure;

import htsjdk.samtools.*;
import htsjdk.samtools.cram.CRAIEntry;
import htsjdk.samtools.cram.encoding.reader.CramRecordReader;
import htsjdk.samtools.cram.encoding.reader.MultiRefSliceAlignmentSpanReader;
import htsjdk.samtools.cram.io.BitInputStream;
Expand Down Expand Up @@ -291,4 +292,15 @@ public Map<Integer, AlignmentSpan> getMultiRefAlignmentSpans(final CompressionHe
return reader.getReferenceSpans();
}

/**
* Generate a CRAI Index entry from this Slice and the container offset.
*
* TODO: investigate why we can't simply use the Slice's own containerOffset here
*
* @param containerStartOffset the byte offset of this Slice's Container
* @return a new CRAI Index Entry
*/
public CRAIEntry getCRAIEntry(final long containerStartOffset) {
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
return new CRAIEntry(sequenceId, alignmentStart, alignmentSpan, containerStartOffset, offset, size);
}
}
Loading