Skip to content

Commit

Permalink
Removed everything from this branch except for indel error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelgatzen committed Aug 7, 2019
1 parent f0c9116 commit cdf839b
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 192 deletions.
105 changes: 48 additions & 57 deletions src/main/java/picard/sam/SamErrorMetric/CollectSamErrorMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,6 @@ public enum BaseOperation {
Deletion,
}

private static final int MAX_DIRECTIVES = ReadBaseStratification.Stratifier.values().length + 1;
private static final Log log = Log.getInstance(CollectSamErrorMetrics.class);

// =====================================================================

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
Expand Down Expand Up @@ -143,8 +140,7 @@ public enum BaseOperation {
"OVERLAPPING_ERROR:READ_ORDINALITY:CYCLE",
"OVERLAPPING_ERROR:READ_ORDINALITY:HOMOPOLYMER",
"OVERLAPPING_ERROR:READ_ORDINALITY:GC_CONTENT",
"INDEL_ERROR",
"INDEL_ERROR:INDEL_LENGTH"
"INDEL_ERROR"
);

@Argument(doc = "A fake argument used to show the options of ERROR (in ERROR_METRICS).", optional = true)
Expand Down Expand Up @@ -179,15 +175,6 @@ public enum BaseOperation {
@Argument(shortName = "P", doc = "The probability of selecting a locus for analysis (for downsampling).", optional = true)
public double PROBABILITY = 1;

@Argument(
fullName = "progressStepInterval",
doc = "The interval between which progress will be displayed.",
optional = true
)
public int progressStepInterval = 100000;

// =====================================================================

@Override
protected boolean requiresReference() {
return true;
Expand Down Expand Up @@ -239,12 +226,15 @@ protected String[] customCommandLineValidation() {
}
}

private static final int MAX_DIRECTIVES = ReadBaseStratification.Stratifier.values().length + 1;
private static final Log log = Log.getInstance(CollectSamErrorMetrics.class);

@Override
protected int doWork() {

final Random random = new Random(42);

final ProgressLogger progressLogger = new ProgressLogger(log, progressStepInterval);
final ProgressLogger progressLogger = new ProgressLogger(log, 100000);
long nTotalLoci = 0;
long nSkippedLoci = 0;
long nProcessedLoci = 0;
Expand All @@ -255,19 +245,12 @@ protected int doWork() {
final Collection<BaseErrorAggregation> aggregatorList = getAggregatorList();
// Open up the input resources:
try (
final SamReader sam = SamReaderFactory.makeDefault()
.referenceSequence(REFERENCE_SEQUENCE)
.setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true)
.open(INPUT);
final SamReader sam = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final ReferenceSequenceFileWalker referenceSequenceFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final VCFFileReader vcfFileReader = new VCFFileReader(VCF, true)
final PeekableIterator<VariantContext> vcfIterator = new PeekableIterator<>(
VCF == null ? Collections.emptyIterator() : new VCFFileReader(VCF, true).iterator())
) {

// Make sure we can query our file:
if (!vcfFileReader.isQueryable()) {
throw new PicardException("Cannot query VCF File! VCF Files must be queryable!");
}

final SAMSequenceDictionary sequenceDictionary = referenceSequenceFileWalker.getSequenceDictionary();
if (sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new PicardException("Input BAM must be sorted by coordinate");
Expand Down Expand Up @@ -311,8 +294,8 @@ protected int doWork() {
nTotalLoci++;

// while there is a next (non-filtered) variant and it is before the locus, advance the pointer.
if (checkLocus(vcfFileReader, info.getLocus())) {
log.debug("Locus does not overlap any variants: " + info.getLocus(), sequenceDictionary);
if (advanceIteratorAndCheckLocus(vcfIterator, info.getLocus(), sequenceDictionary)) {
log.debug(String.format("Skipping locus from VCF: %s", vcfIterator.peek().toStringWithoutGenotypes()));
nSkippedLoci++;
continue;
}
Expand Down Expand Up @@ -360,41 +343,21 @@ private void writeMetricsFileForAggregator(final BaseErrorAggregation locusAggre
}

/**
* If there's a variant at the locus return true, otherwise false.
* Advance the iterator until at or ahead of locus. if there's a variant at the locus return true, otherwise false.
* <p>
* HAS SIDE EFFECTS!!! Queries the vcfFileReader
* HAS SIDE EFFECTS!!! draws from vcfIterator!!!
*
* @param vcfFileReader a {@link VCFFileReader} to query for the given locus
* @param locusInfo a {@link SamLocusIterator.LocusInfo} at which to examine the variants
* @param vcfIterator a {@link VariantContext} iterator to advance (assumed sorted)
* @param locus a {@link Locus} at which to examine the variants
* @param sequenceDictionary a dictionary with which to compare the Locatable to the Locus...
* @return true if there's a variant over the locus, false otherwise.
*/
private static boolean checkLocus(final VCFFileReader vcfFileReader, final SamLocusIterator.LocusInfo locusInfo) {

boolean overlaps = false;

if (locusInfo != null) {

try (final CloseableIterator<VariantContext> vcfIterator = vcfFileReader.query(locusInfo)) {

overlaps = true;

boolean allFiltered = true;

while (vcfIterator.hasNext()) {
final VariantContext vcf = vcfIterator.next();
if (vcf.isFiltered()) {
continue;
}
allFiltered = false;
}

if (allFiltered) {
overlaps = false;
}
}
private static boolean advanceIteratorAndCheckLocus(final PeekableIterator<VariantContext> vcfIterator, final Locus locus, final SAMSequenceDictionary sequenceDictionary) {
while (vcfIterator.hasNext() && (vcfIterator.peek().isFiltered() ||
CompareVariantContextToLocus(sequenceDictionary, vcfIterator.peek(), locus) < 0)) {
vcfIterator.next();
}

return overlaps;
return vcfIterator.hasNext() && CompareVariantContextToLocus(sequenceDictionary, vcfIterator.peek(), locus) == 0;
}

/**
Expand Down Expand Up @@ -531,6 +494,34 @@ private IntervalList getIntervals(final SAMSequenceDictionary sequenceDictionary
return regionOfInterest;
}

/**
* Compares a VariantContext to a Locus providing information regarding possible overlap, or relative location
*
* @param dictionary The {@link SAMSequenceDictionary} to use for ordering the sequences
* @param variantContext the {@link VariantContext} to compare
* @param locus the {@link Locus} to compare
* @return negative if variantContext comes before locus (with no overlap)
* zero if variantContext and locus overlap
* positive if variantContext comes after locus (with no overlap)
* <p/>
* if variantContext and locus are in the same contig the return value will be the number of bases apart they are,
* otherwise it will be MIN_INT/MAX_INT
*/
public static int CompareVariantContextToLocus(final SAMSequenceDictionary dictionary, final VariantContext variantContext, final Locus locus) {

final int indexDiff = dictionary.getSequenceIndex(variantContext.getContig()) - locus.getSequenceIndex();
if (indexDiff != 0) {
return indexDiff < 0 ? Integer.MIN_VALUE : Integer.MAX_VALUE;
}

//same SequenceIndex, can compare by genomic position
if (locus.getPosition() < variantContext.getStart())
return variantContext.getStart() - locus.getPosition();
if (locus.getPosition() > variantContext.getEnd())
return variantContext.getEnd() - locus.getPosition();
return 0;
}

/**
* Parses a "Directive" of the form "ERROR,STRATIFIER,STRATIFIER...etc." into a {@link BaseErrorAggregation} consisting of the appropriate
* {@link BaseCalculator} and the {@link ReadBaseStratification.CollectionStratifier CollectionStratifier} constructed from the various
Expand Down
115 changes: 1 addition & 114 deletions src/main/java/picard/sam/SamErrorMetric/ReadBaseStratification.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@

import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.function.BiFunction;
import java.util.function.Function;
Expand Down Expand Up @@ -484,74 +483,6 @@ public String getSuffix() {
}
}

/**
* Stratifies according to the number of matching cigar operators (from CIGAR string) that the read has.
*/
public static class CigarOperatorsInReadStratifier extends RecordStratifier<Integer> {

private CigarOperator operator;

public CigarOperatorsInReadStratifier(final CigarOperator op) {
operator = op;
}

@Override
public Integer stratify(final SAMRecord samRecord) {
try {
return samRecord.getCigar().getCigarElements().stream()
.filter(ce -> ce.getOperator().equals(operator))
.mapToInt(CigarElement::getLength)
.sum();
} catch (final Exception ex) {
return null;
}
}

@Override
public String getSuffix() {
return "cigar_elements_" + operator.name() + "_in_read";
}
}

/**
* Stratifies according to the number of indel bases (from CIGAR string) that the read has.
*/
public static class IndelsInReadStratifier extends RecordStratifier<Integer> {

@Override
public Integer stratify(final SAMRecord samRecord) {
try {
return samRecord.getCigar().getCigarElements().stream()
.filter(ce -> (ce.getOperator().equals(CigarOperator.I) || ce.getOperator().equals(CigarOperator.D)))
.mapToInt(CigarElement::getLength)
.sum();
} catch (final Exception ex) {
return null;
}
}

@Override
public String getSuffix() {
return "indels_in_read";
}
}

/**
* Stratifies according to the length of an insertion or deletion.
*/
public static class IndelLengthStratifier implements RecordAndOffsetStratifier {

@Override
public Integer stratify(final RecordAndOffset recordAndOffset, final SAMLocusAndReference locusInfo, final CollectSamErrorMetrics.BaseOperation operation) {
return stratifyIndelLength(recordAndOffset, locusInfo, operation);
}

@Override
public String getSuffix() {
return "indel_length";
}
}


/* ************ Instances of stratifiers so that they do not need to be allocated every time ****************/

Expand Down Expand Up @@ -716,26 +647,6 @@ public String getSuffix() {
*/
public static final NsInReadStratifier nsInReadStratifier = new NsInReadStratifier();

/**
* Stratify by Insertions in the read cigars.
*/
public static final CigarOperatorsInReadStratifier insertionsInReadStratifier = new CigarOperatorsInReadStratifier(CigarOperator.I);

/**
* Stratify by Deletions in the read cigars.
*/
public static final CigarOperatorsInReadStratifier deletionsInReadStratifier = new CigarOperatorsInReadStratifier(CigarOperator.D);

/**
* Stratify by Indels in the read cigars.
*/
public static final IndelsInReadStratifier indelsInReadStratifier = new IndelsInReadStratifier();

/**
* Stratifies into the number of bases in an insertion
*/
public static final IndelLengthStratifier indelLengthStratifier = new IndelLengthStratifier();

/* *************** enums **************/

/**
Expand Down Expand Up @@ -774,11 +685,7 @@ enum Stratifier implements CommandLineParser.ClpEnum {
ONE_BASE_PADDED_CONTEXT(() -> oneBasePaddedContextStratifier, "The current reference base and a one base padded region from the read resulting in a 3-base context."),
TWO_BASE_PADDED_CONTEXT(() -> twoBasePaddedContextStratifier, "The current reference base and a two base padded region from the read resulting in a 5-base context."),
CONSENSUS(() -> consensusStratifier, "Whether or not duplicate reads were used to form a consensus read. This stratifier makes use of the aD, bD, and cD tags for duplex consensus reads. If the reads are single index consensus, only the cD tags are used."),
NS_IN_READ(() -> nsInReadStratifier, "The number of Ns in the read."),
INSERTIONS_IN_READ(() -> insertionsInReadStratifier, "The number of Insertions in the read cigar."),
DELETIONS_IN_READ(() -> deletionsInReadStratifier, "The number of Deletions in the read cigar."),
INDELS_IN_READ(() -> indelsInReadStratifier, "The number of INDELs in the read cigar."),
INDEL_LENGTH(() -> indelLengthStratifier, "The number of bases in an indel");
NS_IN_READ(() -> nsInReadStratifier, "The number of Ns in the read.");

private final String docString;

Expand Down Expand Up @@ -1116,26 +1023,6 @@ private static String stratifyReadGroup(final SAMRecord sam) {
return sam.getReadGroup().getReadGroupId();
}

private static Integer stratifyIndelLength(final RecordAndOffset recordAndOffset, final SAMLocusAndReference locusInfo, final CollectSamErrorMetrics.BaseOperation operation) {
// If the base is not an indel, stratify it as a length of 0
if (operation != CollectSamErrorMetrics.BaseOperation.Insertion && operation != CollectSamErrorMetrics.BaseOperation.Deletion) {
return 0;
}

final CigarElement cigarElement = getIndelElement(recordAndOffset, operation);
if (cigarElement == null) {
// No CIGAR operation for given position.
return null;
}
if (operation == CollectSamErrorMetrics.BaseOperation.Insertion && cigarElement.getOperator() != CigarOperator.I) {
throw new IllegalStateException("Wrong CIGAR operator for the given position.");
}
if (operation == CollectSamErrorMetrics.BaseOperation.Deletion && cigarElement.getOperator() != CigarOperator.D) {
throw new IllegalStateException("Wrong CIGAR operator for the given position.");
}
return cigarElement.getLength();
}

public static CigarElement getIndelElement(final RecordAndOffset recordAndOffset, final CollectSamErrorMetrics.BaseOperation operation) {
final SAMRecord record = recordAndOffset.getRecord();
final int offset = recordAndOffset.getOffset();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,26 +599,26 @@ public Object[][] provideForTestIndelErrors() {
{ new String[] { "20M2D2I" }, ".indel_error_by_all", new IndelErrorMetric("all", 22, 1, 2, 1,2) },
{ new String[] { "20M2I2D" }, ".indel_error_by_all", new IndelErrorMetric("all", 22, 1, 2, 1,2) },

// indel_length:
// insertions
{ new String[] { "50M1I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) },
{ new String[] { "50M2I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) },
{ new String[] { "20M2I20M2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 4, 2, 4, 0, 0) },
{ new String[] { "1I10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) },
{ new String[] { "10M1I" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) },
// deletions
{ new String[] { "50M1D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) },
{ new String[] { "50M2D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 1, 2) },
{ new String[] { "20M2D20M2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 2, 4) },
{ new String[] { "1D10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) },
{ new String[] { "10M1D" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) },
// insertions & deletions
{ new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) },
{ new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("3", 0, 0, 0, 1, 3) },
{ new String[] { "2I2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2,1, 2) },
{ new String[] { "2D2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) },
{ new String[] { "2M2D2I" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) },
{ new String[] { "20M2I2D" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) },
// // indel_length:
// // insertions
// { new String[] { "50M1I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) },
// { new String[] { "50M2I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) },
// { new String[] { "20M2I20M2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 4, 2, 4, 0, 0) },
// { new String[] { "1I10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) },
// { new String[] { "10M1I" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) },
// // deletions
// { new String[] { "50M1D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) },
// { new String[] { "50M2D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 1, 2) },
// { new String[] { "20M2D20M2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 2, 4) },
// { new String[] { "1D10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) },
// { new String[] { "10M1D" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) },
// // insertions & deletions
// { new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) },
// { new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("3", 0, 0, 0, 1, 3) },
// { new String[] { "2I2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2,1, 2) },
// { new String[] { "2D2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) },
// { new String[] { "2M2D2I" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) },
// { new String[] { "20M2I2D" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) },
};
}

Expand Down Expand Up @@ -664,7 +664,7 @@ public void testIndelErrors(final String[] readCigars, final String errorSubscri
"OUTPUT=" + outputBaseFileName,
"REFERENCE_SEQUENCE=" + referenceFile.getAbsolutePath(),
"ERROR_METRICS=" + "INDEL_ERROR",
"ERROR_METRICS=" + "INDEL_ERROR:INDEL_LENGTH",
// "ERROR_METRICS=" + "INDEL_ERROR:INDEL_LENGTH",
"VCF=" + vcf.getAbsolutePath()
};

Expand Down

0 comments on commit cdf839b

Please sign in to comment.