diff --git a/src/main/java/picard/sam/SamErrorMetric/CollectSamErrorMetrics.java b/src/main/java/picard/sam/SamErrorMetric/CollectSamErrorMetrics.java index 7a16f70ee3..c412e6ee07 100644 --- a/src/main/java/picard/sam/SamErrorMetric/CollectSamErrorMetrics.java +++ b/src/main/java/picard/sam/SamErrorMetric/CollectSamErrorMetrics.java @@ -99,9 +99,6 @@ public enum BaseOperation { Deletion, } - private static final int MAX_DIRECTIVES = ReadBaseStratification.Stratifier.values().length + 1; - private static final Log log = Log.getInstance(CollectSamErrorMetrics.class); - // ===================================================================== @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.") @@ -143,8 +140,7 @@ public enum BaseOperation { "OVERLAPPING_ERROR:READ_ORDINALITY:CYCLE", "OVERLAPPING_ERROR:READ_ORDINALITY:HOMOPOLYMER", "OVERLAPPING_ERROR:READ_ORDINALITY:GC_CONTENT", - "INDEL_ERROR", - "INDEL_ERROR:INDEL_LENGTH" + "INDEL_ERROR" ); @Argument(doc = "A fake argument used to show the options of ERROR (in ERROR_METRICS).", optional = true) @@ -179,15 +175,6 @@ public enum BaseOperation { @Argument(shortName = "P", doc = "The probability of selecting a locus for analysis (for downsampling).", optional = true) public double PROBABILITY = 1; - @Argument( - fullName = "progressStepInterval", - doc = "The interval between which progress will be displayed.", - optional = true - ) - public int progressStepInterval = 100000; - - // ===================================================================== - @Override protected boolean requiresReference() { return true; @@ -239,12 +226,15 @@ protected String[] customCommandLineValidation() { } } + private static final int MAX_DIRECTIVES = ReadBaseStratification.Stratifier.values().length + 1; + private static final Log log = Log.getInstance(CollectSamErrorMetrics.class); + @Override protected int doWork() { final Random random = new Random(42); - final ProgressLogger progressLogger = new ProgressLogger(log, progressStepInterval); + final ProgressLogger progressLogger = new ProgressLogger(log, 100000); long nTotalLoci = 0; long nSkippedLoci = 0; long nProcessedLoci = 0; @@ -255,19 +245,12 @@ protected int doWork() { final Collection aggregatorList = getAggregatorList(); // Open up the input resources: try ( - final SamReader sam = SamReaderFactory.makeDefault() - .referenceSequence(REFERENCE_SEQUENCE) - .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true) - .open(INPUT); + final SamReader sam = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final ReferenceSequenceFileWalker referenceSequenceFileWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); - final VCFFileReader vcfFileReader = new VCFFileReader(VCF, true) + final PeekableIterator vcfIterator = new PeekableIterator<>( + VCF == null ? Collections.emptyIterator() : new VCFFileReader(VCF, true).iterator()) ) { - // Make sure we can query our file: - if (!vcfFileReader.isQueryable()) { - throw new PicardException("Cannot query VCF File! VCF Files must be queryable!"); - } - final SAMSequenceDictionary sequenceDictionary = referenceSequenceFileWalker.getSequenceDictionary(); if (sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new PicardException("Input BAM must be sorted by coordinate"); @@ -311,8 +294,8 @@ protected int doWork() { nTotalLoci++; // while there is a next (non-filtered) variant and it is before the locus, advance the pointer. - if (checkLocus(vcfFileReader, info.getLocus())) { - log.debug("Locus does not overlap any variants: " + info.getLocus(), sequenceDictionary); + if (advanceIteratorAndCheckLocus(vcfIterator, info.getLocus(), sequenceDictionary)) { + log.debug(String.format("Skipping locus from VCF: %s", vcfIterator.peek().toStringWithoutGenotypes())); nSkippedLoci++; continue; } @@ -360,41 +343,21 @@ private void writeMetricsFileForAggregator(final BaseErrorAggregation locusAggre } /** - * If there's a variant at the locus return true, otherwise false. + * Advance the iterator until at or ahead of locus. if there's a variant at the locus return true, otherwise false. *

- * HAS SIDE EFFECTS!!! Queries the vcfFileReader + * HAS SIDE EFFECTS!!! draws from vcfIterator!!! * - * @param vcfFileReader a {@link VCFFileReader} to query for the given locus - * @param locusInfo a {@link SamLocusIterator.LocusInfo} at which to examine the variants + * @param vcfIterator a {@link VariantContext} iterator to advance (assumed sorted) + * @param locus a {@link Locus} at which to examine the variants + * @param sequenceDictionary a dictionary with which to compare the Locatable to the Locus... * @return true if there's a variant over the locus, false otherwise. */ - private static boolean checkLocus(final VCFFileReader vcfFileReader, final SamLocusIterator.LocusInfo locusInfo) { - - boolean overlaps = false; - - if (locusInfo != null) { - - try (final CloseableIterator vcfIterator = vcfFileReader.query(locusInfo)) { - - overlaps = true; - - boolean allFiltered = true; - - while (vcfIterator.hasNext()) { - final VariantContext vcf = vcfIterator.next(); - if (vcf.isFiltered()) { - continue; - } - allFiltered = false; - } - - if (allFiltered) { - overlaps = false; - } - } + private static boolean advanceIteratorAndCheckLocus(final PeekableIterator vcfIterator, final Locus locus, final SAMSequenceDictionary sequenceDictionary) { + while (vcfIterator.hasNext() && (vcfIterator.peek().isFiltered() || + CompareVariantContextToLocus(sequenceDictionary, vcfIterator.peek(), locus) < 0)) { + vcfIterator.next(); } - - return overlaps; + return vcfIterator.hasNext() && CompareVariantContextToLocus(sequenceDictionary, vcfIterator.peek(), locus) == 0; } /** @@ -531,6 +494,34 @@ private IntervalList getIntervals(final SAMSequenceDictionary sequenceDictionary return regionOfInterest; } + /** + * Compares a VariantContext to a Locus providing information regarding possible overlap, or relative location + * + * @param dictionary The {@link SAMSequenceDictionary} to use for ordering the sequences + * @param variantContext the {@link VariantContext} to compare + * @param locus the {@link Locus} to compare + * @return negative if variantContext comes before locus (with no overlap) + * zero if variantContext and locus overlap + * positive if variantContext comes after locus (with no overlap) + *

+ * if variantContext and locus are in the same contig the return value will be the number of bases apart they are, + * otherwise it will be MIN_INT/MAX_INT + */ + public static int CompareVariantContextToLocus(final SAMSequenceDictionary dictionary, final VariantContext variantContext, final Locus locus) { + + final int indexDiff = dictionary.getSequenceIndex(variantContext.getContig()) - locus.getSequenceIndex(); + if (indexDiff != 0) { + return indexDiff < 0 ? Integer.MIN_VALUE : Integer.MAX_VALUE; + } + + //same SequenceIndex, can compare by genomic position + if (locus.getPosition() < variantContext.getStart()) + return variantContext.getStart() - locus.getPosition(); + if (locus.getPosition() > variantContext.getEnd()) + return variantContext.getEnd() - locus.getPosition(); + return 0; + } + /** * Parses a "Directive" of the form "ERROR,STRATIFIER,STRATIFIER...etc." into a {@link BaseErrorAggregation} consisting of the appropriate * {@link BaseCalculator} and the {@link ReadBaseStratification.CollectionStratifier CollectionStratifier} constructed from the various diff --git a/src/main/java/picard/sam/SamErrorMetric/ReadBaseStratification.java b/src/main/java/picard/sam/SamErrorMetric/ReadBaseStratification.java index a0e5590ee7..cacf362931 100644 --- a/src/main/java/picard/sam/SamErrorMetric/ReadBaseStratification.java +++ b/src/main/java/picard/sam/SamErrorMetric/ReadBaseStratification.java @@ -39,7 +39,6 @@ import java.util.Collection; import java.util.LinkedList; -import java.util.List; import java.util.concurrent.ExecutionException; import java.util.function.BiFunction; import java.util.function.Function; @@ -484,74 +483,6 @@ public String getSuffix() { } } - /** - * Stratifies according to the number of matching cigar operators (from CIGAR string) that the read has. - */ - public static class CigarOperatorsInReadStratifier extends RecordStratifier { - - private CigarOperator operator; - - public CigarOperatorsInReadStratifier(final CigarOperator op) { - operator = op; - } - - @Override - public Integer stratify(final SAMRecord samRecord) { - try { - return samRecord.getCigar().getCigarElements().stream() - .filter(ce -> ce.getOperator().equals(operator)) - .mapToInt(CigarElement::getLength) - .sum(); - } catch (final Exception ex) { - return null; - } - } - - @Override - public String getSuffix() { - return "cigar_elements_" + operator.name() + "_in_read"; - } - } - - /** - * Stratifies according to the number of indel bases (from CIGAR string) that the read has. - */ - public static class IndelsInReadStratifier extends RecordStratifier { - - @Override - public Integer stratify(final SAMRecord samRecord) { - try { - return samRecord.getCigar().getCigarElements().stream() - .filter(ce -> (ce.getOperator().equals(CigarOperator.I) || ce.getOperator().equals(CigarOperator.D))) - .mapToInt(CigarElement::getLength) - .sum(); - } catch (final Exception ex) { - return null; - } - } - - @Override - public String getSuffix() { - return "indels_in_read"; - } - } - - /** - * Stratifies according to the length of an insertion or deletion. - */ - public static class IndelLengthStratifier implements RecordAndOffsetStratifier { - - @Override - public Integer stratify(final RecordAndOffset recordAndOffset, final SAMLocusAndReference locusInfo, final CollectSamErrorMetrics.BaseOperation operation) { - return stratifyIndelLength(recordAndOffset, locusInfo, operation); - } - - @Override - public String getSuffix() { - return "indel_length"; - } - } - /* ************ Instances of stratifiers so that they do not need to be allocated every time ****************/ @@ -716,26 +647,6 @@ public String getSuffix() { */ public static final NsInReadStratifier nsInReadStratifier = new NsInReadStratifier(); - /** - * Stratify by Insertions in the read cigars. - */ - public static final CigarOperatorsInReadStratifier insertionsInReadStratifier = new CigarOperatorsInReadStratifier(CigarOperator.I); - - /** - * Stratify by Deletions in the read cigars. - */ - public static final CigarOperatorsInReadStratifier deletionsInReadStratifier = new CigarOperatorsInReadStratifier(CigarOperator.D); - - /** - * Stratify by Indels in the read cigars. - */ - public static final IndelsInReadStratifier indelsInReadStratifier = new IndelsInReadStratifier(); - - /** - * Stratifies into the number of bases in an insertion - */ - public static final IndelLengthStratifier indelLengthStratifier = new IndelLengthStratifier(); - /* *************** enums **************/ /** @@ -774,11 +685,7 @@ enum Stratifier implements CommandLineParser.ClpEnum { ONE_BASE_PADDED_CONTEXT(() -> oneBasePaddedContextStratifier, "The current reference base and a one base padded region from the read resulting in a 3-base context."), TWO_BASE_PADDED_CONTEXT(() -> twoBasePaddedContextStratifier, "The current reference base and a two base padded region from the read resulting in a 5-base context."), CONSENSUS(() -> consensusStratifier, "Whether or not duplicate reads were used to form a consensus read. This stratifier makes use of the aD, bD, and cD tags for duplex consensus reads. If the reads are single index consensus, only the cD tags are used."), - NS_IN_READ(() -> nsInReadStratifier, "The number of Ns in the read."), - INSERTIONS_IN_READ(() -> insertionsInReadStratifier, "The number of Insertions in the read cigar."), - DELETIONS_IN_READ(() -> deletionsInReadStratifier, "The number of Deletions in the read cigar."), - INDELS_IN_READ(() -> indelsInReadStratifier, "The number of INDELs in the read cigar."), - INDEL_LENGTH(() -> indelLengthStratifier, "The number of bases in an indel"); + NS_IN_READ(() -> nsInReadStratifier, "The number of Ns in the read."); private final String docString; @@ -1116,26 +1023,6 @@ private static String stratifyReadGroup(final SAMRecord sam) { return sam.getReadGroup().getReadGroupId(); } - private static Integer stratifyIndelLength(final RecordAndOffset recordAndOffset, final SAMLocusAndReference locusInfo, final CollectSamErrorMetrics.BaseOperation operation) { - // If the base is not an indel, stratify it as a length of 0 - if (operation != CollectSamErrorMetrics.BaseOperation.Insertion && operation != CollectSamErrorMetrics.BaseOperation.Deletion) { - return 0; - } - - final CigarElement cigarElement = getIndelElement(recordAndOffset, operation); - if (cigarElement == null) { - // No CIGAR operation for given position. - return null; - } - if (operation == CollectSamErrorMetrics.BaseOperation.Insertion && cigarElement.getOperator() != CigarOperator.I) { - throw new IllegalStateException("Wrong CIGAR operator for the given position."); - } - if (operation == CollectSamErrorMetrics.BaseOperation.Deletion && cigarElement.getOperator() != CigarOperator.D) { - throw new IllegalStateException("Wrong CIGAR operator for the given position."); - } - return cigarElement.getLength(); - } - public static CigarElement getIndelElement(final RecordAndOffset recordAndOffset, final CollectSamErrorMetrics.BaseOperation operation) { final SAMRecord record = recordAndOffset.getRecord(); final int offset = recordAndOffset.getOffset(); diff --git a/src/test/java/picard/sam/SamErrorMetric/CollectSamErrorMetricsTest.java b/src/test/java/picard/sam/SamErrorMetric/CollectSamErrorMetricsTest.java index 26b5173b1c..98ab31a507 100644 --- a/src/test/java/picard/sam/SamErrorMetric/CollectSamErrorMetricsTest.java +++ b/src/test/java/picard/sam/SamErrorMetric/CollectSamErrorMetricsTest.java @@ -599,26 +599,26 @@ public Object[][] provideForTestIndelErrors() { { new String[] { "20M2D2I" }, ".indel_error_by_all", new IndelErrorMetric("all", 22, 1, 2, 1,2) }, { new String[] { "20M2I2D" }, ".indel_error_by_all", new IndelErrorMetric("all", 22, 1, 2, 1,2) }, - // indel_length: - // insertions - { new String[] { "50M1I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) }, - { new String[] { "50M2I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) }, - { new String[] { "20M2I20M2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 4, 2, 4, 0, 0) }, - { new String[] { "1I10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) }, - { new String[] { "10M1I" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) }, - // deletions - { new String[] { "50M1D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) }, - { new String[] { "50M2D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 1, 2) }, - { new String[] { "20M2D20M2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 2, 4) }, - { new String[] { "1D10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) }, - { new String[] { "10M1D" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) }, - // insertions & deletions - { new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) }, - { new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("3", 0, 0, 0, 1, 3) }, - { new String[] { "2I2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2,1, 2) }, - { new String[] { "2D2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) }, - { new String[] { "2M2D2I" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) }, - { new String[] { "20M2I2D" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) }, +// // indel_length: +// // insertions +// { new String[] { "50M1I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) }, +// { new String[] { "50M2I50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) }, +// { new String[] { "20M2I20M2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 4, 2, 4, 0, 0) }, +// { new String[] { "1I10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) }, +// { new String[] { "10M1I" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 1, 1, 1, 0, 0) }, +// // deletions +// { new String[] { "50M1D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) }, +// { new String[] { "50M2D50M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 1, 2) }, +// { new String[] { "20M2D20M2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 0, 0, 0, 2, 4) }, +// { new String[] { "1D10M" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) }, +// { new String[] { "10M1D" }, ".indel_error_by_indel_length", new IndelErrorMetric("1", 0, 0, 0, 1, 1) }, +// // insertions & deletions +// { new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 0, 0) }, +// { new String[] { "20M2I20M3D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("3", 0, 0, 0, 1, 3) }, +// { new String[] { "2I2D20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2,1, 2) }, +// { new String[] { "2D2I20M" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) }, +// { new String[] { "2M2D2I" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) }, +// { new String[] { "20M2I2D" }, ".indel_error_by_indel_length", new IndelErrorMetric("2", 2, 1, 2, 1, 2) }, }; } @@ -664,7 +664,7 @@ public void testIndelErrors(final String[] readCigars, final String errorSubscri "OUTPUT=" + outputBaseFileName, "REFERENCE_SEQUENCE=" + referenceFile.getAbsolutePath(), "ERROR_METRICS=" + "INDEL_ERROR", - "ERROR_METRICS=" + "INDEL_ERROR:INDEL_LENGTH", +// "ERROR_METRICS=" + "INDEL_ERROR:INDEL_LENGTH", "VCF=" + vcf.getAbsolutePath() };