diff --git a/build.gradle b/build.gradle index 7562a0aff5..eaed5d5be0 100644 --- a/build.gradle +++ b/build.gradle @@ -31,6 +31,7 @@ jacocoTestReport { } dependencies { + compile "org.apache.commons:commons-lang3:3.9" compile "org.apache.commons:commons-jexl:2.1.1" compile "commons-logging:commons-logging:1.1.1" compile "org.xerial.snappy:snappy-java:1.1.4" diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java index 7de90cc938..b2e754de86 100644 --- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java +++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java @@ -26,6 +26,7 @@ import htsjdk.samtools.*; import htsjdk.samtools.fastq.FastqConstants; import htsjdk.utils.ValidationUtils; +import org.apache.commons.lang3.ArrayUtils; import java.io.File; import java.math.BigInteger; @@ -34,6 +35,7 @@ import java.util.Arrays; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -130,6 +132,113 @@ public static boolean basesEqual(final byte lhs, final byte rhs) { return (bases[lhs] == bases[rhs]); } + /** + * Compares two bases. + *
It returns 0 if both bases represent the same nucleotide or combination of if ambiguous.
+ *Otherwise + * it returns -1 or 1 depending on what bases each represents. More concretely if 'A', 'C', 'G' and 'T' are equivalent to 1, 2, 4, 8 then + * the sortable unsigned value associated to a valid IUPAC code is the one corresponding to the sum of each basic + * nucleotide value above across al the nucleotides the code represents. + *
+ *+ * codes with a lower value precede values with larger value. + *
+ * @param lhs the "left" base to compare. + * @param rhs the "right" base to compare. + * @return + */ + public static int compareBases(final byte lhs, final byte rhs) { + if (lhs < 0 || lhs >= BASES_ARRAY_LENGTH) { + throw new UnsupportedOperationException("bad base code: " + rhs); + } else if (rhs < 0 || rhs >= BASES_ARRAY_LENGTH) { + throw new UnsupportedOperationException("bad base code: " + rhs); + } else { + return Byte.compare(bases[lhs], bases[rhs]); + } + } + + /** + * Compares two base sequences. + * Case are ignored so that "AaTtcCGg" == "AAttCcgG". + *+ * presence of non-base values would result returning false, even if the valid bases are + * the same. + *
+ * + * @param lhs first base sequence to compare. + * @param rhs second base sequence to compare. + * @return + */ + public static boolean equals(final byte[] lhs, final byte[] rhs) { + if (lhs.length != rhs.length) { + return false; + } else { + for (int i = 0; i < lhs.length; i++) { + final byte l = lhs[i]; + if (l < 0 || l >= BASES_ARRAY_LENGTH) { + return false; + } + final byte r = rhs[i]; + if (r < 0 || r >= BASES_ARRAY_LENGTH) { + return false; + } else if (bases[r] != bases[l]) { + return false; + } + } + return true; + } + } + + /** + * Calculates a hash-code making sure that it would return the same value for sequences + * that only differ in case. Also differences in non-valid IUPAC codes are also ignored. + *+ * The result of this method is consistent with {@link #equals(byte[], byte[])} so that: + *
+ * equals(X, Y) --> hashCode(X) == hashCode(Y)
+ *
+ * The result of this method is consistent with {@link #equals(byte[], byte[])} so that: + *
+ * equals(X, Y) --> hashCode(X) == hashCode(Y)
+ *
+ * The input list is not modified. + *
+ * @param list the list to copy and sort. + * @param comparator the comparator to use to sort elements. + * @param- * Types of alleles: - *
- *- Ref: a t C g a // C is the reference base - : a t G g a // C base is a G in some individuals - : a t - g a // C base is deleted w.r.t. the reference - : a t CAg a // A base is inserted w.r.t. the reference sequence -- *
In these cases, where are the alleles?
- *- * Suppose I see a the following in the population: - *
- *- Ref: a t C g a // C is the reference base - : a t G g a // C base is a G in some individuals - : a t - g a // C base is deleted w.r.t. the reference -+ *
+ * Any valid sequence of bases is considered to encode a inline allele. + * The valid bases are all 4 standard nucleotide bases {@code "A", "C", "G"} + * and {@code "T"} plus the ambiguous {@code "N"}. Any other ambiguous code + * IUPAC (e.g. {@code "Y", "W", "M", "X"} ...) as well as special characters + * used in nucleotide alignments such as {@code '-' '.' '*'} are not allowed + * in inline alleles; these are in fact found in the encoding or + * symbolic and special alleles. + *
+ *+ * You can use lower or upper case character for the base sequences. The + * case is ignore in all operations and it might not be preserved if the + * allele is re-encoded. Therefore when comparing base sequences + * {@code "aCgT"} would be considered equivalent to {@code "AcGt", "ACGT"} + * or {@code "acgt"}. + *
+ *+ * Whether an allele represent an SNP or a (short) insertion or deletion + * will depend on how it compares against the reference allele, in case + * it is an alternative allele, or how it compares to every alternative + * allele if it is reference. + *
+ *+ * Examples:
+ * #CHROM POS ID REF ALT ... + * 1 4567321 SNP_1 T A + * 1 4735812 INS_2 G GACT + * 2 878931 DEL_3 TAC T + * 2 256002131 CPX_4 ATA C,A,CTA,ATATA,CTATA + *+ * + *
+ * First entry above would represent a SNP or mutation from {@code "T" --> "A"}, + * the second one is a insertion of three bases {@code "ACT"}, the third is a + * deletion of two bases {@code "CA"} and the last one represents a complex variant, + * a combination of a SNP {@code "A" --> "C"} together with several alternative + * number of repetitions o a two bases unit {@code "TA"}. + *
+ *Symbolic alleles are further divided in simple (or plain) + * symbolic alleles, breakends, assembly contig insertions and + * the special unspecified alternative allele.
+ *+ *
+ * These are encoded with their ID within angled brackets ({@code <>}):
+ * {@code "", "", "
+ * Examples: + *
+ * #CHROM POS ID REF ALT ... + * 10 10000101 BND_1 T T. + * 22 20001111 BND_2 A .A + * 23 99999 BND_3 A C. + * 23 199999 BND_4 GCG G,GCG. + *+ * + *
+ * In this case {@code BND_1} stands for an allele that in the + * result of branching off from the reference right after the current + * reference position. The adjacent DNA would branch off + * the current position in the reference. We represent this type of + * breakend ends programmatically as {@link BreakendType#SINGLE_FORK}. + *
+ *+ * In contrast {@code BND_2} indicates that the adjacent DNA joints + * the reference right before the current position. The breakend type in + * this cale is {@link BreakendType#SINGLE_JOIN}. + *
+ *The other two examples, {@code "C."} in addition to branching + * off breakend there is also SNP at that position from {@code "T" to "G"} ({@code BND_3}) + * and a close by deletion o f two bases ({@code BND_4}). + *
+ *- * How do I represent this? There are three segregating alleles: + * Examples:
+ * #CHROM POS ID REF ALT ... + * 10 10000101 BND_1 T T[5:1000[ + * 12 30012 BND_1_1 A A[* - *:5123[ + * 22 20001111 BND_2 A [7:2000000[A + * 23 99999 BND_3 A A]3:20133] + * 23 199999 BND_4 G ]5:1020121]G + *
- * { C , G , - } - *- *
and these are represented as:
- *- * { tC, tG, t } - *- *
- * Now suppose I have this more complex example: -
-- Ref: a t C g a // C is the reference base - : a t - g a - : a t - - a - : a t CAg a -*
- * There are actually four segregating alleles: + * The location of the distant adjacent sequence is indicates between + * the square brackets using the usual compact genomic location format {@code "ctg-id:pos"}. + * If the contig name is surrounded with angle brackets ({@code <>}) (e.g. {@code BND_1_1}), instead of the + * reference the distant sequence is located in a sequence in the adjoined assembly files. *
- *- * { Cg , -g, --, and CAg } over bases 2-4 - *- *
represented as:
- *- * { tCg, tg, t, tCAg } - *- *
- * Critically, it should be possible to apply an allele to a reference sequence to create the - * correct haplotype sequence:
- *- * Allele + reference => haplotype - *- *
- * For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the - * Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context. + *
+ * When the reference aligned base starts + * the encoding ({@code BND_1} and {@code BND_3}) it indicates that + * the haplotype starts up-stream in the reference (always on the forward strand) and the is followed by + * the distant adjacent sequence.
+ *In contrasts, when the bases are at the end of the encoding the haplotype + * would start on the distant DNA and join at that position of the reference and continue downstream on + * the forward strand.
+ *The direction on the distant sequence is determined by the orientation of the + * brackets: {@code [...[} right from and {@code ]...]} left from the mate breakpoint.
+ *However the strand is a bit more tricky; when the bracket point away from the base then the + * haplotype goes along the forward strand on the distant DNA sequence ({@code BND_1, BND_1_1} and {@code BND_4}), whereas + * when they point towards that base, it means that the haplotype would run along the reverse (complement) strand.
+ *For further details see {@link BreakendType} documentation
* - * Given list of alleles it's possible to determine the "type" of the variation - -- A / C @ loc => SNP - - / A => INDEL -+ *
- * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion. + * Examples:
+ * #CHROM POS ID REF ALT ... + * 10 10000101 BND_1 T T* *+ *
- * Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be - * determined. This is usually represented by a '.' allele. + * The assembly contig id appears between angle brackets ({@code <>}) + * after the base aligned against the reference. + *
+ *Used in genotypes to indicate the lack of a call for a concrete allele.
+ *It is encoded with a single period {@code "."} are is not allowed in the {@code REF} or + * {@code ALT} column of the .vcf; a lonely {@code "."} on those column means that that column + * for that variant record is empty and has to been given a value of any kind
+ *This special allele encoded as a asterisk without any brackets or {@code "*"} indicates that for some + * samples that variant may have a lower ploidy due to a spanning larger deletion.
+ *This special type represents an unknown or unobserved alternative allele + * and it is often used to describe the uncertainty or confidence on the lack + * of variation at that site. *
*
- * Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an
- * Allele.
+ * We currently support two version of this allele, {@link #UNSPECIFIED_ALT} encoded as {@code "<*>"},
+ * and {@link #NON_REF} encoded as {@code "
+ * The resulting exception won't be a reference, thus it would be an alternative if it applies. + *
+ * @param base the byte encoding the allele. + * @return never {@code null}. + * @throws AlleleEncodingException if the byte provide is not a valid allele encoding. + */ + static Allele create(final byte base) { + return AlleleUtils.decode(base, false); + } /** - * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases - * == '-', a Null allele is created. If bases == '.', a no call Allele is created. If bases == '*', a spanning deletions Allele is created. + * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information * - * @param bases the DNA sequence of this variation, '-', '.', or '*' - * @param isRef should we make this a reference allele? - * @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated - */ - public static Allele create(final byte[] bases, final boolean isRef) { - if ( bases == null ) - throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); - - if ( bases.length == 1 ) { - // optimization to return a static constant Allele for each single base object - switch (bases[0]) { - case '.': - if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); - return NO_CALL; - case '*': - if ( isRef ) throw new IllegalArgumentException("Cannot tag a spanning deletions allele as the reference allele"); - return SPAN_DEL; - case 'A': case 'a' : return isRef ? REF_A : ALT_A; - case 'C': case 'c' : return isRef ? REF_C : ALT_C; - case 'G': case 'g' : return isRef ? REF_G : ALT_G; - case 'T': case 't' : return isRef ? REF_T : ALT_T; - case 'N': case 'n' : return isRef ? REF_N : ALT_N; - default: throw new IllegalArgumentException("Illegal base [" + (char)bases[0] + "] seen in the allele"); - } - } else { - return new Allele(bases, isRef); - } + * @param bases bases representing an allele + */ + static Allele create(final CharSequence bases) { + return AlleleUtils.decode(bases, false); } - public static Allele create(final byte base, final boolean isRef) { - return create( new byte[]{ base }, isRef); + /** + * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information + * + * @param bases bases representing an allele + */ + static Allele create(final byte[] bases) { + return AlleleUtils.decode(bases, false, false); } - public static Allele create(final byte base) { - return create( base, false ); + static Allele create(final byte[] bases, final boolean isReference) { + return AlleleUtils.decode(bases, isReference, false); } - public static Allele extend(final Allele left, final byte[] right) { - if (left.isSymbolic()) - throw new IllegalArgumentException("Cannot extend a symbolic allele"); - byte[] bases = new byte[left.length() + right.length]; - System.arraycopy(left.getBases(), 0, bases, 0, left.length()); - System.arraycopy(right, 0, bases, left.length(), right.length); - - return create(bases, left.isReference()); + /** + * @deprecated use {@link #extend(byte[])} on the instance to extend directly. + * @see #extend(byte[]). + */ + @Deprecated + static Allele extend(final Allele toExtend, final byte[] tail) { + return toExtend.extend(tail); } /** - * @param bases bases representing an allele + * @param bases bases representing an allele * @return true if the bases represent the null allele + * @deprecated no clear substitute. */ - public static boolean wouldBeNullAllele(final byte[] bases) { + @Deprecated + static boolean wouldBeNullAllele(final byte[] bases) { return (bases.length == 1 && bases[0] == htsjdk.variant.vcf.VCFConstants.NULL_ALLELE) || bases.length == 0; } /** * @param bases bases representing an allele * @return true if the bases represent the SPAN_DEL allele + * @deprecated no clear substitute */ - public static boolean wouldBeStarAllele(final byte[] bases) { + @Deprecated + static boolean wouldBeStarAllele(final byte[] bases) { return bases.length == 1 && bases[0] == htsjdk.variant.vcf.VCFConstants.SPANNING_DELETION_ALLELE; } /** - * @param bases bases representing an allele + * @param bases bases representing an allele * @return true if the bases represent the NO_CALL allele + * @deprecated no clear substitute */ - public static boolean wouldBeNoCallAllele(final byte[] bases) { + @Deprecated + static boolean wouldBeNoCallAllele(final byte[] bases) { return bases.length == 1 && bases[0] == htsjdk.variant.vcf.VCFConstants.NO_CALL_ALLELE; } /** - * @param bases bases representing an allele + * @param bases bases representing an allele * @return true if the bases represent a symbolic allele, including breakpoints and breakends + * @deprecated simply try to create the allele, catch exception and check type. */ - public static boolean wouldBeSymbolicAllele(final byte[] bases) { - if ( bases.length <= 1 ) + @Deprecated + static boolean wouldBeSymbolicAllele(final byte[] bases) { + if (bases.length <= 1) return false; else { - return bases[0] == SYMBOLIC_ALLELE_START || bases[bases.length - 1] == SYMBOLIC_ALLELE_END || - wouldBeBreakpoint(bases) || - wouldBeSingleBreakend(bases); + return bases[0] == '<' || bases[bases.length - 1] == '>' || + wouldBeBreakpoint(bases) || wouldBeSingleBreakend(bases); } } /** - * @param bases bases representing an allele + * @param bases bases representing an allele * @return true if the bases represent a symbolic allele in breakpoint notation, (ex: G]17:198982] or ]13:123456]T ) + * @deprecated use {@link Breakend#looksLikeBreakend} */ - public static boolean wouldBeBreakpoint(final byte[] bases) { - if (bases.length <= 1) { - return false; - } - for (int i = 0; i < bases.length; i++) { - final byte base = bases[i]; - if (base == BREAKEND_EXTENDING_LEFT || base == BREAKEND_EXTENDING_RIGHT) { - return true; - } - } - return false; + @Deprecated + static boolean wouldBeBreakpoint(final byte[] bases) { + return Breakend.looksLikeBreakend(bases); } + /** - * @param bases bases representing an allele - * @return true if the bases represent a symbolic allele in single breakend notation (ex: .A or A. ) + * @deprecated */ - public static boolean wouldBeSingleBreakend(final byte[] bases) { - if ( bases.length <= 1 ) - return false; - else { - return bases[0] == SINGLE_BREAKEND_INDICATOR || bases[bases.length - 1] == SINGLE_BREAKEND_INDICATOR; - } + @Deprecated + static boolean wouldBeSingleBreakend(final byte[] bases) { + return Breakend.looksLikeBreakend(bases) && bases[0] == '.' || bases[bases.length - 1] == '.'; } /** - * @param bases bases representing a reference allele + * @param bases bases representing a reference allele * @return true if the bases represent the well formatted allele + * @deprecated consider just create the Allele and catching exceptions if you need to. */ - public static boolean acceptableAlleleBases(final String bases) { + @Deprecated + static boolean acceptableAlleleBases(final String bases) { return acceptableAlleleBases(bases.getBytes(), true); } /** - * @param bases bases representing an allele + * @param bases bases representing an allele * @param isReferenceAllele is a reference allele * @return true if the bases represent the well formatted allele + * @deprecated */ - public static boolean acceptableAlleleBases(final String bases, boolean isReferenceAllele) { + @Deprecated + static boolean acceptableAlleleBases(final String bases, boolean isReferenceAllele) { return acceptableAlleleBases(bases.getBytes(), isReferenceAllele); } /** - * @param bases bases representing a reference allele + * @param bases bases representing a reference allele * @return true if the bases represent the well formatted allele + * @deprecated consider alternatives. */ - public static boolean acceptableAlleleBases(final byte[] bases) { + @Deprecated + static boolean acceptableAlleleBases(final byte[] bases) { return acceptableAlleleBases(bases, true); } /** - * - * @param bases bases representing an allele + * @param bases bases representing an allele * @param isReferenceAllele true if a reference allele * @return true if the bases represent the well formatted allele + * @deprecated consider alternatives. */ - public static boolean acceptableAlleleBases(final byte[] bases, final boolean isReferenceAllele) { + @Deprecated + static boolean acceptableAlleleBases(final byte[] bases, final boolean isReferenceAllele) { if ( wouldBeNullAllele(bases) ) return false; @@ -382,7 +476,7 @@ public static boolean acceptableAlleleBases(final byte[] bases, final boolean is for (byte base : bases ) { switch (base) { - case 'A': case 'C': case 'G': case 'T': case 'a': case 'c': case 'g': case 't': case 'N' : case 'n' : + case 'A': case 'C': case 'G': case 'T': case 'a': case 'c': case 'g': case 't': case 'N': case 'n': break; default: return false; @@ -392,217 +486,541 @@ public static boolean acceptableAlleleBases(final byte[] bases, final boolean is return true; } + //////////////////////////////////////////////////////// + // Type enquiring methods: + /** - * @see #Allele(byte[], boolean) + * Checks whether this allele represents a called allele. + *+ * This method must return in all circumstances exactly the opposite to {@link #isCalled()}. + *
* - * @param bases bases representing an allele - * @param isRef is this the reference allele? + * is not the no-call allele. + *+ * This method must return exactly the opposite to {@link #isNoCall()}. + *
+ * @return true if this method is called. */ - public static Allele create(final String bases, final boolean isRef) { - return create(bases.getBytes(), isRef); - } + // Returns true if this is not the NO_CALL allele + boolean isCalled(); + /** + * Checks whether this allele is a breakend (e.g. {@code ".C", "T.", "A[13:400121[", "]+ * However, a {@code false} return does not exclude the possibility that in fact the containing variant-context + * is a structural variant. For example a regular inline base sequence allele are not considered structural by + * default but they may represent a relative large insertion/deletion that may be intrepetted as a structural variant. + *
+ *+ * If this method returns {@code true} then {@link #getStructuralVariantType()} must not return {@code null}. + * Likewise if this method returns {@code false} then {@link #getStructuralVariantType()} must return {@code null}. + *
+ * @return {@code true} for alleles that imply the presence of new adjacencies beyond insertion or deletion of + * a few bases. */ - public static Allele create(final byte[] bases) { - return create(bases, false); - } + boolean isStructural(); /** - * Creates a new allele based on the provided one. Ref state will be copied unless ignoreRefState is true - * (in which case the returned allele will be non-Ref). - * - * This method is efficient because it can skip the validation of the bases (since the original allele was already validated) - * - * @param allele the allele from which to copy the bases - * @param ignoreRefState should we ignore the reference state of the input allele and use the default ref state? + * Checks whether this allele represents a no-call. + *+ * This method must return exactly the opposite to {@link #isCalled()}. + *
+ * @return true iff this is (or is equal to) the {@link #NO_CALL} allele. */ - public static Allele create(final Allele allele, final boolean ignoreRefState) { - return new Allele(allele, ignoreRefState); + default boolean isNoCall() { + return this.equals(NO_CALL); } - // --------------------------------------------------------------------------------------------------------- - // - // accessor routines - // - // --------------------------------------------------------------------------------------------------------- + /** + * Checks whether this allele is a simple and self-contained sequence of bases. For example ("A", "AAAT", etc). + * Anything else including symbolic alleles, span_deletions, no-call, breakends would return {@code false}. + */ + boolean isInline(); - /** @return true if this is the NO_CALL allele */ - public boolean isNoCall() { return isNoCall; } - // Returns true if this is not the NO_CALL allele - public boolean isCalled() { return ! isNoCall(); } + /** + * @return true if this Allele is symbolic (i.e. no well-defined base sequence), this includes breakpoints and breakends + */ + boolean isSymbolic(); + + + /** + * @return true if this Allele is a single breakend (ex: .A or A.) + */ + boolean isSingleBreakend(); - /** @return true if this Allele is the reference allele */ - public boolean isReference() { return isRef; } + /** + * Checks whether this is a span-deletion marking allele. + * @return {@code true} iff it is an span-del. + */ + boolean isSpanDeletion(); - /** @return true if this Allele is not the reference allele */ - public boolean isNonReference() { return ! isReference(); } + /** + * Checks whether this allele is either the {@link #NON_REF} or the {@link #UNSPECIFIED_ALT}. + */ + boolean isUnspecifiedAlternative(); - /** @return true if this Allele is symbolic (i.e. no well-defined base sequence), this includes breakpoints and breakends */ - public boolean isSymbolic() { return isSymbolic; } + //////////////////////////////////////////////////////// + // Reference <--> Alternative status enquire and conversion methods. - /** @return true if this Allele is a breakpoint ( ex: G]17:198982] or ]13:123456]T ) */ - public boolean isBreakpoint() { return wouldBeBreakpoint(bases); } + /** + * Checks whether this allele is an alternative allele. + * @return never {@code true}. + */ + boolean isAlternative(); - /** @return true if this Allele is a single breakend (ex: .A or A.) */ - public boolean isSingleBreakend() { return wouldBeSingleBreakend(bases); } + /** + * Checks whether this allele is the reference allele. + *+ * This method must return exactly the opposite to {@link #isAlternative()}. + *
+ * @return true iff this Allele is the reference allele + */ + boolean isReference(); - // Returns a nice string representation of this object - public String toString() { - return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : ""); - } + /** + * Returns the "alternative" version of this allele. + *+ * Most type of alleles can (or must) be alternative alleles except for {@link #NO_CALL} that is not + * and can't become either reference nor alternative. Therefore such call on {@link #NO_CALL} will + * result in a {@link UnsupportedOperationException}. + *
+ * @return never {@code null}. + * @throws UnsupportedOperationException if this kind of allele cannot be an alternative allele. + */ + Allele asAlternative(); /** - * Return the DNA bases segregating in this allele. Note this isn't reference polarized, - * so the Null allele is represented by a vector of length 0 - * - * @return the segregating bases + * Returns a reference version of this allele. + *+ * In practice this only applies by inline alleles as the rest + * can't never be reference. Consequently conversion on these will fail. + *
+ * @return never {@code null}. + * @throws UnsupportedOperationException if this kind of allele cannot be a reference allele. */ - public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; } + Allele asReference(); + + ////////////////////////////////////////////// + // Allele type specific information accessors: /** - * Return the DNA bases segregating in this allele in String format. - * This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it. + * Returns the SV type that best matches this allele if any. * - * @return the segregating bases + * @see #isStructural() + * @return {@code null} for those alleles that do not have a corresponding SV type. */ - public String getBaseString() { return isNoCall() ? NO_CALL_STRING : new String(getBases()); } + StructuralVariantType getStructuralVariantType(); /** - * Return the printed representation of this allele. - * Same as getBaseString(), except for symbolic alleles. - * For symbolic alleles, the base string is empty while the display string contains <TAG>. - * - * @return the allele string representation + * Returns the ID/name of a symbolic allele. It return {@code null if it does not apply.} + *
+ * Typically the symbolic ID is the string between the angled brackets, so for example for {@code } it is {@code "DEL"},
+ * for {@code
+ * For those symbolic alleles whose string encoding is not of the form {@code
+ * For other allele types it will {@code null}. + *
+ * @return may be {@code null} */ - public boolean equals(Object other) { - return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) ); - } + String getContigID(); /** - * @return hash code + * Checks whether this allele indeed have a contig ID. + *
+ * The following condition must always be met:
+ * hasContigID() == getContigID() != null
+ *
+ * It is guaranteed that {@code Allele.create(A.encodeAsString(), A.isReference()).equals(A)}. + *
+ * @return never {@code null}. + */ + String encodeAsString(); + + /** + * Returns the encoding for the allele as a sequence of characters represented in bytes. + *+ * It is guaranteed that {@code Allele.create(A.encodeAsBytes(), A.isReference()).equals(A)}. + *
+ *+ * Change in the returned array won't change the state of this allele as is new every time this method in called + * so the invoking code is free to modify it or re-purpose it. + *
+ * @return never {@code null}. + */ + byte[] encodeAsBytes(); + + /** + * Returns an string containing only the bases in this allele. + *
+ * For those alleles that don't contain bases (e.g. plain symbolic alleles like {@code ""}) thi method
+ * will return an empty string.
+ *
+ * This operation only make sense and is supported in sequence inline alleles. + *
+ * @param tail the based to add at the end. + * @throws UnsupportedOperationException if this type of allele does not support extension. + * @throws NullPointerException if {@code tail} is {@code null}. + * @throws AlleleEncodingException if {@code tail} contain invalid bases. + * @return never {@code null}. */ - public boolean basesMatch(final byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); } + Allele extend(final byte[] tail); + + ///////////////////////////// + // Deprecated methods: + + /** - * @param test bases to test against + * Creates a new allele based on the provided one. Ref state will be copied unless ignoreRefState is true + * (in which case the returned allele will be non-Ref). + *
+ * This method is efficient because it can skip the validation of the bases (since the original allele was already validated)
*
- * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
+ * @param allele the allele from which to copy the bases
+ * @param ignoreRefState should we ignore the reference state of the input allele and use the default ref state?
+ * @deprecated use {@code #asAlternative} or {@code #asReference} to obtain the same allele with the
+ * other reference status.
*/
- public boolean basesMatch(final String test) { return basesMatch(test.toUpperCase().getBytes()); }
+ @Deprecated
+ static Allele create(final Allele allele, final boolean ignoreRefState) {
+ if (allele.isAlternative() || !ignoreRefState) {
+ return allele;
+ } else {
+ return allele.asAlternative();
+ }
+ }
/**
- * @param test allele to test against
- *
- * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
+ * @return true if this Allele is a breakpoint ( ex: G]17:198982] or ]13:123456]T )
+ * @deprecated please use {@link #isBreakend()} instead.
*/
- public boolean basesMatch(final Allele test) { return basesMatch(test.getBases()); }
+ //todo we need to choose either breakend or breakpoint, not both.
+ @Deprecated
+ boolean isBreakpoint();
/**
- * @return the length of this allele. Null and NO_CALL alleles have 0 length.
+ * @return true if Allele is either {@link #NON_REF} or {@code #USPECIFIED_ALT}.
+ * @deprecated use {@link #isUnspecifiedAlternative()} instead.
*/
- public int length() {
- return isSymbolic ? 0 : bases.length;
+ @Deprecated
+ default boolean isNonRefAllele() {
+ return isUnspecifiedAlternative();
}
- // ---------------------------------------------------------------------------------------------------------
- //
- // useful static functions
- //
- // ---------------------------------------------------------------------------------------------------------
+ /**
+ * @return true if this Allele is not the reference allele
+ * @deprecated use {@link #isAlternative()} instead.
+ */
+ @Deprecated
+ boolean isNonReference();
- public static Allele getMatchingAllele(final Collection
+ * The input {@code base} can be any of the 5 valid codes {@code A C G T N} (lower case are also allowed) and '.' that in this case
+ * indicates that the insertion is before the first base on another contig.
+ *
+ * The input {@code base} can be any of the 5 valid codes {@code A C G T N} (lower case are also allowed).
+ *
+ * Alternatively {@code bases} may be an empty array or have exactly one entry equal to '.' indicating
+ * that this is in insertion before the first base of a reference contig. In either case the resulting allele would
+ * have zero bases.
+ *
+ * Notice that the special '.' base cannot be followed bay another sequence of bases as that is considered invalid.
+ *
+ * Examples: {@code "A", "ATT", "N", "CGAGT", "T", ...}
+ *
+ * Examples: {@code "", "
+ * Examples: {@code "A[chr21:700123", ".G", "G.", "[chr1:6001235[T", ...}
+ *
+ * This type is shared only with its alternative version {@link Allele#NON_REF}.
+ *
+ * This type should be returned by those {@link Allele} implementation that do not conform to
+ * any the types above.
+ *
+ * Note that '*' and '=' are still not allowed in the first position of the ID.
+ *
+ * These include exclusively: a, c, g, t, n, A, C, G, T and N.
+ *
+ * Whether these symbolic are structural and their structural variant type would be
+ * determine by their ID. See {@link StructuralVariantType#fromSymbolicID(String)} for
+ * details.
+ *
+ * It provides a common interface to access individual bases and sections of the base sequence.
+ * allAlleles.stream().filter(a -> a.equalBases(alleleBases)).findFirst().orElse(...)
+ */
+ @Deprecated
+ static Allele getMatchingAllele(final Collection
+ * Nonetheless mutable sequence may implement this interface thus the underlying sequence may change between + * (or during!) method invocations. + *
+ */ +public interface BaseSequence { + + /** + * Returns the number of bases in the sequence. + * @return 0 or greater. + */ + int numberOfBases(); + + /** + * Returns the base at a particular position of the sequence. + * @param index the requested position 0-based. + * @return a valid base as determined by {@link SequenceUtil#isValidBase}.. + * @throws IndexOutOfBoundsException if the index provided is out of bounds. + */ + byte baseAt(final int index); + + /** + * Returns a copy of the base sequence as a naked byte array. + *+ * Changes in the returned array won't affect this sequence and vice-versa. + *
+ * @return never {@code null} but a zero-length array if {@code numberOfBases() == 0}. + */ + default byte[] copyBases() { + final int numberOfBases = numberOfBases(); + if (numberOfBases == 0) { + return ArrayUtils.EMPTY_BYTE_ARRAY; + } else { + final byte[] result = new byte[numberOfBases]; + copyBases(0, result, 0, numberOfBases); + return result; + } + } + + /** + * Copies a section of the sequence into a new byte array. + *+ * Changes in the returned array won't affect this sequence and vice-versa. + *
+ * @param offset the first base position to copy. + * @param length the number of base to copy. + * @return never {@code null}. + * @throws IndexOutOfBoundsException if {@code length} > 0 and in combination with {@code offset} it points + * outside the boundaries of this sequence. + */ + default byte[] copyBases(final int offset, final int length) { + if (length == 0) { + return ArrayUtils.EMPTY_BYTE_ARRAY; + } else { + final byte[] result = new byte[length]; + copyBases(offset, result, 0, length); + return result; + } + } + + /** + * Copies a range of the base sequence in a new byte array. + *+ * Changes in the returned array won't affect this sequence and vice-versa. + *
+ * @param from the first position to copy. + * @param to the position after the last one to copy. + * @return never {@code null}. + * @throws IndexOutOfBoundsException if the range is not empty (i.e. {@code to > from}) and {@code from} or {@code to} + * point outside the bounds of the sequence. + */ + default byte[] copyBasesRange(final int from, final int to) { + final int length = to - from; + if (length == 0) { + return ArrayUtils.EMPTY_BYTE_ARRAY; + } else { + final byte[] result = new byte[to - from]; + copyBases(from, result, 0, length); + return result; + } + } + + /** + * Copies the bases in the sequence onto an existing byte array. + * @param offset position of the first base to copy. + * @param dest where to copy the base to. + * @param destOffset where to start copying the bases in the destination array. + * @param length the number of consecutive bases to copy. + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws IndexOutOfBoundsException if the indexes and length provided result in stepping outside the boundaries + * of this sequence or the destination array. + */ + default void copyBases(final int offset, final byte[] dest, final int destOffset, final int length) { + if (length == 0 && dest == null) { // fail with an NPE on a null destination even if length is 0. + throw new NullPointerException(); + } + final int to = offset + length; + for (int i = offset, j = destOffset; i < to; i++, j++) { + dest[j] = baseAt(i); + } + } + + default void copyBasesRange(final int from, final int to, final byte[] dest, final int destOffset) { + final int length = to - from; + copyBases(from, dest, destOffset, length); + } + + default void copyBasesRange(final int from, final int to, final byte[] dest) { + final int length = to - from; + copyBases(from, dest, 0, length); + } + + default void copyBases(final byte[] dest) { + copyBases(0, dest, 0, numberOfBases()); + } + + default void copyBases(final byte[] dest, final int destOffset) { + copyBases(0, dest, destOffset, numberOfBases()); + } + + /** + * Compares this sequence + * Implementations are not allow to alter then content of the input base array. + * + * Returns 0 if both sequences are equal ignoring base case. + * If this base-sequence is smaller lexicographically the value returned is strictly negative equal to {@code -i -1} where {@code i} + * is the first position to differ. + * If this base-sequence is larger lexicographcially the value returned is strictly positive equal to {@code -i -1} where {@code i} + * is the first position to differ. + * It will return a negative value if this sequence is smaller lexicographically where the absolute value indicates + * the first position that differs (i) as {@code -i -1}. + */ + default int compareBases(final int offset, final byte[] other, final int otherOffset, final int length) { + for (int i = offset, j = otherOffset, k = 0; k < length; k++) { + final byte a = baseAt(i++); + final byte b = other[j++]; + final int comp = SequenceUtil.compareBases(a, b); + if (comp != 0) { + return comp < 0 ? -k -1 : k + 1; + } + } + return 0; + } + + default int compareBases(final int offset, final CharSequence other, final int otherOffset, final int length) { + for (int i = offset, j = otherOffset, k = 0; k < length; k++) { + final byte a = baseAt(i++); + final byte b = (byte) other.charAt(j++); + final int comp = SequenceUtil.compareBases(a, b); + if (comp != 0) { + return comp < 0 ? -k -1 : k + 1; + } + } + return 0; + } + + default int compareBases(final int offset, final BaseSequence other, final int otherOffset, final int length) { + if (other == this && offset == otherOffset) { // short cut in case of a trivial comparison with itself. + return 0; + } + for (int i = offset, j = otherOffset, k = 0; k < length; k++) { + final byte a = baseAt(i++); + final byte b = other.baseAt(j++); + final int comp = SequenceUtil.compareBases(a, b); + if (comp != 0) { + return comp < 0 ? -k -1 : k + 1; + } + } + return 0; + } + + default boolean equalBases(final int offset, final byte[] other, final int otherOffset, final int length) { + return compareBases(offset, other, otherOffset, length) == 0; + } + + default boolean equalBases(final int offset, final CharSequence other, final int otherOffset, final int length) { + return compareBases(offset, other, otherOffset, length) == 0; + } + + default boolean equalBases(final CharSequence other) { + final int numberOfBases = numberOfBases(); + if (other.length() != numberOfBases) { + return false; + } else { + return compareBases(0, other, 0, numberOfBases()) == 0; + } + } + + default boolean equalBases(final int offset, final BaseSequence other, final int otherOffset, final int length) { + return compareBases(offset, other, otherOffset, length) == 0; + } + + default boolean equalBases(final byte[] other) { + if (other.length != numberOfBases()) { + return false; + } else { + return equalBases(0, other, 0, other.length); + } + } + + default boolean equalBases(final BaseSequence other) { + if (other.numberOfBases() != numberOfBases()) { + return false; + } else { + return equalBases(0, other, 0, numberOfBases()); + } + } +} diff --git a/src/main/java/htsjdk/variant/variantcontext/Breakend.java b/src/main/java/htsjdk/variant/variantcontext/Breakend.java new file mode 100644 index 0000000000..9ded4c55e2 --- /dev/null +++ b/src/main/java/htsjdk/variant/variantcontext/Breakend.java @@ -0,0 +1,787 @@ +package htsjdk.variant.variantcontext; + +import htsjdk.samtools.util.*; +import org.apache.commons.lang3.ArrayUtils; + +import java.io.Serializable; +import java.util.Arrays; + +/** + * Represents the information about a breakend representable in an VCF allele spec. + */ +public abstract class Breakend implements Serializable, BaseSequence { + + protected final BreakendType type; + + private Breakend(final BreakendType type) { + this.type = type; + } + + /** + * Checks whether an allele spec byte sequence is likely to be a break-end spec. + *+ * In order to keep the code efficient, this does not make a full check but + * if it return true most likely a call to {@link Breakend#decode} won't fail on the same array, if we assume + * that such spec came from a well-formed VCF. + *
+ * @param spec the allele representation bases as a byte array. + * @return {@code true} iff the input {@code spec} looks like a valid breakend. + * @throws NullPointerException if {@code spec} is {@code null}. + */ + static boolean looksLikeBreakend(final byte[] spec) { + final int length = spec.length; + if (length < 2) { + return false; + } + final byte first = spec[0]; + final byte last = spec[length - 1]; + if (first == '.' && last != '.') { + return true; + } else if (last == '.' && first != '.') { + return true; + } else if ((first == '[' || first == ']') && last != '[' && last != ']') { + return true; + } else { + return first != '[' && first != ']' && (last == '[' || last == ']'); + } + } + + /** + * Constructs a single breakend. + * @param type the single type breakend. Only single types are allowed. + * @param base the reference aligned base character. + * @return never {@code null}. + * + * @throws NullPointerException if {@code type} is {@code null}. + * @throws IllegalArgumentException if {@code type} is not a single type. + * @throws AlleleEncodingException if the {@code base} provided is not a valid + * base character. + */ + public static Breakend single(final BreakendType type, final byte base) { + if (type == null || !type.isSingle()) { + throw new IllegalArgumentException("bad type"); + } + + if (AlleleUtils.isValidBase(base)) { + throw AlleleEncodingException.invalidBases(new byte[] { base }); + } + return new SingleBaseSingleBreakend(type, base); + } + + public static Breakend single(final BreakendType type, final byte[] bases) { + switch (bases.length) { + case 0: throw new AlleleEncodingException("single breakend must have at least one base"); + case 1: return single(type, bases[0]); + default : + if (!AlleleUtils.areValidBases(bases)) { + throw AlleleEncodingException.invalidBases(bases); + } else { + return new MultiBaseSingleBreakend(type, bases); + } + } + } + + /** + * Creates a paired breakend given its type properties. + * + *+ * Notice that only valid allele bases are allowed for {@code base} ('a', 'c', 't', 'g', 'n') and + * so it is no possible to instanciate a before-contig-start right-forward breakend + * whose encoding starts with {@code '.'}. + *
+ *+ * To create one of these you need to call {@link #beforeContigStart} instead. + *
+ * @param type the paired breakend type. Cannot be a single one nor {@code null}. + * @param base the single reference aligned base. + * @param mateContig the location contig for the mate breakend. + * @param matePosition the location positio for the mate breakend. + * @param mateContigIsInAssembly whether the mate's contig is in assembly ({@code true}) or reference ({@code false}). + * @return never {@code null}. + * @throws NullPointerException if any of {@code type}, {@code bases} or {@code mateContig} is {@code null}. + * @throws IllegalArgumentException if {@code type} is not paired or {@code matePosition} less or equal to 0. + * @throws AlleleEncodingException if {@code bases} contain non-valid bases codes + */ + public static Breakend paired(final BreakendType type, final byte base, final String mateContig, final int matePosition, final boolean mateContigIsInAssembly) { + if (!AlleleUtils.isValidContigID(mateContig)) { + throw AlleleEncodingException.invalidContigID(mateContig); + } else if (type.isSingle()) { + throw new IllegalArgumentException("bad type cannot be single: " + type); + } else if (matePosition <= 0) { + throw new IllegalArgumentException("mate position cannot be negative or 0"); + } else if (!AlleleUtils.isValidBase(base)) { + if (base == '.') { + throw new IllegalArgumentException("cannot use base '.' here, please call beforeContigStart(...) instead"); + } else { + throw AlleleEncodingException.invalidBases(new byte[] { base }); + } + } else { + return new SingleBasePairedBreakend(type, base, mateContig, matePosition, mateContigIsInAssembly); + } + } + + public static Breakend paired(final BreakendType type, final byte[] bases, final String mateContig, final int matePosition, final boolean mateContigIsInAssembly) { + switch (bases.length) { + case 0: + if (type == BreakendType.RIGHT_FORWARD) { + return beforeContigStart(mateContig, matePosition, mateContigIsInAssembly); + } else { + throw new AlleleEncodingException("bad breakend-type '%s'; no bases requires '%s'", type, BreakendType.RIGHT_FORWARD); + } + case 1: + return paired(type, bases[0], mateContig, matePosition, mateContigIsInAssembly); + default: + if (!AlleleUtils.isValidContigID(mateContig)) { + throw AlleleEncodingException.invalidContigID(mateContig); + } else if (matePosition <= 0) { + throw new AlleleEncodingException("the mate-position must be greater than 0: " + matePosition); + } else if (!AlleleUtils.areValidBases(bases)) { + throw AlleleEncodingException.invalidBases(bases); + } else { + return new MultiBasePairedBreakend(type, bases, mateContig, matePosition, mateContigIsInAssembly); + } + } + } + + /** + * Creates a brekaned that represent the insertion of sequence before the begining of a + * reference contig. + * @param mateContig the mate's breakend contig ID. + * @param matePosition the mate's breakend position. + * @param mateContigIsInAssembly whether {@code mateContig} refers to a reference or assemblu + * + * @return never {@code null}. + * + * @throws NullPointerException if {@code mateContig} is {@code null}. + * @throws IllegalArgumentException if {@code matePosition} is 0 or negative. + * @throws AlleleEncodingException if {@code mateContig} is not a valid contig-id. + */ + public static Breakend beforeContigStart(final String mateContig, final int matePosition, final boolean mateContigIsInAssembly) { + if (!AlleleUtils.isValidContigID(mateContig)) { + throw AlleleEncodingException.invalidContigID(mateContig); + } else if (matePosition <= 0) { + throw new IllegalArgumentException("mate position cannot be negative or 0"); + } else { + return new BeforeContigInsertBreakend(mateContig, matePosition, mateContigIsInAssembly); + } + } + + /** + * Returns the allele representation of a breakend. + * @return never {@code null}. + */ + public Allele asAllele() { + return new BreakendAllele(this); + } + + /** + * Decodes/parses a breakend from its character/string representation. + * @param chars the source char sequence. + * @return never {@code null}. + * @throws NullPointerException if {@code chars} is {@code null}. + * @throws AlleleEncodingException if the encoding provided in {@code chars} is not + * a valid encoding for a breakend. + */ + public static Breakend decode(final CharSequence chars) { + final int length = chars.length(); + final byte[] encoding = new byte[length]; + for (int i = 0; i < length; i++) { + encoding[i] = (byte) chars.charAt(i); + } + return decode(encoding); + } + + /** + * Decodes/parses a breakend from its byte array representation. + * @param encoding the source byte array. + * @return never {@code null}. + * @throws NullPointerException if {@code encoding} is {@code null}. + * @throws AlleleEncodingException if the encoding provided in {@code encoding} is not + * a valid encoding for a breakend. + */ + public static Breakend decode(final byte[] encoding) { + + final int length = encoding.length; + if (length < 2) { + throw new AlleleEncodingException("not a breakend encoding; too short: '%s'", new String(encoding)); + } else if (length == 2) { + return decodeSingle(encoding); + } else { + for (final byte b : encoding) { + if (b == '[' || b == ']') { + return decodePaired(encoding); + } + } + return decodeSingle(encoding); + } + } + + /** + * Proceeds decoding assuming that this is in fact a single typed breakend. + *+ * It assumes that the source encoding is of length 2 at least. + *
+ * @param encoding + * @return never {@code null}. + * @throws AlleleEncodingException if combination of bytes provided is not a + * valid encoding for a single typed breakend. + */ + private static Breakend decodeSingle(final byte[] encoding) { + final BreakendType type; + final int length = encoding.length; + final int first = encoding[0]; + final int last = encoding[length - 1]; + final int basesFrom; + final int basesTo; + if (first == '.') { + type = BreakendType.SINGLE_JOIN; + basesFrom = 1; + basesTo = length; + } else if (last == '.') { + type = BreakendType.SINGLE_FORK; + basesFrom = 0; + basesTo = length -1; + } else { + throw AlleleEncodingException.invalidEncoding(encoding); + } + if (encoding.length == 2) { + final byte base = encoding[basesFrom]; + if (!AlleleUtils.isValidBase(base)) { + throw AlleleEncodingException.invalidEncoding(encoding); + } else { + return new SingleBaseSingleBreakend(type, base); + } + } else { + if (!AlleleUtils.areValidBases(encoding, basesFrom, basesTo)) { + throw AlleleEncodingException.invalidEncoding(encoding); + } else { + return new MultiBaseSingleBreakend(type, Arrays.copyOfRange(encoding, basesFrom, basesTo)); + } + } + } + + /** + * Proceeds assuming the spec is a mated (non-single) break-end. + * It is provided the correct location for the first braket and its value. + * @param encoding the full String spec for the breakend. + * @return never {@code null}. + */ + private static Breakend decodePaired(final byte[] encoding) { + final int length = encoding.length; + final byte first = encoding[0]; + final byte last = encoding[length - 1]; + final byte bracket; + final int left; + final int right; + if (first == '[' || first == ']') { + bracket = first; + left = 0; + if ((right = ArrayUtils.lastIndexOf(encoding, bracket)) <= left) { + throw new AlleleEncodingException("bad paired break-end encoding missing right bracket (%s): '%s'", bracket, new String(encoding)); + } + } else if (last == '[' || last == ']') { + bracket = last; + right = length - 1; + left = ArrayUtils.indexOf(encoding, bracket); + if ((left <= 0 || left == right)) { + throw new AlleleEncodingException("bad paired break-end encoding missing left bracket (%s): '%s'", bracket, new String(encoding)); + } + } else { + throw new AlleleEncodingException("bad paired break-end encoding; first or last byte must be a bracket ('[' or ']'): '%s'", new String(encoding)); + } + int colon = ArrayUtils.lastIndexOf(encoding, (byte) ':', right - 1); + if (colon < 0) { + throw new AlleleEncodingException("missing colon in mate location: '%s'", new String(encoding)); + } else if (colon <= left) { + throw new AlleleEncodingException("bad paired break-end encoding; found colon (:) before left bracket: '%s'", new String(encoding)); + } + final boolean mateContigIsOnAssembly = colon - left >= 2 && encoding[left + 1] == '<' && encoding[colon - 1] == '>'; + final String contig = mateContigIsOnAssembly ? new String(encoding, left + 2, colon - left - 3) + : new String(encoding, left + 1, colon - left - 1); + if (!AlleleUtils.isValidContigID(contig)) { + throw new AlleleEncodingException("bad mate contig name (%s): '%s'", contig, new String(encoding)); + } + final int position = parseUnsigedPosition(encoding, colon + 1, right); + final boolean isLeftBreakend = bracket == ']'; + final boolean isForwardBreakend = (bracket == '[' && left > 0) || (bracket == ']' && left == 0); + final BreakendType type = BreakendType.paired(isLeftBreakend, isForwardBreakend); + final int numberOfBases = length - (right - left + 1); + switch (numberOfBases) { + case 0: throw new AlleleEncodingException("no bases in encoding: '%s'", new String(encoding)); + case 1: + final byte base = type.startsWithBase() ? first : last; + if (type == BreakendType.RIGHT_FORWARD && base == '.') { + return new BeforeContigInsertBreakend(contig, position, mateContigIsOnAssembly); + } else if (!AlleleUtils.isValidBase(base)) { + throw AlleleEncodingException.invalidEncoding(encoding); + } else { + return new SingleBasePairedBreakend(type, base, contig, position, mateContigIsOnAssembly); + } + default: + final byte[] bases = type.startsWithBase() + ? Arrays.copyOfRange(encoding, 0, left) + : Arrays.copyOfRange(encoding, right + 1, length); + if (!AlleleUtils.areValidBases(bases)) { + throw AlleleEncodingException.invalidBases(bases); + } else { + return new MultiBasePairedBreakend(type, bases, contig, position, mateContigIsOnAssembly); + } + } + } + + private static int parseUnsigedPosition(final byte[] spec, final int from, final int to) { + if (from >= to) { + throw new AlleleEncodingException("bad paired-breakend encode; mate contig position has length 0: '%s'", new String(spec)); + } else { + int result = 0; + for (int i = from; i < to; i++) { + byte b = spec[i]; + if (b < '0' || b > '9') { + throw new AlleleEncodingException("bad paired-breakend encode; mate contig position contain non-digit characters (%s): '%s'", (char) b, new String(spec)); + } else { + result = result * 10 + (b - '0'); + } + } + return result; + } + } + + /** + * Access this breakend type. + * @return never {@code null}. + */ + public BreakendType getType() { + return type; + } + + + /** + * Checks whether this breakend is a single type breakend. + * @return {@code true} iff this is a single type breakend. + */ + public abstract boolean isSingle(); + + /** + * Checks whether this breakend is a paired type breakend. + * @return {@code true} iff this is a paired type breakend + */ + public abstract boolean isPaired(); + + /** + * Returns the contig for the mate break-end if known. + *Otherwise it return {@code null}, for example if this is a + * single typed breakend. + *
+ * + * @return might be {@code null} + */ + public abstract String getMateContig(); + + /** + * Encodes the breakend back into a string. + * @return never {@code null}. + */ + public abstract String encodeAsString(); + + /** + * Checks whether the mate-contig it is an assembly contig/sequence. + *+ * As per the VCF spec, assembly contigs are specified by enclosing their names in + * angled brackets. + *
+ *+ * For single breakends that do not have a mate, this method will return {@code false} + *
+ *+ * For example: + *
+ *
+ * Breakend.of("A[:124912[").mateIsOnAssemblyContig() == true
+ * Breakend.of("A[13:312451[").mateIsOnAssemblyContig() == false
+ * Breakend.of("A.").mateIsOnAssemblyContig() == false
+ *
+ *
+ * @return {@code true} iff the breakend is a paired one and the mate contig belongs
+ * to an assembly file.
+ */
+ public abstract boolean mateIsOnAssemblyContig();
+
+ /**
+ * Position of the mate break-end using 1-based indexing.
+ * + * When there is no mate this will return -1. + *
+ * @return -1 or 1 or greater. + */ + public abstract int getMatePosition(); + + /** + * Returns a 1-bp sized locatable indicating the contig and position of the mate-break end. + * @return never {@code null}. + */ + public Locatable getMateLocation() { + if (isPaired()) { + return new Locatable() { + + @Override + public String getContig() { + return getMateContig(); + } + + @Override + public int getStart() { + return getMatePosition(); + } + + @Override + public int getEnd() { + return getMatePosition(); + } + }; + } else { + return null; + } + } + + @Override + public String toString() { + return encodeAsString(); + } + + private static abstract class AbstractPairedBreakend extends Breakend { + + private static final long serialVersionUID = 1; + + final String mateContig; + final int matePosition; + final boolean mateContigIsInAssembly; + + private AbstractPairedBreakend(final BreakendType type, final String mateContig, + final int matePosition, final boolean mateContigIsInAssembly) { + super(type); + this.mateContig = mateContig; + this.matePosition = matePosition; + this.mateContigIsInAssembly = mateContigIsInAssembly; + } + + abstract StringBuilder appendBases(final StringBuilder builder); + + @Override + public String getMateContig() { + return mateContig; + } + + @Override + public boolean mateIsOnAssemblyContig() { + return mateContigIsInAssembly; + } + + @Override + public int getMatePosition() { + return matePosition; + } + + @Override + public String encodeAsString() { + // 14 = [ + ] + .? + : + <>? + max_digits_int (10) + final StringBuilder builder = new StringBuilder( + mateContig.length() + 17); + final char bracket = type.isRightSide() ? '[' : ']'; + final boolean startWithBase = type.startsWithBase(); + if (startWithBase) { + appendBases(builder); + } + builder.append(bracket); + if (mateContigIsInAssembly) { + builder.append('<').append(mateContig).append('>'); + } else { + builder.append(mateContig); + } + builder.append(':').append(matePosition).append(bracket); + if (!startWithBase) { + appendBases(builder); + } + return builder.toString(); + } + + @Override + public boolean isSingle() { + return false; + } + + @Override + public boolean isPaired() { + return true; + } + } + + private static final class MultiBasePairedBreakend extends AbstractPairedBreakend { + + private static final long serialVersionUID = 1; + + private final byte[] bases; + + private MultiBasePairedBreakend(final BreakendType type, final byte[] bases, final String mateContig, final int matePosition, final boolean mateContigIsInAssembly) { + super(type, mateContig, matePosition, mateContigIsInAssembly); + this.bases = bases; + } + + @Override + StringBuilder appendBases(final StringBuilder builder) { + for (int i = 0; i < bases.length; i++) { + builder.append((char) bases[i]); + } + return builder; + } + @Override + public int numberOfBases() { + return bases.length; + } + + @Override + public byte baseAt(final int index) { + return bases[index]; + } + + @Override + public boolean equals(final Object other) { + return this == other || (other instanceof MultiBasePairedBreakend && equals((MultiBasePairedBreakend) other)); + } + + @Override + public int hashCode() { + return (((((SequenceUtil.hashCode(bases) * 31) + type.hashCode()) * 31) + + mateContig.hashCode()) * 31 + matePosition * 31); + } + + private boolean equals(final MultiBasePairedBreakend other) { + return other == this || (other.type == type && SequenceUtil.equals(other.bases, bases) && mateContigIsInAssembly == other.mateContigIsInAssembly && mateContig.equals(other.mateContig) && matePosition == other.matePosition); + } + } + + private static final class BeforeContigInsertBreakend extends AbstractPairedBreakend { + + private BeforeContigInsertBreakend(String mateContig, int matePosition, boolean mateContigIsInAssembly) { + super(BreakendType.RIGHT_FORWARD, mateContig, matePosition, mateContigIsInAssembly); + } + + @Override + StringBuilder appendBases(final StringBuilder builder) { + return builder.append('.'); + } + + @Override + public int numberOfBases() { + return 0; + } + + @Override + public byte baseAt(int index) { + throw new IndexOutOfBoundsException(); + } + + @Override + public boolean equals(final Object other) { + return other == null || (other instanceof BeforeContigInsertBreakend && equals((BeforeContigInsertBreakend) other)); + } + + @Override + public int hashCode() { + return (( (mateContig.hashCode() * 31) + matePosition ) * 31 + Boolean.hashCode(mateContigIsInAssembly)) * 31; + } + + private boolean equals(final BeforeContigInsertBreakend other) { + return other.type == type && + other.mateContig.equals(this.mateContig) && + other.matePosition == this.matePosition && + other.mateContigIsInAssembly == this.mateContigIsInAssembly; + } + } + + private static final class SingleBasePairedBreakend extends AbstractPairedBreakend { + + private static final long serialVersionUID = 1L; + private final byte base; + + private SingleBasePairedBreakend(final BreakendType type, final byte base, final String mateContig, + final int matePosition, final boolean mateContigIsInAssembly) { + super(type, mateContig, matePosition, mateContigIsInAssembly); + this.base = base; + } + + @Override + StringBuilder appendBases(final StringBuilder builder) { + return builder.append((char) base); + } + + @Override + public int hashCode() { + return (((((SequenceUtil.hashCode(base) * 31) + type.hashCode()) * 31) + + mateContig.hashCode()) * 31 + matePosition * 31); + } + + @Override + public boolean equals(final Object other) { + return other instanceof SingleBasePairedBreakend && equals((SingleBasePairedBreakend) other); + } + + private boolean equals(final SingleBasePairedBreakend other) { + return SequenceUtil.basesEqual(base, other.base) && type == other.type + && mateContigIsInAssembly == other.mateContigIsInAssembly + && mateContig.equals(other.mateContig) + && matePosition == other.matePosition; + } + + @Override + public int numberOfBases() { + return 1; + } + + @Override + public byte baseAt(final int index) { + if (index != 0) { + throw new IndexOutOfBoundsException(); + } + return base; + } + } + + private abstract static class AbstractSingleBreakend extends Breakend { + + private static final long serialVersionUID = 1; + + private AbstractSingleBreakend(final BreakendType type) { + super(type); + } + + @Override + public String getMateContig() { + return null; + } + + @Override + public boolean mateIsOnAssemblyContig() { + return false; + } + + @Override + public int getMatePosition() { + return -1; + } + + @Override + public boolean isSingle() { + return true; + } + + @Override + public boolean isPaired() { + return false; + } + + + } + + private final static class SingleBaseSingleBreakend extends AbstractSingleBreakend { + + private static final long serialVersionUID = 1; + private final byte base; + + private SingleBaseSingleBreakend(final BreakendType type, final byte base) { + super(type); + this.base = base; + } + + @Override + public int numberOfBases() { + return 1; + } + + @Override + public byte baseAt(final int index) { + if (index != 0) { + throw new IndexOutOfBoundsException(); + } + return base; + } + + @Override + public int hashCode() { + return (SequenceUtil.hashCode(base) * 31 + type.hashCode()) * 31; + } + + @Override + public boolean equals(final Object other) { + return other == this || other instanceof SingleBaseSingleBreakend && equals((SingleBaseSingleBreakend) other); + } + + private boolean equals(final SingleBaseSingleBreakend other) { + return other == this || (other.type == type && SequenceUtil.basesEqual(other.base, base)); + } + + @Override + public String encodeAsString() { + final char[] chars; + if (type.startsWithBase()) { + chars = new char[] { (char) base, '.' }; + } else { + chars = new char[] { '.', (char) base }; + } + return new String(chars); + } + + } + + private final static class MultiBaseSingleBreakend extends AbstractSingleBreakend { + + private static final long serialVersionUID = 1; + private final byte[] bases; + + private MultiBaseSingleBreakend(final BreakendType type, final byte[] bases) { + super(type); + this.bases = bases; + } + + @Override + public int numberOfBases() { + return bases.length; + } + + @Override + public byte baseAt(final int index) { + return bases[index]; + } + + @Override + public int hashCode() { + return (SequenceUtil.hashCode(bases) * 31 + type.hashCode()) * 31; + } + + @Override + public boolean equals(final Object other) { + return other == this || other instanceof MultiBaseSingleBreakend && equals((MultiBaseSingleBreakend) other); + } + + private boolean equals(final MultiBaseSingleBreakend other) { + return other == this || (other.type == type && SequenceUtil.equals(other.bases, bases)); + } + + @Override + public String encodeAsString() { + final char[] chars = new char[bases.length + 1]; + if (type.startsWithBase()) { + for (int i = 0; i < bases.length; i++) { + chars[i] = (char) bases[i]; + } + chars[chars.length - 1] = '.'; + } else { + for (int i = chars.length - 1; i > 0;) { + chars[i] = (char) bases[--i]; + } + chars[0] = '.'; + } + return new String(chars); + } + } + +} + diff --git a/src/main/java/htsjdk/variant/variantcontext/BreakendAllele.java b/src/main/java/htsjdk/variant/variantcontext/BreakendAllele.java new file mode 100644 index 0000000000..71cc047db3 --- /dev/null +++ b/src/main/java/htsjdk/variant/variantcontext/BreakendAllele.java @@ -0,0 +1,136 @@ +package htsjdk.variant.variantcontext; + +/** + * Subclass of Allele spezialized in representing breakend alleles. + *+ * It does not offer any new operation, nor it is requirement for all breakend encoding alleles to be represeted by this class. + * It simply provides more efficient handling Breakend related methods declared in {@link Allele} when we + * can assue that the allele is indeed a break-end allele. + *
+ */ +final class BreakendAllele extends AbstractAllele { + + private final Breakend breakend; + + BreakendAllele(final Breakend breakend) { + this.breakend = breakend; + } + + @Override + public boolean isBreakend() { + return true; + } + + @Override + public boolean isPairedBreakend() { + return !breakend.isSingle(); + } + + @Override + public boolean isNoCall() { + return false; + } + + @Override + public boolean isSymbolic() { + return true; + } + + @Override + public boolean isCalled() { return true; } + + @Override + public boolean isAlternative() { return true; } + + @Override + public StructuralVariantType getStructuralVariantType() { + return StructuralVariantType.BND; + } + + @Override + public boolean isStructural() { return true; } + + @Override + public boolean isBreakpoint() { + return true; + } + + @Override + public boolean isSingleBreakend() { + return breakend.isSingle(); + } + + @Override + public Breakend asBreakend() { + return breakend; + } + + @Override + public Allele asAlternative() { + return this; + } + + @Override + public String encodeAsString() { + return breakend.encodeAsString(); + } + + @Override + public boolean equals(final Object other) { + return this == other || (other instanceof BreakendAllele && ((BreakendAllele) other).breakend.equals(breakend)); + } + + @Override + public boolean equals(final Allele other, final boolean ignoreRefState) { + return other == this || (other instanceof BreakendAllele && equals((BreakendAllele) other)); + } + + @Override + public int hashCode() { + return breakend.hashCode(); + } + + private boolean equals(final BreakendAllele other) { + return other != null && other.breakend.equals(breakend); + } + + @Override + public String getContigID() { + return breakend.getMateContig(); + } + + @Override + public boolean hasContigID() { + return breakend.getMateContig() != null; + } + + @Override + public int numberOfBases() { + return breakend.numberOfBases(); + } + + @Override + public byte baseAt(final int index) { + return breakend.baseAt(index); + } + + @Override + public void copyBases(final int offset, final byte[] dest, final int destOffset, final int length) { + breakend.copyBases(offset, dest, destOffset, length); + } + + @Override + public int compareBases(final int offset, final byte[] other, final int otherOffset, final int length) { + return breakend.compareBases(offset, other, otherOffset, length); + } + + @Override + public int compareBases(final int offset, final BaseSequence other, final int otherOffset, final int length) { + return breakend.compareBases(offset, other, otherOffset, length); + } + + @Override + public String getBaseString() { + return new String(breakend.copyBases()); + } +} diff --git a/src/main/java/htsjdk/variant/variantcontext/BreakendType.java b/src/main/java/htsjdk/variant/variantcontext/BreakendType.java new file mode 100644 index 0000000000..d6d1930869 --- /dev/null +++ b/src/main/java/htsjdk/variant/variantcontext/BreakendType.java @@ -0,0 +1,236 @@ +package htsjdk.variant.variantcontext; + +/** + * Possible breakend types. + *+ * There are two single break types and four paired break end types. + *
+ *Examples:
+ * #CHROM POS ID REF ALT ... + * 1 100 BND_1 T T. ... + * 3 400 BND_2 G .G ... + *+ *
+ * There are two single break end types: {@link #SINGLE_FORK} and {@link #SINGLE_JOIN}. Both types + * refers how the new adjacency reads left to right atop the reference forward strand. + *
+ *So {@link #SINGLE_FORK} is simply a fork out or branch off the reference sequence after + * the current position in the reference so the adjacent DNA would "dangle" off the right of the up-stream sequence + * leading to this point in the reference. (E.g. see {@code BND_1} above)
+ *In contrast, {@link #SINGLE_JOIN} would represent the opposite (yet still from the forward strand perspective, + * where the adjacent sequence would "dangle" left from the current position and where it joins the reference and + * continues downstream on the reference from that point. (E.g. see {@code BND_2} above)
+ *There four paired types are proof to be more challenging to name and interpret. This enumeration + * uses how the terms "left", "right" and "reverse (complement)" are used in their definition in the VCF + * to name them.
+ *The following table show the correspondence + * between type constants, encoding format and their description in the spec.
+ * + *Type constant | + *Encoding format | + *Example | + *VCF 4.3 description | + *
---|---|---|---|
{@link #RIGHT_FORWARD} | + *t[p[ | + *T[13:212121[ | + *"piece extending to the right of p is joined after t" | + *
{@link #LEFT_REVERSE} | + *t]p] | + *T]13:212121] | + *"reverse comp piece extending left of p is joined after t" | + *
{@link #LEFT_FORWARD} | + *]p]t | + *]13:212121]T | + *"piece extending to the left of p is joined before t" | + *
{@link #RIGHT_REVERSE} | + *[p[t | + *[13:212121[T | + *"reverse comp piece extending right of p is joined before t" | + *
Notice that the enum constant name, as is the case with the VCF description of each type, makes reference as the location of the rest of the adjacent sequence with respect to + * the mate breakend location.
+ */ +public enum BreakendType { + /** + * Single left break-end, where the adjacency extends to the right of the enclosing location. + */ + SINGLE_FORK(true) { // t. + public BreakendType mateType() { + return null; + } + }, + SINGLE_JOIN(false) { // .t + public BreakendType mateType() { + return null; + } + }, + RIGHT_FORWARD(false, true) { // t[p[ piece extending to the right of p is joined after t + public BreakendType mateType() { + return LEFT_FORWARD; + } + }, + LEFT_REVERSE(true, false) { // t]p] reverse comp piece extending left of p is joined after t + public BreakendType mateType() { + return this; + } + }, + LEFT_FORWARD(true, true) { // ]p]t piece extending to the left of p is joined before t + public BreakendType mateType() { + return RIGHT_FORWARD; + } + }, + RIGHT_REVERSE(false, false) { // [p[t reverse comp piece extending right of p is joined before t + public BreakendType mateType() { + return this; + } + }; + + private final boolean isBasePrefix; + private final boolean isSingle; + private final boolean isLeftEnd; + private final boolean isForward; + private final boolean isRightEnd; + private final boolean isReverse; + + // Constructor for single types: + BreakendType(final boolean basePrefix) { + isSingle = true; + isLeftEnd = isRightEnd = isForward = isReverse = false; + isBasePrefix = basePrefix; + } + + // Constructor for paired types: + BreakendType(final boolean left, final boolean forward) { + isRightEnd = !(isLeftEnd = left); + isReverse = !(isForward = forward); + isBasePrefix = left != forward; + isSingle = false; + } + + + /** + * Checks whether the encoding start with the reference base. + * @return {@code true} iff the first character in the encoding is the reference (or snp) base. + * Otherwise such character is placed at the end. + */ + boolean startsWithBase() { + return isBasePrefix; + } + + /** + * For paired breakend type, checks whether the adjacent DNA sequence comes from the + * left (upstream) of the mate's position. + *+ * For single type it returns false as it is not applicable. + *
+ * @return {@code true} iff this is a paired type the tested condition is true. + */ + public boolean isLeftSide() { + return isLeftEnd; + } + + /** + * For paired breakend type, checks whether the adjacent DNA sequence comes from the + * forward strand around the mate position. + *+ * For single type it returns false as it is not applicable. + *
+ * @return {@code true} iff this is a paired type the tested condition is true. + */ + public boolean isForward() { + return isForward; + } + + /** + * For paired breakend type, checks whether the adjacent DNA sequence comes from the + * right (downstream) of the mate's position. + *+ * For single type it returns false as it is not applicable. + *
+ * @return {@code true} iff this is a paired type the tested condition is true. + */ + public boolean isRightSide() { + return isRightEnd; + } + + /** + * For paired breakend type, checks whether the adjacent DNA sequence is the reverse complement from the + * reverse strand around the mate position. + *+ * For single type it returns false as it is not applicable. + *
+ * @return {@code true} iff this is a paired type the tested condition is true. + */ + public boolean isReverse() { + return isReverse; + } + + /** + * Checks whether this type is a single breakend type. + * @return {@code true} iff this is indeed a single breakend type. + */ + public boolean isSingle() { + return isSingle; + } + + /** + * Checks whether this type is a paired breakend type. + * @return {@code true} iff this is indeed a paired breakend type. + */ + public boolean isPaired() { + return !isSingle; + } + + /** + * Returns a paired type based on requirements on its left-right, forward-reverse status. + * @param left whether the type must be left-sided ({@code true}) or right-sided ({@code false}) + * @param forward whether the type must be forward ({@code true}) or reverse ({@code false}) + * @return never {@code null}. + */ + public static BreakendType paired(final boolean left, final boolean forward) { + if (left) { + return forward ? LEFT_FORWARD : LEFT_REVERSE; + } else { + return forward ? RIGHT_FORWARD : RIGHT_REVERSE; + } + } + + /** + * Returns the type for the mate-breakend. + *+ * When this cannot be determined (i.e. this is a single type) it returns {@code null}. + *
+ * @return may return {@code null}. It does so with single types. + */ + public BreakendType mateType() { + switch (this) { + case LEFT_FORWARD: + return RIGHT_FORWARD; + case RIGHT_FORWARD: + return LEFT_FORWARD; + case LEFT_REVERSE: + return LEFT_REVERSE; + case RIGHT_REVERSE: + return RIGHT_REVERSE; + default: + return null; + } + } +} diff --git a/src/main/java/htsjdk/variant/variantcontext/ContigInsertAllele.java b/src/main/java/htsjdk/variant/variantcontext/ContigInsertAllele.java new file mode 100644 index 0000000000..e51ca2e881 --- /dev/null +++ b/src/main/java/htsjdk/variant/variantcontext/ContigInsertAllele.java @@ -0,0 +1,96 @@ +package htsjdk.variant.variantcontext; + +import htsjdk.samtools.util.SequenceUtil; + +import java.util.Objects; + +final class ContigInsertAllele extends AbstractAllele { + + private static long serialVersionUID = 1L; + + private final byte[] bases; + private final String assemblyContig; + private transient String encodingAsString; + + ContigInsertAllele(final byte[] bases, final String assemblyContig) { + this.bases = Objects.requireNonNull(bases); + this.assemblyContig = Objects.requireNonNull(assemblyContig); + } + + public String getContigID() { + return assemblyContig; + } + + @Override + public boolean hasContigID() { + return true; + } + + @Override + public boolean equals(final Allele other, boolean ignoreRefState) { + return other instanceof ContigInsertAllele && equals((ContigInsertAllele) other); + } + + @Override + public Allele asAlternative() { + return this; + } + + @Override + public String encodeAsString() { + if (encodingAsString == null) { + final StringBuilder builder = new StringBuilder(bases.length + assemblyContig.length() + 2); + for (final byte b : bases) { + builder.append((char) b); + } + builder.append('<'); + builder.append(assemblyContig); + builder.append('>'); + encodingAsString = builder.toString(); + } + return encodingAsString; + } + + @Override + public int numberOfBases() { + return bases.length; + } + + @Override + public byte baseAt(final int index) { + return bases[index]; + } + + @Override + public boolean isCalled() { return true; } + + @Override + public boolean isAlternative() { + return true; + } + + @Override + public boolean isSymbolic() { + return true; + } + + @Override + public boolean isStructural() { return true; } + + @Override + public StructuralVariantType getStructuralVariantType() { return StructuralVariantType.INS; } + + @Override + public boolean equals(final Object other) { + return other instanceof ContigInsertAllele && equals((ContigInsertAllele) other); + } + + @Override + public int hashCode() { + return SequenceUtil.hashCode(bases) * 31 + assemblyContig.hashCode(); + } + + private boolean equals(final ContigInsertAllele other) { + return SequenceUtil.equals(bases, other.bases) && other.assemblyContig.equals(assemblyContig); + } +} diff --git a/src/main/java/htsjdk/variant/variantcontext/Genotype.java b/src/main/java/htsjdk/variant/variantcontext/Genotype.java index 0f782bff95..3a374782bc 100644 --- a/src/main/java/htsjdk/variant/variantcontext/Genotype.java +++ b/src/main/java/htsjdk/variant/variantcontext/Genotype.java @@ -355,7 +355,7 @@ public String getGenotypeString() { * * @return a string representing the genotypes, or null if the type is unavailable. */ - public String getGenotypeString(boolean ignoreRefState) { + public String getGenotypeString(final boolean ignoreRefState) { if ( getPloidy() == 0 ) return "NA"; @@ -367,7 +367,7 @@ public String getGenotypeString(boolean ignoreRefState) { return ParsingUtils.join(separator, getAlleleStrings()); } // 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course) - List+ * This class accept any number of bases (0 to Integer.MAX_VALUE) however when + * the number of bases is exactly one you should consider to use {@link SingleBaseInLineAllele} + * instead. + *
+ */ +final class MultiBaseInLineAllele extends AbstractAllele { + + private final byte[] bases; + private final boolean isReference; + private transient int hashCode; + private transient String encodingAsString; + + /** + * No checks are performed here, the calling code must make sure that: + *
+ * As per the VCF 4.3 spec we take on the suggestion that the SV type of a
+ * symbolic would be the one indicated by the "top level" (i.e. the first) part of such symbolic id where
+ * parts are separated with colon characters (':'). So for example {@link #DUP} would be
+ * the SV type for {@code "
+ * Here we ignore case so we consider "
+ * In the case of {@link #BND} it returns {@code null}, as breakend alleles contain information + * specific to the breakend (mate position, prefix-suffix bases etc). + *
+ *
+ * For those SV types that may have "subtype" alleles, e.g. DUP has {@code