Skip to content

Commit

Permalink
Ported GATK's FastaReferenceWriter (#1172)
Browse files Browse the repository at this point in the history
* - Ported GATK's FastaReferenceWriter
- Implemented some basic validation functionality rather than stripping it out of the implementation of FastaReferenceWriter)
- Implemented a simple version of a RandomDNA creator (and modified SAMRecordSetBuilder to use it)
- Ported the tests (for FastaReferenceWriter) from GATK.
  • Loading branch information
Yossi Farjoun authored Feb 1, 2019
1 parent e86af96 commit 942e3d6
Show file tree
Hide file tree
Showing 8 changed files with 1,812 additions and 68 deletions.
9 changes: 1 addition & 8 deletions src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,7 @@ private int getReadLengthFromCigar(final SAMRecord rec) {
* If there's a cigar with read-length >0, will use that length for reads. Otherwise will use length = 36
*/
private void fillInBases(final SAMRecord rec) {
final int length = getReadLengthFromCigar(rec);
final byte[] bases = new byte[length];

for (int i = 0; i < length; ++i) {
bases[i] = BASES[this.random.nextInt(BASES.length)];
}

rec.setReadBases(bases);
rec.setReadBases(SequenceUtil.getRandomBases(random, getReadLengthFromCigar(rec)));
}

/**
Expand Down
680 changes: 680 additions & 0 deletions src/main/java/htsjdk/samtools/reference/FastaReferenceWriter.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*
* The MIT License
*
* Copyright (c) 2018 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package htsjdk.samtools.reference;

import htsjdk.utils.ValidationUtils;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* Buider for a {@link htsjdk.samtools.reference.FastaReferenceWriter}
* <p>
* You can set each of the three outputs (fasta, dictionary and index) to a file or a stream.
* by default if you provide a file to the fasta an accompanying index and dictionary will be created.
* This behaviour can be controlled by {@link #setMakeDictOutput(boolean)} and {@link #setMakeFaiOutput(boolean)}
* <p>
* The default bases-per-line is {@value FastaReferenceWriter#DEFAULT_BASES_PER_LINE}.
* <p>
* Setting a file or an output stream for any of the three outputs (fasta, index or dict) will invalidate the other
* output type (i.e. setting a file output will invalidate a previous stream and vice-versa).
* </p>
*/
public class FastaReferenceWriterBuilder {
private Path fastaFile;
private boolean makeFaiOutput = true;
private boolean makeDictOutput = true;
private int basesPerLine = FastaReferenceWriter.DEFAULT_BASES_PER_LINE;
private Path indexFile;
private Path dictFile;
private OutputStream fastaOutput;
private OutputStream indexOutput;
private OutputStream dictOutput;

private static Path defaultFaiFile(final boolean makeFaiFile, final Path fastaFile) {
return makeFaiFile ? ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile) : null;
}

private static Path defaultDictFile(final boolean makeDictFile, final Path fastaFile) {
return makeDictFile ? ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(fastaFile) : null;
}

protected static int checkBasesPerLine(final int value) {
ValidationUtils.validateArg(value > 0, "bases per line must be 1 or greater");
return value;
}

/**
* Set the output fasta file to write to. Doesn't (currently) support compressed filenames.
* If the index file and output stream are both null and makeFaiOutput is true (default), a default index file will be created as well.
* If the dictionary file and output stream are both null and makeDictOutput is true (default), a default dictionary file will be created as well.
*
* You can only provide a compressed stream to the fastaOutput, and only in the case that an index isn't written.
*
* @param fastaFile a {@link Path} to the output fasta file.
* @return this builder
*/
public FastaReferenceWriterBuilder setFastaFile(final Path fastaFile) {
this.fastaFile = fastaFile;
this.fastaOutput = null;
return this;
}

/**
* Sets whether to automatically generate an index file from the name of the fasta-file (assuming it is given
* as a file). This can only happen if both the index file and output stream are null.
*
* @param makeFaiOutput a boolean flag
* @return this builder
*/
public FastaReferenceWriterBuilder setMakeFaiOutput(final boolean makeFaiOutput) {
this.makeFaiOutput = makeFaiOutput;
return this;
}

/**
* Sets whether to automatically generate an dictionary file from the name of the fasta-file (assuming it is given
* as a file). This can only happen if both the dictionary file and output stream are null.
*
* @param makeDictOutput a boolean flag
* @return this builder
*/

public FastaReferenceWriterBuilder setMakeDictOutput(final boolean makeDictOutput) {
this.makeDictOutput = makeDictOutput;
return this;
}

/**
* Sets the number of bases each line of the fasta file will have.
* the default is {@value FastaReferenceWriter#DEFAULT_BASES_PER_LINE}
*
* @param basesPerLine integer (must be positive, validated on {@link #build()}) indicating the number of bases per line in
* the output
* @return this builder
*/
public FastaReferenceWriterBuilder setBasesPerLine(final int basesPerLine) {
this.basesPerLine = basesPerLine;
return this;
}

/**
* Set the output index file to write to. Doesn't (currently) support compressed filenames.
*/
public FastaReferenceWriterBuilder setIndexFile(final Path indexFile) {
this.indexFile = indexFile;
this.indexOutput = null;
return this;
}

/**
* Set the output dictionary file to write to.
*/
public FastaReferenceWriterBuilder setDictFile(final Path dictFile) {
this.dictFile = dictFile;
this.dictOutput = null;
return this;
}

/**
* Set the output stream for writing the reference. Doesn't support compressed streams.
*
* @param fastaOutput a {@link OutputStream} for the output fasta file.
* @return this builder
*/

public FastaReferenceWriterBuilder setFastaOutput(final OutputStream fastaOutput) {
this.fastaOutput = fastaOutput;
this.fastaFile = null;
return this;
}

/**
* Set the output stream for writing the index. Doesn't support compressed streams.
*
* @param indexOutput a {@link OutputStream} for the output index.
* @return this builder
*/
public FastaReferenceWriterBuilder setIndexOutput(final OutputStream indexOutput) {
this.indexOutput = indexOutput;
this.indexFile = null;
return this;
}

/**
* Set the output stream for writing the dictionary. Doesn't support compressed streams.
*
* @param dictOutput a {@link OutputStream} for the output dictionary.
* @return this builder
*/
public FastaReferenceWriterBuilder setDictOutput(final OutputStream dictOutput) {
this.dictOutput = dictOutput;
this.dictFile = null;
return this;
}

/**
* Create the {@link FastaReferenceWriter}. This is were all the validations happen:
* <ld>
* <li>
* -One of fastaFile and fastaOutput must be non-null.
* </li>
* <li>
* -the number of bases-per-line must be positive
* </li>
* </ld>
*
* @return a {@link FastaReferenceWriter}
* @throws IOException if trouble opening files
*/
public FastaReferenceWriter build() throws IOException {
if (fastaFile == null && fastaOutput == null) {
throw new IllegalArgumentException("Both fastaFile and fastaOutput were null. Please set one of them to be non-null.");
}
if(fastaFile != null) {
if (indexFile == null && indexOutput == null) {
indexFile = defaultFaiFile(makeFaiOutput, fastaFile);
} else if (indexFile != null && indexOutput != null) {
throw new IllegalArgumentException("Both indexFile and indexOutput were non-null. Please set one of them to be null.");
}
if (dictFile == null && dictOutput == null) {
dictFile = defaultDictFile(makeDictOutput, fastaFile);
} else if (dictFile != null && dictOutput != null) {
throw new IllegalArgumentException("Both dictFile and dictOutput were non-null. Please set one of them to be null.");
}
}
// checkout bases-perline first, so that files are not created if failure;
checkBasesPerLine(basesPerLine);

if (fastaFile != null) {
fastaOutput = Files.newOutputStream(fastaFile);
}
if (indexFile != null) {
indexOutput = Files.newOutputStream(indexFile);
}
if (dictFile != null) {
dictOutput = Files.newOutputStream(dictFile);
}

return new FastaReferenceWriter(basesPerLine, fastaOutput, indexOutput, dictOutput);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ public class ReferenceSequenceFileFactory {
add(".txt.gz");
}};

public static final String FASTA_INDEX_EXTENSION = ".fai";

/**
* Attempts to determine the type of the reference file and return an instance
* of ReferenceSequenceFile that is appropriate to read it. Sequence names
Expand Down Expand Up @@ -255,7 +257,6 @@ public static String getFastaExtension(final Path path) {
* @param fastaFile the reference sequence file path.
*/
public static Path getFastaIndexFileName(Path fastaFile) {
return fastaFile.resolveSibling(fastaFile.getFileName() + ".fai");
return fastaFile.resolveSibling(fastaFile.getFileName() + FASTA_INDEX_EXTENSION);
}

}
63 changes: 47 additions & 16 deletions src/main/java/htsjdk/samtools/util/SequenceUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,9 @@
*/
package htsjdk.samtools.util;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.*;
import htsjdk.samtools.fastq.FastqConstants;
import htsjdk.utils.ValidationUtils;

import java.io.File;
import java.math.BigInteger;
Expand All @@ -41,6 +34,7 @@
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand All @@ -58,15 +52,17 @@ public class SequenceUtil {
*/
private static final byte[] BAM_READ_BASE_SET = "=ABCDGHKMNRSTVWY".getBytes();

private static final int BASES_ARRAY_LENGTH = 127;
private static final int SHIFT_TO_LOWER_CASE = a - A;
/**
* A lookup table to find a corresponding BAM read base.
*/
private static final byte[] bamReadBaseLookup = new byte[127];
private static final byte[] bamReadBaseLookup = new byte[BASES_ARRAY_LENGTH];
static {
Arrays.fill(bamReadBaseLookup, N);
for (final byte base: BAM_READ_BASE_SET) {
bamReadBaseLookup[base] = base;
bamReadBaseLookup[base + 32] = base;
bamReadBaseLookup[base + SHIFT_TO_LOWER_CASE] = base;
}
}

Expand All @@ -75,7 +71,7 @@ public class SequenceUtil {
private static final byte G_MASK = 4;
private static final byte T_MASK = 8;

private static final byte[] bases = new byte[127];
private static final byte[] bases = new byte[BASES_ARRAY_LENGTH];
private static final byte NON_IUPAC_CODE = 0;
/*
* Definition of IUPAC codes:
Expand All @@ -100,7 +96,7 @@ public class SequenceUtil {
bases['N'] = A_MASK | C_MASK | G_MASK | T_MASK;
// Also store the bases in lower case
for (int i = 'A'; i <= 'Z'; i++) {
bases[(byte) i + 32] = bases[(byte) i];
bases[(byte) i + SHIFT_TO_LOWER_CASE] = bases[(byte) i];
}
bases['.'] = A_MASK | C_MASK | G_MASK | T_MASK;
};
Expand All @@ -125,6 +121,12 @@ public static String reverseComplement(final String sequenceData) {
* without considering the set relationships between ambiguous codes.
*/
public static boolean basesEqual(final byte lhs, final byte rhs) {
if (lhs < 0 || lhs >= BASES_ARRAY_LENGTH) {
return false;
}
if (rhs < 0 || rhs >= BASES_ARRAY_LENGTH) {
return false;
}
return (bases[lhs] == bases[rhs]);
}

Expand All @@ -139,6 +141,13 @@ public static boolean basesEqual(final byte lhs, final byte rhs) {
* Since the comparison is directional, make sure to pass read / ref codes in correct order.
*/
public static boolean readBaseMatchesRefBaseWithAmbiguity(final byte readBase, final byte refBase) {
if (readBase < 0 || readBase >= BASES_ARRAY_LENGTH) {
return false;
}
if (refBase < 0 || refBase >= BASES_ARRAY_LENGTH) {
return false;
}

return (bases[readBase] & bases[refBase]) == bases[readBase];
}

Expand Down Expand Up @@ -175,15 +184,19 @@ public static String getIUPACCodesString() {

/** Checks if the given base is a IUPAC code */
public static boolean isIUPAC(final byte base) {
if (base >= BASES_ARRAY_LENGTH || base < 0) {
return false;
}
return bases[base] != NON_IUPAC_CODE;
}

/** Calculates the fraction of bases that are G/C in the sequence */
public static double calculateGc(final byte[] bases) {
int gcs = 0;
for (int i = 0; i < bases.length; ++i) {
final byte b = bases[i];
if (b == 'C' || b == 'G' || b == 'c' || b == 'g') ++gcs;
for (final byte b : bases) {
if (b == 'C' || b == 'G' || b == 'c' || b == 'g') {
++gcs;
}
}

return gcs / (double) bases.length;
Expand Down Expand Up @@ -1062,4 +1075,22 @@ public static String getSamReadNameFromFastqHeader(final String fastqHeader) {

return readName;
}

/**
* Returns an array of bytes containing random DNA bases
*
* @param random A {@link Random} object to use for drawing randomw bases
* @param length How many bases to return.
* @return an array of random DNA bases of the requested length.
*/
static public byte[] getRandomBases(Random random, final int length) {
ValidationUtils.validateArg(length>=0, "length must be positive");
final byte[] bases = new byte[length];

for (int i = 0; i < length; ++i) {
bases[i] = VALID_BASES_UPPER[random.nextInt(VALID_BASES_UPPER.length)];
}

return bases;
}
}
Loading

0 comments on commit 942e3d6

Please sign in to comment.