Skip to content

Commit

Permalink
Adding getPathToDataFile default method to FeatureCodec (#1223)
Browse files Browse the repository at this point in the history
* adding a new method FeatureCodec.getPathToDataFile()
  - This method allows a special class of codec where the files that they
      accept are a configuration file and the actual data is stored elsewhere.
  - The default implementation means that existing codecs do not need to be modified in any way
* modifying AbstractFeatureReader and it's subclasses to use getPathToDataFile
* downstream tools that implement their own FeatureReader subclasses may
  want to update them to be aware of getPathToDataFile if they want to
  understand these codecs
  • Loading branch information
lbergelson authored Nov 14, 2018
1 parent ee0ad68 commit bc4b3ae
Show file tree
Hide file tree
Showing 10 changed files with 105 additions and 5 deletions.
15 changes: 11 additions & 4 deletions src/main/java/htsjdk/tribble/AbstractFeatureReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ public abstract class AbstractFeatureReader<T extends Feature, SOURCE> implement
// the logging destination for this source
//private final static Logger log = Logger.getLogger("BasicFeatureSource");

// the path to underlying data source
/**
* The path to underlying data file, this must be the input path converted with {@link FeatureCodec#getPathToDataFile(String)}
*/
String path;


// a wrapper to apply to the raw stream of the Feature file to allow features like prefetching and caching to be injected
final Function<SeekableByteChannel, SeekableByteChannel> wrapper;
// a wrapper to apply to the raw stream of the index file
Expand Down Expand Up @@ -102,8 +105,12 @@ public static <FEATURE extends Feature, SOURCE> AbstractFeatureReader<FEATURE, S
*/
public static <FEATURE extends Feature, SOURCE> AbstractFeatureReader<FEATURE, SOURCE> getFeatureReader(final String featureResource, String indexResource, final FeatureCodec<FEATURE, SOURCE> codec, final boolean requireIndex, Function<SeekableByteChannel, SeekableByteChannel> wrapper, Function<SeekableByteChannel, SeekableByteChannel> indexWrapper) throws TribbleException {
try {
// Test for tabix index
if (methods.isTabix(featureResource, indexResource)) {

// Test for tabix index.
// Note that we use pathToDataFile here when determining the file type, but featureResource when constructing the readers.
// This is because the reader's constructor will convert the path and it needs to be converted exactly once.
final String pathToDataFile = codec.getPathToDataFile(featureResource);
if (methods.isTabix(pathToDataFile, indexResource)) {
if ( ! (codec instanceof AsciiFeatureCodec) )
throw new TribbleException("Tabix indexed files only work with ASCII codecs, but received non-Ascii codec " + codec.getClass().getSimpleName());
return new TabixFeatureReader<>(featureResource, indexResource, (AsciiFeatureCodec) codec, wrapper, indexWrapper);
Expand Down Expand Up @@ -145,7 +152,7 @@ protected AbstractFeatureReader(final String path, final FeatureCodec<T, SOURCE>
protected AbstractFeatureReader(final String path, final FeatureCodec<T, SOURCE> codec,
final Function<SeekableByteChannel, SeekableByteChannel> wrapper,
final Function<SeekableByteChannel, SeekableByteChannel> indexWrapper) {
this.path = path;
this.path = codec.getPathToDataFile(path);
this.codec = codec;
this.wrapper = wrapper;
this.indexWrapper = indexWrapper;
Expand Down
17 changes: 17 additions & 0 deletions src/main/java/htsjdk/tribble/FeatureCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,21 @@ public interface FeatureCodec<FEATURE_TYPE extends Feature, SOURCE> {
default public TabixFormat getTabixFormat() {
throw new TribbleException(this.getClass().getSimpleName() + "does not have defined tabix format");
}

/**
* Codecs may override this method if the file that they recognize with {@link #canDecode(String)} is different than
* the file that contains the data they parse.
*
* This enables a class of codecs where the input file is a configuration that defines how to locate and handle the
* datafile.
*
* The default implementation returns the same path which was passed in.
*
* @param path the path to a file that this codec {@link #canDecode}
* @return the path to the data file that should be parsed by this codec to produce Features.
* @throws TribbleException codecs may throw if they cannot decode the path.
*/
default String getPathToDataFile(String path){
return path;
}
}
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/tribble/TabixFeatureReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public TabixFeatureReader(final String featureFile, final String indexFile, fina
final Function<SeekableByteChannel, SeekableByteChannel> wrapper,
final Function<SeekableByteChannel, SeekableByteChannel> indexWrapper) throws IOException {
super(featureFile, codec, wrapper, indexWrapper);
tabixReader = new TabixReader(featureFile, indexFile, wrapper, indexWrapper);
tabixReader = new TabixReader(this.path, indexFile, wrapper, indexWrapper);
sequenceNames = new ArrayList<>(tabixReader.getChromosomes());
readHeader();
}
Expand Down
50 changes: 50 additions & 0 deletions src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.samtools.FileTruncatedException;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.IOUtilTest;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.samtools.util.TestUtil;
import htsjdk.tribble.bed.BEDCodec;
import htsjdk.tribble.bed.BEDFeature;
Expand Down Expand Up @@ -51,6 +52,7 @@ public class AbstractFeatureReaderTest extends HtsjdkTest {

//wrapper which skips the first byte of a file and leaves the rest unchanged
private static final Function<SeekableByteChannel, SeekableByteChannel> WRAPPER = SkippingByteChannel::new;
public static final String REDIRECTING_CODEC_TEST_FILES = "src/test/resources/htsjdk/tribble/AbstractFeatureReaderTest/redirectingCodecTest/";

/**
* Asserts readability and correctness of VCF over HTTP. The VCF is indexed and requires and index.
Expand Down Expand Up @@ -228,4 +230,52 @@ public SeekableByteChannel truncate(long size) throws IOException {
}
}

@DataProvider
public Object[][] getVcfRedirects(){
return new Object[][]{
{REDIRECTING_CODEC_TEST_FILES + "vcf.redirect"},
{REDIRECTING_CODEC_TEST_FILES + "vcf.gz.redirect"}
};
}

/**
* Test a codec that uses {@link FeatureCodec#getPathToDataFile(String)} in order to specify a data file that's
* different than the file it identifies with {@link FeatureCodec#canDecode}).
*/
@Test(dataProvider = "getVcfRedirects")
public void testCodecWithGetPathToDataFile(String vcfRedirect) throws IOException {
final VcfRedirectCodec vcfRedirectCodec = new VcfRedirectCodec();
final String vcf = REDIRECTING_CODEC_TEST_FILES + "dataFiles/test.vcf";
Assert.assertTrue(vcfRedirectCodec.canDecode(vcfRedirect), "should have been able to decode " + vcfRedirect);
try(FeatureReader<VariantContext> redirectReader = AbstractFeatureReader.getFeatureReader(vcfRedirect, vcfRedirectCodec, false);
FeatureReader<VariantContext> directReader = AbstractFeatureReader.getFeatureReader(vcf, new VCFCodec(), false)){
Assert.assertEquals(redirectReader.getHeader().toString(), directReader.getHeader().toString());
final int redirectVcfSize = redirectReader.iterator().toList().size();
Assert.assertTrue( redirectVcfSize > 0, "iterator found " + redirectVcfSize + " records");
Assert.assertEquals(redirectVcfSize, directReader.iterator().toList().size());

final int redirectQuerySize = redirectReader.query("20", 1, 20000).toList().size();
Assert.assertTrue(redirectQuerySize > 0, "query found " + redirectVcfSize + " records");
Assert.assertEquals(redirectQuerySize, directReader.query("20", 1, 20000).toList().size() );
}
}

/**
* codec which redirects to another location after reading the input file
*/
private static class VcfRedirectCodec extends VCFCodec{
@Override
public boolean canDecode(String potentialInput) {
return super.canDecode(this.getPathToDataFile(potentialInput));
}

@Override
public String getPathToDataFile(String path) {
try {
return Files.readAllLines(IOUtil.getPath(path)).get(0);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
src/test/resources/htsjdk/tribble/AbstractFeatureReaderTest/redirectingCodecTest/dataFiles/test.vcf.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
src/test/resources/htsjdk/tribble/AbstractFeatureReaderTest/redirectingCodecTest/dataFiles/test.vcf

0 comments on commit bc4b3ae

Please sign in to comment.