Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connected the splice site window size to CLI parameters. #8463

Merged
merged 4 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,12 @@ public abstract class BaseFuncotatorArgumentCollection implements Serializable {
doc = "TSV File containing custom Variant Classification severity map of the form: VARIANT_CLASSIFICATION\tSEV. VARIANT_CLASSIFICATION must match one of the VariantClassification names (" + GencodeFuncotation.VariantClassification.ALL_VC_NAMES + "). SEV is an unsigned integer, where lower is sorted first. When using this option it is HIGHLY recommended you also use the `BEST_EFFECT` transcript selection mode."
)
public GATKPath customVariantClassificationOrderFile = null;

@Argument(
fullName = FuncotatorArgumentDefinitions.SPLICE_SITE_WINDOW_SIZE,
optional = true,
minValue = 0,
doc = "Number of bases on either side of a splice site for a variant to be classified as a SPLICE_SITE variant (default: " + FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE + ")."
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
)
public int spliceSiteWindow = FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE;
}
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ public void onTraversalStart() {
funcotatorArgs.lookaheadFeatureCachingInBp,
new FlankSettings(0,0),
true,
funcotatorArgs.minNumBasesForValidSegment
funcotatorArgs.minNumBasesForValidSegment,
funcotatorArgs.spliceSiteWindow
).stream()
.filter(DataSourceFuncotationFactory::isSupportingSegmentFuncotation)
.collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,8 @@ public void onTraversalStart() {
funcotatorArgs.lookaheadFeatureCachingInBp,
new FlankSettings(funcotatorArgs.fivePrimeFlankSize, funcotatorArgs.threePrimeFlankSize),
false,
funcotatorArgs.minNumBasesForValidSegment
funcotatorArgs.minNumBasesForValidSegment,
funcotatorArgs.spliceSiteWindow
);

logger.info("Initializing Funcotator Engine...");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ public class FuncotatorArgumentDefinitions {

public static final String CUSTOM_VARIANT_CLASS_ORDER_FILE = "custom-variant-classification-order";

public static final String SPLICE_SITE_WINDOW_SIZE = "splice-site-window-size";

// ------------------------------------------------------------
// Helper Types:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ private FuncotatorUtils() {}

public static final int DEFAULT_MIN_NUM_BASES_FOR_VALID_SEGMENT = 150;

/**
* The default window on either side of splice sites to mark variants as {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE}.
*/
public static final int DEFAULT_SPLICE_SITE_WINDOW_SIZE = 2;

private static final Map<String, AminoAcid> tableByCodon;
private static final Map<String, AminoAcid> tableByCode;
private static final Map<String, AminoAcid> tableByLetter;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ private static boolean isValidDirectory(final Path p) {
* be annotated with a gencode/transcript datasource.
* Not all datasources support this flag and it is
* ignored for those that don't.
* @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid.
* @param spliceSiteWindowSize The number of bases on either side of a splice site for a variant to be a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE} variant.
* @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information.
*/
public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFactoriesForDataSources(final Map<Path, Properties> dataSourceMetaData,
Expand All @@ -314,7 +316,8 @@ public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFact
final int lookaheadFeatureCachingInBp,
final FlankSettings flankSettings,
final boolean doAttemptSegmentFuncotationForTranscriptDatasources,
final int minBasesForValidSegment) {
final int minBasesForValidSegment,
final int spliceSiteWindowSize) {
Utils.nonNull(dataSourceMetaData);
Utils.nonNull(annotationOverridesMap);
Utils.nonNull(transcriptSelectionMode);
Expand Down Expand Up @@ -353,7 +356,7 @@ public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFact
case GENCODE:
featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, GencodeGtfFeature.class, false);
funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode,
userTranscriptIdSet, featureInput, flankSettings, doAttemptSegmentFuncotationForTranscriptDatasources, minBasesForValidSegment);
userTranscriptIdSet, featureInput, flankSettings, doAttemptSegmentFuncotationForTranscriptDatasources, minBasesForValidSegment, spliceSiteWindowSize);
break;
case VCF:
featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, VariantContext.class, false);
Expand Down Expand Up @@ -557,6 +560,8 @@ private static CosmicFuncotationFactory createCosmicDataSource(final Path dataSo
* @param isSegmentFuncotationEnabled Do we want to allow the output Gencode Funcotation Factory to do segment annotations? If false,
* segments will be funcotated with variant classifications of
* {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#COULD_NOT_DETERMINE}
* @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid.
* @param spliceSiteWindowSize The number of bases on either side of a splice site for a variant to be a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE} variant.
* @return A new {@link GencodeFuncotationFactory} based on the given data source file information, field overrides map, and transcript information.
*/
private static GencodeFuncotationFactory createGencodeDataSource(final Path dataSourceFile,
Expand All @@ -567,7 +572,8 @@ private static GencodeFuncotationFactory createGencodeDataSource(final Path data
final FeatureInput<? extends Feature> featureInput,
final FlankSettings flankSettings,
final boolean isSegmentFuncotationEnabled,
final int minBasesForValidSegment) {
final int minBasesForValidSegment,
final int spliceSiteWindowSize) {
Utils.nonNull(dataSourceFile);
Utils.nonNull(dataSourceProperties);
Utils.nonNull(annotationOverridesMap);
Expand Down Expand Up @@ -596,7 +602,8 @@ private static GencodeFuncotationFactory createGencodeDataSource(final Path data
isB37,
ncbiBuildVersion,
isSegmentFuncotationEnabled,
minBasesForValidSegment
minBasesForValidSegment,
spliceSiteWindowSize
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,6 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory {
private static final String LOCAL_GENCODE_TRANSCRIPT_TMP_DIR_PREFIX = "localGencodeTranscriptFastaFolder";
private static final String LOCAL_GENCODE_TRANSCRIPT_FILE_BASE_NAME = "gencodeTranscriptFastaFile";

/**
* The window around splice sites to mark variants as {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE}.
*/
private static final int spliceSiteVariantWindowBases = 2;

/**
* Number of bases to the left and right of a variant in which to calculate the GC content.
*/
Expand Down Expand Up @@ -222,6 +217,11 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory {
*/
private String ncbiBuildVersion = null;

/**
* The window on either side of splice sites to mark variants as {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE}.
*/
private int spliceSiteVariantWindowBases = FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE;

/**
* Comparator to be used when sorting {@link Funcotation}s created by this {@link GencodeFuncotationFactory}.
* Will be either {@link TranscriptSelectionMode.BestEffectGencodeFuncotationComparator} or {@link TranscriptSelectionMode.CanonicalGencodeFuncotationComparator}.
Expand Down Expand Up @@ -352,6 +352,41 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFilePath,
final boolean isSegmentFuncotationEnabled,
final int minBasesForValidSegment) {

this(gencodeTranscriptFastaFilePath, version, name, transcriptSelectionMode, userRequestedTranscripts,
annotationOverrides, mainFeatureInput, flankSettings, isDataSourceB37, ncbiBuildVersion,
isSegmentFuncotationEnabled, minBasesForValidSegment, FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE);
}

/**
* Create a {@link GencodeFuncotationFactory}.
*
* @param gencodeTranscriptFastaFilePath {@link Path} to the FASTA file containing the sequences of all transcripts in the Gencode data source.
* @param version The version {@link String} of Gencode from which {@link Funcotation}s will be made.
* @param name A {@link String} containing the name of this {@link GencodeFuncotationFactory}.
* @param transcriptSelectionMode The {@link TranscriptSelectionMode} by which representative/verbose transcripts will be chosen for overlapping variants.
* @param userRequestedTranscripts A {@link Set<String>} containing Gencode TranscriptIDs that the user requests to be annotated with priority over all other transcripts for overlapping variants.
* @param annotationOverrides A {@link LinkedHashMap<String,String>} containing user-specified overrides for specific {@link Funcotation}s.
* @param mainFeatureInput The backing {@link FeatureInput} for this {@link GencodeFuncotationFactory}, from which all {@link Funcotation}s will be created.
* @param flankSettings Settings object containing our 5'/3' flank sizes
* @param isDataSourceB37 If {@code true}, indicates that the data source behind this {@link GencodeFuncotationFactory} contains B37 data.
* @param ncbiBuildVersion The NCBI build version for this {@link GencodeFuncotationFactory} (can be found in the datasource config file)
* @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid.
* @param spliceSiteWindowSize The number of bases on either side of a splice site for a variant to be a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE} variant.
*/
public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFilePath,
final String version,
final String name,
final TranscriptSelectionMode transcriptSelectionMode,
final Set<String> userRequestedTranscripts,
final LinkedHashMap<String, String> annotationOverrides,
final FeatureInput<? extends Feature> mainFeatureInput,
final FlankSettings flankSettings,
final boolean isDataSourceB37,
final String ncbiBuildVersion,
final boolean isSegmentFuncotationEnabled,
final int minBasesForValidSegment,
final int spliceSiteWindowSize) {

super(mainFeatureInput, minBasesForValidSegment);

// Set up our local transcript fasta file.
Expand All @@ -375,6 +410,8 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFilePath,

this.isSegmentFuncotationEnabled = isSegmentFuncotationEnabled;

this.spliceSiteVariantWindowBases = spliceSiteWindowSize;

// Go through each requested transcript and remove the version numbers from them if they exist:
this.userRequestedTranscripts = new HashSet<>();
for ( final String transcript : userRequestedTranscripts ) {
Expand Down Expand Up @@ -1270,7 +1307,9 @@ private GencodeFuncotation createCodingRegionFuncotationForProteinCodingFeature(
.setProteinChange(proteinChange);

// Set the Variant Classification:
final GencodeFuncotation.VariantClassification varClass = createVariantClassification(variant, altAllele, variantType, exon, transcript.getExons().size(), sequenceComparison);
final GencodeFuncotation.VariantClassification varClass = createVariantClassification(
variant, altAllele, variantType, exon, transcript.getExons().size(), sequenceComparison, spliceSiteVariantWindowBases
);
final GencodeFuncotation.VariantClassification secondaryVarClass;
gencodeFuncotationBuilder.setVariantClassification(varClass);
if ( varClass == GencodeFuncotation.VariantClassification.SPLICE_SITE ) {
Expand Down Expand Up @@ -1354,6 +1393,7 @@ else if ( exon.getStopCodon() != null ) {
* @param exon The {@link GencodeGtfExonFeature} in which the given {@code variant} occurs.
* @param numberOfExonsInTranscript The number of exons in the transcript in which the given {@code variant} occurs. (Must be > 0).
* @param sequenceComparison The {@link org.broadinstitute.hellbender.tools.funcotator.SequenceComparison} for the given {@code variant}.
* @param spliceSiteWindowBases The window on either side of splice sites to mark variants as {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE}.
* @return A {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification} based on the given {@code allele}, {@code variant}, {@code exon}, and {@code sequenceComparison}.
*/
@VisibleForTesting
Expand All @@ -1362,7 +1402,8 @@ static GencodeFuncotation.VariantClassification createVariantClassification(fina
final GencodeFuncotation.VariantType variantType,
final GencodeGtfExonFeature exon,
final int numberOfExonsInTranscript,
final SequenceComparison sequenceComparison ){
final SequenceComparison sequenceComparison,
final int spliceSiteWindowBases){
Utils.nonNull(variant);
Utils.nonNull(altAllele);
Utils.nonNull(variantType);
Expand Down Expand Up @@ -1422,12 +1463,17 @@ static GencodeFuncotation.VariantClassification createVariantClassification(fina
final int adjustedExonStart = adjustLocusForInsertion(exon.getStart(), variant, altAllele, realVariationInterval);
final int adjustedExonEnd = adjustLocusForInsertion(exon.getEnd(), variant, altAllele, realVariationInterval);

// If we have 0 padding, we want to make sure the variant overlaps the exon start itself,
// not within a base of the exon start, so we have to create this adjustment here so that we can
// not subtract one and create an invalid interval.
final int intervalEndCoordAdjuster = spliceSiteWindowBases > 0 ? 1 : 0;

if ( doLeftOverlapCheck ) {
final SimpleInterval leftSideInterval = new SimpleInterval(exon.getContig(), adjustedExonStart - spliceSiteVariantWindowBases, adjustedExonStart + (spliceSiteVariantWindowBases-1));
final SimpleInterval leftSideInterval = new SimpleInterval(exon.getContig(), adjustedExonStart - spliceSiteWindowBases, adjustedExonStart + (spliceSiteWindowBases-intervalEndCoordAdjuster));
overlapsLeft = leftSideInterval.overlaps(realVariationInterval);
}
if ( doRightOverlapCheck ) {
final SimpleInterval rightSideInterval = new SimpleInterval(exon.getContig(), adjustedExonEnd - spliceSiteVariantWindowBases + 1, adjustedExonEnd + (spliceSiteVariantWindowBases-1) + 1);
final SimpleInterval rightSideInterval = new SimpleInterval(exon.getContig(), adjustedExonEnd - spliceSiteWindowBases + 1, adjustedExonEnd + (spliceSiteWindowBases-intervalEndCoordAdjuster) + 1);
overlapsRight = rightSideInterval.overlaps(realVariationInterval);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ public void testGetFuncotationFactoriesAndCreateFuncotationMapForVariant(final F
FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE,
new FlankSettings(0, 0),
false,
FuncotatorUtils.DEFAULT_MIN_NUM_BASES_FOR_VALID_SEGMENT)
FuncotatorUtils.DEFAULT_MIN_NUM_BASES_FOR_VALID_SEGMENT,
FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE)
);

for (int i = 0; i < entireVcf.getRight().size(); i++) {
Expand Down
Loading