From 0d734cdd9235dca128af6d53fbf043f0f17511ee Mon Sep 17 00:00:00 2001 From: Michael Lamkin Date: Fri, 19 May 2023 10:48:29 -0700 Subject: [PATCH] feat: Added ability to set vcftype for reading str files (#214) --- docs/api/data.rst | 1 - haptools/clump.py | 4 +++- haptools/data/genotypes.py | 14 ++++++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/api/data.rst b/docs/api/data.rst index b103bf9f..f4844d46 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -178,7 +178,6 @@ All of the other methods in the :class:`Genotypes` class are inherited, but the genotypes = data.GenotypesTR.load('tests/data/simple_tr.vcf') # make the first sample have 4 and 7 repeats for the alleles of the fourth variant genotypes.data[0, 3] = (4, 7) - genotypes.write() .. _api-data-genotypestr: diff --git a/haptools/clump.py b/haptools/clump.py index b78a946c..ebba23b3 100644 --- a/haptools/clump.py +++ b/haptools/clump.py @@ -7,7 +7,7 @@ import math import sys -from haptools.data.genotypes import Genotypes, GenotypesVCF, GenotypesTR +from haptools.data.genotypes import Genotypes, GenotypesPLINK, GenotypesVCF, GenotypesTR class Variant: @@ -587,6 +587,8 @@ def clumpstr( strgts.samples = tuple(np.array(strgts.samples)[str_samples]) # Merge STR and SNP GTs + # NOTE if Genotypes is not used and GenotypesVCF is instead it will error because + # GenotypesVCF requires alleles to be present and Genotypes does not gts = Genotypes.merge_variants((snpgts, strgts), fname=None) elif gts_snps: gts = snpgts diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 0d7a754c..b0fc3c6a 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -829,8 +829,15 @@ class GenotypesTR(Genotypes): 3. POS log: Logger See documentation for :py:attr:`~.Genotypes.log` + vcftype: str + TR vcf type currently being read. + {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'} """ + def __init__(self, fname: Path | str, log: Logger = None, vcftype: str = "auto"): + super().__init__(fname, log) + self.vcftype = vcftype + def _variant_arr(self, record: Variant): """ See documentation for :py:meth:`~.Genotypes._variant_arr` @@ -851,6 +858,7 @@ def load( region: str = None, samples: list[str] = None, variants: set[str] = None, + vcftype: str = "auto", ) -> Genotypes: """ Load STR genotypes from a VCF file @@ -873,7 +881,7 @@ def load( Genotypes A Genotypes object with the data loaded into its properties """ - genotypes = cls(fname) + genotypes = cls(fname, vcftype=vcftype) genotypes.read(region, samples, variants) genotypes.check_phase() return genotypes @@ -895,7 +903,9 @@ def _return_vcf_iter(self, vcf: cyvcf2.VCF, region: str): TRRecord objects yielded from TRRecordHarmonizer. """ vcfiter = vcf(region) - tr_records = trh.TRRecordHarmonizer(vcffile=vcf, vcfiter=vcfiter, region=region) + tr_records = trh.TRRecordHarmonizer( + vcffile=vcf, vcfiter=vcfiter, region=region, vcftype=self.vcftype + ) return tr_records def _return_data(self, variant: trh.TRRecord):