From 94e1f61f90f3751405b176e8725539e5e0b73ae8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 18 Oct 2022 14:38:19 -0700 Subject: [PATCH 01/12] remove aaf field from Genotypes objects see https://github.com/CAST-genomics/haptools/issues/19#issuecomment-1126651795 --- docs/api/data.rst | 4 ++-- haptools/data/genotypes.py | 44 +++---------------------------------- haptools/data/haplotypes.py | 2 +- haptools/transform.py | 2 +- tests/test_data.py | 28 +++++------------------ 5 files changed, 12 insertions(+), 68 deletions(-) diff --git a/docs/api/data.rst b/docs/api/data.rst index de7faa5a..604fdeef 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -59,7 +59,7 @@ Properties ********** The ``data`` property of a :class:`Genotypes` object is a numpy array representing the genotype matrix. Rows of the array are samples and columns are variants. Each entry in the matrix is a tuple of values -- one for each chromosome. Each value is an integer denoting the index of the allele (0 for REF, 1 for the first ALT allele, 2 for the next ALT allele, etc). -There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has four named columns: "id", "chrom", "pos", and "aaf" (alternate allele frequency). +There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has three named columns: "id" (variant ID), "chrom" (chromosome name), and "pos" (chromosomal position). Reading a file ************** @@ -143,7 +143,7 @@ By default, the ``subset()`` method returns a new :class:`Genotypes` instance. T GenotypesRefAlt +++++++++++++++ -The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "aaf", "ref", and "alt". +The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "ref", and "alt". All of the other methods in the :class:`Genotypes` class are inherited, but the :class:`GenotypesRefAlt` class implements an additional method ``write()`` for dumping the contents of the class to the provided file. diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index fab66675..1c429853 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -32,7 +32,6 @@ class Genotypes(Data): 1. ID 2. CHROM 3. POS - 4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called) log: Logger A logging instance for recording debug statements _prephased : bool @@ -61,7 +60,6 @@ def __init__(self, fname: Path | str, log: Logger = None): ("id", "U50"), ("chrom", "U10"), ("pos", np.uint32), - ("aaf", np.float64), ], ) self._prephased = False @@ -219,7 +217,7 @@ def _variant_arr(self, record: Variant): A row from the :py:attr:`~.Genotypes.variants` array """ return np.array( - (record.ID, record.CHROM, record.POS, record.aaf), + (record.ID, record.CHROM, record.POS), dtype=self.variants.dtype, ) @@ -504,38 +502,6 @@ def check_phase(self): # remove the last dimension that contains the phase info self.data = self.data[:, :, :2] - def to_MAC(self): - """ - Convert the ALT count GT matrix into a matrix of minor allele counts - - This function modifies :py:attr:`~.Genotypes.data` in-place - - It also changes the 'aaf' record in :py:attr:`~.Genotypes.variants` to 'maf' - - Raises - ------ - AssertionError - If the matrix has already been converted - """ - if self.variants.dtype.names[3] == "maf": - self.log.warning( - "The matrix already counts instances of the minor allele rather than " - "the ALT allele." - ) - return - need_conversion = self.variants["aaf"] > 0.5 - # flip the count on the variants that have an alternate allele frequency - # above 0.5 - self.data[:, need_conversion, :2] = ~self.data[:, need_conversion, :2] - # also encode an MAF instead of an AAF in self.variants - self.variants["aaf"][need_conversion] = ( - 1 - self.variants["aaf"][need_conversion] - ) - # replace 'aaf' with 'maf' in the matrix - self.variants.dtype.names = [ - (x, "maf")[x == "aaf"] for x in self.variants.dtype.names - ] - class GenotypesRefAlt(Genotypes): """ @@ -556,9 +522,8 @@ class GenotypesRefAlt(Genotypes): 1. ID 2. CHROM 3. POS - 4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called) - 5. REF - 6. ALT + 4. REF + 5. ALT log: Logger See documentation for :py:attr:`~.Genotypes.log` """ @@ -571,7 +536,6 @@ def __init__(self, fname: Path | str, log: Logger = None): ("id", "U50"), ("chrom", "U10"), ("pos", np.uint32), - ("aaf", np.float64), ("ref", "U100"), ("alt", "U100"), ], @@ -586,7 +550,6 @@ def _variant_arr(self, record: Variant): record.ID, record.CHROM, record.POS, - record.aaf, record.REF, record.ALT[0], ), @@ -795,7 +758,6 @@ def _variant_arr( record[cid["ID"]], record[cid["CHROM"]], record[cid["POS"]], - 0.5, record[cid["REF"]], record[cid["ALT"]], ), diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index a5fb8c97..3f247a4c 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -1080,7 +1080,7 @@ def transform( hap_gts = GenotypesRefAlt(fname=None, log=self.log) hap_gts.samples = gts.samples hap_gts.variants = np.array( - [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()], + [(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()], dtype=hap_gts.variants.dtype, ) # build a fast data structure for querying the alleles in each haplotype: diff --git a/haptools/transform.py b/haptools/transform.py index a1b33cab..2f3ae2be 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -97,7 +97,7 @@ def transform( hap_gts = data.GenotypesRefAlt(fname=None, log=self.log) hap_gts.samples = gts.samples hap_gts.variants = np.array( - [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()], + [(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()], dtype=hap_gts.variants.dtype, ) # build a fast data structure for querying the alleles in each haplotype: diff --git a/tests/test_data.py b/tests/test_data.py index 518b9026..3f0273e1 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -39,17 +39,12 @@ def _get_fake_genotypes(self): gts.data = self._get_expected_genotypes() gts.variants = np.array( [ - ("1:10114:T:C", "1", 10114, 0), - ("1:10116:A:G", "1", 10116, 0.6), - ("1:10117:C:A", "1", 10117, 0), - ("1:10122:A:G", "1", 10122, 0), - ], - dtype=[ - ("id", "U50"), - ("chrom", "U10"), - ("pos", np.uint32), - ("aaf", np.float64), + ("1:10114:T:C", "1", 10114), + ("1:10116:A:G", "1", 10116), + ("1:10117:C:A", "1", 10117), + ("1:10122:A:G", "1", 10122), ], + dtype=gts.variants.dtype, ) gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") gts.check_phase() @@ -117,18 +112,6 @@ def test_load_genotypes(self, caplog): gts.check_phase() assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" - # convert the matrix of alt allele counts to a matrix of minor allele counts - assert gts.variants["aaf"][1] == 0.6 - gts.to_MAC() - expected[:, 1, :] = ~expected[:, 1, :] - np.testing.assert_allclose(gts.data, expected) - assert gts.variants["maf"][1] == 0.4 - - # try to do the MAC conversion again - it should warn b/c we've already done it - caplog.clear() - gts.to_MAC() - assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" - def test_load_genotypes_example(self): samples = ( "HG00096", @@ -632,7 +615,6 @@ def test_read_subset(self): haps.read(region="21:26928472-26941960") assert expected == haps.data - def test_read_extras(self): # what do we expect to see from the simphenotype.hap file? expected = { From 156b2edbd24abad62679fb988c7c18bd1d43ab92 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 18 Oct 2022 16:29:01 -0700 Subject: [PATCH 02/12] add method to check af --- haptools/data/genotypes.py | 45 ++++++++++++++++++++++++++++++++++++++ tests/test_data.py | 22 +++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 1c429853..4a36255e 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -502,6 +502,51 @@ def check_phase(self): # remove the last dimension that contains the phase info self.data = self.data[:, :, :2] + def check_af(self, threshold: float = None, discard_also: bool = False): + """ + Return the allele frequency of the REF allele + + If a threshold is specified, raise a ValueError if the frequency exceeds it + + Parameters + ---------- + threshold: float, optional + If any variant has an *alternate* allele frequency rarer than the + provided threshold, raise a ValueError + discard_also : bool, optional + If True, discard any variants that would otherwise cause a ValueError + + This parameter will be ignored if a threshold is not specified + + Raises + ------ + ValueError + If any variant exceeds the provided threshold allele frequency + """ + num_strands = 2 * self.data.shape[0] + ref_af = 1 - self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2))/num_strands + if threshold is None: + return ref_af + rare_variants = (ref_af > (1 - threshold)) | (ref_af < threshold) + if np.any(rare_variants): + var_idx = np.nonzero(rare_variants)[0] + if discard_also: + original_num_variants = len(self.variants) + self.data = np.delete(self.data, var_idx, axis=1) + self.variants = np.delete(self.variants, var_idx) + self.log.info( + "Ignoring missing genotypes from " + f"{original_num_variants - len(self.variants)} samples" + ) + self._var_idx = None + else: + raise ValueError( + "Variant with ID {} at POS {}:{} has REF frequency {}".format( + *tuple(self.variants[var_idx[0]])[:3], ref_af[var_idx[0]] + ) + ) + return ref_af + class GenotypesRefAlt(Genotypes): """ diff --git a/tests/test_data.py b/tests/test_data.py index 3f0273e1..0d96d239 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -232,6 +232,28 @@ def test_subset_genotypes(self): np.testing.assert_allclose(gts_sub.data, expected_data) assert np.array_equal(gts_sub.variants, expected_variants) + def test_check_af(self): + gts = self._get_fake_genotypes() + expected_af = np.array([1, 0.4, 1, 1]) + + af = gts.check_af() + np.testing.assert_allclose(af, expected_af) + + with pytest.raises(ValueError) as info: + gts.check_af(threshold=0.01) + assert ( + str(info.value) + == "Variant with ID 1:10114:T:C at POS 1:10114 has REF frequency 1.0" + ) + + gts.check_af(threshold=0, discard_also=True) + assert len(gts.variants) == 4 + assert gts.data.shape[1] == 4 + + gts.check_af(threshold=0.01, discard_also=True) + assert len(gts.variants) == 1 + assert gts.data.shape[1] == 1 + class TestGenotypesPLINK: def _get_fake_genotypes_plink(self): From c77bb1eee8c1b4c93abafd749908c378c01ecd64 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 10:59:13 -0700 Subject: [PATCH 03/12] check MAF instead of non-REF AF --- haptools/data/genotypes.py | 39 +++++++++++++++++++++++--------------- tests/test_data.py | 20 +++++++++++-------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 4a36255e..fd18b114 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -502,17 +502,23 @@ def check_phase(self): # remove the last dimension that contains the phase info self.data = self.data[:, :, :2] - def check_af(self, threshold: float = None, discard_also: bool = False): + def check_maf(self, threshold: float = None, discard_also: bool = False): """ - Return the allele frequency of the REF allele + Return the minor allele frequency of each variant - If a threshold is specified, raise a ValueError if the frequency exceeds it + Raise a ValueError if any variant's MAF doesn't satisfy the threshold, if + one is provided + + .. note:: + You should call :py:meth:`~.Genotypes.check_missing` and + :py:meth:`~.Genotypes.check_biallelic` before executing this method, for + best results. Otherwise, the frequencies may be computed incorrectly. Parameters ---------- threshold: float, optional - If any variant has an *alternate* allele frequency rarer than the - provided threshold, raise a ValueError + If a variant has a minor allele frequency (MAF) rarer than this threshold, + raise a ValueError discard_also : bool, optional If True, discard any variants that would otherwise cause a ValueError @@ -521,29 +527,32 @@ def check_af(self, threshold: float = None, discard_also: bool = False): Raises ------ ValueError - If any variant exceeds the provided threshold allele frequency + If any variant does not meet the provided threshold minor allele frequency """ num_strands = 2 * self.data.shape[0] - ref_af = 1 - self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2))/num_strands + # TODO: make this work for multi-allelic variants, too? + ref_af = self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2)) / num_strands + maf = np.array([ref_af, 1 - ref_af]).min(axis=0) if threshold is None: - return ref_af - rare_variants = (ref_af > (1 - threshold)) | (ref_af < threshold) + return maf + rare_variants = maf < threshold if np.any(rare_variants): - var_idx = np.nonzero(rare_variants)[0] + idx = np.nonzero(rare_variants)[0] if discard_also: original_num_variants = len(self.variants) - self.data = np.delete(self.data, var_idx, axis=1) - self.variants = np.delete(self.variants, var_idx) + self.data = np.delete(self.data, idx, axis=1) + self.variants = np.delete(self.variants, idx) self.log.info( "Ignoring missing genotypes from " f"{original_num_variants - len(self.variants)} samples" ) self._var_idx = None else: + vals = tuple(self.variants[idx[0]])[:3] + (maf[idx[0]], threshold) + # raise error if the minor allele frequency of a variant does not meet + # the threshold raise ValueError( - "Variant with ID {} at POS {}:{} has REF frequency {}".format( - *tuple(self.variants[var_idx[0]])[:3], ref_af[var_idx[0]] - ) + "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals) ) return ref_af diff --git a/tests/test_data.py b/tests/test_data.py index 0d96d239..f9ce47da 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -232,28 +232,32 @@ def test_subset_genotypes(self): np.testing.assert_allclose(gts_sub.data, expected_data) assert np.array_equal(gts_sub.variants, expected_variants) - def test_check_af(self): + def test_check_maf(self): gts = self._get_fake_genotypes() - expected_af = np.array([1, 0.4, 1, 1]) + expected_maf = np.array([0, 0.4, 0, 0]) - af = gts.check_af() - np.testing.assert_allclose(af, expected_af) + maf = gts.check_maf() + np.testing.assert_allclose(maf, expected_maf) with pytest.raises(ValueError) as info: - gts.check_af(threshold=0.01) + gts.check_maf(threshold=0.01) assert ( str(info.value) - == "Variant with ID 1:10114:T:C at POS 1:10114 has REF frequency 1.0" + == "Variant with ID 1:10114:T:C at POS 1:10114 has MAF 0.0 < 0.01" ) - gts.check_af(threshold=0, discard_also=True) + gts.check_maf(threshold=0, discard_also=True) assert len(gts.variants) == 4 assert gts.data.shape[1] == 4 - gts.check_af(threshold=0.01, discard_also=True) + gts.check_maf(threshold=0.01, discard_also=True) assert len(gts.variants) == 1 assert gts.data.shape[1] == 1 + gts.check_maf(threshold=0.5, discard_also=True) + assert len(gts.variants) == 0 + assert gts.data.shape[1] == 0 + class TestGenotypesPLINK: def _get_fake_genotypes_plink(self): From 2ba000b9f7b1295bd2579d856d9c20aacd2f4212 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:28:13 -0700 Subject: [PATCH 04/12] add flag to warn instead of raise ValueError --- haptools/data/genotypes.py | 30 ++++++++++++++++++++++-------- tests/test_data.py | 22 ++++++++++++++-------- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index fd18b114..3dc94887 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -502,9 +502,14 @@ def check_phase(self): # remove the last dimension that contains the phase info self.data = self.data[:, :, :2] - def check_maf(self, threshold: float = None, discard_also: bool = False): + def check_maf( + self, + threshold: float = None, + discard_also: bool = False, + warn_only: bool = False, + ) -> npt.NDArray[np.float64]: """ - Return the minor allele frequency of each variant + Check the minor allele frequency of each variant Raise a ValueError if any variant's MAF doesn't satisfy the threshold, if one is provided @@ -523,11 +528,17 @@ def check_maf(self, threshold: float = None, discard_also: bool = False): If True, discard any variants that would otherwise cause a ValueError This parameter will be ignored if a threshold is not specified + warn_only: bool, optional + Just raise a warning instead of a ValueError Raises ------ ValueError If any variant does not meet the provided threshold minor allele frequency + + Returns + ------- + The minor allele frequency of each variant """ num_strands = 2 * self.data.shape[0] # TODO: make this work for multi-allelic variants, too? @@ -542,6 +553,7 @@ def check_maf(self, threshold: float = None, discard_also: bool = False): original_num_variants = len(self.variants) self.data = np.delete(self.data, idx, axis=1) self.variants = np.delete(self.variants, idx) + maf = np.delete(maf, idx) self.log.info( "Ignoring missing genotypes from " f"{original_num_variants - len(self.variants)} samples" @@ -549,12 +561,14 @@ def check_maf(self, threshold: float = None, discard_also: bool = False): self._var_idx = None else: vals = tuple(self.variants[idx[0]])[:3] + (maf[idx[0]], threshold) - # raise error if the minor allele frequency of a variant does not meet - # the threshold - raise ValueError( - "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals) - ) - return ref_af + msg = "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals) + if warn_only: + self.log.warning(msg) + else: + # raise error if the minor allele frequency of a variant does not + # meet the threshold + raise ValueError(msg) + return maf class GenotypesRefAlt(Genotypes): diff --git a/tests/test_data.py b/tests/test_data.py index f9ce47da..f88444fc 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -232,29 +232,35 @@ def test_subset_genotypes(self): np.testing.assert_allclose(gts_sub.data, expected_data) assert np.array_equal(gts_sub.variants, expected_variants) - def test_check_maf(self): + def test_check_maf(self, caplog): gts = self._get_fake_genotypes() expected_maf = np.array([0, 0.4, 0, 0]) maf = gts.check_maf() np.testing.assert_allclose(maf, expected_maf) + msg = "Variant with ID 1:10114:T:C at POS 1:10114 has MAF 0.0 < 0.01" with pytest.raises(ValueError) as info: gts.check_maf(threshold=0.01) - assert ( - str(info.value) - == "Variant with ID 1:10114:T:C at POS 1:10114 has MAF 0.0 < 0.01" - ) + assert str(info.value) == msg - gts.check_maf(threshold=0, discard_also=True) + # test just the warning system + caplog.clear() + maf = gts.check_maf(threshold=0.01, warn_only=True) + assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" + + maf = gts.check_maf(threshold=0, discard_also=True) + np.testing.assert_allclose(maf, expected_maf) assert len(gts.variants) == 4 assert gts.data.shape[1] == 4 - gts.check_maf(threshold=0.01, discard_also=True) + maf = gts.check_maf(threshold=0.01, discard_also=True) + np.testing.assert_allclose(maf, expected_maf[1]) assert len(gts.variants) == 1 assert gts.data.shape[1] == 1 - gts.check_maf(threshold=0.5, discard_also=True) + maf = gts.check_maf(threshold=0.5, discard_also=True) + np.testing.assert_allclose(maf, expected_maf[:0]) assert len(gts.variants) == 0 assert gts.data.shape[1] == 0 From 87df5c8311aa71a9279aa57c5b39af4cca46d91d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:28:57 -0700 Subject: [PATCH 05/12] fix auto-generated docs --- haptools/data/breakpoints.py | 2 +- haptools/data/genotypes.py | 6 +++--- haptools/index.py | 2 +- haptools/ld.py | 2 +- haptools/transform.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/haptools/data/breakpoints.py b/haptools/data/breakpoints.py index 4497ce99..35660d55 100644 --- a/haptools/data/breakpoints.py +++ b/haptools/data/breakpoints.py @@ -269,7 +269,7 @@ def population_array( A subset of samples to include in the output, ordered by their given order Returns - ------ + ------- npt.NDArray An array of shape: samples x variants x 2 diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 3dc94887..a48fe002 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -297,15 +297,15 @@ def index(self, samples: bool = True, variants: bool = True): Call this function once to improve the amortized time-complexity of look-ups of samples and variants by their ID. This is useful if you intend to later subset by a set of samples or variant IDs. - The time complexity of this function should be roughly O(n+m) if both - parameters are True. Otherwise, it will be either O(n) or O(m). + The time complexity of this function should be roughly O(n+p) if both + parameters are True. Otherwise, it will be either O(n) or O(p). Parameters ---------- samples: bool, optional Whether to index the samples for fast loop-up. Adds complexity O(n). variants: bool, optional - Whether to index the variants for fast look-up. Adds complexity O(m). + Whether to index the variants for fast look-up. Adds complexity O(p). Raises ------ diff --git a/haptools/index.py b/haptools/index.py index 6e7f6a4e..5b2eaed7 100644 --- a/haptools/index.py +++ b/haptools/index.py @@ -33,7 +33,7 @@ def index_haps( haplotypes: Path, sort: bool = False, output: Path = None, - log: Logger = None, + log: logging.Logger = None, ): """ Takes in an unsorted .hap file and outputs it as a .gz and a .tbi file diff --git a/haptools/ld.py b/haptools/ld.py index de175205..27ae2474 100644 --- a/haptools/ld.py +++ b/haptools/ld.py @@ -57,7 +57,7 @@ def calc_ld( discard_missing: bool = False, from_gts: bool = False, output: Path = Path("/dev/stdout"), - log: Logger = None, + log: logging.Logger = None, ): """ Creates a VCF composed of haplotypes diff --git a/haptools/transform.py b/haptools/transform.py index 2f3ae2be..c2c926dd 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -510,7 +510,7 @@ def transform_haps( discard_missing: bool = False, ancestry: bool = False, output: Path = Path("-"), - log: Logger = None, + log: logging.Logger = None, ): """ Creates a VCF composed of haplotypes From 7f7867a4d5186176b99f36ec7ec26959af5ce95c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:32:51 -0700 Subject: [PATCH 06/12] refmt admix_storage with black --- haptools/admix_storage.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/haptools/admix_storage.py b/haptools/admix_storage.py index 3aa306e0..912ac95c 100644 --- a/haptools/admix_storage.py +++ b/haptools/admix_storage.py @@ -11,7 +11,7 @@ def __repr__(self): def __str__(self): return f"(Chrom {self.chrom}, {self.cm_map_pos} cM, {self.bp_map_pos} bp)" - + def get_chrom(self): return self.chrom @@ -30,12 +30,18 @@ class HaplotypeSegment: def __init__(self, pop_num, chrom, end_coord, end_pos): """ Note the beginning of the segment is inferred based on previous - segments stored throughout the simulation process. - Arguments - pop_num - population label - chrom - chromosome the haplotype segment lies on. - end_coord - Ending coordinate in bp of the haplotype segment - end_pos - Ending coordinate in centimorgans of the hap segment + segments stored throughout the simulation process. + + Parameters + ---------- + pop_num + population label + chrom + chromosome the haplotype segment lies on. + end_coord + Ending coordinate in bp of the haplotype segment + end_pos + Ending coordinate in centimorgans of the hap segment """ self.pop_num = pop_num self.chrom = chrom @@ -43,12 +49,16 @@ def __init__(self, pop_num, chrom, end_coord, end_pos): self.end_pos = end_pos def __repr__(self): - return f"(Population {self.pop_num}, chrom {self.chrom}, " + \ - f"end_coord {self.end_coord}, end_pos {self.end_pos})" + return ( + f"(Population {self.pop_num}, chrom {self.chrom}, " + + f"end_coord {self.end_coord}, end_pos {self.end_pos})" + ) def __str__(self): - return f"Population {self.pop_num}, chrom {self.chrom}, " + \ - f"end_coord {self.end_coord}, end_pos {self.end_pos}" + return ( + f"Population {self.pop_num}, chrom {self.chrom}, " + + f"end_coord {self.end_coord}, end_pos {self.end_pos}" + ) def get_end_pos(self): return self.end_pos From e88c3295728348e03989421f41c0147c3c0d7050 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 12:14:27 -0700 Subject: [PATCH 07/12] make region in validate_params() an optional arg --- haptools/sim_genotype.py | 2 +- tests/test_outputvcf.py | 43 ++++++++++++++++++---------------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py index f2c9a49b..31a1bf9c 100644 --- a/haptools/sim_genotype.py +++ b/haptools/sim_genotype.py @@ -776,7 +776,7 @@ def start_segment(start, chrom, segments): return len(segments) -def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region, only_bp=False): +def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region=None, only_bp=False): # validate model file mfile = open(model, 'r') num_samples, *pops = mfile.readline().strip().split() diff --git a/tests/test_outputvcf.py b/tests/test_outputvcf.py index b53c134a..8506a687 100644 --- a/tests/test_outputvcf.py +++ b/tests/test_outputvcf.py @@ -177,14 +177,14 @@ def test_model_files(): faulty_model = DATADIR.joinpath("dat_files/faulty_model_sample_number_to_int.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Can't convert samples number to an integer." faulty_model = DATADIR.joinpath("dat_files/faulty_model_num_pops.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert ( str(e.value) @@ -193,7 +193,7 @@ def test_model_files(): faulty_model = DATADIR.joinpath("dat_files/faulty_model_less_than_1.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Number of samples is less than 1." @@ -201,21 +201,21 @@ def test_model_files(): faulty_model = DATADIR.joinpath("dat_files/faulty_model_pop_fracs.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Can't convert generation to integer." faulty_model = DATADIR.joinpath("dat_files/faulty_model_frac.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Can't convert population fractions to type float." faulty_model = DATADIR.joinpath("dat_files/faulty_model_pop_header.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert ( (str(e.value)) @@ -226,7 +226,7 @@ def test_model_files(): faulty_model = DATADIR.joinpath("dat_files/faulty_model_cur_gen.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert ( (str(e.value)) @@ -238,58 +238,53 @@ def test_model_files(): faulty_model = DATADIR.joinpath("dat_files/faulty_model_sum_frac.dat") with pytest.raises(Exception) as e: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) - assert ( - str(e.value) - ) == "Population fractions for generation 1 do not sum to 1." + assert (str(e.value)) == "Population fractions for generation 1 do not sum to 1." # Validate mapdir exceptions model = DATADIR.joinpath("dat_files/correct_model.dat") faulty_mapdir = DATADIR.joinpath("maps") with pytest.raises(Exception) as e: validate_params( - model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Map directory given is not a valid path." with pytest.raises(Exception) as e: - validate_params( - model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None - ) + validate_params(model, mapdir, chroms, popsize, vcf_file, sampleinfo_file) assert (str(e.value)) == f"Chromosome {chroms[0]} in the list given is not valid." chroms = ["1"] faulty_mapdir = DATADIR.joinpath("test_map") with pytest.raises(Exception) as e: validate_params( - model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Could not parse map directory files." faulty_mapdir = DATADIR.joinpath("test_map_2") with pytest.raises(Exception) as e: validate_params( - model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert ( (str(e.value)) - == "No valid coordinate files found. Must contain chr{1-22,X} in the file" - " name." + == "No valid coordinate files found. Must contain chr{1-22,X} in the file name." ) # validate popsize exceptions faulty_popsize = "NA" with pytest.raises(Exception) as e: validate_params( - model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file, None + model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Popsize is not an Integer." faulty_popsize = 0 with pytest.raises(Exception) as e: validate_params( - model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file, None + model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file ) assert (str(e.value)) == "Popsize must be greater than 0." @@ -297,7 +292,7 @@ def test_model_files(): faulty_vcf_file = DATADIR.joinpath("faulty_vcf.vcf") with pytest.raises(Exception) as e: validate_params( - model, mapdir, chroms, popsize, faulty_vcf_file, sampleinfo_file, None + model, mapdir, chroms, popsize, faulty_vcf_file, sampleinfo_file ) assert (str(e.value)) == "Unable to collect vcf samples." @@ -305,7 +300,7 @@ def test_model_files(): faulty_sampleinfo_file = DATADIR.joinpath("faulty_info.tab") with pytest.raises(Exception) as e: validate_params( - model, mapdir, chroms, popsize, vcf_file, faulty_sampleinfo_file, None + model, mapdir, chroms, popsize, vcf_file, faulty_sampleinfo_file ) assert (str(e.value)) == "Sample HG00022 in sampleinfo file is not present in the vcf file." @@ -315,7 +310,7 @@ def test_model_files(): with pytest.raises(Exception) as e: for model_pop in pops: validate_params( - faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None + faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) assert ( (str(e.value)) From ef3633484ebeb4113abfc6da2ac7ab7117571c5d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 12:15:31 -0700 Subject: [PATCH 08/12] refmt all test files and docs files with black --- docs/conf.py | 2 +- haptools/__main__.py | 7 +++---- tests/test_karyogram.py | 41 +++++++++++++++++++------------------ tests/test_outputvcf.py | 45 +++++++++++++++++++++++++++-------------- 4 files changed, 55 insertions(+), 40 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 986c7196..da840922 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,7 +49,7 @@ autosummary_generate = True numpydoc_show_class_members = False # allow for both rst and md syntax -source_suffix = ['.rst'] +source_suffix = [".rst"] # -- Options for HTML output ------------------------------------------------- diff --git a/haptools/__main__.py b/haptools/__main__.py index 58cb817e..5ec91bf1 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -764,7 +764,7 @@ def ld( "-o", "--output", type=click.Path(path_type=Path), - default= None, + default=None, show_default="input file", help="A .hap file containing sorted and indexed haplotypes and variants", ) @@ -778,11 +778,10 @@ def ld( ) def index( haplotypes: Path, - sort: bool= False, + sort: bool = False, output: Path = None, - verbosity: str = 'CRITICAL', + verbosity: str = "CRITICAL", ): - """ Takes in an unsorted .hap file and outputs it as a .gz and a .tbi file """ diff --git a/tests/test_karyogram.py b/tests/test_karyogram.py index 4e908933..07fae56b 100644 --- a/tests/test_karyogram.py +++ b/tests/test_karyogram.py @@ -5,26 +5,27 @@ DATADIR = Path(__file__).parent.joinpath("data") + def test_GetHaplotypeBlocks(): - test_file = DATADIR.joinpath("test.bp") - sample_blocks = GetHaplotypeBlocks(test_file, "Sample_1") - assert(sample_blocks[0][0]["pop"]=="YRI") - assert(sample_blocks[0][0]["chrom"]==1) - assert(sample_blocks[0][0]["start"]==0.0001) - assert(sample_blocks[0][0]["end"]==168.003442) + test_file = DATADIR.joinpath("test.bp") + sample_blocks = GetHaplotypeBlocks(test_file, "Sample_1") + assert sample_blocks[0][0]["pop"] == "YRI" + assert sample_blocks[0][0]["chrom"] == 1 + assert sample_blocks[0][0]["start"] == 0.0001 + assert sample_blocks[0][0]["end"] == 168.003442 + + assert sample_blocks[1][0]["pop"] == "YRI" + assert sample_blocks[1][0]["chrom"] == 1 + assert sample_blocks[1][0]["start"] == 0.0001 + assert sample_blocks[1][0]["end"] == 87.107755 - assert(sample_blocks[1][0]["pop"]=="YRI") - assert(sample_blocks[1][0]["chrom"]==1) - assert(sample_blocks[1][0]["start"]==0.0001) - assert(sample_blocks[1][0]["end"]==87.107755) - - sample_blocks = GetHaplotypeBlocks(test_file, "Sample_2") - assert(sample_blocks[0][-1]["pop"]=="YRI") - assert(sample_blocks[0][-1]["chrom"]==2) - assert(sample_blocks[0][-1]["start"]==180.837755+0.0001) - assert(sample_blocks[0][-1]["end"]==244.341689) + sample_blocks = GetHaplotypeBlocks(test_file, "Sample_2") + assert sample_blocks[0][-1]["pop"] == "YRI" + assert sample_blocks[0][-1]["chrom"] == 2 + assert sample_blocks[0][-1]["start"] == 180.837755 + 0.0001 + assert sample_blocks[0][-1]["end"] == 244.341689 - assert(sample_blocks[1][0]["pop"]=="YRI") - assert(sample_blocks[1][0]["chrom"]==1) - assert(sample_blocks[1][0]["start"]==0.0001) - assert(sample_blocks[1][0]["end"]==85.107755) \ No newline at end of file + assert sample_blocks[1][0]["pop"] == "YRI" + assert sample_blocks[1][0]["chrom"] == 1 + assert sample_blocks[1][0]["start"] == 0.0001 + assert sample_blocks[1][0]["end"] == 85.107755 diff --git a/tests/test_outputvcf.py b/tests/test_outputvcf.py index 8506a687..d0bf43ef 100644 --- a/tests/test_outputvcf.py +++ b/tests/test_outputvcf.py @@ -17,6 +17,7 @@ def _get_files(): out_prefix = DATADIR.joinpath("outvcf_out") return bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix + def _get_breakpoints(bkp_file): # Collect breakpoints to proper format used in output_vcf function breakpoints = [] @@ -44,17 +45,20 @@ def _get_breakpoints(bkp_file): breakpoints = np.array(breakpoints, dtype=object) return breakpoints + def test_alt_chrom_name(): # Test when the ref VCF has chr{X|\d+} form # read in all files and breakpoints bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files() bkp_file = DATADIR.joinpath("outvcf_test_chr.bp") vcf_file = DATADIR.joinpath("outvcf_test_chr.vcf") - chroms = ['1', '2', 'X'] + chroms = ["1", "2", "X"] bkps = _get_breakpoints(bkp_file) # generate output vcf file - output_vcf(bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix)) + output_vcf( + bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix) + ) # read in vcf file vcf = VCF(str(out_prefix) + ".vcf") @@ -90,14 +94,17 @@ def test_alt_chrom_name(): os.remove(str(out_prefix) + ".vcf") return + def test_vcf_output(): # read in all files and breakpoints bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files() - chroms = ['1', '2'] + chroms = ["1", "2"] bkps = _get_breakpoints(bkp_file) # generate output vcf file - output_vcf(bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix)) + output_vcf( + bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix) + ) # Expected output for each variant (note these are phased so order matters) # CHROM POS FORMAT Sample1 Sample2 @@ -132,31 +139,37 @@ def test_vcf_output(): os.remove(str(out_prefix) + ".vcf") return + def test_region_bkp(): modelfile = DATADIR.joinpath("outvcf_gen.dat") popsize = 100000 - region = {'chr':'22','start':16000, 'end':18000} + region = {"chr": "22", "start": 16000, "end": 18000} coords_dir = DATADIR.joinpath("map") chroms = ["22"] seed = 100 - num_samples, all_samples = simulate_gt(modelfile, coords_dir, chroms, region, popsize, seed) - + num_samples, all_samples = simulate_gt( + modelfile, coords_dir, chroms, region, popsize, seed + ) + # Make sure lowest bkp listed is 16111 and greatest is 18674 for sample in all_samples: for coord in sample: assert 16111 <= coord.get_end_coord() <= 18674 return + def test_region_vcf(): - region = {'chr':'2', 'start':1, 'end':10122} + region = {"chr": "2", "start": 1, "end": 10122} bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files() bkps = _get_breakpoints(bkp_file) - chroms = ['2'] - output_vcf(bkps, chroms, model_file, vcf_file, sampleinfo_file, region, str(out_prefix)) + chroms = ["2"] + output_vcf( + bkps, chroms, model_file, vcf_file, sampleinfo_file, region, str(out_prefix) + ) vcf = VCF(str(out_prefix) + ".vcf") for var in vcf: - assert var.POS == 10122 and var.CHROM == '2' + assert var.POS == 10122 and var.CHROM == "2" assert var.genotypes[0] == [1, 0, True] assert var.format("POP")[0] == "YRI,CEU" assert var.genotypes[1] == [0, 1, True] @@ -165,6 +178,7 @@ def test_region_vcf(): os.remove(str(out_prefix) + ".vcf") return + # model_file exception validation def test_model_files(): mapdir = DATADIR.joinpath("map") @@ -302,7 +316,8 @@ def test_model_files(): validate_params( model, mapdir, chroms, popsize, vcf_file, faulty_sampleinfo_file ) - assert (str(e.value)) == "Sample HG00022 in sampleinfo file is not present in the vcf file." + msg = "Sample HG00022 in sampleinfo file is not present in the vcf file." + assert (str(e.value)) == msg faulty_model = DATADIR.joinpath("dat_files/faulty_model_sample_info.dat") mfile = open(faulty_model, "r") @@ -312,8 +327,8 @@ def test_model_files(): validate_params( faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file ) - assert ( - (str(e.value)) - == f"Population {model_pop} in model file is not present in the sample info" + msg = ( + f"Population {model_pop} in model file is not present in the sample info" " file." ) + assert str(e.value) == msg From 8ccb127216f0fec7564d31633f2024320c459528 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 12:17:56 -0700 Subject: [PATCH 09/12] exclude unformatted files from black --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 80d7e468..62ac2073 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ haptools = 'haptools.__main__:main' [tool.black] line-length = 88 preview = true +extend-exclude = "haptools/(sim_genotype|karyogram).py" [tool.pytest.ini_options] log_cli_level = "DEBUG" From 54abd6b300a9c380e85d51d83cd278c1f37d6b94 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Oct 2022 14:15:11 -0700 Subject: [PATCH 10/12] describe check_maf() in data API docs --- docs/api/data.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/api/data.rst b/docs/api/data.rst index 604fdeef..2075092f 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -129,6 +129,17 @@ There are several quality-control checks performed by default (in the ``load()`` 2. ``check_biallelic()`` - raises an error if any variants have more than one ALT allele 3. ``check_phase()`` - raises an error if any genotypes are unphased +Additionally, you can use the ``check_maf()`` method after checking for missing genotypes and confirming that all variants are biallelic. + +.. code-block:: python + + genotypes = data.Genotypes('tests/data/simple.vcf.gz') + genotypes.read() + genotypes.check_missing() + genotypes.check_biallelic() + genotypes.check_maf(threshold=0.05) + genotypes.check_phase() + Subsetting ********** You can index into a loaded :class:`Genotypes` instance using the ``subset()`` function. This works similiar to numpy indexing with the added benefit that you can specify a subset of variants and/or samples by their IDs instead of just their indices. From 111df55a952c5f19e7cc5f8e2372ebc02402b55b Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 24 Oct 2022 16:35:42 -0700 Subject: [PATCH 11/12] fix GenotypesPLINK typo in haptools data API example --- docs/api/examples.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/examples.rst b/docs/api/examples.rst index 8592db4a..340b8a41 100644 --- a/docs/api/examples.rst +++ b/docs/api/examples.rst @@ -88,7 +88,7 @@ You can easily use the :ref:`data API ` and the :ref:`simphenotype API gt.read(variants=variants) # the advantage of using a PGEN file is that you can use read_variants() to load # the variants quickly w/o having to load the genotypes, too - gt = data.GenotypesPGEN("tests/data/apoe.pgen") + gt = data.GenotypesPLINK("tests/data/apoe.pgen") gt.read_variants(variants=variants) # initialize an empty haplotype file From 0b89c93a6a8d2ebfa2ea6bbc5432139d02750ee6 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 24 Oct 2022 16:36:17 -0700 Subject: [PATCH 12/12] fix another typo - yikes --- docs/api/examples.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/examples.rst b/docs/api/examples.rst index 340b8a41..c6ae9dcf 100644 --- a/docs/api/examples.rst +++ b/docs/api/examples.rst @@ -25,7 +25,7 @@ As an example, let's say we would like to convert the following ``.blocks.det`` # load the genotypes file # you can use either a VCF or PGEN file gt = data.GenotypesRefAlt.load("input.vcf.gz") - gt = data.GenotypesPGEN.load("input.pgen") + gt = data.GenotypesPLINK.load("input.pgen") # load the haplotypes hp = data.Haplotypes("output.hap")