From 94e1f61f90f3751405b176e8725539e5e0b73ae8 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 18 Oct 2022 14:38:19 -0700
Subject: [PATCH 01/12] remove aaf field from Genotypes objects see
 https://github.com/CAST-genomics/haptools/issues/19#issuecomment-1126651795

---
 docs/api/data.rst           |  4 ++--
 haptools/data/genotypes.py  | 44 +++----------------------------------
 haptools/data/haplotypes.py |  2 +-
 haptools/transform.py       |  2 +-
 tests/test_data.py          | 28 +++++------------------
 5 files changed, 12 insertions(+), 68 deletions(-)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index de7faa5a..604fdeef 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -59,7 +59,7 @@ Properties
 **********
 The ``data`` property of a :class:`Genotypes` object is a numpy array representing the genotype matrix. Rows of the array are samples and columns are variants. Each entry in the matrix is a tuple of values -- one for each chromosome. Each value is an integer denoting the index of the allele (0 for REF, 1 for the first ALT allele, 2 for the next ALT allele, etc).
 
-There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has four named columns: "id", "chrom", "pos", and "aaf" (alternate allele frequency).
+There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has three named columns: "id" (variant ID), "chrom" (chromosome name), and "pos" (chromosomal position).
 
 Reading a file
 **************
@@ -143,7 +143,7 @@ By default, the ``subset()`` method returns a new :class:`Genotypes` instance. T
 
 GenotypesRefAlt
 +++++++++++++++
-The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "aaf", "ref", and "alt".
+The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "ref", and "alt".
 
 All of the other methods in the :class:`Genotypes` class are inherited, but the :class:`GenotypesRefAlt` class implements an additional method ``write()`` for dumping the contents of the class to the provided file.
 
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index fab66675..1c429853 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -32,7 +32,6 @@ class Genotypes(Data):
             1. ID
             2. CHROM
             3. POS
-            4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
     log: Logger
         A logging instance for recording debug statements
     _prephased : bool
@@ -61,7 +60,6 @@ def __init__(self, fname: Path | str, log: Logger = None):
                 ("id", "U50"),
                 ("chrom", "U10"),
                 ("pos", np.uint32),
-                ("aaf", np.float64),
             ],
         )
         self._prephased = False
@@ -219,7 +217,7 @@ def _variant_arr(self, record: Variant):
             A row from the :py:attr:`~.Genotypes.variants` array
         """
         return np.array(
-            (record.ID, record.CHROM, record.POS, record.aaf),
+            (record.ID, record.CHROM, record.POS),
             dtype=self.variants.dtype,
         )
 
@@ -504,38 +502,6 @@ def check_phase(self):
         # remove the last dimension that contains the phase info
         self.data = self.data[:, :, :2]
 
-    def to_MAC(self):
-        """
-        Convert the ALT count GT matrix into a matrix of minor allele counts
-
-        This function modifies :py:attr:`~.Genotypes.data` in-place
-
-        It also changes the 'aaf' record in :py:attr:`~.Genotypes.variants` to 'maf'
-
-        Raises
-        ------
-        AssertionError
-            If the matrix has already been converted
-        """
-        if self.variants.dtype.names[3] == "maf":
-            self.log.warning(
-                "The matrix already counts instances of the minor allele rather than "
-                "the ALT allele."
-            )
-            return
-        need_conversion = self.variants["aaf"] > 0.5
-        # flip the count on the variants that have an alternate allele frequency
-        # above 0.5
-        self.data[:, need_conversion, :2] = ~self.data[:, need_conversion, :2]
-        # also encode an MAF instead of an AAF in self.variants
-        self.variants["aaf"][need_conversion] = (
-            1 - self.variants["aaf"][need_conversion]
-        )
-        # replace 'aaf' with 'maf' in the matrix
-        self.variants.dtype.names = [
-            (x, "maf")[x == "aaf"] for x in self.variants.dtype.names
-        ]
-
 
 class GenotypesRefAlt(Genotypes):
     """
@@ -556,9 +522,8 @@ class GenotypesRefAlt(Genotypes):
             1. ID
             2. CHROM
             3. POS
-            4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
-            5. REF
-            6. ALT
+            4. REF
+            5. ALT
     log: Logger
         See documentation for :py:attr:`~.Genotypes.log`
     """
@@ -571,7 +536,6 @@ def __init__(self, fname: Path | str, log: Logger = None):
                 ("id", "U50"),
                 ("chrom", "U10"),
                 ("pos", np.uint32),
-                ("aaf", np.float64),
                 ("ref", "U100"),
                 ("alt", "U100"),
             ],
@@ -586,7 +550,6 @@ def _variant_arr(self, record: Variant):
                 record.ID,
                 record.CHROM,
                 record.POS,
-                record.aaf,
                 record.REF,
                 record.ALT[0],
             ),
@@ -795,7 +758,6 @@ def _variant_arr(
                 record[cid["ID"]],
                 record[cid["CHROM"]],
                 record[cid["POS"]],
-                0.5,
                 record[cid["REF"]],
                 record[cid["ALT"]],
             ),
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index a5fb8c97..3f247a4c 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -1080,7 +1080,7 @@ def transform(
             hap_gts = GenotypesRefAlt(fname=None, log=self.log)
         hap_gts.samples = gts.samples
         hap_gts.variants = np.array(
-            [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
+            [(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()],
             dtype=hap_gts.variants.dtype,
         )
         # build a fast data structure for querying the alleles in each haplotype:
diff --git a/haptools/transform.py b/haptools/transform.py
index a1b33cab..2f3ae2be 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -97,7 +97,7 @@ def transform(
             hap_gts = data.GenotypesRefAlt(fname=None, log=self.log)
         hap_gts.samples = gts.samples
         hap_gts.variants = np.array(
-            [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
+            [(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()],
             dtype=hap_gts.variants.dtype,
         )
         # build a fast data structure for querying the alleles in each haplotype:
diff --git a/tests/test_data.py b/tests/test_data.py
index 518b9026..3f0273e1 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -39,17 +39,12 @@ def _get_fake_genotypes(self):
         gts.data = self._get_expected_genotypes()
         gts.variants = np.array(
             [
-                ("1:10114:T:C", "1", 10114, 0),
-                ("1:10116:A:G", "1", 10116, 0.6),
-                ("1:10117:C:A", "1", 10117, 0),
-                ("1:10122:A:G", "1", 10122, 0),
-            ],
-            dtype=[
-                ("id", "U50"),
-                ("chrom", "U10"),
-                ("pos", np.uint32),
-                ("aaf", np.float64),
+                ("1:10114:T:C", "1", 10114),
+                ("1:10116:A:G", "1", 10116),
+                ("1:10117:C:A", "1", 10117),
+                ("1:10122:A:G", "1", 10122),
             ],
+            dtype=gts.variants.dtype,
         )
         gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
         gts.check_phase()
@@ -117,18 +112,6 @@ def test_load_genotypes(self, caplog):
         gts.check_phase()
         assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"
 
-        # convert the matrix of alt allele counts to a matrix of minor allele counts
-        assert gts.variants["aaf"][1] == 0.6
-        gts.to_MAC()
-        expected[:, 1, :] = ~expected[:, 1, :]
-        np.testing.assert_allclose(gts.data, expected)
-        assert gts.variants["maf"][1] == 0.4
-
-        # try to do the MAC conversion again - it should warn b/c we've already done it
-        caplog.clear()
-        gts.to_MAC()
-        assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"
-
     def test_load_genotypes_example(self):
         samples = (
             "HG00096",
@@ -632,7 +615,6 @@ def test_read_subset(self):
         haps.read(region="21:26928472-26941960")
         assert expected == haps.data
 
-
     def test_read_extras(self):
         # what do we expect to see from the simphenotype.hap file?
         expected = {

From 156b2edbd24abad62679fb988c7c18bd1d43ab92 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 18 Oct 2022 16:29:01 -0700
Subject: [PATCH 02/12] add method to check af

---
 haptools/data/genotypes.py | 45 ++++++++++++++++++++++++++++++++++++++
 tests/test_data.py         | 22 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 1c429853..4a36255e 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -502,6 +502,51 @@ def check_phase(self):
         # remove the last dimension that contains the phase info
         self.data = self.data[:, :, :2]
 
+    def check_af(self, threshold: float = None, discard_also: bool = False):
+        """
+        Return the allele frequency of the REF allele
+
+        If a threshold is specified, raise a ValueError if the frequency exceeds it
+
+        Parameters
+        ----------
+        threshold: float, optional
+            If any variant has an *alternate* allele frequency rarer than the
+            provided threshold, raise a ValueError
+        discard_also : bool, optional
+            If True, discard any variants that would otherwise cause a ValueError
+
+            This parameter will be ignored if a threshold is not specified
+
+        Raises
+        ------
+        ValueError
+            If any variant exceeds the provided threshold allele frequency
+        """
+        num_strands = 2 * self.data.shape[0]
+        ref_af = 1 - self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2))/num_strands
+        if threshold is None:
+            return ref_af
+        rare_variants = (ref_af > (1 - threshold)) | (ref_af < threshold)
+        if np.any(rare_variants):
+            var_idx = np.nonzero(rare_variants)[0]
+            if discard_also:
+                original_num_variants = len(self.variants)
+                self.data = np.delete(self.data, var_idx, axis=1)
+                self.variants = np.delete(self.variants, var_idx)
+                self.log.info(
+                    "Ignoring missing genotypes from "
+                    f"{original_num_variants - len(self.variants)} samples"
+                )
+                self._var_idx = None
+            else:
+                raise ValueError(
+                    "Variant with ID {} at POS {}:{} has REF frequency {}".format(
+                        *tuple(self.variants[var_idx[0]])[:3], ref_af[var_idx[0]]
+                    )
+                )
+        return ref_af
+
 
 class GenotypesRefAlt(Genotypes):
     """
diff --git a/tests/test_data.py b/tests/test_data.py
index 3f0273e1..0d96d239 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -232,6 +232,28 @@ def test_subset_genotypes(self):
         np.testing.assert_allclose(gts_sub.data, expected_data)
         assert np.array_equal(gts_sub.variants, expected_variants)
 
+    def test_check_af(self):
+        gts = self._get_fake_genotypes()
+        expected_af = np.array([1, 0.4, 1, 1])
+
+        af = gts.check_af()
+        np.testing.assert_allclose(af, expected_af)
+
+        with pytest.raises(ValueError) as info:
+            gts.check_af(threshold=0.01)
+        assert (
+            str(info.value)
+            == "Variant with ID 1:10114:T:C at POS 1:10114 has REF frequency 1.0"
+        )
+
+        gts.check_af(threshold=0, discard_also=True)
+        assert len(gts.variants) == 4
+        assert gts.data.shape[1] == 4
+
+        gts.check_af(threshold=0.01, discard_also=True)
+        assert len(gts.variants) == 1
+        assert gts.data.shape[1] == 1
+
 
 class TestGenotypesPLINK:
     def _get_fake_genotypes_plink(self):

From c77bb1eee8c1b4c93abafd749908c378c01ecd64 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:59:13 -0700
Subject: [PATCH 03/12] check MAF instead of non-REF AF

---
 haptools/data/genotypes.py | 39 +++++++++++++++++++++++---------------
 tests/test_data.py         | 20 +++++++++++--------
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 4a36255e..fd18b114 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -502,17 +502,23 @@ def check_phase(self):
         # remove the last dimension that contains the phase info
         self.data = self.data[:, :, :2]
 
-    def check_af(self, threshold: float = None, discard_also: bool = False):
+    def check_maf(self, threshold: float = None, discard_also: bool = False):
         """
-        Return the allele frequency of the REF allele
+        Return the minor allele frequency of each variant
 
-        If a threshold is specified, raise a ValueError if the frequency exceeds it
+        Raise a ValueError if any variant's MAF doesn't satisfy the threshold, if
+        one is provided
+
+        .. note::
+            You should call :py:meth:`~.Genotypes.check_missing` and
+            :py:meth:`~.Genotypes.check_biallelic` before executing this method, for
+            best results. Otherwise, the frequencies may be computed incorrectly.
 
         Parameters
         ----------
         threshold: float, optional
-            If any variant has an *alternate* allele frequency rarer than the
-            provided threshold, raise a ValueError
+            If a variant has a minor allele frequency (MAF) rarer than this threshold,
+            raise a ValueError
         discard_also : bool, optional
             If True, discard any variants that would otherwise cause a ValueError
 
@@ -521,29 +527,32 @@ def check_af(self, threshold: float = None, discard_also: bool = False):
         Raises
         ------
         ValueError
-            If any variant exceeds the provided threshold allele frequency
+            If any variant does not meet the provided threshold minor allele frequency
         """
         num_strands = 2 * self.data.shape[0]
-        ref_af = 1 - self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2))/num_strands
+        # TODO: make this work for multi-allelic variants, too?
+        ref_af = self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2)) / num_strands
+        maf = np.array([ref_af, 1 - ref_af]).min(axis=0)
         if threshold is None:
-            return ref_af
-        rare_variants = (ref_af > (1 - threshold)) | (ref_af < threshold)
+            return maf
+        rare_variants = maf < threshold
         if np.any(rare_variants):
-            var_idx = np.nonzero(rare_variants)[0]
+            idx = np.nonzero(rare_variants)[0]
             if discard_also:
                 original_num_variants = len(self.variants)
-                self.data = np.delete(self.data, var_idx, axis=1)
-                self.variants = np.delete(self.variants, var_idx)
+                self.data = np.delete(self.data, idx, axis=1)
+                self.variants = np.delete(self.variants, idx)
                 self.log.info(
                     "Ignoring missing genotypes from "
                     f"{original_num_variants - len(self.variants)} samples"
                 )
                 self._var_idx = None
             else:
+                vals = tuple(self.variants[idx[0]])[:3] + (maf[idx[0]], threshold)
+                # raise error if the minor allele frequency of a variant does not meet
+                # the threshold
                 raise ValueError(
-                    "Variant with ID {} at POS {}:{} has REF frequency {}".format(
-                        *tuple(self.variants[var_idx[0]])[:3], ref_af[var_idx[0]]
-                    )
+                    "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals)
                 )
         return ref_af
 
diff --git a/tests/test_data.py b/tests/test_data.py
index 0d96d239..f9ce47da 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -232,28 +232,32 @@ def test_subset_genotypes(self):
         np.testing.assert_allclose(gts_sub.data, expected_data)
         assert np.array_equal(gts_sub.variants, expected_variants)
 
-    def test_check_af(self):
+    def test_check_maf(self):
         gts = self._get_fake_genotypes()
-        expected_af = np.array([1, 0.4, 1, 1])
+        expected_maf = np.array([0, 0.4, 0, 0])
 
-        af = gts.check_af()
-        np.testing.assert_allclose(af, expected_af)
+        maf = gts.check_maf()
+        np.testing.assert_allclose(maf, expected_maf)
 
         with pytest.raises(ValueError) as info:
-            gts.check_af(threshold=0.01)
+            gts.check_maf(threshold=0.01)
         assert (
             str(info.value)
-            == "Variant with ID 1:10114:T:C at POS 1:10114 has REF frequency 1.0"
+            == "Variant with ID 1:10114:T:C at POS 1:10114 has MAF 0.0 < 0.01"
         )
 
-        gts.check_af(threshold=0, discard_also=True)
+        gts.check_maf(threshold=0, discard_also=True)
         assert len(gts.variants) == 4
         assert gts.data.shape[1] == 4
 
-        gts.check_af(threshold=0.01, discard_also=True)
+        gts.check_maf(threshold=0.01, discard_also=True)
         assert len(gts.variants) == 1
         assert gts.data.shape[1] == 1
 
+        gts.check_maf(threshold=0.5, discard_also=True)
+        assert len(gts.variants) == 0
+        assert gts.data.shape[1] == 0
+
 
 class TestGenotypesPLINK:
     def _get_fake_genotypes_plink(self):

From 2ba000b9f7b1295bd2579d856d9c20aacd2f4212 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:28:13 -0700
Subject: [PATCH 04/12] add flag to warn instead of raise ValueError

---
 haptools/data/genotypes.py | 30 ++++++++++++++++++++++--------
 tests/test_data.py         | 22 ++++++++++++++--------
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index fd18b114..3dc94887 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -502,9 +502,14 @@ def check_phase(self):
         # remove the last dimension that contains the phase info
         self.data = self.data[:, :, :2]
 
-    def check_maf(self, threshold: float = None, discard_also: bool = False):
+    def check_maf(
+        self,
+        threshold: float = None,
+        discard_also: bool = False,
+        warn_only: bool = False,
+    ) -> npt.NDArray[np.float64]:
         """
-        Return the minor allele frequency of each variant
+        Check the minor allele frequency of each variant
 
         Raise a ValueError if any variant's MAF doesn't satisfy the threshold, if
         one is provided
@@ -523,11 +528,17 @@ def check_maf(self, threshold: float = None, discard_also: bool = False):
             If True, discard any variants that would otherwise cause a ValueError
 
             This parameter will be ignored if a threshold is not specified
+        warn_only: bool, optional
+            Just raise a warning instead of a ValueError
 
         Raises
         ------
         ValueError
             If any variant does not meet the provided threshold minor allele frequency
+
+        Returns
+        -------
+            The minor allele frequency of each variant
         """
         num_strands = 2 * self.data.shape[0]
         # TODO: make this work for multi-allelic variants, too?
@@ -542,6 +553,7 @@ def check_maf(self, threshold: float = None, discard_also: bool = False):
                 original_num_variants = len(self.variants)
                 self.data = np.delete(self.data, idx, axis=1)
                 self.variants = np.delete(self.variants, idx)
+                maf = np.delete(maf, idx)
                 self.log.info(
                     "Ignoring missing genotypes from "
                     f"{original_num_variants - len(self.variants)} samples"
@@ -549,12 +561,14 @@ def check_maf(self, threshold: float = None, discard_also: bool = False):
                 self._var_idx = None
             else:
                 vals = tuple(self.variants[idx[0]])[:3] + (maf[idx[0]], threshold)
-                # raise error if the minor allele frequency of a variant does not meet
-                # the threshold
-                raise ValueError(
-                    "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals)
-                )
-        return ref_af
+                msg = "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals)
+                if warn_only:
+                    self.log.warning(msg)
+                else:
+                    # raise error if the minor allele frequency of a variant does not
+                    # meet the threshold
+                    raise ValueError(msg)
+        return maf
 
 
 class GenotypesRefAlt(Genotypes):
diff --git a/tests/test_data.py b/tests/test_data.py
index f9ce47da..f88444fc 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -232,29 +232,35 @@ def test_subset_genotypes(self):
         np.testing.assert_allclose(gts_sub.data, expected_data)
         assert np.array_equal(gts_sub.variants, expected_variants)
 
-    def test_check_maf(self):
+    def test_check_maf(self, caplog):
         gts = self._get_fake_genotypes()
         expected_maf = np.array([0, 0.4, 0, 0])
 
         maf = gts.check_maf()
         np.testing.assert_allclose(maf, expected_maf)
 
+        msg = "Variant with ID 1:10114:T:C at POS 1:10114 has MAF 0.0 < 0.01"
         with pytest.raises(ValueError) as info:
             gts.check_maf(threshold=0.01)
-        assert (
-            str(info.value)
-            == "Variant with ID 1:10114:T:C at POS 1:10114 has MAF 0.0 < 0.01"
-        )
+        assert str(info.value) == msg
 
-        gts.check_maf(threshold=0, discard_also=True)
+        # test just the warning system
+        caplog.clear()
+        maf = gts.check_maf(threshold=0.01, warn_only=True)
+        assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"
+
+        maf = gts.check_maf(threshold=0, discard_also=True)
+        np.testing.assert_allclose(maf, expected_maf)
         assert len(gts.variants) == 4
         assert gts.data.shape[1] == 4
 
-        gts.check_maf(threshold=0.01, discard_also=True)
+        maf = gts.check_maf(threshold=0.01, discard_also=True)
+        np.testing.assert_allclose(maf, expected_maf[1])
         assert len(gts.variants) == 1
         assert gts.data.shape[1] == 1
 
-        gts.check_maf(threshold=0.5, discard_also=True)
+        maf = gts.check_maf(threshold=0.5, discard_also=True)
+        np.testing.assert_allclose(maf, expected_maf[:0])
         assert len(gts.variants) == 0
         assert gts.data.shape[1] == 0
 

From 87df5c8311aa71a9279aa57c5b39af4cca46d91d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:28:57 -0700
Subject: [PATCH 05/12] fix auto-generated docs

---
 haptools/data/breakpoints.py | 2 +-
 haptools/data/genotypes.py   | 6 +++---
 haptools/index.py            | 2 +-
 haptools/ld.py               | 2 +-
 haptools/transform.py        | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/haptools/data/breakpoints.py b/haptools/data/breakpoints.py
index 4497ce99..35660d55 100644
--- a/haptools/data/breakpoints.py
+++ b/haptools/data/breakpoints.py
@@ -269,7 +269,7 @@ def population_array(
             A subset of samples to include in the output, ordered by their given order
 
         Returns
-        ------
+        -------
         npt.NDArray
             An array of shape: samples x variants x 2
 
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 3dc94887..a48fe002 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -297,15 +297,15 @@ def index(self, samples: bool = True, variants: bool = True):
         Call this function once to improve the amortized time-complexity of look-ups of
         samples and variants by their ID. This is useful if you intend to later subset
         by a set of samples or variant IDs.
-        The time complexity of this function should be roughly O(n+m) if both
-        parameters are True. Otherwise, it will be either O(n) or O(m).
+        The time complexity of this function should be roughly O(n+p) if both
+        parameters are True. Otherwise, it will be either O(n) or O(p).
 
         Parameters
         ----------
         samples: bool, optional
             Whether to index the samples for fast loop-up. Adds complexity O(n).
         variants: bool, optional
-            Whether to index the variants for fast look-up. Adds complexity O(m).
+            Whether to index the variants for fast look-up. Adds complexity O(p).
 
         Raises
         ------
diff --git a/haptools/index.py b/haptools/index.py
index 6e7f6a4e..5b2eaed7 100644
--- a/haptools/index.py
+++ b/haptools/index.py
@@ -33,7 +33,7 @@ def index_haps(
     haplotypes: Path,
     sort: bool = False,
     output: Path = None,
-    log: Logger = None,
+    log: logging.Logger = None,
 ):
     """
     Takes in an unsorted .hap file and outputs it as a .gz and a .tbi file
diff --git a/haptools/ld.py b/haptools/ld.py
index de175205..27ae2474 100644
--- a/haptools/ld.py
+++ b/haptools/ld.py
@@ -57,7 +57,7 @@ def calc_ld(
     discard_missing: bool = False,
     from_gts: bool = False,
     output: Path = Path("/dev/stdout"),
-    log: Logger = None,
+    log: logging.Logger = None,
 ):
     """
     Creates a VCF composed of haplotypes
diff --git a/haptools/transform.py b/haptools/transform.py
index 2f3ae2be..c2c926dd 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -510,7 +510,7 @@ def transform_haps(
     discard_missing: bool = False,
     ancestry: bool = False,
     output: Path = Path("-"),
-    log: Logger = None,
+    log: logging.Logger = None,
 ):
     """
     Creates a VCF composed of haplotypes

From 7f7867a4d5186176b99f36ec7ec26959af5ce95c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:32:51 -0700
Subject: [PATCH 06/12] refmt admix_storage with black

---
 haptools/admix_storage.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/haptools/admix_storage.py b/haptools/admix_storage.py
index 3aa306e0..912ac95c 100644
--- a/haptools/admix_storage.py
+++ b/haptools/admix_storage.py
@@ -11,7 +11,7 @@ def __repr__(self):
 
     def __str__(self):
         return f"(Chrom {self.chrom}, {self.cm_map_pos} cM, {self.bp_map_pos} bp)"
-    
+
     def get_chrom(self):
         return self.chrom
 
@@ -30,12 +30,18 @@ class HaplotypeSegment:
     def __init__(self, pop_num, chrom, end_coord, end_pos):
         """
         Note the beginning of the segment is inferred based on previous
-        segments stored throughout the simulation process. 
-        Arguments
-            pop_num - population label
-            chrom - chromosome the haplotype segment lies on.
-            end_coord - Ending coordinate in bp of the haplotype segment
-            end_pos - Ending coordinate in centimorgans of the hap segment
+        segments stored throughout the simulation process.
+
+        Parameters
+        ----------
+            pop_num
+                population label
+            chrom
+                chromosome the haplotype segment lies on.
+            end_coord
+                Ending coordinate in bp of the haplotype segment
+            end_pos
+                Ending coordinate in centimorgans of the hap segment
         """
         self.pop_num = pop_num
         self.chrom = chrom
@@ -43,12 +49,16 @@ def __init__(self, pop_num, chrom, end_coord, end_pos):
         self.end_pos = end_pos
 
     def __repr__(self):
-        return f"(Population {self.pop_num}, chrom {self.chrom}, " + \
-               f"end_coord {self.end_coord}, end_pos {self.end_pos})"
+        return (
+            f"(Population {self.pop_num}, chrom {self.chrom}, "
+            + f"end_coord {self.end_coord}, end_pos {self.end_pos})"
+        )
 
     def __str__(self):
-        return f"Population {self.pop_num}, chrom {self.chrom}, " + \
-               f"end_coord {self.end_coord}, end_pos {self.end_pos}"
+        return (
+            f"Population {self.pop_num}, chrom {self.chrom}, "
+            + f"end_coord {self.end_coord}, end_pos {self.end_pos}"
+        )
 
     def get_end_pos(self):
         return self.end_pos

From e88c3295728348e03989421f41c0147c3c0d7050 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 12:14:27 -0700
Subject: [PATCH 07/12] make region in validate_params() an optional arg

---
 haptools/sim_genotype.py |  2 +-
 tests/test_outputvcf.py  | 43 ++++++++++++++++++----------------------
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py
index f2c9a49b..31a1bf9c 100644
--- a/haptools/sim_genotype.py
+++ b/haptools/sim_genotype.py
@@ -776,7 +776,7 @@ def start_segment(start, chrom, segments):
 
     return len(segments)
 
-def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region, only_bp=False):
+def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region=None, only_bp=False):
     # validate model file
     mfile = open(model, 'r')
     num_samples, *pops = mfile.readline().strip().split()
diff --git a/tests/test_outputvcf.py b/tests/test_outputvcf.py
index b53c134a..8506a687 100644
--- a/tests/test_outputvcf.py
+++ b/tests/test_outputvcf.py
@@ -177,14 +177,14 @@ def test_model_files():
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_sample_number_to_int.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Can't convert samples number to an integer."
 
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_num_pops.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (
         str(e.value)
@@ -193,7 +193,7 @@ def test_model_files():
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_less_than_1.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Number of samples is less than 1."
 
@@ -201,21 +201,21 @@ def test_model_files():
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_pop_fracs.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Can't convert generation to integer."
 
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_frac.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Can't convert population fractions to type float."
 
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_pop_header.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (
         (str(e.value))
@@ -226,7 +226,7 @@ def test_model_files():
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_cur_gen.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (
         (str(e.value))
@@ -238,58 +238,53 @@ def test_model_files():
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_sum_frac.dat")
     with pytest.raises(Exception) as e:
         validate_params(
-            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
-    assert (
-        str(e.value)
-    ) == "Population fractions for generation 1 do not sum to 1."
+    assert (str(e.value)) == "Population fractions for generation 1 do not sum to 1."
 
     # Validate mapdir exceptions
     model = DATADIR.joinpath("dat_files/correct_model.dat")
     faulty_mapdir = DATADIR.joinpath("maps")
     with pytest.raises(Exception) as e:
         validate_params(
-            model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Map directory given is not a valid path."
 
     with pytest.raises(Exception) as e:
-        validate_params(
-            model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
-        )
+        validate_params(model, mapdir, chroms, popsize, vcf_file, sampleinfo_file)
     assert (str(e.value)) == f"Chromosome {chroms[0]} in the list given is not valid."
 
     chroms = ["1"]
     faulty_mapdir = DATADIR.joinpath("test_map")
     with pytest.raises(Exception) as e:
         validate_params(
-            model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Could not parse map directory files."
 
     faulty_mapdir = DATADIR.joinpath("test_map_2")
     with pytest.raises(Exception) as e:
         validate_params(
-            model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+            model, faulty_mapdir, chroms, popsize, vcf_file, sampleinfo_file
         )
     assert (
         (str(e.value))
-        == "No valid coordinate files found. Must contain chr{1-22,X} in the file"
-        " name."
+        == "No valid coordinate files found. Must contain chr{1-22,X} in the file name."
     )
 
     # validate popsize exceptions
     faulty_popsize = "NA"
     with pytest.raises(Exception) as e:
         validate_params(
-            model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file, None
+            model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Popsize is not an Integer."
 
     faulty_popsize = 0
     with pytest.raises(Exception) as e:
         validate_params(
-            model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file, None
+            model, mapdir, chroms, faulty_popsize, vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Popsize must be greater than 0."
 
@@ -297,7 +292,7 @@ def test_model_files():
     faulty_vcf_file = DATADIR.joinpath("faulty_vcf.vcf")
     with pytest.raises(Exception) as e:
         validate_params(
-            model, mapdir, chroms, popsize, faulty_vcf_file, sampleinfo_file, None
+            model, mapdir, chroms, popsize, faulty_vcf_file, sampleinfo_file
         )
     assert (str(e.value)) == "Unable to collect vcf samples."
 
@@ -305,7 +300,7 @@ def test_model_files():
     faulty_sampleinfo_file = DATADIR.joinpath("faulty_info.tab")
     with pytest.raises(Exception) as e:
         validate_params(
-            model, mapdir, chroms, popsize, vcf_file, faulty_sampleinfo_file, None
+            model, mapdir, chroms, popsize, vcf_file, faulty_sampleinfo_file
         )
     assert (str(e.value)) == "Sample HG00022 in sampleinfo file is not present in the vcf file."
 
@@ -315,7 +310,7 @@ def test_model_files():
     with pytest.raises(Exception) as e:
         for model_pop in pops:
             validate_params(
-                faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file, None
+                faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
             )
         assert (
             (str(e.value))

From ef3633484ebeb4113abfc6da2ac7ab7117571c5d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 12:15:31 -0700
Subject: [PATCH 08/12] refmt all test files and docs files with black

---
 docs/conf.py            |  2 +-
 haptools/__main__.py    |  7 +++----
 tests/test_karyogram.py | 41 +++++++++++++++++++------------------
 tests/test_outputvcf.py | 45 +++++++++++++++++++++++++++--------------
 4 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 986c7196..da840922 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -49,7 +49,7 @@
 autosummary_generate = True
 numpydoc_show_class_members = False
 # allow for both rst and md syntax
-source_suffix = ['.rst']
+source_suffix = [".rst"]
 
 # -- Options for HTML output -------------------------------------------------
 
diff --git a/haptools/__main__.py b/haptools/__main__.py
index 58cb817e..5ec91bf1 100755
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -764,7 +764,7 @@ def ld(
     "-o",
     "--output",
     type=click.Path(path_type=Path),
-    default= None,
+    default=None,
     show_default="input file",
     help="A .hap file containing sorted and indexed haplotypes and variants",
 )
@@ -778,11 +778,10 @@ def ld(
 )
 def index(
     haplotypes: Path,
-    sort: bool= False,
+    sort: bool = False,
     output: Path = None,
-    verbosity: str = 'CRITICAL',
+    verbosity: str = "CRITICAL",
 ):
-
     """
     Takes in an unsorted .hap file and outputs it as a .gz and a .tbi file
     """
diff --git a/tests/test_karyogram.py b/tests/test_karyogram.py
index 4e908933..07fae56b 100644
--- a/tests/test_karyogram.py
+++ b/tests/test_karyogram.py
@@ -5,26 +5,27 @@
 
 DATADIR = Path(__file__).parent.joinpath("data")
 
+
 def test_GetHaplotypeBlocks():
-	test_file = DATADIR.joinpath("test.bp")
-	sample_blocks = GetHaplotypeBlocks(test_file, "Sample_1")
-	assert(sample_blocks[0][0]["pop"]=="YRI")
-	assert(sample_blocks[0][0]["chrom"]==1)
-	assert(sample_blocks[0][0]["start"]==0.0001)
-	assert(sample_blocks[0][0]["end"]==168.003442)
+    test_file = DATADIR.joinpath("test.bp")
+    sample_blocks = GetHaplotypeBlocks(test_file, "Sample_1")
+    assert sample_blocks[0][0]["pop"] == "YRI"
+    assert sample_blocks[0][0]["chrom"] == 1
+    assert sample_blocks[0][0]["start"] == 0.0001
+    assert sample_blocks[0][0]["end"] == 168.003442
+
+    assert sample_blocks[1][0]["pop"] == "YRI"
+    assert sample_blocks[1][0]["chrom"] == 1
+    assert sample_blocks[1][0]["start"] == 0.0001
+    assert sample_blocks[1][0]["end"] == 87.107755
 
-	assert(sample_blocks[1][0]["pop"]=="YRI")
-	assert(sample_blocks[1][0]["chrom"]==1)
-	assert(sample_blocks[1][0]["start"]==0.0001)
-	assert(sample_blocks[1][0]["end"]==87.107755)
-	
-	sample_blocks = GetHaplotypeBlocks(test_file, "Sample_2")
-	assert(sample_blocks[0][-1]["pop"]=="YRI")
-	assert(sample_blocks[0][-1]["chrom"]==2)
-	assert(sample_blocks[0][-1]["start"]==180.837755+0.0001)
-	assert(sample_blocks[0][-1]["end"]==244.341689)
+    sample_blocks = GetHaplotypeBlocks(test_file, "Sample_2")
+    assert sample_blocks[0][-1]["pop"] == "YRI"
+    assert sample_blocks[0][-1]["chrom"] == 2
+    assert sample_blocks[0][-1]["start"] == 180.837755 + 0.0001
+    assert sample_blocks[0][-1]["end"] == 244.341689
 
-	assert(sample_blocks[1][0]["pop"]=="YRI")
-	assert(sample_blocks[1][0]["chrom"]==1)
-	assert(sample_blocks[1][0]["start"]==0.0001)
-	assert(sample_blocks[1][0]["end"]==85.107755)
\ No newline at end of file
+    assert sample_blocks[1][0]["pop"] == "YRI"
+    assert sample_blocks[1][0]["chrom"] == 1
+    assert sample_blocks[1][0]["start"] == 0.0001
+    assert sample_blocks[1][0]["end"] == 85.107755
diff --git a/tests/test_outputvcf.py b/tests/test_outputvcf.py
index 8506a687..d0bf43ef 100644
--- a/tests/test_outputvcf.py
+++ b/tests/test_outputvcf.py
@@ -17,6 +17,7 @@ def _get_files():
     out_prefix = DATADIR.joinpath("outvcf_out")
     return bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix
 
+
 def _get_breakpoints(bkp_file):
     # Collect breakpoints to proper format used in output_vcf function
     breakpoints = []
@@ -44,17 +45,20 @@ def _get_breakpoints(bkp_file):
     breakpoints = np.array(breakpoints, dtype=object)
     return breakpoints
 
+
 def test_alt_chrom_name():
     # Test when the ref VCF has chr{X|\d+} form
     # read in all files and breakpoints
     bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
     bkp_file = DATADIR.joinpath("outvcf_test_chr.bp")
     vcf_file = DATADIR.joinpath("outvcf_test_chr.vcf")
-    chroms = ['1', '2', 'X']
+    chroms = ["1", "2", "X"]
     bkps = _get_breakpoints(bkp_file)
 
     # generate output vcf file
-    output_vcf(bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix))
+    output_vcf(
+        bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix)
+    )
 
     # read in vcf file
     vcf = VCF(str(out_prefix) + ".vcf")
@@ -90,14 +94,17 @@ def test_alt_chrom_name():
     os.remove(str(out_prefix) + ".vcf")
     return
 
+
 def test_vcf_output():
     # read in all files and breakpoints
     bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
-    chroms = ['1', '2']
+    chroms = ["1", "2"]
     bkps = _get_breakpoints(bkp_file)
 
     # generate output vcf file
-    output_vcf(bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix))
+    output_vcf(
+        bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix)
+    )
 
     # Expected output for each variant (note these are phased so order matters)
     # CHROM	POS  FORMAT	 Sample1      Sample2
@@ -132,31 +139,37 @@ def test_vcf_output():
     os.remove(str(out_prefix) + ".vcf")
     return
 
+
 def test_region_bkp():
     modelfile = DATADIR.joinpath("outvcf_gen.dat")
     popsize = 100000
-    region = {'chr':'22','start':16000, 'end':18000}
+    region = {"chr": "22", "start": 16000, "end": 18000}
     coords_dir = DATADIR.joinpath("map")
     chroms = ["22"]
     seed = 100
-    num_samples, all_samples = simulate_gt(modelfile, coords_dir, chroms, region, popsize, seed)
-    
+    num_samples, all_samples = simulate_gt(
+        modelfile, coords_dir, chroms, region, popsize, seed
+    )
+
     # Make sure lowest bkp listed is 16111 and greatest is 18674
     for sample in all_samples:
         for coord in sample:
             assert 16111 <= coord.get_end_coord() <= 18674
     return
 
+
 def test_region_vcf():
-    region = {'chr':'2', 'start':1, 'end':10122}
+    region = {"chr": "2", "start": 1, "end": 10122}
     bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
     bkps = _get_breakpoints(bkp_file)
-    chroms = ['2']
-    output_vcf(bkps, chroms, model_file, vcf_file, sampleinfo_file, region, str(out_prefix))
+    chroms = ["2"]
+    output_vcf(
+        bkps, chroms, model_file, vcf_file, sampleinfo_file, region, str(out_prefix)
+    )
 
     vcf = VCF(str(out_prefix) + ".vcf")
     for var in vcf:
-        assert var.POS == 10122 and var.CHROM == '2'
+        assert var.POS == 10122 and var.CHROM == "2"
         assert var.genotypes[0] == [1, 0, True]
         assert var.format("POP")[0] == "YRI,CEU"
         assert var.genotypes[1] == [0, 1, True]
@@ -165,6 +178,7 @@ def test_region_vcf():
     os.remove(str(out_prefix) + ".vcf")
     return
 
+
 # model_file exception validation
 def test_model_files():
     mapdir = DATADIR.joinpath("map")
@@ -302,7 +316,8 @@ def test_model_files():
         validate_params(
             model, mapdir, chroms, popsize, vcf_file, faulty_sampleinfo_file
         )
-    assert (str(e.value)) == "Sample HG00022 in sampleinfo file is not present in the vcf file."
+    msg = "Sample HG00022 in sampleinfo file is not present in the vcf file."
+    assert (str(e.value)) == msg
 
     faulty_model = DATADIR.joinpath("dat_files/faulty_model_sample_info.dat")
     mfile = open(faulty_model, "r")
@@ -312,8 +327,8 @@ def test_model_files():
             validate_params(
                 faulty_model, mapdir, chroms, popsize, vcf_file, sampleinfo_file
             )
-        assert (
-            (str(e.value))
-            == f"Population {model_pop} in model file is not present in the sample info"
+        msg = (
+            f"Population {model_pop} in model file is not present in the sample info"
             " file."
         )
+        assert str(e.value) == msg

From 8ccb127216f0fec7564d31633f2024320c459528 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 12:17:56 -0700
Subject: [PATCH 09/12] exclude unformatted files from black

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 80d7e468..62ac2073 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,6 +66,7 @@ haptools = 'haptools.__main__:main'
 [tool.black]
 line-length = 88
 preview = true
+extend-exclude = "haptools/(sim_genotype|karyogram).py"
 
 [tool.pytest.ini_options]
 log_cli_level = "DEBUG"

From 54abd6b300a9c380e85d51d83cd278c1f37d6b94 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Oct 2022 14:15:11 -0700
Subject: [PATCH 10/12] describe check_maf() in data API docs

---
 docs/api/data.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index 604fdeef..2075092f 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -129,6 +129,17 @@ There are several quality-control checks performed by default (in the ``load()``
 2. ``check_biallelic()`` - raises an error if any variants have more than one ALT allele
 3. ``check_phase()`` - raises an error if any genotypes are unphased
 
+Additionally, you can use the ``check_maf()`` method after checking for missing genotypes and confirming that all variants are biallelic.
+
+.. code-block:: python
+
+	genotypes = data.Genotypes('tests/data/simple.vcf.gz')
+	genotypes.read()
+	genotypes.check_missing()
+	genotypes.check_biallelic()
+	genotypes.check_maf(threshold=0.05)
+	genotypes.check_phase()
+
 Subsetting
 **********
 You can index into a loaded :class:`Genotypes` instance using the ``subset()`` function. This works similiar to numpy indexing with the added benefit that you can specify a subset of variants and/or samples by their IDs instead of just their indices.

From 111df55a952c5f19e7cc5f8e2372ebc02402b55b Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 24 Oct 2022 16:35:42 -0700
Subject: [PATCH 11/12] fix GenotypesPLINK typo in haptools data API example

---
 docs/api/examples.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/examples.rst b/docs/api/examples.rst
index 8592db4a..340b8a41 100644
--- a/docs/api/examples.rst
+++ b/docs/api/examples.rst
@@ -88,7 +88,7 @@ You can easily use the :ref:`data API <api-data>` and the :ref:`simphenotype API
     gt.read(variants=variants)
     # the advantage of using a PGEN file is that you can use read_variants() to load
     # the variants quickly w/o having to load the genotypes, too
-    gt = data.GenotypesPGEN("tests/data/apoe.pgen")
+    gt = data.GenotypesPLINK("tests/data/apoe.pgen")
     gt.read_variants(variants=variants)
 
     # initialize an empty haplotype file

From 0b89c93a6a8d2ebfa2ea6bbc5432139d02750ee6 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 24 Oct 2022 16:36:17 -0700
Subject: [PATCH 12/12] fix another typo - yikes

---
 docs/api/examples.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/examples.rst b/docs/api/examples.rst
index 340b8a41..c6ae9dcf 100644
--- a/docs/api/examples.rst
+++ b/docs/api/examples.rst
@@ -25,7 +25,7 @@ As an example, let's say we would like to convert the following ``.blocks.det``
     # load the genotypes file
     # you can use either a VCF or PGEN file
     gt = data.GenotypesRefAlt.load("input.vcf.gz")
-    gt = data.GenotypesPGEN.load("input.pgen")
+    gt = data.GenotypesPLINK.load("input.pgen")
 
     # load the haplotypes
     hp = data.Haplotypes("output.hap")