Merge pull request #124 from CAST-genomics/ref/genotypes-af

feat: `Genotypes.check_maf()` method
CAST-genomics · Oct 25, 2022 · dd4fa9e · dd4fa9e
2 parents f7df71f + 0b89c93
commit dd4fa9e
Show file tree

Hide file tree

Showing 16 changed files with 221 additions and 145 deletions.
diff --git a/docs/api/data.rst b/docs/api/data.rst
@@ -59,7 +59,7 @@ Properties
 **********
 The ``data`` property of a :class:`Genotypes` object is a numpy array representing the genotype matrix. Rows of the array are samples and columns are variants. Each entry in the matrix is a tuple of values -- one for each chromosome. Each value is an integer denoting the index of the allele (0 for REF, 1 for the first ALT allele, 2 for the next ALT allele, etc).
 
-There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has four named columns: "id", "chrom", "pos", and "aaf" (alternate allele frequency).
+There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has three named columns: "id" (variant ID), "chrom" (chromosome name), and "pos" (chromosomal position).
 
 Reading a file
 **************
@@ -129,6 +129,17 @@ There are several quality-control checks performed by default (in the ``load()``
 2. ``check_biallelic()`` - raises an error if any variants have more than one ALT allele
 3. ``check_phase()`` - raises an error if any genotypes are unphased
 
+Additionally, you can use the ``check_maf()`` method after checking for missing genotypes and confirming that all variants are biallelic.
+
+.. code-block:: python
+
+	genotypes = data.Genotypes('tests/data/simple.vcf.gz')
+	genotypes.read()
+	genotypes.check_missing()
+	genotypes.check_biallelic()
+	genotypes.check_maf(threshold=0.05)
+	genotypes.check_phase()
+
 Subsetting
 **********
 You can index into a loaded :class:`Genotypes` instance using the ``subset()`` function. This works similiar to numpy indexing with the added benefit that you can specify a subset of variants and/or samples by their IDs instead of just their indices.
@@ -143,7 +154,7 @@ By default, the ``subset()`` method returns a new :class:`Genotypes` instance. T
 
 GenotypesRefAlt
 +++++++++++++++
-The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "aaf", "ref", and "alt".
+The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "ref", and "alt".
 
 All of the other methods in the :class:`Genotypes` class are inherited, but the :class:`GenotypesRefAlt` class implements an additional method ``write()`` for dumping the contents of the class to the provided file.
 

diff --git a/docs/api/examples.rst b/docs/api/examples.rst
@@ -25,7 +25,7 @@ As an example, let's say we would like to convert the following ``.blocks.det``
     # load the genotypes file
     # you can use either a VCF or PGEN file
     gt = data.GenotypesRefAlt.load("input.vcf.gz")
-    gt = data.GenotypesPGEN.load("input.pgen")
+    gt = data.GenotypesPLINK.load("input.pgen")
 
     # load the haplotypes
     hp = data.Haplotypes("output.hap")
@@ -88,7 +88,7 @@ You can easily use the :ref:`data API <api-data>` and the :ref:`simphenotype API
     gt.read(variants=variants)
     # the advantage of using a PGEN file is that you can use read_variants() to load
     # the variants quickly w/o having to load the genotypes, too
-    gt = data.GenotypesPGEN("tests/data/apoe.pgen")
+    gt = data.GenotypesPLINK("tests/data/apoe.pgen")
     gt.read_variants(variants=variants)
 
     # initialize an empty haplotype file

diff --git a/docs/conf.py b/docs/conf.py
@@ -49,7 +49,7 @@
 autosummary_generate = True
 numpydoc_show_class_members = False
 # allow for both rst and md syntax
-source_suffix = ['.rst']
+source_suffix = [".rst"]
 
 # -- Options for HTML output -------------------------------------------------
 

diff --git a/haptools/__main__.py b/haptools/__main__.py
@@ -764,7 +764,7 @@ def ld(
     "-o",
     "--output",
     type=click.Path(path_type=Path),
-    default= None,
+    default=None,
     show_default="input file",
     help="A .hap file containing sorted and indexed haplotypes and variants",
 )
@@ -778,11 +778,10 @@ def ld(
 )
 def index(
     haplotypes: Path,
-    sort: bool= False,
+    sort: bool = False,
     output: Path = None,
-    verbosity: str = 'CRITICAL',
+    verbosity: str = "CRITICAL",
 ):
-
     """
     Takes in an unsorted .hap file and outputs it as a .gz and a .tbi file
     """

diff --git a/haptools/admix_storage.py b/haptools/admix_storage.py
@@ -11,7 +11,7 @@ def __repr__(self):
 
     def __str__(self):
         return f"(Chrom {self.chrom}, {self.cm_map_pos} cM, {self.bp_map_pos} bp)"
-    
+
     def get_chrom(self):
         return self.chrom
 
@@ -30,25 +30,35 @@ class HaplotypeSegment:
     def __init__(self, pop_num, chrom, end_coord, end_pos):
         """
         Note the beginning of the segment is inferred based on previous
-        segments stored throughout the simulation process. 
-        Arguments
-            pop_num - population label
-            chrom - chromosome the haplotype segment lies on.
-            end_coord - Ending coordinate in bp of the haplotype segment
-            end_pos - Ending coordinate in centimorgans of the hap segment
+        segments stored throughout the simulation process.
+
+        Parameters
+        ----------
+            pop_num
+                population label
+            chrom
+                chromosome the haplotype segment lies on.
+            end_coord
+                Ending coordinate in bp of the haplotype segment
+            end_pos
+                Ending coordinate in centimorgans of the hap segment
         """
         self.pop_num = pop_num
         self.chrom = chrom
         self.end_coord = end_coord
         self.end_pos = end_pos
 
     def __repr__(self):
-        return f"(Population {self.pop_num}, chrom {self.chrom}, " + \
-               f"end_coord {self.end_coord}, end_pos {self.end_pos})"
+        return (
+            f"(Population {self.pop_num}, chrom {self.chrom}, "
+            + f"end_coord {self.end_coord}, end_pos {self.end_pos})"
+        )
 
     def __str__(self):
-        return f"Population {self.pop_num}, chrom {self.chrom}, " + \
-               f"end_coord {self.end_coord}, end_pos {self.end_pos}"
+        return (
+            f"Population {self.pop_num}, chrom {self.chrom}, "
+            + f"end_coord {self.end_coord}, end_pos {self.end_pos}"
+        )
 
     def get_end_pos(self):
         return self.end_pos

diff --git a/haptools/data/breakpoints.py b/haptools/data/breakpoints.py
@@ -269,7 +269,7 @@ def population_array(
             A subset of samples to include in the output, ordered by their given order
 
         Returns
-        ------
+        -------
         npt.NDArray
             An array of shape: samples x variants x 2
 

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
@@ -32,7 +32,6 @@ class Genotypes(Data):
             1. ID
             2. CHROM
             3. POS
-            4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
     log: Logger
         A logging instance for recording debug statements
     _prephased : bool
@@ -61,7 +60,6 @@ def __init__(self, fname: Path | str, log: Logger = None):
                 ("id", "U50"),
                 ("chrom", "U10"),
                 ("pos", np.uint32),
-                ("aaf", np.float64),
             ],
         )
         self._prephased = False
@@ -219,7 +217,7 @@ def _variant_arr(self, record: Variant):
             A row from the :py:attr:`~.Genotypes.variants` array
         """
         return np.array(
-            (record.ID, record.CHROM, record.POS, record.aaf),
+            (record.ID, record.CHROM, record.POS),
             dtype=self.variants.dtype,
         )
 
@@ -299,15 +297,15 @@ def index(self, samples: bool = True, variants: bool = True):
         Call this function once to improve the amortized time-complexity of look-ups of
         samples and variants by their ID. This is useful if you intend to later subset
         by a set of samples or variant IDs.
-        The time complexity of this function should be roughly O(n+m) if both
-        parameters are True. Otherwise, it will be either O(n) or O(m).
+        The time complexity of this function should be roughly O(n+p) if both
+        parameters are True. Otherwise, it will be either O(n) or O(p).
 
         Parameters
         ----------
         samples: bool, optional
             Whether to index the samples for fast loop-up. Adds complexity O(n).
         variants: bool, optional
-            Whether to index the variants for fast look-up. Adds complexity O(m).
+            Whether to index the variants for fast look-up. Adds complexity O(p).
 
         Raises
         ------
@@ -504,37 +502,73 @@ def check_phase(self):
         # remove the last dimension that contains the phase info
         self.data = self.data[:, :, :2]
 
-    def to_MAC(self):
+    def check_maf(
+        self,
+        threshold: float = None,
+        discard_also: bool = False,
+        warn_only: bool = False,
+    ) -> npt.NDArray[np.float64]:
         """
-        Convert the ALT count GT matrix into a matrix of minor allele counts
+        Check the minor allele frequency of each variant
 
-        This function modifies :py:attr:`~.Genotypes.data` in-place
+        Raise a ValueError if any variant's MAF doesn't satisfy the threshold, if
+        one is provided
+
+        .. note::
+            You should call :py:meth:`~.Genotypes.check_missing` and
+            :py:meth:`~.Genotypes.check_biallelic` before executing this method, for
+            best results. Otherwise, the frequencies may be computed incorrectly.
 
-        It also changes the 'aaf' record in :py:attr:`~.Genotypes.variants` to 'maf'
+        Parameters
+        ----------
+        threshold: float, optional
+            If a variant has a minor allele frequency (MAF) rarer than this threshold,
+            raise a ValueError
+        discard_also : bool, optional
+            If True, discard any variants that would otherwise cause a ValueError
+
+            This parameter will be ignored if a threshold is not specified
+        warn_only: bool, optional
+            Just raise a warning instead of a ValueError
 
         Raises
         ------
-        AssertionError
-            If the matrix has already been converted
+        ValueError
+            If any variant does not meet the provided threshold minor allele frequency
+
+        Returns
+        -------
+            The minor allele frequency of each variant
         """
-        if self.variants.dtype.names[3] == "maf":
-            self.log.warning(
-                "The matrix already counts instances of the minor allele rather than "
-                "the ALT allele."
-            )
-            return
-        need_conversion = self.variants["aaf"] > 0.5
-        # flip the count on the variants that have an alternate allele frequency
-        # above 0.5
-        self.data[:, need_conversion, :2] = ~self.data[:, need_conversion, :2]
-        # also encode an MAF instead of an AAF in self.variants
-        self.variants["aaf"][need_conversion] = (
-            1 - self.variants["aaf"][need_conversion]
-        )
-        # replace 'aaf' with 'maf' in the matrix
-        self.variants.dtype.names = [
-            (x, "maf")[x == "aaf"] for x in self.variants.dtype.names
-        ]
+        num_strands = 2 * self.data.shape[0]
+        # TODO: make this work for multi-allelic variants, too?
+        ref_af = self.data[:, :, :2].astype(np.bool_).sum(axis=(0, 2)) / num_strands
+        maf = np.array([ref_af, 1 - ref_af]).min(axis=0)
+        if threshold is None:
+            return maf
+        rare_variants = maf < threshold
+        if np.any(rare_variants):
+            idx = np.nonzero(rare_variants)[0]
+            if discard_also:
+                original_num_variants = len(self.variants)
+                self.data = np.delete(self.data, idx, axis=1)
+                self.variants = np.delete(self.variants, idx)
+                maf = np.delete(maf, idx)
+                self.log.info(
+                    "Ignoring missing genotypes from "
+                    f"{original_num_variants - len(self.variants)} samples"
+                )
+                self._var_idx = None
+            else:
+                vals = tuple(self.variants[idx[0]])[:3] + (maf[idx[0]], threshold)
+                msg = "Variant with ID {} at POS {}:{} has MAF {} < {}".format(*vals)
+                if warn_only:
+                    self.log.warning(msg)
+                else:
+                    # raise error if the minor allele frequency of a variant does not
+                    # meet the threshold
+                    raise ValueError(msg)
+        return maf
 
 
 class GenotypesRefAlt(Genotypes):
@@ -556,9 +590,8 @@ class GenotypesRefAlt(Genotypes):
             1. ID
             2. CHROM
             3. POS
-            4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
-            5. REF
-            6. ALT
+            4. REF
+            5. ALT
     log: Logger
         See documentation for :py:attr:`~.Genotypes.log`
     """
@@ -571,7 +604,6 @@ def __init__(self, fname: Path | str, log: Logger = None):
                 ("id", "U50"),
                 ("chrom", "U10"),
                 ("pos", np.uint32),
-                ("aaf", np.float64),
                 ("ref", "U100"),
                 ("alt", "U100"),
             ],
@@ -586,7 +618,6 @@ def _variant_arr(self, record: Variant):
                 record.ID,
                 record.CHROM,
                 record.POS,
-                record.aaf,
                 record.REF,
                 record.ALT[0],
             ),
@@ -795,7 +826,6 @@ def _variant_arr(
                 record[cid["ID"]],
                 record[cid["CHROM"]],
                 record[cid["POS"]],
-                0.5,
                 record[cid["REF"]],
                 record[cid["ALT"]],
             ),

diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
@@ -1080,7 +1080,7 @@ def transform(
             hap_gts = GenotypesRefAlt(fname=None, log=self.log)
         hap_gts.samples = gts.samples
         hap_gts.variants = np.array(
-            [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
+            [(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()],
             dtype=hap_gts.variants.dtype,
         )
         # build a fast data structure for querying the alleles in each haplotype:

diff --git a/haptools/index.py b/haptools/index.py
@@ -33,7 +33,7 @@ def index_haps(
     haplotypes: Path,
     sort: bool = False,
     output: Path = None,
-    log: Logger = None,
+    log: logging.Logger = None,
 ):
     """
     Takes in an unsorted .hap file and outputs it as a .gz and a .tbi file

diff --git a/haptools/ld.py b/haptools/ld.py
@@ -57,7 +57,7 @@ def calc_ld(
     discard_missing: bool = False,
     from_gts: bool = False,
     output: Path = Path("/dev/stdout"),
-    log: Logger = None,
+    log: logging.Logger = None,
 ):
     """
     Creates a VCF composed of haplotypes

diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py
@@ -776,7 +776,7 @@ def start_segment(start, chrom, segments):
 
     return len(segments)
 
-def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region, only_bp=False):
+def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region=None, only_bp=False):
     # validate model file
     mfile = open(model, 'r')
     num_samples, *pops = mfile.readline().strip().split()

diff --git a/haptools/transform.py b/haptools/transform.py
@@ -97,7 +97,7 @@ def transform(
             hap_gts = data.GenotypesRefAlt(fname=None, log=self.log)
         hap_gts.samples = gts.samples
         hap_gts.variants = np.array(
-            [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
+            [(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()],
             dtype=hap_gts.variants.dtype,
         )
         # build a fast data structure for querying the alleles in each haplotype:
@@ -510,7 +510,7 @@ def transform_haps(
     discard_missing: bool = False,
     ancestry: bool = False,
     output: Path = Path("-"),
-    log: Logger = None,
+    log: logging.Logger = None,
 ):
     """
     Creates a VCF composed of haplotypes

diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ haptools = 'haptools.__main__:main'
 [tool.black]
 line-length = 88
 preview = true
+extend-exclude = "haptools/(sim_genotype|karyogram).py"
 
 [tool.pytest.ini_options]
 log_cli_level = "DEBUG"
-Original file line number
+Diff line change
@@ Expand Up / @@ -269,7 +269,7 @@ def population_array( @@
                 A subset of samples to include in the output, ordered by their given order
             Returns
-            ------
+            -------
             npt.NDArray
                 An array of shape: samples x variants x 2
@@ Expand Down @@