support reading PLINK2-style pheno and covar files

and make covariates class a subclass of phenotypes see #19 for more details
CAST-genomics · Jul 1, 2022 · afcdda5 · afcdda5
1 parent bffd27c
commit afcdda5
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 249 deletions.
diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py
@@ -8,16 +8,17 @@
 import numpy as np
 
 from .data import Data
+from .phenotypes import Phenotypes
 
 
-class Covariates(Data):
+class Covariates(Phenotypes):
     """
     A class for processing covariates from a file
 
     Attributes
     ----------
     data : np.array
-        The covariates in an n (samples) x 1 (covariate value) array
+        The covariates in an n (samples) x m (covariates) array
     fname : Path
         The path to the read-only file containing the data
     samples : tuple[str]
@@ -33,124 +34,5 @@ class Covariates(Data):
     """
 
     def __init__(self, fname: Path, log: Logger = None):
-        super().__init__(fname, log)
-        self.samples = tuple()
-        self.names = tuple()
-
-    @classmethod
-    def load(cls: Covariates, fname: Path, samples: list[str] = None) -> Covariates:
-        """
-        Load covariates from a TSV file
-
-        Read the file contents and standardize the covariates
-
-        Parameters
-        ----------
-        fname
-            See documentation for :py:attr:`~.Data.fname`
-        samples : list[str], optional
-            See documentation for :py:meth:`~.Data.Covariates.read`
-
-        Returns
-        -------
-        covariates
-            A Covariates object with the data loaded into its properties
-        """
-        covariates = cls(fname)
-        covariates.read(samples)
-        return covariates
-
-    def read(self, samples: list[str] = None):
-        """
-        Read covariates from a TSV file into a numpy matrix stored in :py:attr:`~.Covariates.data`
-
-        Parameters
-        ----------
-        samples : list[str], optional
-            A subset of the samples from which to extract covariates
-
-            Defaults to loading covariates from all samples
-
-        Raises
-        ------
-        AssertionError
-            If the provided file doesn't follow the expected format
-        """
-        super().read()
-        # load all info into memory
-        # use hook_compressed to automatically handle gz files
-        with hook_compressed(self.fname, mode="rt") as covars:
-            covar_text = reader(covars, delimiter="\t")
-            header = next(covar_text)
-            # there should at least two columns
-            if len(header) < 2:
-                raise ValueError("The covariates TSV should have at least two columns.")
-            # the first column should be called "sample"
-            if header[0] != "sample":
-                self.log.warning(
-                    "The first column of the covariates TSV should contain sample IDs"
-                    " and should be named 'sample' in the header line"
-                )
-            # convert to list and subset samples if need be
-            if samples:
-                samples = set(samples)
-                covar_text = [covar for covar in covar_text if covar[0] in samples]
-            else:
-                covar_text = list(covar_text)
-        # the second column should be castable to a float
-        try:
-            float(covar_text[0][1])
-        except:
-            self.log.error(
-                "Every column in the covariates file (besides the sample column) must"
-                " be numeric."
-            )
-        # fill out the samples and data properties
-        data = list(zip(*covar_text))
-        self.samples = data[0]
-        self.names = tuple(header[1:])
-        # coerce strings to floats
-        self.data = np.transpose(np.array(data[1:], dtype="float64"))
-
-    def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
-        """
-        Read covariates from a TSV line by line without storing anything
-
-        Parameters
-        ----------
-        samples : list[str], optional
-            A subset of the samples from which to extract covariates
-
-            Defaults to loading covariates from all samples
-
-        Yields
-        ------
-        Iterator[namedtuple]
-            An iterator over each line in the file, where each line is encoded as a
-            namedtuple containing each of the class properties
-        """
-        with hook_compressed(self.fname, mode="rt") as covars:
-            covar_text = reader(covars, delimiter="\t")
-            header = next(covar_text)
-            # there should at least two columns
-            if len(header) < 2:
-                raise ValueError("The covariates TSV should have at least two columns.")
-            # the first column should be called "sample"
-            if header[0] != "sample":
-                self.log.warning(
-                    "The first column of the covariates TSV should contain sample IDs"
-                    " and should be named 'sample' in the header line"
-                )
-            header = tuple(header[1:])
-            Record = namedtuple("Record", "data samples names")
-            for covar in covar_text:
-                if samples is None or covar[0] in samples:
-                    try:
-                        yield Record(
-                            np.array(covar[1:], dtype="float64"), covar[0], header
-                        )
-                    except:
-                        self.log.error(
-                            "Every column in the covariates file (besides the sample"
-                            " column) must be numeric."
-                        )
+        super(Phenotypes, self).__init__(fname, log)
+        self._ext = 'covar'
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
@@ -87,7 +87,7 @@ def fmt_str(self) -> str:
         """
         Convert an Extra into a fmt string
 
-        Retruns
+        Returns
         -------
         str
             A python format string (ex: "{beta:.3f}")

diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 from csv import reader
 from pathlib import Path
+from io import TextIOBase
 from collections import namedtuple
+from collections.abc import Iterable
 from logging import getLogger, Logger
 from fileinput import hook_compressed
 
@@ -17,11 +19,13 @@ class Phenotypes(Data):
     Attributes
     ----------
     data : np.array
-        The phenotypes in an n (samples) x 1 (phenotype value) array
+        The phenotypes in an n (samples) x m (phenotypes) array
     fname : Path
         The path to the read-only file containing the data
     samples : tuple
         The names of each of the n samples
+    names : tuple[str]
+        The names of the phenotypes
     log: Logger
         A logging instance for recording debug statements.
 
@@ -33,6 +37,8 @@ class Phenotypes(Data):
     def __init__(self, fname: Path, log: Logger = None):
         super().__init__(fname, log)
         self.samples = tuple()
+        self.names = tuple()
+        self._ext = 'pheno'
 
     @classmethod
     def load(cls: Phenotypes, fname: Path, samples: list[str] = None) -> Phenotypes:
@@ -75,36 +81,26 @@ def read(self, samples: list[str] = None):
             If the provided file doesn't follow the expected format
         """
         super().read()
-        # load all info into memory
-        # use hook_compressed to automatically handle gz files
-        with hook_compressed(self.fname, mode="rt") as phens:
-            phen_text = reader(phens, delimiter="\t")
-            # convert to list and subset samples if need be
-            if samples:
-                samples = set(samples)
-                phen_text = [phen for phen in phen_text if phen[0] in samples]
-            else:
-                phen_text = list(phen_text)
-        # there should only be two columns
-        if len(phen_text[0]) != 2:
-            self.log.warning("The phenotype TSV should only have two columns.")
-        # the second column should be castable to a float
-        try:
-            float(phen_text[0][1])
-        except:
-            self.log.error("The second column of the TSV file must numeric.")
+        # call self.__iter__() to obtain the data and samples
+        data, self.samples = zip(*self.__iter__(samples))
+        self.log.info(f"Loaded {len(self.samples)} samples from .{self._ext} file")
         # fill out the samples and data properties
-        self.samples, self.data = zip(*phen_text)
-        # coerce strings to floats
-        self.data = np.array(self.data, dtype="float64")
+        # collect data in a np array
+        self.data = np.array(data)
 
-    def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
+    def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None):
         """
-        Read phenotypes from a TSV line by line without storing anything
+        A generator over the lines of a TSV
+
+        This is a helper function for :py:meth:`~.Phenotypes.__iter__`
 
         Parameters
         ----------
-        samples : list[str], optional
+        phens: TextIOBase
+            The file handler for the stream
+        phen_text: Iterable
+            The csv.reader object containing the lines of text from the file as lists
+        samples : set[str], optional
             A subset of the samples from which to extract phenotypes
 
             Defaults to loading phenotypes from all samples
@@ -115,22 +111,64 @@ def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
             An iterator over each line in the file, where each line is encoded as a
             namedtuple containing each of the class properties
         """
-        with hook_compressed(self.fname, mode="rt") as phens:
-            phen_text = reader(phens, delimiter="\t")
-            Record = namedtuple("Record", "data samples")
-            for phen in phen_text:
-                if samples is None or phen[0] in samples:
-                    try:
-                        yield Record(float(phen[1]), phen[0])
-                    except:
-                        self.log.error(
-                            "The second column of the TSV file must numeric."
-                        )
+        self.log.info(f"Loading {len(self.names)} columns from .{self._ext} file")
+        Record = namedtuple("Record", "data samples")
+        for phen in phen_text:
+            if samples is None or phen[0] in samples:
+                try:
+                    yield Record(
+                        np.array(phen[1:], dtype="float64"), phen[0]
+                    )
+                except:
+                    self.log.error(
+                        f"Every column in the .{self._ext} file (besides the sample"
+                        " column) must be numeric."
+                    )
+        phens.close()
+
+    def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]:
+        """
+        Read phenotypes from a TSV line by line without storing anything
+
+        Parameters
+        ----------
+        samples : list[str], optional
+            A subset of the samples from which to extract phenotypes
+
+            Defaults to loading phenotypes from all samples
+
+        Returns
+        ------
+        Iterable[namedtuple]
+            See documentation for :py:meth:`~.Phenotypes._iterate`
+        """
+        phens = hook_compressed(self.fname, mode="rt")
+        phen_text = reader(phens, delimiter="\t")
+        # ignore all of the comment lines
+        while True:
+            header = next(phen_text)
+            if not header[0].startswith('#') or header[0].startswith('#IID'):
+                break
+
+        # there should be at least two columns
+        if len(header) < 2:
+            raise ValueError(f"The .{self._ext} file should have at least two columns.")
+        # the first column should be called "#IID"
+        if header[0] != "#IID":
+            self.log.warning(
+                f"The first column of the .{self._ext} file should contain sample IDs"
+                " and should be named '#IID' in the header line"
+            )
+        self.names = tuple(header[1:])
+        samples = set(samples) if samples else None
+        # call another function to force the lines above to be run immediately
+        # see https://stackoverflow.com/a/36726497
+        return self._iterate(phens, phen_text, samples)
 
     def standardize(self):
         """
         Standardize phenotypes so they have a mean of 0 and a stdev of 1
 
         This function modifies :py:attr:`~.Genotypes.data` in-place
         """
-        self.data = (self.data - np.mean(self.data)) / np.std(self.data)
+        self.data = (self.data - np.mean(self.data, axis=0)) / np.std(self.data, axis=0)
-Original file line number
+Diff line change
@@ Expand Up / @@ -87,7 +87,7 @@ def fmt_str(self) -> str: @@
             """
             Convert an Extra into a fmt string
-            Retruns
+            Returns
             -------
             str
                 A python format string (ex: "{beta:.3f}")
@@ Expand Down @@