Skip to content

Commit

Permalink
support reading PLINK2-style pheno and covar files
Browse files Browse the repository at this point in the history
and make covariates class a subclass of phenotypes
see #19 for more details
  • Loading branch information
aryarm committed Jul 1, 2022
1 parent bffd27c commit afcdda5
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 249 deletions.
128 changes: 5 additions & 123 deletions haptools/data/covariates.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@
import numpy as np

from .data import Data
from .phenotypes import Phenotypes


class Covariates(Data):
class Covariates(Phenotypes):
"""
A class for processing covariates from a file
Attributes
----------
data : np.array
The covariates in an n (samples) x 1 (covariate value) array
The covariates in an n (samples) x m (covariates) array
fname : Path
The path to the read-only file containing the data
samples : tuple[str]
Expand All @@ -33,124 +34,5 @@ class Covariates(Data):
"""

def __init__(self, fname: Path, log: Logger = None):
super().__init__(fname, log)
self.samples = tuple()
self.names = tuple()

@classmethod
def load(cls: Covariates, fname: Path, samples: list[str] = None) -> Covariates:
"""
Load covariates from a TSV file
Read the file contents and standardize the covariates
Parameters
----------
fname
See documentation for :py:attr:`~.Data.fname`
samples : list[str], optional
See documentation for :py:meth:`~.Data.Covariates.read`
Returns
-------
covariates
A Covariates object with the data loaded into its properties
"""
covariates = cls(fname)
covariates.read(samples)
return covariates

def read(self, samples: list[str] = None):
"""
Read covariates from a TSV file into a numpy matrix stored in :py:attr:`~.Covariates.data`
Parameters
----------
samples : list[str], optional
A subset of the samples from which to extract covariates
Defaults to loading covariates from all samples
Raises
------
AssertionError
If the provided file doesn't follow the expected format
"""
super().read()
# load all info into memory
# use hook_compressed to automatically handle gz files
with hook_compressed(self.fname, mode="rt") as covars:
covar_text = reader(covars, delimiter="\t")
header = next(covar_text)
# there should at least two columns
if len(header) < 2:
raise ValueError("The covariates TSV should have at least two columns.")
# the first column should be called "sample"
if header[0] != "sample":
self.log.warning(
"The first column of the covariates TSV should contain sample IDs"
" and should be named 'sample' in the header line"
)
# convert to list and subset samples if need be
if samples:
samples = set(samples)
covar_text = [covar for covar in covar_text if covar[0] in samples]
else:
covar_text = list(covar_text)
# the second column should be castable to a float
try:
float(covar_text[0][1])
except:
self.log.error(
"Every column in the covariates file (besides the sample column) must"
" be numeric."
)
# fill out the samples and data properties
data = list(zip(*covar_text))
self.samples = data[0]
self.names = tuple(header[1:])
# coerce strings to floats
self.data = np.transpose(np.array(data[1:], dtype="float64"))

def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
"""
Read covariates from a TSV line by line without storing anything
Parameters
----------
samples : list[str], optional
A subset of the samples from which to extract covariates
Defaults to loading covariates from all samples
Yields
------
Iterator[namedtuple]
An iterator over each line in the file, where each line is encoded as a
namedtuple containing each of the class properties
"""
with hook_compressed(self.fname, mode="rt") as covars:
covar_text = reader(covars, delimiter="\t")
header = next(covar_text)
# there should at least two columns
if len(header) < 2:
raise ValueError("The covariates TSV should have at least two columns.")
# the first column should be called "sample"
if header[0] != "sample":
self.log.warning(
"The first column of the covariates TSV should contain sample IDs"
" and should be named 'sample' in the header line"
)
header = tuple(header[1:])
Record = namedtuple("Record", "data samples names")
for covar in covar_text:
if samples is None or covar[0] in samples:
try:
yield Record(
np.array(covar[1:], dtype="float64"), covar[0], header
)
except:
self.log.error(
"Every column in the covariates file (besides the sample"
" column) must be numeric."
)
super(Phenotypes, self).__init__(fname, log)
self._ext = 'covar'
2 changes: 1 addition & 1 deletion haptools/data/haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def fmt_str(self) -> str:
"""
Convert an Extra into a fmt string
Retruns
Returns
-------
str
A python format string (ex: "{beta:.3f}")
Expand Down
112 changes: 75 additions & 37 deletions haptools/data/phenotypes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations
from csv import reader
from pathlib import Path
from io import TextIOBase
from collections import namedtuple
from collections.abc import Iterable
from logging import getLogger, Logger
from fileinput import hook_compressed

Expand All @@ -17,11 +19,13 @@ class Phenotypes(Data):
Attributes
----------
data : np.array
The phenotypes in an n (samples) x 1 (phenotype value) array
The phenotypes in an n (samples) x m (phenotypes) array
fname : Path
The path to the read-only file containing the data
samples : tuple
The names of each of the n samples
names : tuple[str]
The names of the phenotypes
log: Logger
A logging instance for recording debug statements.
Expand All @@ -33,6 +37,8 @@ class Phenotypes(Data):
def __init__(self, fname: Path, log: Logger = None):
super().__init__(fname, log)
self.samples = tuple()
self.names = tuple()
self._ext = 'pheno'

@classmethod
def load(cls: Phenotypes, fname: Path, samples: list[str] = None) -> Phenotypes:
Expand Down Expand Up @@ -75,36 +81,26 @@ def read(self, samples: list[str] = None):
If the provided file doesn't follow the expected format
"""
super().read()
# load all info into memory
# use hook_compressed to automatically handle gz files
with hook_compressed(self.fname, mode="rt") as phens:
phen_text = reader(phens, delimiter="\t")
# convert to list and subset samples if need be
if samples:
samples = set(samples)
phen_text = [phen for phen in phen_text if phen[0] in samples]
else:
phen_text = list(phen_text)
# there should only be two columns
if len(phen_text[0]) != 2:
self.log.warning("The phenotype TSV should only have two columns.")
# the second column should be castable to a float
try:
float(phen_text[0][1])
except:
self.log.error("The second column of the TSV file must numeric.")
# call self.__iter__() to obtain the data and samples
data, self.samples = zip(*self.__iter__(samples))
self.log.info(f"Loaded {len(self.samples)} samples from .{self._ext} file")
# fill out the samples and data properties
self.samples, self.data = zip(*phen_text)
# coerce strings to floats
self.data = np.array(self.data, dtype="float64")
# collect data in a np array
self.data = np.array(data)

def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None):
"""
Read phenotypes from a TSV line by line without storing anything
A generator over the lines of a TSV
This is a helper function for :py:meth:`~.Phenotypes.__iter__`
Parameters
----------
samples : list[str], optional
phens: TextIOBase
The file handler for the stream
phen_text: Iterable
The csv.reader object containing the lines of text from the file as lists
samples : set[str], optional
A subset of the samples from which to extract phenotypes
Defaults to loading phenotypes from all samples
Expand All @@ -115,22 +111,64 @@ def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
An iterator over each line in the file, where each line is encoded as a
namedtuple containing each of the class properties
"""
with hook_compressed(self.fname, mode="rt") as phens:
phen_text = reader(phens, delimiter="\t")
Record = namedtuple("Record", "data samples")
for phen in phen_text:
if samples is None or phen[0] in samples:
try:
yield Record(float(phen[1]), phen[0])
except:
self.log.error(
"The second column of the TSV file must numeric."
)
self.log.info(f"Loading {len(self.names)} columns from .{self._ext} file")
Record = namedtuple("Record", "data samples")
for phen in phen_text:
if samples is None or phen[0] in samples:
try:
yield Record(
np.array(phen[1:], dtype="float64"), phen[0]
)
except:
self.log.error(
f"Every column in the .{self._ext} file (besides the sample"
" column) must be numeric."
)
phens.close()

def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]:
"""
Read phenotypes from a TSV line by line without storing anything
Parameters
----------
samples : list[str], optional
A subset of the samples from which to extract phenotypes
Defaults to loading phenotypes from all samples
Returns
------
Iterable[namedtuple]
See documentation for :py:meth:`~.Phenotypes._iterate`
"""
phens = hook_compressed(self.fname, mode="rt")
phen_text = reader(phens, delimiter="\t")
# ignore all of the comment lines
while True:
header = next(phen_text)
if not header[0].startswith('#') or header[0].startswith('#IID'):
break

# there should be at least two columns
if len(header) < 2:
raise ValueError(f"The .{self._ext} file should have at least two columns.")
# the first column should be called "#IID"
if header[0] != "#IID":
self.log.warning(
f"The first column of the .{self._ext} file should contain sample IDs"
" and should be named '#IID' in the header line"
)
self.names = tuple(header[1:])
samples = set(samples) if samples else None
# call another function to force the lines above to be run immediately
# see https://stackoverflow.com/a/36726497
return self._iterate(phens, phen_text, samples)

def standardize(self):
"""
Standardize phenotypes so they have a mean of 0 and a stdev of 1
This function modifies :py:attr:`~.Genotypes.data` in-place
"""
self.data = (self.data - np.mean(self.data)) / np.std(self.data)
self.data = (self.data - np.mean(self.data, axis=0)) / np.std(self.data, axis=0)
Loading

0 comments on commit afcdda5

Please sign in to comment.