Skip to content

Commit

Permalink
use logging instead of assertions in data module (see #19)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm committed Apr 13, 2022
1 parent 7827e3d commit 18f5a97
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 33 deletions.
6 changes: 4 additions & 2 deletions haptools/data/covariates.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ class Covariates(Data):
The path to the read-only file containing the data
samples : tuple[str]
The names of each of the n samples
log: Logger
A logging instance for recording debug statements.
Examples
--------
>>> covariates = Covariates.load('tests/data/covars.tsv')
"""

def __init__(self, fname: Path):
super().__init__(fname)
def __init__(self, fname: Path, log: Logger = None):
super().__init__(fname, log)
self.samples = tuple()
self.names = tuple()

Expand Down
8 changes: 6 additions & 2 deletions haptools/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from csv import reader
from pathlib import Path
from abc import ABC, abstractmethod
from logging import getLogger, Logger

import numpy as np

Expand All @@ -16,11 +17,14 @@ class Data(ABC):
The path to the read-only file containing the data
data : np.array
The contents of the data file, once loaded
log: Logger
A logging instance for recording debug statements.
"""

def __init__(self, fname: Path):
def __init__(self, fname: Path, log: Logger = None):
self.fname = fname
self.data = None
self.log = log or getLogger(self.__class__.__name__)
super().__init__()

def __repr__(self):
Expand All @@ -45,4 +49,4 @@ def read(self):
Read the raw file contents into the class properties
"""
if self.data is not None:
raise AssertionError("The data has already been loaded.")
self.log.warning("The data has already been loaded. Overriding.")
25 changes: 16 additions & 9 deletions haptools/data/genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,16 @@ class Genotypes(Data):
2. CHROM
3. POS
4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
log: Logger
A logging instance for recording debug statements.
Examples
--------
>>> genotypes = Genotypes.load('tests/data/simple.vcf')
"""

def __init__(self, fname: Path):
super().__init__(fname)
def __init__(self, fname: Path, log: Logger = None):
super().__init__(fname, log)
self.samples = tuple()
self.variants = np.array([])

Expand Down Expand Up @@ -116,7 +118,7 @@ def read(self, region: str = None, samples: list[str] = None):
)
self.data = np.array(self.data, dtype=np.uint8)
if self.data.shape == (0, 0, 0):
raise ValueError(
self.log.warning(
"Failed to load genotypes. If you specified a region, check that the"
" contig name matches! For example, double-check the 'chr' prefix."
)
Expand All @@ -143,7 +145,8 @@ def check_biallelic(self, discard_also=False):
If True, discard any multiallelic variants without raising a ValueError
"""
if self.data.dtype == np.bool_:
raise AssertionError("All genotypes are already biallelic")
self.log.warning("All genotypes are already biallelic")
return
# check: are there any variants that have genotype values above 1?
# A genotype value above 1 would imply the variant has more than one ALT allele
multiallelic = np.any(self.data[:, :, :2] > 1, axis=2)
Expand Down Expand Up @@ -176,9 +179,10 @@ def check_phase(self):
If any heterozgyous genotpyes are unphased
"""
if self.data.shape[2] < 3:
raise AssertionError(
self.log.warning(
"Phase information has already been removed from the data"
)
return
# check: are there any variants that are heterozygous and unphased?
unphased = (self.data[:, :, 0] ^ self.data[:, :, 1]) & (~self.data[:, :, 2])
if np.any(unphased):
Expand All @@ -205,10 +209,11 @@ def to_MAC(self):
If the matrix has already been converted
"""
if self.variants.dtype.names[3] == "maf":
raise AssertionError(
"The matrix already counts instances of the minor allele rather than"
self.log.warning(
"The matrix already counts instances of the minor allele rather than "
"the ALT allele."
)
return
need_conversion = self.variants["aaf"] > 0.5
# flip the count on the variants that have an alternate allele frequency
# above 0.5
Expand Down Expand Up @@ -239,14 +244,16 @@ class GenotypesPLINK(Data):
2. CHROM
3. POS
4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
log: Logger
A logging instance for recording debug statements.
Examples
--------
>>> genotypes = Genotypes.load('tests/data/simple.pgen')
"""

def __init__(self, fname: Path):
super().__init__(fname)
def __init__(self, fname: Path, log: Logger = None):
super().__init__(fname, log)
self.samples = tuple()
self.variants = np.array([])

Expand Down
6 changes: 4 additions & 2 deletions haptools/data/phenotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ class Phenotypes(Data):
The path to the read-only file containing the data
samples : tuple
The names of each of the n samples
log: Logger
A logging instance for recording debug statements.
Examples
--------
>>> phenotypes = Phenotypes.load('tests/data/simple.tsv')
"""

def __init__(self, fname: Path):
super().__init__(fname)
def __init__(self, fname: Path, log: Logger = None):
super().__init__(fname, log)
self.samples = tuple()

@classmethod
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ haptools = 'haptools.__main__:main'
line-length = 88
preview = true

[tool.pytest.ini_options]
log_cli_level = "DEBUG"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
36 changes: 18 additions & 18 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_expected_genotypes():
return expected


def test_load_genotypes():
def test_load_genotypes(caplog):
expected = get_expected_genotypes()

# can we load the data from the VCF?
Expand All @@ -27,9 +27,9 @@ def test_load_genotypes():
np.testing.assert_allclose(gts.data, expected)
assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")

# try loading the data again - it should fail b/c we've already done it
with pytest.raises(AssertionError):
gts.read()
# try loading the data again - it should warn b/c we've already done it
gts.read()
assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"

# force one of the SNPs to have more than one allele and check that we get an error
gts.data[1, 1, 1] = 2
Expand Down Expand Up @@ -62,9 +62,9 @@ def test_load_genotypes():
expected = expected[:, :, :2]
np.testing.assert_allclose(gts.data, expected)

# try to check phase again - it should fail b/c we've already done it before
with pytest.raises(AssertionError):
gts.check_phase()
# try to check phase again - it should warn b/c we've already done it before
gts.check_phase()
assert len(caplog.records) == 2 and caplog.records[1].levelname == "WARNING"

# convert the matrix of alt allele counts to a matrix of minor allele counts
assert gts.variants["aaf"][1] == 0.6
Expand All @@ -73,9 +73,9 @@ def test_load_genotypes():
np.testing.assert_allclose(gts.data, expected)
assert gts.variants["maf"][1] == 0.4

# try to do the MAC conversion again - it should fail b/c we've already done it
with pytest.raises(AssertionError):
gts.to_MAC()
# try to do the MAC conversion again - it should warn b/c we've already done it
gts.to_MAC()
assert len(caplog.records) == 3 and caplog.records[2].levelname == "WARNING"


def test_load_genotypes_discard_multiallelic():
Expand Down Expand Up @@ -122,7 +122,7 @@ def test_load_genotypes_subset():
assert gts.samples == tuple(samples)


def test_load_phenotypes():
def test_load_phenotypes(caplog):
# create a phenotype vector with shape: num_samples x 1
expected = np.array([1, 1, 2, 2, 0])

Expand All @@ -132,9 +132,9 @@ def test_load_phenotypes():
np.testing.assert_allclose(phens.data, expected)
assert phens.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")

# try loading the data again - it should fail b/c we've already done it
with pytest.raises(AssertionError):
phens.read()
# try loading the data again - it should warn b/c we've already done it
phens.read()
assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"

expected = (expected - np.mean(expected)) / np.std(expected)
phens.standardize()
Expand All @@ -156,7 +156,7 @@ def test_load_phenotypes_subset():
assert phens.samples == tuple(samples)


def test_load_covariates():
def test_load_covariates(caplog):
# create a covariate vector with shape: num_samples x num_covars
expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)])

Expand All @@ -167,9 +167,9 @@ def test_load_covariates():
assert covars.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
assert covars.names == ("sex", "age")

# try loading the data again - it should fail b/c we've already done it
with pytest.raises(AssertionError):
covars.read()
# try loading the data again - it should warn b/c we've already done it
covars.read()
assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"


def test_load_covariates_subset():
Expand Down

0 comments on commit 18f5a97

Please sign in to comment.