From 18f5a970f73f89c9dfa1505a260eb2c746a36110 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 13 Apr 2022 08:16:33 -0700 Subject: [PATCH] use logging instead of assertions in data module (see #19) --- haptools/data/covariates.py | 6 ++++-- haptools/data/data.py | 8 ++++++-- haptools/data/genotypes.py | 25 ++++++++++++++++--------- haptools/data/phenotypes.py | 6 ++++-- pyproject.toml | 5 +++++ tests/test_data.py | 36 ++++++++++++++++++------------------ 6 files changed, 53 insertions(+), 33 deletions(-) diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py index 74f78d14..88b1c38b 100644 --- a/haptools/data/covariates.py +++ b/haptools/data/covariates.py @@ -20,14 +20,16 @@ class Covariates(Data): The path to the read-only file containing the data samples : tuple[str] The names of each of the n samples + log: Logger + A logging instance for recording debug statements. Examples -------- >>> covariates = Covariates.load('tests/data/covars.tsv') """ - def __init__(self, fname: Path): - super().__init__(fname) + def __init__(self, fname: Path, log: Logger = None): + super().__init__(fname, log) self.samples = tuple() self.names = tuple() diff --git a/haptools/data/data.py b/haptools/data/data.py index 41ce17d0..8daba0b5 100644 --- a/haptools/data/data.py +++ b/haptools/data/data.py @@ -2,6 +2,7 @@ from csv import reader from pathlib import Path from abc import ABC, abstractmethod +from logging import getLogger, Logger import numpy as np @@ -16,11 +17,14 @@ class Data(ABC): The path to the read-only file containing the data data : np.array The contents of the data file, once loaded + log: Logger + A logging instance for recording debug statements. """ - def __init__(self, fname: Path): + def __init__(self, fname: Path, log: Logger = None): self.fname = fname self.data = None + self.log = log or getLogger(self.__class__.__name__) super().__init__() def __repr__(self): @@ -45,4 +49,4 @@ def read(self): Read the raw file contents into the class properties """ if self.data is not None: - raise AssertionError("The data has already been loaded.") + self.log.warning("The data has already been loaded. Overriding.") diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 890f65da..44f0b9d3 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -25,14 +25,16 @@ class Genotypes(Data): 2. CHROM 3. POS 4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called) + log: Logger + A logging instance for recording debug statements. Examples -------- >>> genotypes = Genotypes.load('tests/data/simple.vcf') """ - def __init__(self, fname: Path): - super().__init__(fname) + def __init__(self, fname: Path, log: Logger = None): + super().__init__(fname, log) self.samples = tuple() self.variants = np.array([]) @@ -116,7 +118,7 @@ def read(self, region: str = None, samples: list[str] = None): ) self.data = np.array(self.data, dtype=np.uint8) if self.data.shape == (0, 0, 0): - raise ValueError( + self.log.warning( "Failed to load genotypes. If you specified a region, check that the" " contig name matches! For example, double-check the 'chr' prefix." ) @@ -143,7 +145,8 @@ def check_biallelic(self, discard_also=False): If True, discard any multiallelic variants without raising a ValueError """ if self.data.dtype == np.bool_: - raise AssertionError("All genotypes are already biallelic") + self.log.warning("All genotypes are already biallelic") + return # check: are there any variants that have genotype values above 1? # A genotype value above 1 would imply the variant has more than one ALT allele multiallelic = np.any(self.data[:, :, :2] > 1, axis=2) @@ -176,9 +179,10 @@ def check_phase(self): If any heterozgyous genotpyes are unphased """ if self.data.shape[2] < 3: - raise AssertionError( + self.log.warning( "Phase information has already been removed from the data" ) + return # check: are there any variants that are heterozygous and unphased? unphased = (self.data[:, :, 0] ^ self.data[:, :, 1]) & (~self.data[:, :, 2]) if np.any(unphased): @@ -205,10 +209,11 @@ def to_MAC(self): If the matrix has already been converted """ if self.variants.dtype.names[3] == "maf": - raise AssertionError( - "The matrix already counts instances of the minor allele rather than" + self.log.warning( + "The matrix already counts instances of the minor allele rather than " "the ALT allele." ) + return need_conversion = self.variants["aaf"] > 0.5 # flip the count on the variants that have an alternate allele frequency # above 0.5 @@ -239,14 +244,16 @@ class GenotypesPLINK(Data): 2. CHROM 3. POS 4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called) + log: Logger + A logging instance for recording debug statements. Examples -------- >>> genotypes = Genotypes.load('tests/data/simple.pgen') """ - def __init__(self, fname: Path): - super().__init__(fname) + def __init__(self, fname: Path, log: Logger = None): + super().__init__(fname, log) self.samples = tuple() self.variants = np.array([]) diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index 3c09d8a0..54779965 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -20,14 +20,16 @@ class Phenotypes(Data): The path to the read-only file containing the data samples : tuple The names of each of the n samples + log: Logger + A logging instance for recording debug statements. Examples -------- >>> phenotypes = Phenotypes.load('tests/data/simple.tsv') """ - def __init__(self, fname: Path): - super().__init__(fname) + def __init__(self, fname: Path, log: Logger = None): + super().__init__(fname, log) self.samples = tuple() @classmethod diff --git a/pyproject.toml b/pyproject.toml index 19ca0bf9..fdc9e088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,11 @@ haptools = 'haptools.__main__:main' line-length = 88 preview = true +[tool.pytest.ini_options] +log_cli_level = "DEBUG" +log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/tests/test_data.py b/tests/test_data.py index d9c806e3..ecf472e6 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -18,7 +18,7 @@ def get_expected_genotypes(): return expected -def test_load_genotypes(): +def test_load_genotypes(caplog): expected = get_expected_genotypes() # can we load the data from the VCF? @@ -27,9 +27,9 @@ def test_load_genotypes(): np.testing.assert_allclose(gts.data, expected) assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - # try loading the data again - it should fail b/c we've already done it - with pytest.raises(AssertionError): - gts.read() + # try loading the data again - it should warn b/c we've already done it + gts.read() + assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING" # force one of the SNPs to have more than one allele and check that we get an error gts.data[1, 1, 1] = 2 @@ -62,9 +62,9 @@ def test_load_genotypes(): expected = expected[:, :, :2] np.testing.assert_allclose(gts.data, expected) - # try to check phase again - it should fail b/c we've already done it before - with pytest.raises(AssertionError): - gts.check_phase() + # try to check phase again - it should warn b/c we've already done it before + gts.check_phase() + assert len(caplog.records) == 2 and caplog.records[1].levelname == "WARNING" # convert the matrix of alt allele counts to a matrix of minor allele counts assert gts.variants["aaf"][1] == 0.6 @@ -73,9 +73,9 @@ def test_load_genotypes(): np.testing.assert_allclose(gts.data, expected) assert gts.variants["maf"][1] == 0.4 - # try to do the MAC conversion again - it should fail b/c we've already done it - with pytest.raises(AssertionError): - gts.to_MAC() + # try to do the MAC conversion again - it should warn b/c we've already done it + gts.to_MAC() + assert len(caplog.records) == 3 and caplog.records[2].levelname == "WARNING" def test_load_genotypes_discard_multiallelic(): @@ -122,7 +122,7 @@ def test_load_genotypes_subset(): assert gts.samples == tuple(samples) -def test_load_phenotypes(): +def test_load_phenotypes(caplog): # create a phenotype vector with shape: num_samples x 1 expected = np.array([1, 1, 2, 2, 0]) @@ -132,9 +132,9 @@ def test_load_phenotypes(): np.testing.assert_allclose(phens.data, expected) assert phens.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - # try loading the data again - it should fail b/c we've already done it - with pytest.raises(AssertionError): - phens.read() + # try loading the data again - it should warn b/c we've already done it + phens.read() + assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING" expected = (expected - np.mean(expected)) / np.std(expected) phens.standardize() @@ -156,7 +156,7 @@ def test_load_phenotypes_subset(): assert phens.samples == tuple(samples) -def test_load_covariates(): +def test_load_covariates(caplog): # create a covariate vector with shape: num_samples x num_covars expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)]) @@ -167,9 +167,9 @@ def test_load_covariates(): assert covars.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") assert covars.names == ("sex", "age") - # try loading the data again - it should fail b/c we've already done it - with pytest.raises(AssertionError): - covars.read() + # try loading the data again - it should warn b/c we've already done it + covars.read() + assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING" def test_load_covariates_subset():