From 18f5a970f73f89c9dfa1505a260eb2c746a36110 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 13 Apr 2022 08:16:33 -0700
Subject: [PATCH] use logging instead of assertions in data module (see #19)

---
 haptools/data/covariates.py |  6 ++++--
 haptools/data/data.py       |  8 ++++++--
 haptools/data/genotypes.py  | 25 ++++++++++++++++---------
 haptools/data/phenotypes.py |  6 ++++--
 pyproject.toml              |  5 +++++
 tests/test_data.py          | 36 ++++++++++++++++++------------------
 6 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py
index 74f78d14..88b1c38b 100644
--- a/haptools/data/covariates.py
+++ b/haptools/data/covariates.py
@@ -20,14 +20,16 @@ class Covariates(Data):
         The path to the read-only file containing the data
     samples : tuple[str]
         The names of each of the n samples
+    log: Logger
+        A logging instance for recording debug statements.
 
     Examples
     --------
     >>> covariates = Covariates.load('tests/data/covars.tsv')
     """
 
-    def __init__(self, fname: Path):
-        super().__init__(fname)
+    def __init__(self, fname: Path, log: Logger = None):
+        super().__init__(fname, log)
         self.samples = tuple()
         self.names = tuple()
 
diff --git a/haptools/data/data.py b/haptools/data/data.py
index 41ce17d0..8daba0b5 100644
--- a/haptools/data/data.py
+++ b/haptools/data/data.py
@@ -2,6 +2,7 @@
 from csv import reader
 from pathlib import Path
 from abc import ABC, abstractmethod
+from logging import getLogger, Logger
 
 import numpy as np
 
@@ -16,11 +17,14 @@ class Data(ABC):
         The path to the read-only file containing the data
     data : np.array
         The contents of the data file, once loaded
+    log: Logger
+        A logging instance for recording debug statements.
     """
 
-    def __init__(self, fname: Path):
+    def __init__(self, fname: Path, log: Logger = None):
         self.fname = fname
         self.data = None
+        self.log = log or getLogger(self.__class__.__name__)
         super().__init__()
 
     def __repr__(self):
@@ -45,4 +49,4 @@ def read(self):
         Read the raw file contents into the class properties
         """
         if self.data is not None:
-            raise AssertionError("The data has already been loaded.")
+            self.log.warning("The data has already been loaded. Overriding.")
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 890f65da..44f0b9d3 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -25,14 +25,16 @@ class Genotypes(Data):
             2. CHROM
             3. POS
             4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
+    log: Logger
+        A logging instance for recording debug statements.
 
     Examples
     --------
     >>> genotypes = Genotypes.load('tests/data/simple.vcf')
     """
 
-    def __init__(self, fname: Path):
-        super().__init__(fname)
+    def __init__(self, fname: Path, log: Logger = None):
+        super().__init__(fname, log)
         self.samples = tuple()
         self.variants = np.array([])
 
@@ -116,7 +118,7 @@ def read(self, region: str = None, samples: list[str] = None):
         )
         self.data = np.array(self.data, dtype=np.uint8)
         if self.data.shape == (0, 0, 0):
-            raise ValueError(
+            self.log.warning(
                 "Failed to load genotypes. If you specified a region, check that the"
                 " contig name matches! For example, double-check the 'chr' prefix."
             )
@@ -143,7 +145,8 @@ def check_biallelic(self, discard_also=False):
             If True, discard any multiallelic variants without raising a ValueError
         """
         if self.data.dtype == np.bool_:
-            raise AssertionError("All genotypes are already biallelic")
+            self.log.warning("All genotypes are already biallelic")
+            return
         # check: are there any variants that have genotype values above 1?
         # A genotype value above 1 would imply the variant has more than one ALT allele
         multiallelic = np.any(self.data[:, :, :2] > 1, axis=2)
@@ -176,9 +179,10 @@ def check_phase(self):
             If any heterozgyous genotpyes are unphased
         """
         if self.data.shape[2] < 3:
-            raise AssertionError(
+            self.log.warning(
                 "Phase information has already been removed from the data"
             )
+            return
         # check: are there any variants that are heterozygous and unphased?
         unphased = (self.data[:, :, 0] ^ self.data[:, :, 1]) & (~self.data[:, :, 2])
         if np.any(unphased):
@@ -205,10 +209,11 @@ def to_MAC(self):
             If the matrix has already been converted
         """
         if self.variants.dtype.names[3] == "maf":
-            raise AssertionError(
-                "The matrix already counts instances of the minor allele rather than"
+            self.log.warning(
+                "The matrix already counts instances of the minor allele rather than "
                 "the ALT allele."
             )
+            return
         need_conversion = self.variants["aaf"] > 0.5
         # flip the count on the variants that have an alternate allele frequency
         # above 0.5
@@ -239,14 +244,16 @@ class GenotypesPLINK(Data):
             2. CHROM
             3. POS
             4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
+    log: Logger
+        A logging instance for recording debug statements.
 
     Examples
     --------
     >>> genotypes = Genotypes.load('tests/data/simple.pgen')
     """
 
-    def __init__(self, fname: Path):
-        super().__init__(fname)
+    def __init__(self, fname: Path, log: Logger = None):
+        super().__init__(fname, log)
         self.samples = tuple()
         self.variants = np.array([])
 
diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index 3c09d8a0..54779965 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -20,14 +20,16 @@ class Phenotypes(Data):
         The path to the read-only file containing the data
     samples : tuple
         The names of each of the n samples
+    log: Logger
+        A logging instance for recording debug statements.
 
     Examples
     --------
     >>> phenotypes = Phenotypes.load('tests/data/simple.tsv')
     """
 
-    def __init__(self, fname: Path):
-        super().__init__(fname)
+    def __init__(self, fname: Path, log: Logger = None):
+        super().__init__(fname, log)
         self.samples = tuple()
 
     @classmethod
diff --git a/pyproject.toml b/pyproject.toml
index 19ca0bf9..fdc9e088 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,11 @@ haptools = 'haptools.__main__:main'
 line-length = 88
 preview = true
 
+[tool.pytest.ini_options]
+log_cli_level = "DEBUG"
+log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/tests/test_data.py b/tests/test_data.py
index d9c806e3..ecf472e6 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -18,7 +18,7 @@ def get_expected_genotypes():
     return expected
 
 
-def test_load_genotypes():
+def test_load_genotypes(caplog):
     expected = get_expected_genotypes()
 
     # can we load the data from the VCF?
@@ -27,9 +27,9 @@ def test_load_genotypes():
     np.testing.assert_allclose(gts.data, expected)
     assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
 
-    # try loading the data again - it should fail b/c we've already done it
-    with pytest.raises(AssertionError):
-        gts.read()
+    # try loading the data again - it should warn b/c we've already done it
+    gts.read()
+    assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"
 
     # force one of the SNPs to have more than one allele and check that we get an error
     gts.data[1, 1, 1] = 2
@@ -62,9 +62,9 @@ def test_load_genotypes():
     expected = expected[:, :, :2]
     np.testing.assert_allclose(gts.data, expected)
 
-    # try to check phase again - it should fail b/c we've already done it before
-    with pytest.raises(AssertionError):
-        gts.check_phase()
+    # try to check phase again - it should warn b/c we've already done it before
+    gts.check_phase()
+    assert len(caplog.records) == 2 and caplog.records[1].levelname == "WARNING"
 
     # convert the matrix of alt allele counts to a matrix of minor allele counts
     assert gts.variants["aaf"][1] == 0.6
@@ -73,9 +73,9 @@ def test_load_genotypes():
     np.testing.assert_allclose(gts.data, expected)
     assert gts.variants["maf"][1] == 0.4
 
-    # try to do the MAC conversion again - it should fail b/c we've already done it
-    with pytest.raises(AssertionError):
-        gts.to_MAC()
+    # try to do the MAC conversion again - it should warn b/c we've already done it
+    gts.to_MAC()
+    assert len(caplog.records) == 3 and caplog.records[2].levelname == "WARNING"
 
 
 def test_load_genotypes_discard_multiallelic():
@@ -122,7 +122,7 @@ def test_load_genotypes_subset():
     assert gts.samples == tuple(samples)
 
 
-def test_load_phenotypes():
+def test_load_phenotypes(caplog):
     # create a phenotype vector with shape: num_samples x 1
     expected = np.array([1, 1, 2, 2, 0])
 
@@ -132,9 +132,9 @@ def test_load_phenotypes():
     np.testing.assert_allclose(phens.data, expected)
     assert phens.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
 
-    # try loading the data again - it should fail b/c we've already done it
-    with pytest.raises(AssertionError):
-        phens.read()
+    # try loading the data again - it should warn b/c we've already done it
+    phens.read()
+    assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"
 
     expected = (expected - np.mean(expected)) / np.std(expected)
     phens.standardize()
@@ -156,7 +156,7 @@ def test_load_phenotypes_subset():
     assert phens.samples == tuple(samples)
 
 
-def test_load_covariates():
+def test_load_covariates(caplog):
     # create a covariate vector with shape: num_samples x num_covars
     expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)])
 
@@ -167,9 +167,9 @@ def test_load_covariates():
     assert covars.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
     assert covars.names == ("sex", "age")
 
-    # try loading the data again - it should fail b/c we've already done it
-    with pytest.raises(AssertionError):
-        covars.read()
+    # try loading the data again - it should warn b/c we've already done it
+    covars.read()
+    assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"
 
 
 def test_load_covariates_subset():