feat: allow multiallelic variants in transform (#232)

CAST-genomics · Jan 12, 2024 · 371415c · 371415c
1 parent 06cc273
commit 371415c
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 26 deletions.
diff --git a/docs/commands/transform.rst b/docs/commands/transform.rst
@@ -81,15 +81,6 @@ To match haplotypes as well as their ancestral population labels, use the ``--an
 
 	haptools transform --ancestry tests/data/simple-ancestry.vcf tests/data/simple.hap
 
-.. warning::
-	If your VCF has multi-allelic variants, they must be split into bi-allelic records before you can use ``transform``. After splitting, you should rename the IDs in your file to ensure they remain unique:
-
-	.. code-block:: bash
-
-		bcftools norm -m- -Ou input.vcf.gz | \
-		bcftools annotate -Ov --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' | \
-		haptools transform -o output.vcf.gz /dev/stdin file.hap
-
 All files used in these examples are described :doc:`here </project_info/example_files>`.
 
 

diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst
@@ -13,6 +13,8 @@ VCF/BCF
 
 Genotype files must be specified as VCF or BCF files. They can be bgzip-compressed.
 
+To be loaded properly, VCFs must follow the VCF specification. VCFs with duplicate variant IDs do not follow the specification; the IDs must be unique. Please validate your VCF using a tool like `gatk ValidateVariants <https://gatk.broadinstitute.org/hc/en-us/articles/360037057272-ValidateVariants>`_ before using haptools.
+
 .. _formats-genotypesplink:
 
 PLINK2 PGEN
@@ -24,11 +26,11 @@ If you run out memory when using PGEN files, consider reading/writing variants f
 
 Converting from VCF to PGEN
 ---------------------------
-To convert a VCF containing only biallelic SNPs to PGEN, use the following command.
+To convert a VCF containing only SNPs to PGEN, use the following command.
 
 .. code-block:: bash
 
-	plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output
+	plink2 --snps-only 'just-acgt' --vcf input.vcf --make-pgen --out output
 
 To convert a VCF containing tandem repeats to PGEN, use this command, instead.
 

diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
@@ -478,6 +478,7 @@ def transform(self, genotypes: GenotypesVCF) -> npt.NDArray[bool]:
             denotes the presence of the haplotype in one chromosome of a sample
         """
         var_IDs = self.varIDs
+        # ensure the variants in the Genotypes object are ordered according to var_IDs
         gts = genotypes.subset(variants=var_IDs)
         # check: were any of the variants absent from the genotypes?
         if len(gts.variants) < len(var_IDs):
@@ -490,14 +491,17 @@ def transform(self, genotypes: GenotypesVCF) -> npt.NDArray[bool]:
         # note: the excessive use of square-brackets gives us shape (1, p, 1)
         # where p denotes the number of alleles in this haplotype
         # That shape is broadcastable with gts.data which has shape (n, p, 2)
-        allele_arr = np.array(
-            [
+        try:
+            allele_arr = np.array(
                 [
-                    [int(var.allele != gts.variants[i]["alleles"][0])]
-                    for i, var in enumerate(self.variants)
+                    [
+                        [gts.variants[i]["alleles"].index(var.allele)]
+                        for i, var in enumerate(self.variants)
+                    ]
                 ]
-            ]
-        )
+            )
+        except ValueError:
+            raise ValueError("Some alleles were not present in the genotypes")
         # look for the presence of each allele in each chromosomal strand
         # and then just AND them together
         return np.all(allele_arr == gts.data[:, :, :2], axis=1)
@@ -1367,13 +1371,16 @@ def transform(
         self.log.debug(f"Creating array denoting alt allele status")
         # initialize a np array denoting the allele integer in each haplotype
         # with shape (1, gts.data.shape[1], 1) for broadcasting later
-        allele_arr = np.array(
-            [
-                int(allele != gts.variants[i]["alleles"][0])
-                for i, (vID, allele) in enumerate(alleles)
-            ],
-            dtype=gts.data.dtype,
-        )[np.newaxis, :, np.newaxis]
+        try:
+            allele_arr = np.array(
+                [
+                    gts.variants[i]["alleles"].index(allele)
+                    for i, (vID, allele) in enumerate(alleles)
+                ],
+                dtype=gts.data.dtype,
+            )[np.newaxis, :, np.newaxis]
+        except ValueError:
+            raise ValueError("Some alleles were not present in the genotypes")
         # finally, obtain and merge the haplotype genotypes
         self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
         equality_arr = np.equal(allele_arr, gts.data[:, :, :2])

diff --git a/haptools/transform.py b/haptools/transform.py
@@ -615,7 +615,6 @@ def transform_haps(
     # gt._prephased = True
     gt.read(region=region, samples=samples, variants=variants)
     gt.check_missing(discard_also=discard_missing)
-    gt.check_biallelic()
     gt.check_phase()
 
     # check that all of the variants were loaded successfully and warn otherwise

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -1172,6 +1172,16 @@ def _get_dummy_haps(self):
         haps.data = haplotypes
         return haps
 
+    def _get_dummy_haps_multiallelic(self):
+        haps = self._get_dummy_haps()
+        h1_vars = list(haps.data["H1"].variants)
+        h1_vars[1] = Variant(start=10116, end=10117, id="1:10116:A:G", allele="T")
+        haps.data["H1"].variants = tuple(h1_vars)
+        h3_vars = list(haps.data["H3"].variants)
+        h3_vars[1] = Variant(start=10122, end=10123, id="1:10122:A:G", allele="C")
+        haps.data["H3"].variants = tuple(h3_vars)
+        return haps
+
     def test_load(self):
         expected = self._basic_haps()
 
@@ -1553,6 +1563,23 @@ def test_hap_transform(self):
         hap_gt = hap.transform(gens)
         np.testing.assert_allclose(hap_gt, expected)
 
+    def test_hap_transform_multiallelic(self):
+        expected = np.array(
+            [
+                [0, 0],
+                [0, 0],
+                [1, 0],
+                [0, 0],
+                [0, 1],
+            ],
+            dtype=np.uint8,
+        )
+
+        hap = list(self._get_dummy_haps_multiallelic().data.values())[0]
+        gens = TestGenotypesVCF()._get_fake_genotypes_multiallelic()
+        hap_gt = hap.transform(gens)
+        np.testing.assert_allclose(hap_gt, expected)
+
     def test_haps_transform(self, return_also=False):
         expected = np.array(
             [
@@ -1576,6 +1603,30 @@ def test_haps_transform(self, return_also=False):
         if return_also:
             return hap_gt
 
+    def test_haps_transform_multiallelic(self, return_also=False):
+        expected = np.array(
+            [
+                [[0, 0], [0, 0], [0, 0]],
+                [[0, 0], [0, 0], [1, 0]],
+                [[1, 0], [0, 1], [0, 0]],
+                [[0, 0], [0, 0], [0, 0]],
+                [[0, 0], [0, 1], [1, 0]],
+            ],
+            dtype=np.uint8,
+        )
+
+        haps = self._get_dummy_haps_multiallelic()
+        gens = TestGenotypesVCF()._get_fake_genotypes_multiallelic()
+        gens.data[[2, 4], 0, 1] = 1
+        gens.data[[1, 4], 2, 0] = 1
+        gens.data[[1, 4], 3, 0] = 2
+        hap_gt = GenotypesVCF(fname=None)
+        haps.transform(gens, hap_gt)
+        np.testing.assert_allclose(hap_gt.data, expected)
+
+        if return_also:
+            return hap_gt
+
     def test_hap_gt_write(self):
         fname = DATADIR / "simple_haps.vcf"
 

diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -12,7 +12,7 @@
     GenotypesAncestry,
 )
 from haptools.__main__ import main
-from .test_data import TestGenotypesVCF
+from .test_data import TestGenotypesVCF, TestHaplotypes
 from haptools.data import Variant, GenotypesVCF, GenotypesPLINK
 
 
@@ -234,6 +234,34 @@ def test_basic(capfd):
     assert result.exit_code == 0
 
 
+def test_basic_multiallelic(capfd):
+    expected = """##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=1>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tHG00096\tHG00097\tHG00099\tHG00100\tHG00101
+1\t10114\tH1\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t1|0\t0|0\t0|1
+1\t10114\tH2\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t0|0\t0|0\t0|0
+1\t10116\tH3\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t0|0\t0|0\t0|0
+"""
+    gt_file = DATADIR / "simple-multiallelic.vcf"
+    hp_file = DATADIR / "simple-multiallelic.hap"
+
+    # first, create the multiallelic hap file
+    hp = TestHaplotypes()._get_dummy_haps_multiallelic()
+    hp.fname = hp_file
+    hp.write()
+
+    cmd = f"transform {gt_file} {hp_file}"
+    runner = CliRunner()
+    result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
+    captured = capfd.readouterr()
+    assert captured.out == expected
+    assert result.exit_code == 0
+
+    hp.fname.unlink()
+
+
 def test_basic_subset(capfd):
     expected = """##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">