Skip to content

Commit

Permalink
feat: allow multiallelic variants in transform (#232)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Jan 12, 2024
1 parent 06cc273 commit 371415c
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 26 deletions.
9 changes: 0 additions & 9 deletions docs/commands/transform.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,6 @@ To match haplotypes as well as their ancestral population labels, use the ``--an
haptools transform --ancestry tests/data/simple-ancestry.vcf tests/data/simple.hap
.. warning::
If your VCF has multi-allelic variants, they must be split into bi-allelic records before you can use ``transform``. After splitting, you should rename the IDs in your file to ensure they remain unique:

.. code-block:: bash
bcftools norm -m- -Ou input.vcf.gz | \
bcftools annotate -Ov --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' | \
haptools transform -o output.vcf.gz /dev/stdin file.hap
All files used in these examples are described :doc:`here </project_info/example_files>`.


Expand Down
6 changes: 4 additions & 2 deletions docs/formats/genotypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ VCF/BCF

Genotype files must be specified as VCF or BCF files. They can be bgzip-compressed.

To be loaded properly, VCFs must follow the VCF specification. VCFs with duplicate variant IDs do not follow the specification; the IDs must be unique. Please validate your VCF using a tool like `gatk ValidateVariants <https://gatk.broadinstitute.org/hc/en-us/articles/360037057272-ValidateVariants>`_ before using haptools.

.. _formats-genotypesplink:

PLINK2 PGEN
Expand All @@ -24,11 +26,11 @@ If you run out memory when using PGEN files, consider reading/writing variants f

Converting from VCF to PGEN
---------------------------
To convert a VCF containing only biallelic SNPs to PGEN, use the following command.
To convert a VCF containing only SNPs to PGEN, use the following command.

.. code-block:: bash
plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output
plink2 --snps-only 'just-acgt' --vcf input.vcf --make-pgen --out output
To convert a VCF containing tandem repeats to PGEN, use this command, instead.

Expand Down
33 changes: 20 additions & 13 deletions haptools/data/haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ def transform(self, genotypes: GenotypesVCF) -> npt.NDArray[bool]:
denotes the presence of the haplotype in one chromosome of a sample
"""
var_IDs = self.varIDs
# ensure the variants in the Genotypes object are ordered according to var_IDs
gts = genotypes.subset(variants=var_IDs)
# check: were any of the variants absent from the genotypes?
if len(gts.variants) < len(var_IDs):
Expand All @@ -490,14 +491,17 @@ def transform(self, genotypes: GenotypesVCF) -> npt.NDArray[bool]:
# note: the excessive use of square-brackets gives us shape (1, p, 1)
# where p denotes the number of alleles in this haplotype
# That shape is broadcastable with gts.data which has shape (n, p, 2)
allele_arr = np.array(
[
try:
allele_arr = np.array(
[
[int(var.allele != gts.variants[i]["alleles"][0])]
for i, var in enumerate(self.variants)
[
[gts.variants[i]["alleles"].index(var.allele)]
for i, var in enumerate(self.variants)
]
]
]
)
)
except ValueError:
raise ValueError("Some alleles were not present in the genotypes")
# look for the presence of each allele in each chromosomal strand
# and then just AND them together
return np.all(allele_arr == gts.data[:, :, :2], axis=1)
Expand Down Expand Up @@ -1367,13 +1371,16 @@ def transform(
self.log.debug(f"Creating array denoting alt allele status")
# initialize a np array denoting the allele integer in each haplotype
# with shape (1, gts.data.shape[1], 1) for broadcasting later
allele_arr = np.array(
[
int(allele != gts.variants[i]["alleles"][0])
for i, (vID, allele) in enumerate(alleles)
],
dtype=gts.data.dtype,
)[np.newaxis, :, np.newaxis]
try:
allele_arr = np.array(
[
gts.variants[i]["alleles"].index(allele)
for i, (vID, allele) in enumerate(alleles)
],
dtype=gts.data.dtype,
)[np.newaxis, :, np.newaxis]
except ValueError:
raise ValueError("Some alleles were not present in the genotypes")
# finally, obtain and merge the haplotype genotypes
self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
equality_arr = np.equal(allele_arr, gts.data[:, :, :2])
Expand Down
1 change: 0 additions & 1 deletion haptools/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,6 @@ def transform_haps(
# gt._prephased = True
gt.read(region=region, samples=samples, variants=variants)
gt.check_missing(discard_also=discard_missing)
gt.check_biallelic()
gt.check_phase()

# check that all of the variants were loaded successfully and warn otherwise
Expand Down
51 changes: 51 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,16 @@ def _get_dummy_haps(self):
haps.data = haplotypes
return haps

def _get_dummy_haps_multiallelic(self):
haps = self._get_dummy_haps()
h1_vars = list(haps.data["H1"].variants)
h1_vars[1] = Variant(start=10116, end=10117, id="1:10116:A:G", allele="T")
haps.data["H1"].variants = tuple(h1_vars)
h3_vars = list(haps.data["H3"].variants)
h3_vars[1] = Variant(start=10122, end=10123, id="1:10122:A:G", allele="C")
haps.data["H3"].variants = tuple(h3_vars)
return haps

def test_load(self):
expected = self._basic_haps()

Expand Down Expand Up @@ -1553,6 +1563,23 @@ def test_hap_transform(self):
hap_gt = hap.transform(gens)
np.testing.assert_allclose(hap_gt, expected)

def test_hap_transform_multiallelic(self):
expected = np.array(
[
[0, 0],
[0, 0],
[1, 0],
[0, 0],
[0, 1],
],
dtype=np.uint8,
)

hap = list(self._get_dummy_haps_multiallelic().data.values())[0]
gens = TestGenotypesVCF()._get_fake_genotypes_multiallelic()
hap_gt = hap.transform(gens)
np.testing.assert_allclose(hap_gt, expected)

def test_haps_transform(self, return_also=False):
expected = np.array(
[
Expand All @@ -1576,6 +1603,30 @@ def test_haps_transform(self, return_also=False):
if return_also:
return hap_gt

def test_haps_transform_multiallelic(self, return_also=False):
expected = np.array(
[
[[0, 0], [0, 0], [0, 0]],
[[0, 0], [0, 0], [1, 0]],
[[1, 0], [0, 1], [0, 0]],
[[0, 0], [0, 0], [0, 0]],
[[0, 0], [0, 1], [1, 0]],
],
dtype=np.uint8,
)

haps = self._get_dummy_haps_multiallelic()
gens = TestGenotypesVCF()._get_fake_genotypes_multiallelic()
gens.data[[2, 4], 0, 1] = 1
gens.data[[1, 4], 2, 0] = 1
gens.data[[1, 4], 3, 0] = 2
hap_gt = GenotypesVCF(fname=None)
haps.transform(gens, hap_gt)
np.testing.assert_allclose(hap_gt.data, expected)

if return_also:
return hap_gt

def test_hap_gt_write(self):
fname = DATADIR / "simple_haps.vcf"

Expand Down
30 changes: 29 additions & 1 deletion tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
GenotypesAncestry,
)
from haptools.__main__ import main
from .test_data import TestGenotypesVCF
from .test_data import TestGenotypesVCF, TestHaplotypes
from haptools.data import Variant, GenotypesVCF, GenotypesPLINK


Expand Down Expand Up @@ -234,6 +234,34 @@ def test_basic(capfd):
assert result.exit_code == 0


def test_basic_multiallelic(capfd):
expected = """##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tHG00096\tHG00097\tHG00099\tHG00100\tHG00101
1\t10114\tH1\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t1|0\t0|0\t0|1
1\t10114\tH2\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t0|0\t0|0\t0|0
1\t10116\tH3\tA\tT\t.\t.\t.\tGT\t0|0\t0|0\t0|0\t0|0\t0|0
"""
gt_file = DATADIR / "simple-multiallelic.vcf"
hp_file = DATADIR / "simple-multiallelic.hap"

# first, create the multiallelic hap file
hp = TestHaplotypes()._get_dummy_haps_multiallelic()
hp.fname = hp_file
hp.write()

cmd = f"transform {gt_file} {hp_file}"
runner = CliRunner()
result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
captured = capfd.readouterr()
assert captured.out == expected
assert result.exit_code == 0

hp.fname.unlink()


def test_basic_subset(capfd):
expected = """##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
Expand Down

0 comments on commit 371415c

Please sign in to comment.