Skip to content

Commit

Permalink
docs: fix example of .blocks.det to .hap conversion in API docs (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Jan 12, 2024
1 parent 371415c commit 1ed9139
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 14 deletions.
30 changes: 16 additions & 14 deletions docs/api/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,35 @@ Converting a ``.blocks.det`` file into a ``.hap`` file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can use the :ref:`data API <api-data>` to easily convert `a PLINK 1.9 .blocks.det file <https://www.cog-genomics.org/plink/1.9/formats#blocks>`_ into a ``.hap`` file.

As an example, let's say we would like to convert the following ``.blocks.det`` file.
As an example, let's say we would like to convert `the following .blocks.det file <https://github.com/cast-genomics/haptools/blob/main/tests/data/simple.blocks.det>`_.

.. code-block::
CHR BP1 BP2 KB NSNPS SNPS
1 2313888 2331789 17.902 3 rs7527871|rs2840528|rs7545940
1 2462779 2482556 19.778 2 rs2296442|rs2246732
1 2867411 2869431 2.021 2 rs10752728|rs897635
1 2974991 2979823 4.833 3 rs10489588|rs9661525|rs2993510
.. include:: ../../tests/data/simple.blocks.det
:literal:

.. code-block:: python
from haptools import data
# load the genotypes file
# you can use either a VCF or PGEN file
gt = data.GenotypesVCF.load("input.vcf.gz")
gt = data.GenotypesPLINK.load("input.pgen")
gt = data.GenotypesVCF.load("tests/data/simple.vcf.gz")
gt = data.GenotypesPLINK.load("tests/data/simple.pgen")
# load the haplotypes
hp = data.Haplotypes("output.hap")
hp.data = {}
# iterate through lines of the .blocks.det file
with open("input.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.readlines()):
with open("tests/data/simple.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
# initialize variables and parse line from the blocks file
hap_id = f"H{idx}"
chrom, bp1, bp2, kb, nsnps, snps = line.split("\t")
# create a haplotype line in the .hap file
hp.data[hap_id] = data.Haplotype(chrom=chrom, start=bp1, end=bp2, id=hap_id)
hp.data[hap_id] = data.Haplotype(
chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
)
# fetch alleles from the genotypes file
snp_gts = gt.subset(variants=tuple(snps.split("|")))
Expand All @@ -48,7 +45,12 @@ As an example, let's say we would like to convert the following ``.blocks.det``
# Note that the .blocks.det file doesn't specify an allele, so
# we simply choose the first allele (ie the REF allele) for this example
hp.data[hap_id].variants = tuple(
data.Variant(start=v["pos"], end=v["pos"]+len(v["alleles"][0]), id=v["id"], allele=v["alleles"][0])
data.Variant(
start=v["pos"],
end=v["pos"] + len(v["alleles"][0]),
id=v["id"],
allele=v["alleles"][0],
)
for v in snp_gts.variants
)
Expand Down
4 changes: 4 additions & 0 deletions tests/data/simple.blocks.det
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CHR BP1 BP2 KB NSNPS SNPS
1 10114 10117 2.001 2 1:10114:T:C|1:10116:A:G
1 10114 10119 2.007 2 1:10114:T:C|1:10117:C:A
1 10116 10119 2.011 2 1:10116:A:G|1:10117:C:A
10 changes: 10 additions & 0 deletions tests/data/simple.blocks.det.hap
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# version 0.2.0
H 1 10114 10117 H0
H 1 10114 10119 H1
H 1 10116 10119 H2
V H0 10114 10115 1:10114:T:C T
V H0 10116 10117 1:10116:A:G A
V H1 10114 10115 1:10114:T:C T
V H1 10117 10118 1:10117:C:A C
V H2 10116 10117 1:10116:A:G A
V H2 10117 10118 1:10117:C:A C
46 changes: 46 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2114,6 +2114,52 @@ def test_breakpoints_to_pop_array_chrom_no_match(self):


class TestDocExamples:
def test_blocks2hap(self):
# load the genotypes file
# you can use either a VCF or PGEN file
gt = GenotypesVCF.load(DATADIR / "simple.vcf.gz")
gt = GenotypesPLINK.load(DATADIR / "simple.pgen")

# load the haplotypes
hp = Haplotypes("output.hap")
hp.data = {}

# iterate through lines of the .blocks.det file
with open(DATADIR / "simple.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
# initialize variables and parse line from the blocks file
hap_id = f"H{idx}"
chrom, bp1, bp2, kb, nsnps, snps = line.split("\t")

# create a haplotype line in the .hap file
hp.data[hap_id] = Haplotype(
chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
)

# fetch alleles from the genotypes file
snp_gts = gt.subset(variants=tuple(snps.split("|")))

# create variant lines for each haplotype
# Note that the .blocks.det file doesn't specify an allele, so
# we simply choose the first allele (ie the REF allele) for this example
hp.data[hap_id].variants = tuple(
Variant(
start=v["pos"],
end=v["pos"] + len(v["alleles"][0]),
id=v["id"],
allele=v["alleles"][0],
)
for v in snp_gts.variants
)

hp.write()

# validate the output and clean up afterwards
with open("output.hap") as hp_file:
with open(DATADIR / "simple.blocks.det.hap") as expected:
assert hp_file.read() == expected.read()
hp.fname.unlink()

def test_gts2hap(self):
# load variants from the snplist file
variants = {}
Expand Down

0 comments on commit 1ed9139

Please sign in to comment.