docs: fix example of .blocks.det to .hap conversion in API docs (#…

…236)
CAST-genomics · Jan 12, 2024 · 1ed9139 · 1ed9139
1 parent 371415c
commit 1ed9139
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 14 deletions.
diff --git a/docs/api/examples.rst b/docs/api/examples.rst
@@ -8,38 +8,35 @@ Converting a ``.blocks.det`` file into a ``.hap`` file
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 You can use the :ref:`data API <api-data>` to easily convert `a PLINK 1.9 .blocks.det file <https://www.cog-genomics.org/plink/1.9/formats#blocks>`_ into a ``.hap`` file.
 
-As an example, let's say we would like to convert the following ``.blocks.det`` file.
+As an example, let's say we would like to convert `the following .blocks.det file <https://github.com/cast-genomics/haptools/blob/main/tests/data/simple.blocks.det>`_.
 
-.. code-block::
-
-  CHR          BP1          BP2           KB  NSNPS SNPS
-   1      2313888      2331789       17.902      3 rs7527871|rs2840528|rs7545940
-   1      2462779      2482556       19.778      2 rs2296442|rs2246732
-   1      2867411      2869431        2.021      2 rs10752728|rs897635
-   1      2974991      2979823        4.833      3 rs10489588|rs9661525|rs2993510
+.. include:: ../../tests/data/simple.blocks.det
+  :literal:
 
 .. code-block:: python
 
     from haptools import data
 
     # load the genotypes file
     # you can use either a VCF or PGEN file
-    gt = data.GenotypesVCF.load("input.vcf.gz")
-    gt = data.GenotypesPLINK.load("input.pgen")
+    gt = data.GenotypesVCF.load("tests/data/simple.vcf.gz")
+    gt = data.GenotypesPLINK.load("tests/data/simple.pgen")
 
     # load the haplotypes
     hp = data.Haplotypes("output.hap")
     hp.data = {}
 
     # iterate through lines of the .blocks.det file
-    with open("input.blocks.det") as blocks_file:
-        for idx, line in enumerate(blocks_file.readlines()):
+    with open("tests/data/simple.blocks.det") as blocks_file:
+        for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
             # initialize variables and parse line from the blocks file
             hap_id = f"H{idx}"
             chrom, bp1, bp2, kb, nsnps, snps = line.split("\t")
 
             # create a haplotype line in the .hap file
-            hp.data[hap_id] = data.Haplotype(chrom=chrom, start=bp1, end=bp2, id=hap_id)
+            hp.data[hap_id] = data.Haplotype(
+                chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
+            )
 
             # fetch alleles from the genotypes file
             snp_gts = gt.subset(variants=tuple(snps.split("|")))
@@ -48,7 +45,12 @@ As an example, let's say we would like to convert the following ``.blocks.det``
             # Note that the .blocks.det file doesn't specify an allele, so
             # we simply choose the first allele (ie the REF allele) for this example
             hp.data[hap_id].variants = tuple(
-                data.Variant(start=v["pos"], end=v["pos"]+len(v["alleles"][0]), id=v["id"], allele=v["alleles"][0])
+                data.Variant(
+                    start=v["pos"],
+                    end=v["pos"] + len(v["alleles"][0]),
+                    id=v["id"],
+                    allele=v["alleles"][0],
+                )
                 for v in snp_gts.variants
             )
 

diff --git a/tests/data/simple.blocks.det b/tests/data/simple.blocks.det
@@ -0,0 +1,4 @@
+CHR	BP1	BP2	KB	NSNPS	SNPS
+1	10114	10117	2.001	2	1:10114:T:C|1:10116:A:G
+1	10114	10119	2.007	2	1:10114:T:C|1:10117:C:A
+1	10116	10119	2.011	2	1:10116:A:G|1:10117:C:A
diff --git a/tests/data/simple.blocks.det.hap b/tests/data/simple.blocks.det.hap
@@ -0,0 +1,10 @@
+#	version	0.2.0
+H	1	10114	10117	H0
+H	1	10114	10119	H1
+H	1	10116	10119	H2
+V	H0	10114	10115	1:10114:T:C	T
+V	H0	10116	10117	1:10116:A:G	A
+V	H1	10114	10115	1:10114:T:C	T
+V	H1	10117	10118	1:10117:C:A	C
+V	H2	10116	10117	1:10116:A:G	A
+V	H2	10117	10118	1:10117:C:A	C
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -2114,6 +2114,52 @@ def test_breakpoints_to_pop_array_chrom_no_match(self):
 
 
 class TestDocExamples:
+    def test_blocks2hap(self):
+        # load the genotypes file
+        # you can use either a VCF or PGEN file
+        gt = GenotypesVCF.load(DATADIR / "simple.vcf.gz")
+        gt = GenotypesPLINK.load(DATADIR / "simple.pgen")
+
+        # load the haplotypes
+        hp = Haplotypes("output.hap")
+        hp.data = {}
+
+        # iterate through lines of the .blocks.det file
+        with open(DATADIR / "simple.blocks.det") as blocks_file:
+            for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
+                # initialize variables and parse line from the blocks file
+                hap_id = f"H{idx}"
+                chrom, bp1, bp2, kb, nsnps, snps = line.split("\t")
+
+                # create a haplotype line in the .hap file
+                hp.data[hap_id] = Haplotype(
+                    chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
+                )
+
+                # fetch alleles from the genotypes file
+                snp_gts = gt.subset(variants=tuple(snps.split("|")))
+
+                # create variant lines for each haplotype
+                # Note that the .blocks.det file doesn't specify an allele, so
+                # we simply choose the first allele (ie the REF allele) for this example
+                hp.data[hap_id].variants = tuple(
+                    Variant(
+                        start=v["pos"],
+                        end=v["pos"] + len(v["alleles"][0]),
+                        id=v["id"],
+                        allele=v["alleles"][0],
+                    )
+                    for v in snp_gts.variants
+                )
+
+        hp.write()
+
+        # validate the output and clean up afterwards
+        with open("output.hap") as hp_file:
+            with open(DATADIR / "simple.blocks.det.hap") as expected:
+                assert hp_file.read() == expected.read()
+        hp.fname.unlink()
+
     def test_gts2hap(self):
         # load variants from the snplist file
         variants = {}