feat: do not require sorting .hap lines by line type (#208)

CAST-genomics · Apr 14, 2023 · f221397 · f221397
1 parent ce2337b
commit f221397
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 37 deletions.
diff --git a/docs/commands/index.rst b/docs/commands/index.rst
@@ -42,7 +42,7 @@ You can use the ``--no-sort`` flag to skip the sorting step if your file is alre
 
   .. code-block:: bash
 
-    LC_ALL=C sort -k1,4 tests/data/simphenotype.hap | \
+    awk '$0 ~ /^#/ {print; next} {print | "sort -k2,4"}' tests/data/simphenotype.hap | \
     haptools index --no-sort --output tests/data/simphenotype.hap.gz /dev/stdin
 
 All files used in these examples are described :doc:`here </project_info/example_files>`.

diff --git a/docs/formats/haplotypes.rst b/docs/formats/haplotypes.rst
@@ -176,11 +176,10 @@ We encourage you to sort, bgzip compress, and index your ``.hap`` file whenever
 
 .. code-block:: bash
 
-  LC_ALL=C sort -k1,4 -o file.hap file.hap
-  bgzip file.hap
-  tabix -s 2 -b 3 -e 4 file.hap.gz
+  awk '$0 ~ /^#/ {print; next} {print | "sort -k2,4"}' file.hap | bgzip > sorted.hap.gz
+  tabix -s 2 -b 3 -e 4 sorted.hap.gz
 
-In order to properly index the file, the set of IDs in the haplotype lines must be distinct from the set of chromosome names. This is a best practice in unindexed ``.hap`` files but a requirement for indexed ones. In addition, you must sort on the first field (ie the line type symbol) in addition to the latter three.
+In order to properly index the file, the set of IDs in the haplotype lines must be distinct from the set of chromosome names. This is a best practice in unindexed ``.hap`` files but a requirement for indexed ones.
 
 Querying an indexed file
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -277,6 +276,12 @@ No extra fields are required here.
 
 Changelog
 ~~~~~~~~~
+v0.2.0
+------
+Support for tandem repeats in the specification via a new 'R' line type that has similar fields to the 'H' line type.
+
+Also, ``.hap`` files no longer need to be sorted by their first field in order to be indexed. We have updated the recommended ``sort`` command to reflect this. The new command wraps ``sort`` in a call to ``awk`` to ensure header lines are kept at the beginning of the file.
+
 v0.1.0
 ------
 Updates to the header lines in the specification. See `PR #80 <https://github.com/cast-genomics/haptools/pull/80>`_.

diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
@@ -584,6 +584,7 @@ def __init__(
         # note: it's important that self.types is created such that its keys are sorted
         # otherwise, the write() method might create unsorted files
         self.types = {"H": haplotype, "V": variant}
+        self.type_ids = None
         self.version = "0.1.0"
 
     @classmethod
@@ -773,6 +774,20 @@ def _line_type(self, line: str) -> type:
             # if none of the lines matched, return None
             return None
 
+    def index(self, force=False):
+        """
+        Reset the type_ids parameter
+
+        You should call this method after any changes to the data property
+        """
+        if not (force or self.type_ids is None):
+            # do not remap IDs if they've already been mapped
+            return
+        self.type_ids = {"H": []}
+        for key, value in self.data.items():
+            if isinstance(value, Haplotype):
+                self.type_ids["H"].append(key)
+
     def read(self, region: str = None, haplotypes: set[str] = None):
         """
         Read haplotypes from a .hap file into a list stored in :py:attr:`~.Haplotypes.data`
@@ -804,7 +819,9 @@ def read(self, region: str = None, haplotypes: set[str] = None):
                 var_haps.setdefault(hap_id, []).append(line)
         for hap in var_haps:
             self.data[hap].variants = tuple(var_haps[hap])
-        self.log.info(f"Loaded {len(self.data)} haplotypes from .hap file")
+        self.index()
+        num_haps = len(self.type_ids["H"])
+        self.log.info(f"Loaded {num_haps} haplotypes from .hap file")
 
     def _get_field_types(
         self,
@@ -908,17 +925,17 @@ def _iter_haps(
                     continue
                 yield hap
         else:
+            count = 0
             for line in haps_file.fetch(multiple_iterators=True):
                 # we only want lines that start with an H
                 line_type = self._line_type(line)
                 if line_type == "H":
                     hap = self.types["H"].from_hap_spec(line, types=line_types)
                     if hap.id in haplotypes:
+                        count += 1
                         yield hap
-                elif line_type > "H":
-                    # if we've already passed all of the H's, we can just exit
-                    # We assume the file has been sorted so that all of the H lines
-                    # come before the V lines
+                # exit prematurely if we've seen all of the requested haplotypes
+                if count == len(haplotypes):
                     break
 
     def __iter__(
@@ -1052,26 +1069,27 @@ def to_str(self, sort: bool = True) -> Generator[str, None, None]:
         Parameters
         ----------
         sort: bool, optional
-            Whether to attempt to output lines in sorted order
+            Whether to attempt to sort variant lines by their haplotype IDs
 
         Yields
         ------
         Generator[str, None, None]
             A list of lines (strings) to include in the output
         """
+        self.index()
         for symbol, line_instance in self.types.items():
             extras_order = line_instance.extras_order()
             if extras_order:
                 yield f"#\torder{symbol}\t" + "\t".join(extras_order)
         yield "#\tversion\t" + self.version
         for line_instance in self.types.values():
             yield from sorted(line_instance.extras_head())
-        for hap in self.data.values():
-            yield self.types["H"].to_hap_spec(hap)
-        sorted_hap_ids = sorted(self.data.keys()) if sort else self.data.keys()
-        for hap_id in sorted_hap_ids:
-            for var in self.data[hap_id].variants:
-                yield self.types["V"].to_hap_spec(var, hap_id)
+        for hap in self.type_ids["H"]:
+            yield self.types["H"].to_hap_spec(self.data[hap])
+        sorted_hap_ids = sorted(self.type_ids["H"]) if sort else self.type_ids["H"]
+        for hap in sorted_hap_ids:
+            for var in self.data[hap].variants:
+                yield self.types["V"].to_hap_spec(var, hap)
 
     def __repr__(self):
         return "\n".join(self.to_str())
@@ -1121,21 +1139,23 @@ def transform(
         GenotypesVCF
             A Genotypes object composed of haplotypes instead of regular variants.
         """
+        self.index()
+        haps = [self.data[hap] for hap in self.type_ids["H"]]
         # Initialize GenotypesVCF return value
         if hap_gts is None:
             hap_gts = GenotypesVCF(fname=None, log=self.log)
         hap_gts.samples = gts.samples
         hap_gts.variants = np.array(
-            [(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in self.data.values()],
+            [(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in haps],
             dtype=hap_gts.variants.dtype,
         )
         # build a fast data structure for querying the alleles in each haplotype:
         # a dict mapping (variant ID, allele) -> a unique index
         alleles = {}
         # and a list of arrays containing the indices of each hap's alleles
-        idxs = [None] * len(self.data)
+        idxs = [None] * len(haps)
         count = 0
-        for i, hap in enumerate(self.data.values()):
+        for i, hap in enumerate(haps):
             idxs[i] = np.empty(len(hap.variants), dtype=np.uintc)
             for j, variant in enumerate(hap.variants):
                 key = (variant.id, variant.allele)
@@ -1156,15 +1176,15 @@ def transform(
             dtype=gts.data.dtype,
         )[np.newaxis, :, np.newaxis]
         # finally, obtain and merge the haplotype genotypes
-        self.log.info(f"Transforming genotypes for {len(self.data)} haplotypes")
+        self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
         equality_arr = np.equal(allele_arr, gts.data)
         self.log.debug(
             f"Allocating array with dtype {gts.data.dtype} and size "
-            f"{(len(gts.samples), len(self.data), 2)}"
+            f"{(len(gts.samples), len(haps), 2)}"
         )
-        hap_gts.data = np.empty((gts.data.shape[0], len(self.data), 2), dtype=np.bool_)
+        hap_gts.data = np.empty((gts.data.shape[0], len(haps), 2), dtype=np.bool_)
         self.log.debug("Computing haplotype genotypes. This may take a while")
-        for i in range(len(self.data)):
+        for i in range(len(haps)):
             hap_gts.data[:, i] = np.all(equality_arr[:, idxs[i]], axis=1)
         return hap_gts
 
@@ -1175,8 +1195,9 @@ def sort(self):
         Also sorts the variants within each haplotype
         """
         self.data = dict(sorted(self.data.items(), key=lambda item: item[1]))
-        for hap in self.data.values():
-            hap.sort()
+        self.index(force=True)
+        for hap in self.type_ids["H"]:
+            self.data[hap].sort()
 
     def subset(self, haplotypes: tuple[str], inplace: bool = False):
         """
@@ -1216,5 +1237,6 @@ def subset(self, haplotypes: tuple[str], inplace: bool = False):
                 f"{len(hps.data)} haplotypes."
             )
         hps.data = data
+        hps.index(force=True)
         if not inplace:
             return hps
diff --git a/haptools/ld.py b/haptools/ld.py
@@ -120,17 +120,21 @@ def calc_ld(
             variants.update(var.id for var in hp.data[target].variants)
     else:
         log.info("Extracting variants from haplotypes")
-        variants = {var.id for hap in hp.data.values() for var in hap.variants}
+        variants = {var.id for h in hp.type_ids["H"] for var in hp.data[h].variants}
 
     # check to see whether the target was a haplotype
-    if target in hp.data:
+    if target in hp.type_ids["H"]:
         log.info(f"Identified target '{target}' as a haplotype")
         target = hp.data.pop(target)
+        hp.index(force=True)
         if len(hp.data) == 0 and not from_gts:
             log.error(
                 "There must be at least one more haplotype in the .hap file "
                 "than the TARGET haplotype specified."
             )
+    else:
+        # TODO: handle other line types
+        pass
 
     # check that all of the haplotypes were loaded successfully and warn otherwise
     if ids is not None and not from_gts and len(ids) > len(hp.data):
@@ -206,7 +210,7 @@ def calc_ld(
         # construct a new Haplotypes object that also stores the LD values
         hp_out = data.Haplotypes(fname=output, haplotype=Haplotype, log=log)
         hp_out.data = {}
-        for hap_id in hp.data:
+        for hap_id in hp.type_ids["H"]:
             # break the BaseHaplotype instance up into its properties
             hapd = hp.data[hap_id].__dict__
             hapd_variants = hapd.pop("variants")

diff --git a/haptools/transform.py b/haptools/transform.py
@@ -93,23 +93,25 @@ def transform(
         gts: data.GenotypesAncestry,
         hap_gts: data.GenotypesVCF = None,
     ) -> data.GenotypesVCF:
+        self.index()
+        haps = [self.data[hap] for hap in self.type_ids["H"]]
         # Initialize GenotypesVCF return value
         if hap_gts is None:
             hap_gts = data.GenotypesVCF(fname=None, log=self.log)
         hap_gts.samples = gts.samples
         hap_gts.variants = np.array(
-            [(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in self.data.values()],
+            [(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in haps],
             dtype=hap_gts.variants.dtype,
         )
         # build a fast data structure for querying the alleles in each haplotype:
         # a dict mapping (variant ID, allele) -> a unique index
         alleles = {}
         # and a list of arrays containing the indices of each hap's alleles
-        idxs = [None] * len(self.data)
+        idxs = [None] * len(haps)
         # and lastly, a list of ancestral population labels for each hap
-        ancestries = np.empty(len(self.data), dtype=np.uint8)
+        ancestries = np.empty(len(haps), dtype=np.uint8)
         count = 0
-        for i, hap in enumerate(self.data.values()):
+        for i, hap in enumerate(haps):
             ancestries[i] = gts.ancestry_labels.get(hap.ancestry, -1)
             idxs[i] = np.empty(len(hap.variants), dtype=np.uintc)
             for j, variant in enumerate(hap.variants):
@@ -131,15 +133,15 @@ def transform(
             dtype=gts.data.dtype,
         )[np.newaxis, :, np.newaxis]
         # finally, obtain and merge the haplotype genotypes
-        self.log.info(f"Transforming genotypes for {len(self.data)} haplotypes")
+        self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
         equality_arr = np.equal(allele_arr, gts.data)
         self.log.debug(
             f"Allocating array with dtype {gts.data.dtype} and size "
-            f"{(len(gts.samples), len(self.data), 2)}"
+            f"{(len(gts.samples), len(haps), 2)}"
         )
-        hap_gts.data = np.empty((gts.data.shape[0], len(self.data), 2), dtype=np.bool_)
+        hap_gts.data = np.empty((gts.data.shape[0], len(haps), 2), dtype=np.bool_)
         self.log.debug("Computing haplotype genotypes. This may take a while")
-        for i in range(len(self.data)):
+        for i in range(len(haps)):
             hap_gts.data[:, i] = np.logical_and(
                 np.all(gts.ancestry[:, idxs[i]] == ancestries[i], axis=1),
                 np.all(equality_arr[:, idxs[i]], axis=1),

diff --git a/tests/data/unordered_first_field.hap.gz b/tests/data/unordered_first_field.hap.gz
diff --git a/tests/data/unordered_first_field.hap.gz.tbi b/tests/data/unordered_first_field.hap.gz.tbi
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -834,6 +834,30 @@ def _basic_haps(self):
         )
         return expected
 
+    def _basic_unordered_first_field_haps(self):
+        # what do we expect to see from the basic.hap file?
+        expected = {
+            "21.q.3365*1": Haplotype("21", 26928472, 26941960, "21.q.3365*1"),
+            "22.q.3365*10": Haplotype("22", 26938989, 26941960, "22.q.3365*10"),
+            "22.q.3365*11": Haplotype("22", 26938353, 26938989, "22.q.3365*11"),
+        }
+        expected["21.q.3365*1"].variants = (
+            Variant(26928472, 26928472, "21_26928472_C_A", "C"),
+            Variant(26938353, 26938353, "21_26938353_T_C", "T"),
+            Variant(26940815, 26940815, "21_26940815_T_C", "C"),
+            Variant(26941960, 26941960, "21_26941960_A_G", "G"),
+        )
+        expected["22.q.3365*10"].variants = (
+            Variant(26938989, 26938989, "21_26938989_G_A", "A"),
+            Variant(26940815, 26940815, "21_26940815_T_C", "T"),
+            Variant(26941960, 26941960, "21_26941960_A_G", "A"),
+        )
+        expected["22.q.3365*11"].variants = (
+            Variant(26938353, 26938353, "21_26938353_T_C", "T"),
+            Variant(26938989, 26938989, "21_26938989_G_A", "A"),
+        )
+        return expected
+
     def _get_dummy_haps(self):
         # create three haplotypes
         haplotypes = {
@@ -919,6 +943,51 @@ def test_iterate(self):
         for exp_hap, line in zip(exp_single_hap, haps_iter):
             assert exp_hap == line
 
+    def test_iterate_unordered_first_field(self):
+        exp_full = self._basic_unordered_first_field_haps()
+
+        exp_single_hap = [exp_full["21.q.3365*1"]]
+        exp_single_hap += exp_single_hap[0].variants
+        exp_single_hap2 = [exp_full["22.q.3365*11"]]
+        exp_single_hap2 += exp_single_hap2[0].variants
+
+        expected = [hap for hap in exp_full.values()]
+        for hap in tuple(expected):
+            expected += hap.variants
+            hap.variants = ()
+
+        # also check whether it works when we pass function params
+        haps = Haplotypes(DATADIR.joinpath("unordered_first_field.hap.gz"))
+        haps_iter = list(haps.__iter__(region="21:26928472-26941960"))
+        assert len(haps_iter) == len(exp_single_hap)
+        assert all(line in exp_single_hap for line in haps_iter)
+
+        haps_iter = list(haps.__iter__(region="21"))
+        assert len(haps_iter) == len(exp_single_hap)
+        assert all(line in exp_single_hap for line in haps_iter)
+
+        haps_iter = list(haps.__iter__(region="21:"))
+        assert len(haps_iter) == len(exp_single_hap)
+        assert all(line in exp_single_hap for line in haps_iter)
+
+        haps_iter = list(haps.__iter__(region="21:26928472-"))
+        assert len(haps_iter) == len(exp_single_hap)
+        assert all(line in exp_single_hap for line in haps_iter)
+
+        haps_iter = list(haps.__iter__(region="22:26928472-26938989"))
+        assert len(haps_iter) == len(exp_single_hap2)
+        assert all(line in exp_single_hap2 for line in haps_iter)
+
+        # also, try adding the hap ID
+        i = haps.__iter__(region="21:26928472-26941960", haplotypes={"21.q.3365*1"})
+        for exp_hap, line in zip(exp_single_hap, i):
+            assert exp_hap == line
+
+        # also, try adding the hap ID
+        haps_iter = haps.__iter__(haplotypes={"22.q.3365*11"})
+        for exp_hap, line in zip(exp_single_hap2, haps_iter):
+            assert exp_hap == line
+
     def test_read_subset(self):
         expected = {}
         expected["chr21.q.3365*1"] = self._basic_haps()["chr21.q.3365*1"]