Skip to content

Commit

Permalink
feat: do not require sorting .hap lines by line type (#208)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Apr 14, 2023
1 parent ce2337b commit f221397
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 37 deletions.
2 changes: 1 addition & 1 deletion docs/commands/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ You can use the ``--no-sort`` flag to skip the sorting step if your file is alre

.. code-block:: bash
LC_ALL=C sort -k1,4 tests/data/simphenotype.hap | \
awk '$0 ~ /^#/ {print; next} {print | "sort -k2,4"}' tests/data/simphenotype.hap | \
haptools index --no-sort --output tests/data/simphenotype.hap.gz /dev/stdin
All files used in these examples are described :doc:`here </project_info/example_files>`.
Expand Down
13 changes: 9 additions & 4 deletions docs/formats/haplotypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,10 @@ We encourage you to sort, bgzip compress, and index your ``.hap`` file whenever

.. code-block:: bash
LC_ALL=C sort -k1,4 -o file.hap file.hap
bgzip file.hap
tabix -s 2 -b 3 -e 4 file.hap.gz
awk '$0 ~ /^#/ {print; next} {print | "sort -k2,4"}' file.hap | bgzip > sorted.hap.gz
tabix -s 2 -b 3 -e 4 sorted.hap.gz
In order to properly index the file, the set of IDs in the haplotype lines must be distinct from the set of chromosome names. This is a best practice in unindexed ``.hap`` files but a requirement for indexed ones. In addition, you must sort on the first field (ie the line type symbol) in addition to the latter three.
In order to properly index the file, the set of IDs in the haplotype lines must be distinct from the set of chromosome names. This is a best practice in unindexed ``.hap`` files but a requirement for indexed ones.

Querying an indexed file
~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -277,6 +276,12 @@ No extra fields are required here.

Changelog
~~~~~~~~~
v0.2.0
------
Support for tandem repeats in the specification via a new 'R' line type that has similar fields to the 'H' line type.

Also, ``.hap`` files no longer need to be sorted by their first field in order to be indexed. We have updated the recommended ``sort`` command to reflect this. The new command wraps ``sort`` in a call to ``awk`` to ensure header lines are kept at the beginning of the file.

v0.1.0
------
Updates to the header lines in the specification. See `PR #80 <https://github.com/cast-genomics/haptools/pull/80>`_.
Expand Down
64 changes: 43 additions & 21 deletions haptools/data/haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,7 @@ def __init__(
# note: it's important that self.types is created such that its keys are sorted
# otherwise, the write() method might create unsorted files
self.types = {"H": haplotype, "V": variant}
self.type_ids = None
self.version = "0.1.0"

@classmethod
Expand Down Expand Up @@ -773,6 +774,20 @@ def _line_type(self, line: str) -> type:
# if none of the lines matched, return None
return None

def index(self, force=False):
"""
Reset the type_ids parameter
You should call this method after any changes to the data property
"""
if not (force or self.type_ids is None):
# do not remap IDs if they've already been mapped
return
self.type_ids = {"H": []}
for key, value in self.data.items():
if isinstance(value, Haplotype):
self.type_ids["H"].append(key)

def read(self, region: str = None, haplotypes: set[str] = None):
"""
Read haplotypes from a .hap file into a list stored in :py:attr:`~.Haplotypes.data`
Expand Down Expand Up @@ -804,7 +819,9 @@ def read(self, region: str = None, haplotypes: set[str] = None):
var_haps.setdefault(hap_id, []).append(line)
for hap in var_haps:
self.data[hap].variants = tuple(var_haps[hap])
self.log.info(f"Loaded {len(self.data)} haplotypes from .hap file")
self.index()
num_haps = len(self.type_ids["H"])
self.log.info(f"Loaded {num_haps} haplotypes from .hap file")

def _get_field_types(
self,
Expand Down Expand Up @@ -908,17 +925,17 @@ def _iter_haps(
continue
yield hap
else:
count = 0
for line in haps_file.fetch(multiple_iterators=True):
# we only want lines that start with an H
line_type = self._line_type(line)
if line_type == "H":
hap = self.types["H"].from_hap_spec(line, types=line_types)
if hap.id in haplotypes:
count += 1
yield hap
elif line_type > "H":
# if we've already passed all of the H's, we can just exit
# We assume the file has been sorted so that all of the H lines
# come before the V lines
# exit prematurely if we've seen all of the requested haplotypes
if count == len(haplotypes):
break

def __iter__(
Expand Down Expand Up @@ -1052,26 +1069,27 @@ def to_str(self, sort: bool = True) -> Generator[str, None, None]:
Parameters
----------
sort: bool, optional
Whether to attempt to output lines in sorted order
Whether to attempt to sort variant lines by their haplotype IDs
Yields
------
Generator[str, None, None]
A list of lines (strings) to include in the output
"""
self.index()
for symbol, line_instance in self.types.items():
extras_order = line_instance.extras_order()
if extras_order:
yield f"#\torder{symbol}\t" + "\t".join(extras_order)
yield "#\tversion\t" + self.version
for line_instance in self.types.values():
yield from sorted(line_instance.extras_head())
for hap in self.data.values():
yield self.types["H"].to_hap_spec(hap)
sorted_hap_ids = sorted(self.data.keys()) if sort else self.data.keys()
for hap_id in sorted_hap_ids:
for var in self.data[hap_id].variants:
yield self.types["V"].to_hap_spec(var, hap_id)
for hap in self.type_ids["H"]:
yield self.types["H"].to_hap_spec(self.data[hap])
sorted_hap_ids = sorted(self.type_ids["H"]) if sort else self.type_ids["H"]
for hap in sorted_hap_ids:
for var in self.data[hap].variants:
yield self.types["V"].to_hap_spec(var, hap)

def __repr__(self):
return "\n".join(self.to_str())
Expand Down Expand Up @@ -1121,21 +1139,23 @@ def transform(
GenotypesVCF
A Genotypes object composed of haplotypes instead of regular variants.
"""
self.index()
haps = [self.data[hap] for hap in self.type_ids["H"]]
# Initialize GenotypesVCF return value
if hap_gts is None:
hap_gts = GenotypesVCF(fname=None, log=self.log)
hap_gts.samples = gts.samples
hap_gts.variants = np.array(
[(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in self.data.values()],
[(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in haps],
dtype=hap_gts.variants.dtype,
)
# build a fast data structure for querying the alleles in each haplotype:
# a dict mapping (variant ID, allele) -> a unique index
alleles = {}
# and a list of arrays containing the indices of each hap's alleles
idxs = [None] * len(self.data)
idxs = [None] * len(haps)
count = 0
for i, hap in enumerate(self.data.values()):
for i, hap in enumerate(haps):
idxs[i] = np.empty(len(hap.variants), dtype=np.uintc)
for j, variant in enumerate(hap.variants):
key = (variant.id, variant.allele)
Expand All @@ -1156,15 +1176,15 @@ def transform(
dtype=gts.data.dtype,
)[np.newaxis, :, np.newaxis]
# finally, obtain and merge the haplotype genotypes
self.log.info(f"Transforming genotypes for {len(self.data)} haplotypes")
self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
equality_arr = np.equal(allele_arr, gts.data)
self.log.debug(
f"Allocating array with dtype {gts.data.dtype} and size "
f"{(len(gts.samples), len(self.data), 2)}"
f"{(len(gts.samples), len(haps), 2)}"
)
hap_gts.data = np.empty((gts.data.shape[0], len(self.data), 2), dtype=np.bool_)
hap_gts.data = np.empty((gts.data.shape[0], len(haps), 2), dtype=np.bool_)
self.log.debug("Computing haplotype genotypes. This may take a while")
for i in range(len(self.data)):
for i in range(len(haps)):
hap_gts.data[:, i] = np.all(equality_arr[:, idxs[i]], axis=1)
return hap_gts

Expand All @@ -1175,8 +1195,9 @@ def sort(self):
Also sorts the variants within each haplotype
"""
self.data = dict(sorted(self.data.items(), key=lambda item: item[1]))
for hap in self.data.values():
hap.sort()
self.index(force=True)
for hap in self.type_ids["H"]:
self.data[hap].sort()

def subset(self, haplotypes: tuple[str], inplace: bool = False):
"""
Expand Down Expand Up @@ -1216,5 +1237,6 @@ def subset(self, haplotypes: tuple[str], inplace: bool = False):
f"{len(hps.data)} haplotypes."
)
hps.data = data
hps.index(force=True)
if not inplace:
return hps
10 changes: 7 additions & 3 deletions haptools/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,21 @@ def calc_ld(
variants.update(var.id for var in hp.data[target].variants)
else:
log.info("Extracting variants from haplotypes")
variants = {var.id for hap in hp.data.values() for var in hap.variants}
variants = {var.id for h in hp.type_ids["H"] for var in hp.data[h].variants}

# check to see whether the target was a haplotype
if target in hp.data:
if target in hp.type_ids["H"]:
log.info(f"Identified target '{target}' as a haplotype")
target = hp.data.pop(target)
hp.index(force=True)
if len(hp.data) == 0 and not from_gts:
log.error(
"There must be at least one more haplotype in the .hap file "
"than the TARGET haplotype specified."
)
else:
# TODO: handle other line types
pass

# check that all of the haplotypes were loaded successfully and warn otherwise
if ids is not None and not from_gts and len(ids) > len(hp.data):
Expand Down Expand Up @@ -206,7 +210,7 @@ def calc_ld(
# construct a new Haplotypes object that also stores the LD values
hp_out = data.Haplotypes(fname=output, haplotype=Haplotype, log=log)
hp_out.data = {}
for hap_id in hp.data:
for hap_id in hp.type_ids["H"]:
# break the BaseHaplotype instance up into its properties
hapd = hp.data[hap_id].__dict__
hapd_variants = hapd.pop("variants")
Expand Down
18 changes: 10 additions & 8 deletions haptools/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,23 +93,25 @@ def transform(
gts: data.GenotypesAncestry,
hap_gts: data.GenotypesVCF = None,
) -> data.GenotypesVCF:
self.index()
haps = [self.data[hap] for hap in self.type_ids["H"]]
# Initialize GenotypesVCF return value
if hap_gts is None:
hap_gts = data.GenotypesVCF(fname=None, log=self.log)
hap_gts.samples = gts.samples
hap_gts.variants = np.array(
[(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in self.data.values()],
[(hap.id, hap.chrom, hap.start, ("A", "T")) for hap in haps],
dtype=hap_gts.variants.dtype,
)
# build a fast data structure for querying the alleles in each haplotype:
# a dict mapping (variant ID, allele) -> a unique index
alleles = {}
# and a list of arrays containing the indices of each hap's alleles
idxs = [None] * len(self.data)
idxs = [None] * len(haps)
# and lastly, a list of ancestral population labels for each hap
ancestries = np.empty(len(self.data), dtype=np.uint8)
ancestries = np.empty(len(haps), dtype=np.uint8)
count = 0
for i, hap in enumerate(self.data.values()):
for i, hap in enumerate(haps):
ancestries[i] = gts.ancestry_labels.get(hap.ancestry, -1)
idxs[i] = np.empty(len(hap.variants), dtype=np.uintc)
for j, variant in enumerate(hap.variants):
Expand All @@ -131,15 +133,15 @@ def transform(
dtype=gts.data.dtype,
)[np.newaxis, :, np.newaxis]
# finally, obtain and merge the haplotype genotypes
self.log.info(f"Transforming genotypes for {len(self.data)} haplotypes")
self.log.info(f"Transforming genotypes for {len(haps)} haplotypes")
equality_arr = np.equal(allele_arr, gts.data)
self.log.debug(
f"Allocating array with dtype {gts.data.dtype} and size "
f"{(len(gts.samples), len(self.data), 2)}"
f"{(len(gts.samples), len(haps), 2)}"
)
hap_gts.data = np.empty((gts.data.shape[0], len(self.data), 2), dtype=np.bool_)
hap_gts.data = np.empty((gts.data.shape[0], len(haps), 2), dtype=np.bool_)
self.log.debug("Computing haplotype genotypes. This may take a while")
for i in range(len(self.data)):
for i in range(len(haps)):
hap_gts.data[:, i] = np.logical_and(
np.all(gts.ancestry[:, idxs[i]] == ancestries[i], axis=1),
np.all(equality_arr[:, idxs[i]], axis=1),
Expand Down
Binary file added tests/data/unordered_first_field.hap.gz
Binary file not shown.
Binary file added tests/data/unordered_first_field.hap.gz.tbi
Binary file not shown.
69 changes: 69 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,30 @@ def _basic_haps(self):
)
return expected

def _basic_unordered_first_field_haps(self):
# what do we expect to see from the basic.hap file?
expected = {
"21.q.3365*1": Haplotype("21", 26928472, 26941960, "21.q.3365*1"),
"22.q.3365*10": Haplotype("22", 26938989, 26941960, "22.q.3365*10"),
"22.q.3365*11": Haplotype("22", 26938353, 26938989, "22.q.3365*11"),
}
expected["21.q.3365*1"].variants = (
Variant(26928472, 26928472, "21_26928472_C_A", "C"),
Variant(26938353, 26938353, "21_26938353_T_C", "T"),
Variant(26940815, 26940815, "21_26940815_T_C", "C"),
Variant(26941960, 26941960, "21_26941960_A_G", "G"),
)
expected["22.q.3365*10"].variants = (
Variant(26938989, 26938989, "21_26938989_G_A", "A"),
Variant(26940815, 26940815, "21_26940815_T_C", "T"),
Variant(26941960, 26941960, "21_26941960_A_G", "A"),
)
expected["22.q.3365*11"].variants = (
Variant(26938353, 26938353, "21_26938353_T_C", "T"),
Variant(26938989, 26938989, "21_26938989_G_A", "A"),
)
return expected

def _get_dummy_haps(self):
# create three haplotypes
haplotypes = {
Expand Down Expand Up @@ -919,6 +943,51 @@ def test_iterate(self):
for exp_hap, line in zip(exp_single_hap, haps_iter):
assert exp_hap == line

def test_iterate_unordered_first_field(self):
exp_full = self._basic_unordered_first_field_haps()

exp_single_hap = [exp_full["21.q.3365*1"]]
exp_single_hap += exp_single_hap[0].variants
exp_single_hap2 = [exp_full["22.q.3365*11"]]
exp_single_hap2 += exp_single_hap2[0].variants

expected = [hap for hap in exp_full.values()]
for hap in tuple(expected):
expected += hap.variants
hap.variants = ()

# also check whether it works when we pass function params
haps = Haplotypes(DATADIR.joinpath("unordered_first_field.hap.gz"))
haps_iter = list(haps.__iter__(region="21:26928472-26941960"))
assert len(haps_iter) == len(exp_single_hap)
assert all(line in exp_single_hap for line in haps_iter)

haps_iter = list(haps.__iter__(region="21"))
assert len(haps_iter) == len(exp_single_hap)
assert all(line in exp_single_hap for line in haps_iter)

haps_iter = list(haps.__iter__(region="21:"))
assert len(haps_iter) == len(exp_single_hap)
assert all(line in exp_single_hap for line in haps_iter)

haps_iter = list(haps.__iter__(region="21:26928472-"))
assert len(haps_iter) == len(exp_single_hap)
assert all(line in exp_single_hap for line in haps_iter)

haps_iter = list(haps.__iter__(region="22:26928472-26938989"))
assert len(haps_iter) == len(exp_single_hap2)
assert all(line in exp_single_hap2 for line in haps_iter)

# also, try adding the hap ID
i = haps.__iter__(region="21:26928472-26941960", haplotypes={"21.q.3365*1"})
for exp_hap, line in zip(exp_single_hap, i):
assert exp_hap == line

# also, try adding the hap ID
haps_iter = haps.__iter__(haplotypes={"22.q.3365*11"})
for exp_hap, line in zip(exp_single_hap2, haps_iter):
assert exp_hap == line

def test_read_subset(self):
expected = {}
expected["chr21.q.3365*1"] = self._basic_haps()["chr21.q.3365*1"]
Expand Down

0 comments on commit f221397

Please sign in to comment.