Skip to content

Commit bf62ea0

Browse files
committed
v2.0.2 - updating code for hiding taxa. now takes a file with species as in tree (gtdb issue). will now take root level in species tree by default.
1 parent a5fa800 commit bf62ea0

File tree

6 files changed

+71
-36
lines changed

6 files changed

+71
-36
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ Required arguments: ``--db``, ``--oma_path``
120120
| [``--min_fam_size``](#markdown-header--min_fam_size)|6|Only root-HOGs with a protein count passing this threshold are used.
121121
| [``--min_fam_completeness``](#markdown-header--min_fam_completeness)|0.5|Only root-HOGs passing this threshold are used. The completeness of a HOG is defined as the number of observed species divided by the expected number of species at the HOG taxonomic level.
122122
| [``--logic``](#markdown-header--logic)|OR|Logic used between the two above arguments to filter root-HOGs. Options are "AND" or "OR".
123-
| [``--root_taxon``](#markdown-header--root_taxon)|LUCA|HOGs defined at, or descending from, this taxon are uses as root-HOGs.
124-
| [``--hidden_taxa``](#markdown-header--hidden_taxa)||The proteins from these taxa are removed before the database computation. Usage: a list of comma-separated taxa (scientific name) with underscore replacing spaces (_e.g._ Bacteria,Homo_sapiens).
123+
| [``--root_taxon``](#markdown-header--root_taxon)|root of speciestree.nwk|HOGs defined at, or descending from, this taxon are uses as root-HOGs.
124+
| [``--hidden_taxa``](#markdown-header--hidden_taxa)||The proteins from these taxa are removed before the database computation. Usage: a file containing taxa on seperate lines (scientific name). These must match EXACTLY with the node name in the tree given.
125125
| [``--reduced_alphabet``](#markdown-header--reduced_alphabet)||Use reduced alphabet from Linclust paper
126126
| [``--k``](#markdown-header--k)|6|_k_-mer length
127127
| [``--oma_path``](#markdown-header--oma_path)||Path to a directory with both OmaServer.h5 and speciestree.nwk

omamer/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from datetime import date
2424

2525
__packagename__ = "omamer"
26-
__version__ = "2.0.1"
26+
__version__ = "2.0.2"
2727
__copyright__ = "(C) 2019-{:d} Victor Rossier <[email protected]> and Alex Warwick Vesztrocy <[email protected]>".format(
2828
date.today().year
2929
)

omamer/_runners.py

+33-10
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
def mkdb_oma(args):
2727
from .database import DatabaseFromOMA
2828
from .index import Index
29+
30+
from ete3 import Tree
2931
import os
3032

3133
_ensure_db_build_dependencies_available()
@@ -35,32 +37,53 @@ def mkdb_oma(args):
3537
LOG.info("arguments for build:")
3638
for (k, v) in vars(args).items():
3739
LOG.info(" - {}: {}".format(k, v))
40+
41+
oma_db_fn = os.path.join(args.oma_path, "OmaServer.h5")
42+
nwk_fn = os.path.join(args.oma_path, "speciestree.nwk")
43+
44+
# check if root_taxon in tree
45+
t = Tree(nwk_fn, format=1, quoted_node_names=True)
46+
if args.root_taxon is not None:
47+
pruned_t = t.search_nodes(name=args.root_taxon)
48+
if len(pruned_t) == 0:
49+
raise ValueError("Unable to find {} as root taxon".format(args.root_taxon))
50+
elif len(pruned_t) > 1:
51+
raise ValueError("Ambiguous root taxon {} ({})".format(args.root_taxon, ", ".join(map(lambda x: x.name, pruned_t))))
52+
t = pruned_t[0]
53+
root_taxon = t.name
54+
55+
if args.hidden_taxa:
56+
# load and check for hidden taxa
57+
hidden_taxa = list(map(lambda x: x.rstrip(), args.hidden_taxa))
58+
59+
for ht in hidden_taxa:
60+
r = t.search_nodes(name=ht)
61+
if len(r) == 0:
62+
raise ValueError("Unable to find {} as hidden taxon".format(ht))
63+
elif len(r) > 1:
64+
raise ValueError("Ambiguous hidden taxon {} ({})".format(ht, ", ".join(map(lambda x: x.name, r))))
65+
else:
66+
hidden_taxa = []
3867

3968
db = DatabaseFromOMA(
4069
args.db,
41-
root_taxon=args.root_taxon,
70+
root_taxon=root_taxon,
4271
min_fam_size=args.min_fam_size,
4372
logic=args.logic,
4473
min_fam_completeness=args.min_fam_completeness,
4574
include_younger_fams=True,
4675
mode="w",
4776
)
4877

49-
oma_db_fn = os.path.join(args.oma_path, "OmaServer.h5")
50-
nwk_fn = os.path.join(args.oma_path, "speciestree.nwk")
51-
5278
# add sequences from database
53-
LOG.info("Adding sequences")
79+
LOG.info("Loading sequences")
5480
seq_buff = db.build_database(oma_db_fn, nwk_fn)
5581

56-
hidden_taxa = []
57-
if args.hidden_taxa:
58-
hidden_taxa = [" ".join(x.split("_")) for x in args.hidden_taxa.split(",")]
59-
6082
LOG.info("Building index")
6183
db.ki = Index(
6284
db, k=args.k, reduced_alphabet=args.reduced_alphabet, hidden_taxa=hidden_taxa
6385
)
86+
db.ki.sp_filter
6487
db.ki.build_kmer_table(seq_buff)
6588
db.add_metadata()
6689
db.add_md5_hash()
@@ -207,7 +230,7 @@ def search(args):
207230

208231
def _ensure_db_build_dependencies_available():
209232
try:
210-
from pysais import sais
233+
from PySAIS import sais
211234
except ImportError:
212235
LOG.error("To build OMAmer databases, pysais must be installed. Please ensure you installed omamer with the 'build' extra, e.g. `pip install omamer[build]`")
213236
import sys

omamer/database.py

+1
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ def get_metadata(self):
696696
meta["k-mer length"] = self.ki.k
697697
meta["alphabet size"] = self.ki.alphabet.n
698698
meta["nr species"] = len(self._db_Species)
699+
meta["nr indexed species"] = (len(self._db_Species) - np.sum(self.ki.sp_filter))
699700
meta["hidden taxa"] = self.ki.hidden_taxa
700701
return meta
701702

omamer/index.py

+29-18
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
You should have received a copy of the GNU Lesser General Public License
2121
along with OMAmer. If not, see <http://www.gnu.org/licenses/>.
2222
"""
23+
from property_manager import lazy_property
2324
import numba
2425
import numpy as np
2526

@@ -46,26 +47,36 @@ def __init__(self, db, k=6, reduced_alphabet=False, hidden_taxa=[]):
4647

4748
self.alphabet = Alphabet(n=alphabet_n)
4849

49-
@property
50+
@lazy_property
5051
def sp_filter(self):
51-
tax_tab = self.db._db_Taxonomy[:]
5252
sp_filter = np.full((len(self.db._db_Species),), False)
53-
for hidden_taxon in self.hidden_taxa:
54-
descendant_species = get_leaves(
55-
np.searchsorted(tax_tab["ID"], hidden_taxon.encode("ascii")),
56-
tax_tab,
57-
self.db._db_ChildrenTax[:],
58-
)
59-
# case where hidden taxon is species
60-
if descendant_species.size == 0:
61-
sp_filter[
62-
np.searchsorted(
63-
self.db._db_Species.col("ID"), hidden_taxon.encode("ascii")
64-
)
65-
] = True
66-
else:
67-
for sp_off in descendant_species:
68-
sp_filter[tax_tab["SpeOff"][sp_off]] = True
53+
if len(self.hidden_taxa) > 0:
54+
tax_tab = self.db._db_Taxonomy[:]
55+
child_tax = self.db._db_ChildrenTax[:]
56+
57+
if len(self.hidden_taxa) > 0:
58+
LOG.debug(' - creating species filter to hide declared taxa')
59+
60+
for hidden_taxon in self.hidden_taxa:
61+
LOG.debug(' - identifying: {}'.format(hidden_taxon))
62+
taxon = np.argwhere(tax_tab["ID"] == hidden_taxon.encode("ascii")).flatten()
63+
if len(taxon) == 0:
64+
raise ValueError("Can't find {} in taxonomy.".format(hidden_taxon))
65+
elif len(taxon) > 1:
66+
raise ValueError("Ambiguous taxon {}.".format(hidden_taxon))
67+
68+
tax_ii = taxon[0]
69+
sp_ii = tax_tab[tax_ii]['SpeOff']
70+
71+
if sp_ii >= 0:
72+
# leaf (i.e., extant species listed)
73+
LOG.debug(' - hiding {}'.format(self.db._db_Species[sp_ii]['ID'].decode('ascii')))
74+
sp_filter[sp_ii] = True
75+
else:
76+
# filter all leaves below declared taxon
77+
for sp_jj in tax_tab["SpeOff"][get_leaves(tax_ii, tax_tab, child_tax)]:
78+
LOG.debug(' - hiding {}'.format(self.db._db_Species[sp_jj]['ID'].decode('ascii')))
79+
sp_filter[sp_jj] = True
6980
return sp_filter
7081

7182
@property

omamer/main.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -128,14 +128,14 @@ def get_thread_count():
128128
)
129129
mkdb_parser.add_argument(
130130
"--root_taxon",
131-
default="LUCA",
132-
help="HOGs defined at, or descending from, this taxon are uses as root-HOGs.",
131+
required=False,
132+
help="HOGs defined at, or descending from, this taxon are uses as root-HOGs. Default is the top level in species tree.",
133133
)
134134
mkdb_parser.add_argument(
135135
"--hidden_taxa",
136-
default="",
137-
help="The proteins from these taxa are removed before the database computation. Usage: a list of comma-separated taxa (scientific name) with underscore replacing spaces (e.g. Bacteria,Homo_sapiens).",
138-
type=str,
136+
required=False,
137+
help="Optional -- path to a file giving a list of taxa to hide the proteins during index creation only. HOGs will still exist in the database, but it will not be possible to place to them. Names must match EXACTLY to those in the given newick species tree.",
138+
type=FileType("r"),
139139
)
140140
# mkdb_parser.add_argument(
141141
# "--species", default="", help="Alternatively to --hidden_taxa, provide a file with species offsets in sp_tab (tmp option for scaling experiment)", type=str

0 commit comments

Comments
 (0)