Skip to content

Commit

Permalink
Add taxonomic filtering during prefilter with --taxon-list
Browse files Browse the repository at this point in the history
This happens before the ungapped alignment stage, thus is not affected by --max-seqs
  • Loading branch information
milot-mirdita committed Aug 4, 2022
1 parent 3b9cf88 commit 1d63172
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ Parameters::Parameters():
prefilter.push_back(&PARAM_MASK_PROBABILTY);
prefilter.push_back(&PARAM_MASK_LOWER_CASE);
prefilter.push_back(&PARAM_MIN_DIAG_SCORE);
prefilter.push_back(&PARAM_TAXON_LIST);
prefilter.push_back(&PARAM_INCLUDE_IDENTITY);
prefilter.push_back(&PARAM_SPACED_KMER_MODE);
prefilter.push_back(&PARAM_PRELOAD_MODE);
Expand Down
1 change: 1 addition & 0 deletions src/prefiltering/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(prefiltering_header_files
prefiltering/Prefiltering.h
prefiltering/PrefilteringIndexReader.h
prefiltering/QueryMatcher.h
prefiltering/QueryMatcherTaxonomyHook.h
prefiltering/ReducedMatrix.h
prefiltering/SequenceLookup.h
prefiltering/UngappedAlignment.h
Expand Down
18 changes: 18 additions & 0 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "ExtendedSubstitutionMatrix.h"
#include "SubstitutionMatrixProfileStates.h"
#include "DBWriter.h"
#include "QueryMatcherTaxonomyHook.h"

#include "PatternCompiler.h"
#include "FileUtil.h"
Expand Down Expand Up @@ -219,9 +220,19 @@ Prefiltering::Prefiltering(const std::string &queryDB,
_3merSubMatrix = getScoreMatrix(*kmerSubMat, 3);
kmerSubMat->alphabetSize = alphabetSize;
}

if (par.taxonList.length() > 0) {
taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList);
} else {
taxonomyHook = NULL;
}
}

Prefiltering::~Prefiltering() {
if (taxonomyHook != NULL) {
delete taxonomyHook;
}

if (sameQTDB == false) {
qdbr->close();
delete qdbr;
Expand Down Expand Up @@ -790,6 +801,10 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
matcher.setSubstitutionMatrix(NULL, NULL);
}

if (taxonomyHook != NULL) {
matcher.setQueryMatcherHook(taxonomyHook);
}

char buffer[128];
std::string result;
result.reserve(1000000);
Expand All @@ -816,6 +831,9 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
}
}
// calculate prefiltering results
if (taxonomyHook != NULL) {
taxonomyHook->setDbFrom(dbFrom);
}
std::pair<hit_t *, size_t> prefResults = matcher.matchQuery(&seq, targetSeqId, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES);
size_t resultSize = prefResults.second;
const float queryLength = static_cast<float>(qdbr->getSeqLen(id));
Expand Down
3 changes: 3 additions & 0 deletions src/prefiltering/Prefiltering.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <list>
#include <utility>

class QueryMatcherTaxonomyHook;

class Prefiltering {
public:
Prefiltering(
Expand Down Expand Up @@ -99,6 +101,7 @@ class Prefiltering {
int preloadMode;
const unsigned int threads;
int compressed;
QueryMatcherTaxonomyHook* taxonomyHook;

bool runSplit(const std::string &resultDB, const std::string &resultDBIndex, size_t split, bool merge);

Expand Down
5 changes: 4 additions & 1 deletion src/prefiltering/QueryMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ QueryMatcher::QueryMatcher(IndexTable *indexTable, SequenceLookup *sequenceLooku
short kmerThr, int kmerSize, size_t dbSize,
unsigned int maxSeqLen, size_t maxHitsPerQuery, bool aaBiasCorrection, float aaBiasCorrectionScale,
bool diagonalScoring, unsigned int minDiagScoreThr, bool takeOnlyBestKmer, bool isNucleotide)
: idx(indexTable->getAlphabetSize(), kmerSize), isNucleotide(isNucleotide)
: idx(indexTable->getAlphabetSize(), kmerSize), isNucleotide(isNucleotide), hook(NULL)
{
this->kmerSubMat = kmerSubMat;
this->ungappedAlignmentSubMat = ungappedAlignmentSubMat;
Expand Down Expand Up @@ -99,6 +99,9 @@ std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned
}

size_t resultSize = match(querySeq, compositionBias);
if (hook != NULL) {
resultSize = hook->afterDiagonalMatchingHook(*this, resultSize);
}
std::pair<hit_t *, size_t> queryResult;
if (diagonalScoring) {
// write diagonal scores in count value
Expand Down
16 changes: 16 additions & 0 deletions src/prefiltering/QueryMatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ struct hit_t {
}
};

class QueryMatcherHook;

class QueryMatcher {
public:
QueryMatcher(IndexTable *indexTable, SequenceLookup *sequenceLookup,
Expand All @@ -63,6 +65,10 @@ class QueryMatcher {
// identityId is the id of the identitical sequence in the target database if there is any, UINT_MAX otherwise
std::pair<hit_t*, size_t> matchQuery(Sequence *querySeq, unsigned int identityId, bool isNucleotide);

void setQueryMatcherHook(QueryMatcherHook* hook) {
this->hook = hook;
}

// set substituion matrix for KmerGenerator
void setProfileMatrix(ScoreMatrix **matrix){
kmerGenerator->setDivideStrategy(matrix);
Expand Down Expand Up @@ -195,6 +201,8 @@ class QueryMatcher {

const static size_t SCORE_RANGE = 256;

QueryMatcherHook* hook;

void updateScoreBins(CounterResult *result, size_t elementCount);

static unsigned int computeScoreThreshold(unsigned int * scoreSizes, size_t maxHitsPerQuery) {
Expand Down Expand Up @@ -256,6 +264,14 @@ class QueryMatcher {
size_t mergeElements(CounterResult *foundDiagonals, size_t hitCounter);

size_t keepMaxScoreElementOnly(CounterResult *foundDiagonals, size_t resultSize);

friend class QueryMatcherTaxonomyHook;
};

class QueryMatcherHook {
public:
virtual ~QueryMatcherHook() {};
virtual size_t afterDiagonalMatchingHook(QueryMatcher& matcher, size_t resultSize) = 0;
};

#endif //MMSEQS_QUERYTEMPLATEMATCHEREXACTMATCH_H
55 changes: 55 additions & 0 deletions src/prefiltering/QueryMatcherTaxonomyHook.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef QUERY_MATCHER_TAXONOMY_HOOK_H
#define QUERY_MATCHER_TAXONOMY_HOOK_H

#include "QueryMatcher.h"
#include "NcbiTaxonomy.h"
#include "MappingReader.h"
#include "DBReader.h"
#include "PrefilteringIndexReader.h"
#include "TaxonomyExpression.h"

class QueryMatcherTaxonomyHook : public QueryMatcherHook {
public:
QueryMatcherTaxonomyHook(std::string targetPath, DBReader<unsigned int>* targetReader, const std::string& expressionString)
: targetReader(targetReader), dbFrom(0) {
std::string targetName = PrefilteringIndexReader::dbPathWithoutIndex(targetPath);
taxonomy = NcbiTaxonomy::openTaxonomy(targetName);
taxonomyMapping = new MappingReader(targetName);
expression = new TaxonomyExpression(expressionString, *taxonomy);
}

~QueryMatcherTaxonomyHook() {
delete taxonomy;
delete taxonomyMapping;
delete expression;
}

void setDbFrom(unsigned int from) {
dbFrom = from;
}

size_t afterDiagonalMatchingHook(QueryMatcher& matcher, size_t resultSize) {
size_t writePos = 0;
for (size_t i = 0; i < resultSize; i++) {
unsigned int currId = matcher.foundDiagonals[i].id;
unsigned int key = targetReader->getDbKey(dbFrom + currId);
TaxID currTax = taxonomyMapping->lookup(key);
if (expression->isAncestor(currTax)) {
if (i != writePos) {
matcher.foundDiagonals[writePos] = matcher.foundDiagonals[i];
}
writePos++;
}
}
return writePos;
}

NcbiTaxonomy* taxonomy;
MappingReader* taxonomyMapping;
DBReader<unsigned int>* targetReader;
TaxonomyExpression* expression;

unsigned int dbFrom;
};

#endif
2 changes: 1 addition & 1 deletion util/regression

0 comments on commit 1d63172

Please sign in to comment.