Skip to content

Commit

Permalink
Add filter min enable
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Aug 19, 2021
1 parent 7aade9d commit c6d8ae0
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 11 deletions.
8 changes: 8 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ Parameters::Parameters():
PARAM_FILTER_QID(PARAM_FILTER_QID_ID, "--qid", "Minimum seq. id.", "Reduce diversity of output MSAs using min.seq. identity with query sequences [0.0,1.0]", typeid(float), (void *) &qid, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_FILTER_COV(PARAM_FILTER_COV_ID, "--cov", "Minimum coverage", "Filter output MSAs using min. fraction of query residues covered by matched sequences [0.0,1.0]", typeid(float), (void *) &covMSAThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_FILTER_NDIFF(PARAM_FILTER_NDIFF_ID, "--diff", "Select N most diverse seqs", "Filter MSAs by selecting most diverse set of sequences, keeping at least this many seqs in each MSA block of length 50", typeid(int), (void *) &Ndiff, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_FILTER_MIN_ENABLE(PARAM_FILTER_MIN_ENABLE_ID, "--filter-min-enable", "Use filter only at N seqs", "Filter MSAs only with more than N sequences, 0 is filters all", typeid(int), (void *) &filterMinEnable, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_WG(PARAM_WG_ID, "--wg", "Global sequence weighting", "Use global sequence weighting for profile calculation", typeid(bool), (void *) &wg, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_PCA(PARAM_PCA_ID, "--pca", "Pseudo count a", "Pseudo count admixture strength", typeid(float), (void *) &pca, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_PCB(PARAM_PCB_ID, "--pcb", "Pseudo count b", "Pseudo counts: Neff at half of maximum admixture (range 0.0-inf)", typeid(float), (void *) &pcb, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
Expand Down Expand Up @@ -489,6 +491,7 @@ Parameters::Parameters():
result2profile.push_back(&PARAM_FILTER_QSC);
result2profile.push_back(&PARAM_FILTER_COV);
result2profile.push_back(&PARAM_FILTER_NDIFF);
result2profile.push_back(&PARAM_FILTER_MIN_ENABLE);
result2profile.push_back(&PARAM_PCA);
result2profile.push_back(&PARAM_PCB);
result2profile.push_back(&PARAM_PRELOAD_MODE);
Expand Down Expand Up @@ -552,6 +555,7 @@ Parameters::Parameters():
result2msa.push_back(&PARAM_FILTER_QSC);
result2msa.push_back(&PARAM_FILTER_COV);
result2msa.push_back(&PARAM_FILTER_NDIFF);
result2msa.push_back(&PARAM_FILTER_MIN_ENABLE);
result2msa.push_back(&PARAM_THREADS);
result2msa.push_back(&PARAM_COMPRESSED);
result2msa.push_back(&PARAM_V);
Expand All @@ -574,6 +578,7 @@ Parameters::Parameters():
filterresult.push_back(&PARAM_FILTER_QSC);
filterresult.push_back(&PARAM_FILTER_COV);
filterresult.push_back(&PARAM_FILTER_NDIFF);
filterresult.push_back(&PARAM_FILTER_MIN_ENABLE);
filterresult.push_back(&PARAM_THREADS);
filterresult.push_back(&PARAM_COMPRESSED);
filterresult.push_back(&PARAM_V);
Expand All @@ -598,6 +603,7 @@ Parameters::Parameters():
msa2profile.push_back(&PARAM_FILTER_QSC);
msa2profile.push_back(&PARAM_FILTER_MAX_SEQ_ID);
msa2profile.push_back(&PARAM_FILTER_NDIFF);
msa2profile.push_back(&PARAM_FILTER_MIN_ENABLE);
msa2profile.push_back(&PARAM_GAP_OPEN);
msa2profile.push_back(&PARAM_GAP_EXTEND);
msa2profile.push_back(&PARAM_SKIP_QUERY);
Expand Down Expand Up @@ -1111,6 +1117,7 @@ Parameters::Parameters():
expand2profile.push_back(&PARAM_FILTER_QSC);
expand2profile.push_back(&PARAM_FILTER_COV);
expand2profile.push_back(&PARAM_FILTER_NDIFF);
expand2profile.push_back(&PARAM_FILTER_MIN_ENABLE);
expand2profile.push_back(&PARAM_PCA);
expand2profile.push_back(&PARAM_PCB);
expand2profile.push_back(&PARAM_PRELOAD_MODE);
Expand Down Expand Up @@ -2210,6 +2217,7 @@ void Parameters::setDefaults() {
qsc = -20.0f; // default for minimum score per column with query
covMSAThr = 0.0; // default for minimum coverage threshold
Ndiff = 1000; // pick Ndiff most different sequences from alignment
filterMinEnable = 0;
wg = false;
pca = 1.0;
pcb = 1.5;
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ class Parameters {
float qid;
float covMSAThr;
int Ndiff;
int filterMinEnable;
bool wg;
float pca;
float pcb;
Expand Down Expand Up @@ -776,6 +777,7 @@ class Parameters {
PARAMETER(PARAM_FILTER_QID)
PARAMETER(PARAM_FILTER_COV)
PARAMETER(PARAM_FILTER_NDIFF)
PARAMETER(PARAM_FILTER_MIN_ENABLE)
PARAMETER(PARAM_WG)
PARAMETER(PARAM_PCA)
PARAMETER(PARAM_PCB)
Expand Down
9 changes: 7 additions & 2 deletions src/util/expandaln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,13 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
} else {
MultipleAlignment::MSAResult res = aligner->computeMSA(&aSeq, seqSet, resultsAc, true);
resultsAc.clear();
size_t filteredSetSize = par.filterMsa == false ? res.setSize
: filter->filter(res.setSize, res.centerLength, (int)(par.covMSAThr * 100), (int)(par.qid * 100), par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, true);
size_t filteredSetSize = par.filterMsa == true && res.setSize > par.filterMinEnable ?
filter->filter(res.setSize, res.centerLength,
(int)(par.covMSAThr * 100),
(int)(par.qid * 100), par.qsc,
(int)(par.filterMaxSeqId * 100), par.Ndiff,
(const char **) res.msaSequence, true)
: res.setSize;
PSSMCalculator::Profile pssmRes = calculator->computePSSMFromMSA(filteredSetSize, aSeq.L, (const char **) res.msaSequence, par.wg);
if (par.maskProfile == true) {
masker->mask(aSeq, pssmRes);
Expand Down
2 changes: 1 addition & 1 deletion src/util/msa2profile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ int msa2profile(int argc, const char **argv, const Command &command) {
unsigned int centerLength = centerLengthWithGaps - maskedCount;

size_t filteredSetSize = setSize;
if (par.filterMsa == 1) {
if (par.filterMsa == 1 && setSize > par.filterMinEnable) {
filteredSetSize = filter.filter(setSize, centerLength, static_cast<int>(par.covMSAThr * 100),
static_cast<int>(par.qid * 100), par.qsc,
static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff,
Expand Down
2 changes: 1 addition & 1 deletion src/util/msa2result.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ int msa2result(int argc, const char **argv, const Command &command) {

unsigned int centerLength = centerLengthWithGaps - maskedCount;
size_t filteredSetSize = setSize;
if (par.filterMsa == 1) {
if (par.filterMsa == 1 && setSize > par.filterMinEnable) {
filteredSetSize = filter.filter(setSize, centerLength, static_cast<int>(par.covMSAThr * 100),
static_cast<int>(par.qid * 100), par.qsc,
static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff,
Expand Down
8 changes: 4 additions & 4 deletions src/util/result2msa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ int result2msa(int argc, const char **argv, const Command &command) {
//MultipleAlignment::print(res, &subMat);

if (par.msaFormatMode == Parameters::FORMAT_MSA_FASTADB || par.msaFormatMode == Parameters::FORMAT_MSA_FASTADB_SUMMARY) {
if (isFiltering) {
if (isFiltering && res.setSize > par.filterMinEnable) {
filter.filter(res.setSize, res.centerLength, static_cast<int>(par.covMSAThr * 100), static_cast<int>(par.qid * 100), par.qsc, static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, false);
filter.getKept(kept, res.setSize);
}
Expand Down Expand Up @@ -294,7 +294,7 @@ int result2msa(int argc, const char **argv, const Command &command) {
result.append(1, '\n');
}
} else if (par.msaFormatMode == Parameters::FORMAT_MSA_STOCKHOLM_FLAT) {
if (isFiltering) {
if (isFiltering && res.setSize > par.filterMinEnable)) {
filter.filter(res.setSize, res.centerLength, static_cast<int>(par.covMSAThr * 100), static_cast<int>(par.qid * 100), par.qsc, static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, false);
filter.getKept(kept, res.setSize);
}
Expand Down Expand Up @@ -332,7 +332,7 @@ int result2msa(int argc, const char **argv, const Command &command) {
}
result.append("//\n");
} else if (par.msaFormatMode == Parameters::FORMAT_MSA_A3M) {
if (isFiltering) {
if (isFiltering && res.setSize > par.filterMinEnable) {
filter.filter(res.setSize, res.centerLength, static_cast<int>(par.covMSAThr * 100), static_cast<int>(par.qid * 100), par.qsc, static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, false);
filter.getKept(kept, res.setSize);
}
Expand Down Expand Up @@ -394,7 +394,7 @@ int result2msa(int argc, const char **argv, const Command &command) {
}
} else if (isCA3M == true) {
size_t filteredSetSize = res.setSize;
if (isFiltering) {
if (isFiltering && res.setSize > par.filterMinEnable) {
filteredSetSize = filter.filter(res, alnResults, static_cast<int>(par.covMSAThr * 100), static_cast<int>(par.qid * 100), par.qsc, static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff);
}
if (par.formatAlignmentMode == Parameters::FORMAT_MSA_CA3M_CONSENSUS) {
Expand Down
8 changes: 5 additions & 3 deletions src/util/result2profile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,11 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
if (returnAlnRes == false) {
alnResults.clear();
}
size_t filteredSetSize = isFiltering == false ? res.setSize
: filter.filter(res, alnResults, (int)(par.covMSAThr * 100), (int)(par.qid * 100), par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff);
//MultipleAlignment::print(res, &subMat);
size_t filteredSetSize = (isFiltering == true && res.setSize > par.filterMinEnable) ?
filter.filter(res, alnResults, (int)(par.covMSAThr * 100), (int)(par.qid * 100), par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff)
:
res.setSize;
//MultipleAlignment::print(res, &subMat);

if (returnAlnRes) {
// do not count query
Expand Down

0 comments on commit c6d8ae0

Please sign in to comment.