diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index bf3875f17..697d6d116 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -120,6 +120,8 @@ Parameters::Parameters(): PARAM_FILTER_QID(PARAM_FILTER_QID_ID, "--qid", "Minimum seq. id.", "Reduce diversity of output MSAs using min.seq. identity with query sequences [0.0,1.0]", typeid(float), (void *) &qid, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_FILTER_COV(PARAM_FILTER_COV_ID, "--cov", "Minimum coverage", "Filter output MSAs using min. fraction of query residues covered by matched sequences [0.0,1.0]", typeid(float), (void *) &covMSAThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_FILTER_NDIFF(PARAM_FILTER_NDIFF_ID, "--diff", "Select N most diverse seqs", "Filter MSAs by selecting most diverse set of sequences, keeping at least this many seqs in each MSA block of length 50", typeid(int), (void *) &Ndiff, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), + PARAM_FILTER_MIN_ENABLE(PARAM_FILTER_MIN_ENABLE_ID, "--filter-min-enable", "Use filter only at N seqs", "Filter MSAs only with more than N sequences, 0 is filters all", typeid(int), (void *) &filterMinEnable, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), + PARAM_WG(PARAM_WG_ID, "--wg", "Global sequence weighting", "Use global sequence weighting for profile calculation", typeid(bool), (void *) &wg, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_PCA(PARAM_PCA_ID, "--pca", "Pseudo count a", "Pseudo count admixture strength", typeid(float), (void *) &pca, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_PCB(PARAM_PCB_ID, "--pcb", "Pseudo count b", "Pseudo counts: Neff at half of maximum admixture (range 0.0-inf)", typeid(float), (void *) &pcb, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), @@ -489,6 +491,7 @@ Parameters::Parameters(): result2profile.push_back(&PARAM_FILTER_QSC); result2profile.push_back(&PARAM_FILTER_COV); result2profile.push_back(&PARAM_FILTER_NDIFF); + result2profile.push_back(&PARAM_FILTER_MIN_ENABLE); result2profile.push_back(&PARAM_PCA); result2profile.push_back(&PARAM_PCB); result2profile.push_back(&PARAM_PRELOAD_MODE); @@ -552,6 +555,7 @@ Parameters::Parameters(): result2msa.push_back(&PARAM_FILTER_QSC); result2msa.push_back(&PARAM_FILTER_COV); result2msa.push_back(&PARAM_FILTER_NDIFF); + result2msa.push_back(&PARAM_FILTER_MIN_ENABLE); result2msa.push_back(&PARAM_THREADS); result2msa.push_back(&PARAM_COMPRESSED); result2msa.push_back(&PARAM_V); @@ -574,6 +578,7 @@ Parameters::Parameters(): filterresult.push_back(&PARAM_FILTER_QSC); filterresult.push_back(&PARAM_FILTER_COV); filterresult.push_back(&PARAM_FILTER_NDIFF); + filterresult.push_back(&PARAM_FILTER_MIN_ENABLE); filterresult.push_back(&PARAM_THREADS); filterresult.push_back(&PARAM_COMPRESSED); filterresult.push_back(&PARAM_V); @@ -598,6 +603,7 @@ Parameters::Parameters(): msa2profile.push_back(&PARAM_FILTER_QSC); msa2profile.push_back(&PARAM_FILTER_MAX_SEQ_ID); msa2profile.push_back(&PARAM_FILTER_NDIFF); + msa2profile.push_back(&PARAM_FILTER_MIN_ENABLE); msa2profile.push_back(&PARAM_GAP_OPEN); msa2profile.push_back(&PARAM_GAP_EXTEND); msa2profile.push_back(&PARAM_SKIP_QUERY); @@ -1111,6 +1117,7 @@ Parameters::Parameters(): expand2profile.push_back(&PARAM_FILTER_QSC); expand2profile.push_back(&PARAM_FILTER_COV); expand2profile.push_back(&PARAM_FILTER_NDIFF); + expand2profile.push_back(&PARAM_FILTER_MIN_ENABLE); expand2profile.push_back(&PARAM_PCA); expand2profile.push_back(&PARAM_PCB); expand2profile.push_back(&PARAM_PRELOAD_MODE); @@ -2210,6 +2217,7 @@ void Parameters::setDefaults() { qsc = -20.0f; // default for minimum score per column with query covMSAThr = 0.0; // default for minimum coverage threshold Ndiff = 1000; // pick Ndiff most different sequences from alignment + filterMinEnable = 0; wg = false; pca = 1.0; pcb = 1.5; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 0eb0d775f..c1063f192 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -482,6 +482,7 @@ class Parameters { float qid; float covMSAThr; int Ndiff; + int filterMinEnable; bool wg; float pca; float pcb; @@ -776,6 +777,7 @@ class Parameters { PARAMETER(PARAM_FILTER_QID) PARAMETER(PARAM_FILTER_COV) PARAMETER(PARAM_FILTER_NDIFF) + PARAMETER(PARAM_FILTER_MIN_ENABLE) PARAMETER(PARAM_WG) PARAMETER(PARAM_PCA) PARAMETER(PARAM_PCB) diff --git a/src/util/expandaln.cpp b/src/util/expandaln.cpp index 60759e4ab..6763dfd61 100644 --- a/src/util/expandaln.cpp +++ b/src/util/expandaln.cpp @@ -313,8 +313,13 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl } else { MultipleAlignment::MSAResult res = aligner->computeMSA(&aSeq, seqSet, resultsAc, true); resultsAc.clear(); - size_t filteredSetSize = par.filterMsa == false ? res.setSize - : filter->filter(res.setSize, res.centerLength, (int)(par.covMSAThr * 100), (int)(par.qid * 100), par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, true); + size_t filteredSetSize = par.filterMsa == true && res.setSize > par.filterMinEnable ? + filter->filter(res.setSize, res.centerLength, + (int)(par.covMSAThr * 100), + (int)(par.qid * 100), par.qsc, + (int)(par.filterMaxSeqId * 100), par.Ndiff, + (const char **) res.msaSequence, true) + : res.setSize; PSSMCalculator::Profile pssmRes = calculator->computePSSMFromMSA(filteredSetSize, aSeq.L, (const char **) res.msaSequence, par.wg); if (par.maskProfile == true) { masker->mask(aSeq, pssmRes); diff --git a/src/util/msa2profile.cpp b/src/util/msa2profile.cpp index abfc66b14..9d0b5ff82 100644 --- a/src/util/msa2profile.cpp +++ b/src/util/msa2profile.cpp @@ -325,7 +325,7 @@ int msa2profile(int argc, const char **argv, const Command &command) { unsigned int centerLength = centerLengthWithGaps - maskedCount; size_t filteredSetSize = setSize; - if (par.filterMsa == 1) { + if (par.filterMsa == 1 && setSize > par.filterMinEnable) { filteredSetSize = filter.filter(setSize, centerLength, static_cast(par.covMSAThr * 100), static_cast(par.qid * 100), par.qsc, static_cast(par.filterMaxSeqId * 100), par.Ndiff, diff --git a/src/util/msa2result.cpp b/src/util/msa2result.cpp index 902111fee..98cb3d04f 100644 --- a/src/util/msa2result.cpp +++ b/src/util/msa2result.cpp @@ -364,7 +364,7 @@ int msa2result(int argc, const char **argv, const Command &command) { unsigned int centerLength = centerLengthWithGaps - maskedCount; size_t filteredSetSize = setSize; - if (par.filterMsa == 1) { + if (par.filterMsa == 1 && setSize > par.filterMinEnable) { filteredSetSize = filter.filter(setSize, centerLength, static_cast(par.covMSAThr * 100), static_cast(par.qid * 100), par.qsc, static_cast(par.filterMaxSeqId * 100), par.Ndiff, diff --git a/src/util/result2msa.cpp b/src/util/result2msa.cpp index 5e8f02900..5ea230475 100644 --- a/src/util/result2msa.cpp +++ b/src/util/result2msa.cpp @@ -239,7 +239,7 @@ int result2msa(int argc, const char **argv, const Command &command) { //MultipleAlignment::print(res, &subMat); if (par.msaFormatMode == Parameters::FORMAT_MSA_FASTADB || par.msaFormatMode == Parameters::FORMAT_MSA_FASTADB_SUMMARY) { - if (isFiltering) { + if (isFiltering && res.setSize > par.filterMinEnable) { filter.filter(res.setSize, res.centerLength, static_cast(par.covMSAThr * 100), static_cast(par.qid * 100), par.qsc, static_cast(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, false); filter.getKept(kept, res.setSize); } @@ -294,7 +294,7 @@ int result2msa(int argc, const char **argv, const Command &command) { result.append(1, '\n'); } } else if (par.msaFormatMode == Parameters::FORMAT_MSA_STOCKHOLM_FLAT) { - if (isFiltering) { + if (isFiltering && res.setSize > par.filterMinEnable)) { filter.filter(res.setSize, res.centerLength, static_cast(par.covMSAThr * 100), static_cast(par.qid * 100), par.qsc, static_cast(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, false); filter.getKept(kept, res.setSize); } @@ -332,7 +332,7 @@ int result2msa(int argc, const char **argv, const Command &command) { } result.append("//\n"); } else if (par.msaFormatMode == Parameters::FORMAT_MSA_A3M) { - if (isFiltering) { + if (isFiltering && res.setSize > par.filterMinEnable) { filter.filter(res.setSize, res.centerLength, static_cast(par.covMSAThr * 100), static_cast(par.qid * 100), par.qsc, static_cast(par.filterMaxSeqId * 100), par.Ndiff, (const char **) res.msaSequence, false); filter.getKept(kept, res.setSize); } @@ -394,7 +394,7 @@ int result2msa(int argc, const char **argv, const Command &command) { } } else if (isCA3M == true) { size_t filteredSetSize = res.setSize; - if (isFiltering) { + if (isFiltering && res.setSize > par.filterMinEnable) { filteredSetSize = filter.filter(res, alnResults, static_cast(par.covMSAThr * 100), static_cast(par.qid * 100), par.qsc, static_cast(par.filterMaxSeqId * 100), par.Ndiff); } if (par.formatAlignmentMode == Parameters::FORMAT_MSA_CA3M_CONSENSUS) { diff --git a/src/util/result2profile.cpp b/src/util/result2profile.cpp index 31119c9f8..6c4574226 100644 --- a/src/util/result2profile.cpp +++ b/src/util/result2profile.cpp @@ -204,9 +204,11 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret if (returnAlnRes == false) { alnResults.clear(); } - size_t filteredSetSize = isFiltering == false ? res.setSize - : filter.filter(res, alnResults, (int)(par.covMSAThr * 100), (int)(par.qid * 100), par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff); - //MultipleAlignment::print(res, &subMat); + size_t filteredSetSize = (isFiltering == true && res.setSize > par.filterMinEnable) ? + filter.filter(res, alnResults, (int)(par.covMSAThr * 100), (int)(par.qid * 100), par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff) + : + res.setSize; + //MultipleAlignment::print(res, &subMat); if (returnAlnRes) { // do not count query