From 167bbd122be0c795a077d0dfd2b90c789910d577 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Tue, 11 Apr 2023 16:29:57 +0900 Subject: [PATCH] Introduce experimental a3m filtering module (hidden by default) --- src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 7 +++ src/commons/Parameters.cpp | 13 +++++ src/commons/Parameters.h | 1 + src/util/CMakeLists.txt | 1 + src/util/filtera3m.cpp | 104 +++++++++++++++++++++++++++++++++++++ 6 files changed, 127 insertions(+) create mode 100644 src/util/filtera3m.cpp diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index c8c479c6c..eed98e629 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -54,6 +54,7 @@ extern int extractalignedregion(int argc, const char **argv, const Command& comm extern int extractdomains(int argc, const char **argv, const Command& command); extern int extractorfs(int argc, const char **argv, const Command& command); extern int extractframes(int argc, const char **argv, const Command& command); +extern int filtera3m(int argc, const char **argv, const Command& command); extern int filterdb(int argc, const char **argv, const Command& command); extern int filterresult(int argc, const char **argv, const Command& command); extern int gff2db(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index ef427dfe9..67812d74a 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -979,6 +979,13 @@ std::vector baseCommands = { {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }, {"statsDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}}, + {"filtera3m", filtera3m, &par.filtera3m, COMMAND_HIDDEN, + "DB filtering by given conditions", + "", + "Milot Mirdita ", + " ", + CITATION_MMSEQS2, {{"a3mFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfileAndStdin }, + {"a3mFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}}, {"filterresult", filterresult, &par.filterresult, COMMAND_RESULT, "Pairwise alignment result filter", NULL, diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 561b38d9b..7786dbf67 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -584,6 +584,19 @@ Parameters::Parameters(): //result2msa.push_back(&PARAM_FIRST_SEQ_REP_SEQ); result2dnamsa.push_back(&PARAM_V); + + // filtera3m + filtera3m.push_back(&PARAM_SUB_MAT); + filtera3m.push_back(&PARAM_GAP_OPEN); + filtera3m.push_back(&PARAM_GAP_EXTEND); + filtera3m.push_back(&PARAM_FILTER_MIN_ENABLE); + filtera3m.push_back(&PARAM_FILTER_MAX_SEQ_ID); + filtera3m.push_back(&PARAM_FILTER_QID); + filtera3m.push_back(&PARAM_FILTER_QSC); + filtera3m.push_back(&PARAM_FILTER_COV); + filtera3m.push_back(&PARAM_FILTER_NDIFF); + filtera3m.push_back(&PARAM_V); + // filterresult filterresult.push_back(&PARAM_SUB_MAT); filterresult.push_back(&PARAM_GAP_OPEN); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 0f2792a30..85cf4f9c3 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -1039,6 +1039,7 @@ class Parameters { std::vector result2profile; std::vector result2msa; std::vector result2dnamsa; + std::vector filtera3m; std::vector filterresult; std::vector convertmsa; std::vector msa2profile; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index c323980c6..5adbbdc0c 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -27,6 +27,7 @@ set(util_source_files util/extractorfs.cpp util/orftocontig.cpp util/touchdb.cpp + util/filtera3m.cpp util/filterdb.cpp util/gff2db.cpp util/renamedbkeys.cpp diff --git a/src/util/filtera3m.cpp b/src/util/filtera3m.cpp new file mode 100644 index 000000000..3ef27fc59 --- /dev/null +++ b/src/util/filtera3m.cpp @@ -0,0 +1,104 @@ +#include "Debug.h" +#include "Util.h" +#include "FileUtil.h" +#include "KSeqWrapper.h" +#include "MsaFilter.h" +#include "MultipleAlignment.h" + +MultipleAlignment::MSAResult readMSA(const std::string &a3mPath) { + KSeqWrapper* kseq = KSeqFactory(a3mPath.c_str()); + MultipleAlignment::MSAResult msa(0, 0, 0, NULL); + msa.msaSequenceLength = 0; + msa.centerLength = 0; + msa.setSize = 0; + std::vector sequences; + std::string seq; + Debug::Progress progress; + while (kseq->ReadEntry()) { + progress.updateProgress(); + const KSeqWrapper::KSeqEntry &e = kseq->entry; + seq.reserve(e.sequence.l); + if (msa.setSize == 0) { + msa.msaSequenceLength = e.sequence.l; + msa.centerLength = e.sequence.l; + } + + for (size_t i = 0; i < e.sequence.l; i++) { + char c = e.sequence.s[i]; + if (!(c >= 'a' && c <= 'z')) { + seq.push_back(c); + } + } + sequences.push_back(seq); + seq.clear(); + msa.setSize++; + } + msa.msaSequence = new char*[msa.setSize]; + for (size_t i = 0; i < msa.setSize; i++) { + msa.msaSequence[i] = &(sequences[i])[0]; + } + delete kseq; + return msa; +} + +int filtera3m(int argc, const char **argv, const Command& command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + std::vector qid_str_vec = Util::split(par.qid, ","); + std::vector qid_vec; + for (size_t qid_idx = 0; qid_idx < qid_str_vec.size(); qid_idx++) { + float qid_float = strtod(qid_str_vec[qid_idx].c_str(), NULL); + qid_vec.push_back(static_cast(qid_float*100)); + } + std::sort(qid_vec.begin(), qid_vec.end()); + + MultipleAlignment::MSAResult msa = readMSA(par.db1.c_str()); + FILE* handle = FileUtil::openFileOrDie(par.db2.c_str(), "w", false); + + SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0f, -0.2f); + MsaFilter filter(msa.centerLength + 1, msa.setSize, &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid()); + filter.filter( + msa.setSize, + msa.centerLength, + static_cast(par.covMSAThr * 100), + qid_vec, + par.qsc, + static_cast(par.filterMaxSeqId * 100), + par.Ndiff, + par.filterMinEnable, + (const char **) msa.msaSequence, + false + ); + bool* kept = new bool[msa.setSize]; + filter.getKept(kept, msa.setSize); + + KSeqWrapper* kseq2 = KSeqFactory(par.db1.c_str()); + size_t i = 0; + while (kseq2->ReadEntry()) { + if (kept[i] == false) { + i++; + continue; + } + fwrite(">", sizeof(char), 1, handle); + fwrite(kseq2->entry.name.s, sizeof(char), kseq2->entry.name.l, handle); + if (kseq2->entry.comment.l > 0) { + fwrite(" ", sizeof(char), 1, handle); + fwrite(kseq2->entry.comment.s, sizeof(char), kseq2->entry.comment.l, handle); + } + fwrite("\n", sizeof(char), 1, handle); + fwrite(kseq2->entry.sequence.s, sizeof(char), kseq2->entry.sequence.l, handle); + fwrite("\n", sizeof(char), 1, handle); + i++; + } + if (fclose(handle) != 0) { + Debug(Debug::ERROR) << "Cannot close file " << par.db2 << "\n"; + return EXIT_FAILURE; + } + + delete kseq2; + delete[] kept; + delete[] msa.msaSequence; + + return EXIT_SUCCESS; +}