From 15140153511cdd4e2b9a7ec5769e4c3e460ef9ad Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Sun, 28 May 2023 12:12:30 +0900 Subject: [PATCH] Rework pairaln to support different pairing modes. Add support for dummy sequences to result2msa --- src/alignment/MultipleAlignment.cpp | 3 +- src/commons/Parameters.cpp | 9 ++++ src/commons/Parameters.h | 16 ++++++++ src/util/pairaln.cpp | 55 ++++++++++++++++++++----- src/util/result2msa.cpp | 64 +++++++++++++++++++++++++---- 5 files changed, 128 insertions(+), 19 deletions(-) diff --git a/src/alignment/MultipleAlignment.cpp b/src/alignment/MultipleAlignment.cpp index a2d7882b0..7206f8838 100644 --- a/src/alignment/MultipleAlignment.cpp +++ b/src/alignment/MultipleAlignment.cpp @@ -105,8 +105,9 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente unsigned int queryPos = result.qStartPos; unsigned int targetPos = result.dbStartPos; // HACK: score was 0 and sequence was rejected, so we fill in an empty gap sequence + // Needed for pairaln with dummy sequences if(targetPos == UINT_MAX) { - Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n"; + //Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n"; // fill up with gaps for(size_t pos = 0; pos < centerSeqSize; pos++){ edgeSeqMSA[pos] = '-'; diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index bc14ef1e8..ab6af6624 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -270,6 +270,9 @@ Parameters::Parameters(): // aggregatetax PARAM_MAJORITY(PARAM_MAJORITY_ID, "--majority", "Majority threshold", "minimal fraction of agreement among taxonomically assigned sequences of a set", typeid(float), (void *) &majorityThr, "^0(\\.[0-9]+)?|^1(\\.0+)?$"), PARAM_VOTE_MODE(PARAM_VOTE_MODE_ID, "--vote-mode", "Vote mode", "Mode of assigning weights to compute majority. 0: uniform, 1: minus log E-value, 2: score", typeid(int), (void *) &voteMode, "^[0-2]{1}$"), + // pairaln + PARAM_PAIRING_DUMMY_MODE(PARAM_PAIRING_DUMMY_MODE_ID, "--pairing-dummy-mode", "Include dummy pairing", "0: dont include, 1: include - an entry that will cause result2msa to write a gap only line", typeid(int), (void *) &pairdummymode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), + PARAM_PAIRING_MODE(PARAM_PAIRING_MODE_ID, "--pairing-mode", "Pairing mode", "0: pair maximal per species, 1: pair only if all chains are covered per species", typeid(int), (void *) &pairmode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), // taxonomyreport PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"), // createtaxdb @@ -1191,6 +1194,8 @@ Parameters::Parameters(): expand2profile.push_back(&PARAM_V); pairaln.push_back(&PARAM_PRELOAD_MODE); + pairaln.push_back(&PARAM_PAIRING_DUMMY_MODE); + pairaln.push_back(&PARAM_PAIRING_MODE); pairaln.push_back(&PARAM_COMPRESSED); pairaln.push_back(&PARAM_THREADS); pairaln.push_back(&PARAM_V); @@ -2512,6 +2517,10 @@ void Parameters::setDefaults() { majorityThr = 0.5; voteMode = AGG_TAX_MINUS_LOG_EVAL; + // pairaln + pairdummymode = PAIRALN_DUMMY_MODE_OFF; + pairmode = PAIRALN_MODE_ALL_PER_SPECIES; + // taxonomyreport reportMode = 0; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 345b3ab62..963bbe378 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -213,6 +213,14 @@ class Parameters { static const int AGG_TAX_MINUS_LOG_EVAL = 1; static const int AGG_TAX_SCORE = 2; + // pairaln dummy mode + static const int PAIRALN_DUMMY_MODE_OFF = 0; + static const int PAIRALN_DUMMY_MODE_ON = 1; + + // pairaln mode + static const int PAIRALN_MODE_ALL_PER_SPECIES = 0; + static const int PAIRALN_MODE_COVER_ALL_CHAINS = 1; + // taxonomy search strategy static const int TAXONOMY_SINGLE_SEARCH = 1; static const int TAXONOMY_2BLCA = 2; @@ -644,6 +652,10 @@ class Parameters { float majorityThr; int voteMode; + // pairaln + int pairdummymode; + int pairmode; + // taxonomyreport int reportMode; @@ -983,6 +995,10 @@ class Parameters { PARAMETER(PARAM_MAJORITY) PARAMETER(PARAM_VOTE_MODE) + // pairaln + PARAMETER(PARAM_PAIRING_DUMMY_MODE) + PARAMETER(PARAM_PAIRING_MODE) + // taxonomyreport PARAMETER(PARAM_REPORT_MODE) diff --git a/src/util/pairaln.cpp b/src/util/pairaln.cpp index d400f9dba..7f680f1cd 100644 --- a/src/util/pairaln.cpp +++ b/src/util/pairaln.cpp @@ -57,23 +57,30 @@ int pairaln(int argc, const char **argv, const Command& command) { std::vector result; result.reserve(100000); std::unordered_map findPair; + std::vector taxonToPair; std::string output; output.reserve(100000); + bool hasBacktrace = false; #pragma omp for schedule(dynamic, 1) - for (size_t fileNumber = 0; fileNumber < fileToIds.size(); fileNumber++) { + for (size_t fileNumber = 0; fileNumber < fileToIds.size(); fileNumber++) { char buffer[1024 + 32768 * 4]; findPair.clear(); + taxonToPair.clear(); progress.updateProgress(); + unsigned int minResultDbKey = UINT_MAX; // find intersection between all proteins for (size_t i = 0; i < fileToIds[fileNumber].size(); i++) { result.clear(); size_t id = fileToIds[fileNumber][i]; Matcher::readAlignmentResults(result, alnDbr.getData(id, thread_idx), true); + for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) { + hasBacktrace = result[resIdx].backtrace.size() > 0; unsigned int taxon = mapping->lookup(result[resIdx].dbKey); // we don't want to introduce a new field, reuse existing unused field here result[resIdx].dbOrfStartPos = taxon; + minResultDbKey = std::min(result[resIdx].dbKey, minResultDbKey); } std::stable_sort(result.begin(), result.end(), compareByTaxId); unsigned int prevTaxon = UINT_MAX; @@ -92,6 +99,16 @@ int pairaln(int argc, const char **argv, const Command& command) { } } + // fill taxonToPair vector + std::unordered_map::iterator it; + for (it = findPair.begin(); it != findPair.end(); ++it) { + int thresholdToPair = (par.pairmode == Parameters::PAIRALN_MODE_ALL_PER_SPECIES) ? 1 : fileToIds[fileNumber].size() - 1; + if (it->second > thresholdToPair) { + taxonToPair.emplace_back(it->first); + } + } + std::sort(taxonToPair.begin(), taxonToPair.end()); + for (size_t i = 0; i < fileToIds[fileNumber].size(); i++) { result.clear(); output.clear(); @@ -103,21 +120,39 @@ int pairaln(int argc, const char **argv, const Command& command) { // we don't want to introduce a new field, reuse existing unused field here result[resIdx].dbOrfStartPos = taxon; } + // stable sort is required to assure that best hit is first per taxon std::stable_sort(result.begin(), result.end(), compareByTaxId); unsigned int prevTaxon = UINT_MAX; - for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) { - unsigned int taxon = result[resIdx].dbOrfStartPos; - // found pair! - if (taxon != prevTaxon - && findPair.find(taxon) != findPair.end() - && findPair[taxon] == fileToIds[fileNumber].size()) { - bool hasBacktrace = result[resIdx].backtrace.size() > 0; - size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false); + // iterate over taxonToPair + size_t resIdxStart = 0; + for(size_t taxonInList : taxonToPair) { + bool taxonFound = false; + for (size_t resIdx = resIdxStart; resIdx < result.size(); ++resIdx) { + unsigned int taxon = result[resIdx].dbOrfStartPos; + // check if this taxon has enough information to pair + if(taxonInList != taxon){ + continue; + } + bool bestTaxonHit = (taxon != prevTaxon); + taxonFound = true; + if(bestTaxonHit){ + size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false); + output.append(buffer, len); + resIdxStart = resIdx + 1; + break; + } + prevTaxon = taxon; + } + if(taxonFound == false && par.pairdummymode == Parameters::PAIRALN_DUMMY_MODE_ON){ // par.addDummyPairedAlignment + // write an Matcher::result_t with UINT_MAX as dbKey + Matcher::result_t emptyResult(minResultDbKey, 1, 1, 0, 1, 0, + 0, UINT_MAX, 0, 0, UINT_MAX, 0, 0, "1M"); + size_t len = Matcher::resultToBuffer(buffer, emptyResult, hasBacktrace, false, false); output.append(buffer, len); } - prevTaxon = taxon; } + resultWriter.writeData(output.c_str(), output.length(), alnDbr.getDbKey(id), thread_idx); } } diff --git a/src/util/result2msa.cpp b/src/util/result2msa.cpp index 4048c448e..ba98663e9 100644 --- a/src/util/result2msa.cpp +++ b/src/util/result2msa.cpp @@ -324,9 +324,21 @@ int result2msa(int argc, const char **argv, const Command &command) { header = targetHeaderReader->getData(id, thread_idx); length = targetHeaderReader->getEntryLen(id) - 1; } + bool isOnlyGap = true; + for (size_t pos = 0; pos < res.centerLength; pos++) { + char aa = res.msaSequence[i][pos]; + if (aa != MultipleAlignment::GAP) { + isOnlyGap = false; + break; + } + } result.append(1, '>'); - result.append(header, length); + if(isOnlyGap) { + result.append("DUMMY\n"); + }else{ + result.append(header, length); + } // need to allow insertion in the centerSequence for (size_t pos = 0; pos < res.centerLength; pos++) { char aa = res.msaSequence[i][pos]; @@ -353,13 +365,31 @@ int result2msa(int argc, const char **argv, const Command &command) { continue; } - char *header; + const char *header; + bool isOnlyGap = true; + for (size_t pos = 0; pos < res.centerLength; pos++) { + char aa = res.msaSequence[i][pos]; + if (aa != MultipleAlignment::GAP) { + isOnlyGap = false; + break; + } + } + + if (i == 0) { - header = centerSequenceHeader; + if(isOnlyGap) { + header = "DUMMY"; + }else{ + header = centerSequenceHeader; + } } else { - unsigned int key = seqKeys[i - 1]; - size_t id = targetHeaderReader->getId(key); - header = targetHeaderReader->getData(id, thread_idx); + if(isOnlyGap) { + header = "DUMMY"; + }else { + unsigned int key = seqKeys[i - 1]; + size_t id = targetHeaderReader->getId(key); + header = targetHeaderReader->getData(id, thread_idx); + } } accession = Util::parseFastaHeader(header); @@ -386,12 +416,30 @@ int result2msa(int argc, const char **argv, const Command &command) { } result.push_back('>'); + bool isOnlyGap = true; + for (size_t pos = 0; pos < res.centerLength; pos++) { + char aa = res.msaSequence[i][pos]; + if (aa != MultipleAlignment::GAP) { + isOnlyGap = false; + break; + } + } + + if (i == 0) { - result.append(Util::parseFastaHeader(centerSequenceHeader)); + if(isOnlyGap){ + result.append("DUMMY"); + }else{ + result.append(Util::parseFastaHeader(centerSequenceHeader)); + } } else { unsigned int key = seqKeys[i - 1]; size_t id = targetHeaderReader->getId(key); - result.append(Util::parseFastaHeader(targetHeaderReader->getData(id, thread_idx))); + if(isOnlyGap){ + result.append("DUMMY"); + }else { + result.append(Util::parseFastaHeader(targetHeaderReader->getData(id, thread_idx))); + } if (par.msaFormatMode == Parameters::FORMAT_MSA_A3M_ALN_INFO) { size_t len = Matcher::resultToBuffer(buffer, alnResults[i - 1], false); char* data = buffer;