From c967985e9254dadfcc1d2f349de697bf30803da7 Mon Sep 17 00:00:00 2001 From: AnnSeidel Date: Thu, 15 Apr 2021 00:24:55 +0200 Subject: [PATCH] changed rescoring for nucleotide sequences only in prefilter --- src/prefiltering/CacheFriendlyOperations.h | 9 ++++ src/prefiltering/Prefiltering.cpp | 4 +- src/prefiltering/QueryMatcher.cpp | 55 +++++++++++++++++----- src/prefiltering/QueryMatcher.h | 6 ++- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/prefiltering/CacheFriendlyOperations.h b/src/prefiltering/CacheFriendlyOperations.h index 38d9f2ae3..ff46926e3 100644 --- a/src/prefiltering/CacheFriendlyOperations.h +++ b/src/prefiltering/CacheFriendlyOperations.h @@ -47,6 +47,15 @@ struct __attribute__((__packed__)) CounterResult { unsigned int id; unsigned short diagonal; unsigned char count; + + static bool sortById(const CounterResult &first, const CounterResult &second) { + if (first.id < second.id) + return true; + if (second.id < first.id) + return false; + return false; + } + }; template diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 06d0504a7..7341fe8b4 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -782,7 +782,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu Sequence seq(qdbr->getMaxSeqLen(), querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern); QueryMatcher matcher(indexTable, sequenceLookup, kmerSubMat, ungappedSubMat, kmerThr, kmerSize, dbSize, std::max(tdbr->getMaxSeqLen(),qdbr->getMaxSeqLen()), maxResListLen, aaBiasCorrection, - diagonalScoring, minDiagScoreThr, takeOnlyBestKmer); + diagonalScoring, minDiagScoreThr, takeOnlyBestKmer, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES); if (seq.profile_matrix != NULL) { matcher.setProfileMatrix(seq.profile_matrix); @@ -818,7 +818,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu } } // calculate prefiltering results - std::pair prefResults = matcher.matchQuery(&seq, targetSeqId); + std::pair prefResults = matcher.matchQuery(&seq, targetSeqId, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES); size_t resultSize = prefResults.second; const float queryLength = static_cast(qdbr->getSeqLen(id)); for (size_t i = 0; i < resultSize; i++) { diff --git a/src/prefiltering/QueryMatcher.cpp b/src/prefiltering/QueryMatcher.cpp index 9f690f28b..dde39d0ec 100644 --- a/src/prefiltering/QueryMatcher.cpp +++ b/src/prefiltering/QueryMatcher.cpp @@ -23,8 +23,8 @@ QueryMatcher::QueryMatcher(IndexTable *indexTable, SequenceLookup *sequenceLooku BaseMatrix *kmerSubMat, BaseMatrix *ungappedAlignmentSubMat, short kmerThr, int kmerSize, size_t dbSize, unsigned int maxSeqLen, size_t maxHitsPerQuery, bool aaBiasCorrection, - bool diagonalScoring, unsigned int minDiagScoreThr, bool takeOnlyBestKmer) - : idx(indexTable->getAlphabetSize(), kmerSize) + bool diagonalScoring, unsigned int minDiagScoreThr, bool takeOnlyBestKmer, bool isNucleotide) + : idx(indexTable->getAlphabetSize(), kmerSize), isNucleotide(isNucleotide) { this->kmerSubMat = kmerSubMat; this->ungappedAlignmentSubMat = ungappedAlignmentSubMat; @@ -81,7 +81,7 @@ QueryMatcher::~QueryMatcher(){ delete kmerGenerator; } -std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned int identityId) { +std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned int identityId, bool isNucleotide) { querySeq->resetCurrPos(); // std::cout << "Id: " << querySeq->getId() << std::endl; memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int)); @@ -103,10 +103,41 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned // write diagonal scores in count value ungappedAlignment->processQuery(querySeq, compositionBias, foundDiagonals, resultSize); memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int)); + updateScoreBins(foundDiagonals, resultSize); + size_t elementsCntAboveMinDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, + minDiagScoreThr, foundDiagonals, resultSize); + if (isNucleotide) { + size_t len; + // only sort the 255 bucket + for (len = 0; len < elementsCntAboveMinDiagonalThr + && (foundDiagonals + resultSize)[len].count >= (UCHAR_MAX - ungappedAlignment->getQueryBias()); len++) { ; + } + SORT_SERIAL((foundDiagonals + resultSize), (foundDiagonals + resultSize) + len, CounterResult::sortById); + size_t prevId = UINT_MAX;//(foundDiagonals + resultSize)[0].id; + size_t max = 0; + size_t firstPos = 0; + for (size_t i = 0; i < len; i++) { + if (prevId == (foundDiagonals + resultSize)[i].id) { + unsigned int newScore = ungappedAlignment->scoreSingelSequenceByCounterResult( + (foundDiagonals + resultSize)[i]); + if (newScore > max) { + max = newScore; + (foundDiagonals + resultSize)[firstPos].diagonal = (foundDiagonals + resultSize)[i].diagonal; + } + } else { + max = i+1scoreSingelSequenceByCounterResult( + (foundDiagonals + resultSize)[i]) : 0 ; + firstPos = i; + } + prevId = (foundDiagonals + resultSize)[i].id; + } + } - resultSize = keepMaxScoreElementOnly(foundDiagonals, resultSize); + unsigned int maxScoreElementsCount = keepMaxScoreElementOnly(foundDiagonals + resultSize, elementsCntAboveMinDiagonalThr); + memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int)); + updateScoreBins(foundDiagonals + resultSize, maxScoreElementsCount); - updateScoreBins(foundDiagonals, resultSize); unsigned int diagonalThr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery); diagonalThr = std::max(minDiagScoreThr, diagonalThr); @@ -114,21 +145,21 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned if(resultSize < foundDiagonalsSize / 2){ unsigned int maxDiagonalScoreThr = (UCHAR_MAX - ungappedAlignment->getQueryBias()); bool scoreIsTruncated = (diagonalThr >= maxDiagonalScoreThr) ? true : false; - size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, diagonalThr, foundDiagonals, resultSize); + size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals, diagonalThr, foundDiagonals+resultSize, maxScoreElementsCount); if (scoreIsTruncated == true) { memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int)); - std::pair rescoreResult = rescoreHits(querySeq, scoreSizes, foundDiagonals + resultSize, elementsCntAboveDiagonalThr, ungappedAlignment, maxDiagonalScoreThr); + std::pair rescoreResult = rescoreHits(querySeq, scoreSizes, foundDiagonals, elementsCntAboveDiagonalThr, ungappedAlignment, maxDiagonalScoreThr); size_t newResultSize = rescoreResult.first; unsigned int maxSelfScoreMinusDiag = rescoreResult.second; - elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals, 0, foundDiagonals + resultSize, newResultSize); - queryResult = getResult(foundDiagonals, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag); + elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals+newResultSize, 0, foundDiagonals, newResultSize); + queryResult = getResult(foundDiagonals+newResultSize, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag); }else{ - queryResult = getResult(foundDiagonals + resultSize, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false); + queryResult = getResult(foundDiagonals, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false); } stats->truncated = 0; }else{ //Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n"; - queryResult = getResult(foundDiagonals, resultSize, identityId, diagonalThr, ungappedAlignment, false); + queryResult = getResult(foundDiagonals + resultSize, maxScoreElementsCount, identityId, diagonalThr, ungappedAlignment, false); stats->truncated = 1; } }else{ @@ -458,9 +489,9 @@ std::pair QueryMatcher::rescoreHits(Sequence * querySeq, u const unsigned char * query = querySeq->numSequence; int maxSelfScore = align->scoreSingleSequence(std::make_pair(query, querySeq->L), 0,0); - maxSelfScore = std::min(maxSelfScore, USHRT_MAX); maxSelfScore = (maxSelfScore-lowerBoundScore); maxSelfScore = std::max(1, maxSelfScore); + maxSelfScore = std::min(maxSelfScore, USHRT_MAX); float fltMaxSelfScore = static_cast(maxSelfScore); for (size_t i = 0; i < resultSize && results[i].count >= lowerBoundScore; i++) { unsigned int newScore = align->scoreSingelSequenceByCounterResult(results[i]); diff --git a/src/prefiltering/QueryMatcher.h b/src/prefiltering/QueryMatcher.h index bb003e240..13bc6d635 100644 --- a/src/prefiltering/QueryMatcher.h +++ b/src/prefiltering/QueryMatcher.h @@ -56,12 +56,12 @@ class QueryMatcher { BaseMatrix *kmerSubMat, BaseMatrix *ungappedAlignmentSubMat, short kmerThr, int kmerSize, size_t dbSize, unsigned int maxSeqLen, size_t maxHitsPerQuery, bool aaBiasCorrection, bool diagonalScoringMode, - unsigned int minDiagScoreThr, bool takeOnlyBestKmer); + unsigned int minDiagScoreThr, bool takeOnlyBestKmer, bool isNucleotide); ~QueryMatcher(); // returns result for the sequence // identityId is the id of the identitical sequence in the target database if there is any, UINT_MAX otherwise - std::pair matchQuery(Sequence *querySeq, unsigned int identityId); + std::pair matchQuery(Sequence *querySeq, unsigned int identityId, bool isNucleotide); // set substituion matrix for KmerGenerator void setProfileMatrix(ScoreMatrix **matrix){ @@ -190,6 +190,8 @@ class QueryMatcher { Indexer idx; + bool isNucleotide; + const static size_t SCORE_RANGE = 256; void updateScoreBins(CounterResult *result, size_t elementCount);