Skip to content

Commit

Permalink
changed rescoring for nucleotide sequences only in prefilter
Browse files Browse the repository at this point in the history
  • Loading branch information
AnnSeidel committed Apr 14, 2021
1 parent 19064f2 commit c967985
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 16 deletions.
9 changes: 9 additions & 0 deletions src/prefiltering/CacheFriendlyOperations.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ struct __attribute__((__packed__)) CounterResult {
unsigned int id;
unsigned short diagonal;
unsigned char count;

static bool sortById(const CounterResult &first, const CounterResult &second) {
if (first.id < second.id)
return true;
if (second.id < first.id)
return false;
return false;
}

};

template<unsigned int BINSIZE>
Expand Down
4 changes: 2 additions & 2 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
Sequence seq(qdbr->getMaxSeqLen(), querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);
QueryMatcher matcher(indexTable, sequenceLookup, kmerSubMat, ungappedSubMat,
kmerThr, kmerSize, dbSize, std::max(tdbr->getMaxSeqLen(),qdbr->getMaxSeqLen()), maxResListLen, aaBiasCorrection,
diagonalScoring, minDiagScoreThr, takeOnlyBestKmer);
diagonalScoring, minDiagScoreThr, takeOnlyBestKmer, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES);

if (seq.profile_matrix != NULL) {
matcher.setProfileMatrix(seq.profile_matrix);
Expand Down Expand Up @@ -818,7 +818,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
}
}
// calculate prefiltering results
std::pair<hit_t *, size_t> prefResults = matcher.matchQuery(&seq, targetSeqId);
std::pair<hit_t *, size_t> prefResults = matcher.matchQuery(&seq, targetSeqId, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES);
size_t resultSize = prefResults.second;
const float queryLength = static_cast<float>(qdbr->getSeqLen(id));
for (size_t i = 0; i < resultSize; i++) {
Expand Down
55 changes: 43 additions & 12 deletions src/prefiltering/QueryMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ QueryMatcher::QueryMatcher(IndexTable *indexTable, SequenceLookup *sequenceLooku
BaseMatrix *kmerSubMat, BaseMatrix *ungappedAlignmentSubMat,
short kmerThr, int kmerSize, size_t dbSize,
unsigned int maxSeqLen, size_t maxHitsPerQuery, bool aaBiasCorrection,
bool diagonalScoring, unsigned int minDiagScoreThr, bool takeOnlyBestKmer)
: idx(indexTable->getAlphabetSize(), kmerSize)
bool diagonalScoring, unsigned int minDiagScoreThr, bool takeOnlyBestKmer, bool isNucleotide)
: idx(indexTable->getAlphabetSize(), kmerSize), isNucleotide(isNucleotide)
{
this->kmerSubMat = kmerSubMat;
this->ungappedAlignmentSubMat = ungappedAlignmentSubMat;
Expand Down Expand Up @@ -81,7 +81,7 @@ QueryMatcher::~QueryMatcher(){
delete kmerGenerator;
}

std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned int identityId) {
std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned int identityId, bool isNucleotide) {
querySeq->resetCurrPos();
// std::cout << "Id: " << querySeq->getId() << std::endl;
memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
Expand All @@ -103,32 +103,63 @@ std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned
// write diagonal scores in count value
ungappedAlignment->processQuery(querySeq, compositionBias, foundDiagonals, resultSize);
memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
updateScoreBins(foundDiagonals, resultSize);
size_t elementsCntAboveMinDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize,
minDiagScoreThr, foundDiagonals, resultSize);
if (isNucleotide) {
size_t len;
// only sort the 255 bucket
for (len = 0; len < elementsCntAboveMinDiagonalThr
&& (foundDiagonals + resultSize)[len].count >= (UCHAR_MAX - ungappedAlignment->getQueryBias()); len++) { ;
}
SORT_SERIAL((foundDiagonals + resultSize), (foundDiagonals + resultSize) + len, CounterResult::sortById);
size_t prevId = UINT_MAX;//(foundDiagonals + resultSize)[0].id;
size_t max = 0;
size_t firstPos = 0;
for (size_t i = 0; i < len; i++) {
if (prevId == (foundDiagonals + resultSize)[i].id) {
unsigned int newScore = ungappedAlignment->scoreSingelSequenceByCounterResult(
(foundDiagonals + resultSize)[i]);
if (newScore > max) {
max = newScore;
(foundDiagonals + resultSize)[firstPos].diagonal = (foundDiagonals + resultSize)[i].diagonal;
}
} else {
max = i+1<len && (foundDiagonals + resultSize)[i+1].id == (foundDiagonals + resultSize)[i].id ? \
ungappedAlignment->scoreSingelSequenceByCounterResult(
(foundDiagonals + resultSize)[i]) : 0 ;
firstPos = i;
}
prevId = (foundDiagonals + resultSize)[i].id;
}
}

resultSize = keepMaxScoreElementOnly(foundDiagonals, resultSize);
unsigned int maxScoreElementsCount = keepMaxScoreElementOnly(foundDiagonals + resultSize, elementsCntAboveMinDiagonalThr);
memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
updateScoreBins(foundDiagonals + resultSize, maxScoreElementsCount);

updateScoreBins(foundDiagonals, resultSize);
unsigned int diagonalThr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery);
diagonalThr = std::max(minDiagScoreThr, diagonalThr);

// sort to not lose highest scoring hits if > 150.000 hits are searched
if(resultSize < foundDiagonalsSize / 2){
unsigned int maxDiagonalScoreThr = (UCHAR_MAX - ungappedAlignment->getQueryBias());
bool scoreIsTruncated = (diagonalThr >= maxDiagonalScoreThr) ? true : false;
size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, diagonalThr, foundDiagonals, resultSize);
size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals, diagonalThr, foundDiagonals+resultSize, maxScoreElementsCount);
if (scoreIsTruncated == true) {
memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
std::pair<size_t, unsigned int> rescoreResult = rescoreHits(querySeq, scoreSizes, foundDiagonals + resultSize, elementsCntAboveDiagonalThr, ungappedAlignment, maxDiagonalScoreThr);
std::pair<size_t, unsigned int> rescoreResult = rescoreHits(querySeq, scoreSizes, foundDiagonals, elementsCntAboveDiagonalThr, ungappedAlignment, maxDiagonalScoreThr);
size_t newResultSize = rescoreResult.first;
unsigned int maxSelfScoreMinusDiag = rescoreResult.second;
elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals, 0, foundDiagonals + resultSize, newResultSize);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag);
elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals+newResultSize, 0, foundDiagonals, newResultSize);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals+newResultSize, elementsCntAboveDiagonalThr, identityId, 0, ungappedAlignment, maxSelfScoreMinusDiag);
}else{
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals + resultSize, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false);
}
stats->truncated = 0;
}else{
//Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals, resultSize, identityId, diagonalThr, ungappedAlignment, false);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals + resultSize, maxScoreElementsCount, identityId, diagonalThr, ungappedAlignment, false);
stats->truncated = 1;
}
}else{
Expand Down Expand Up @@ -458,9 +489,9 @@ std::pair<size_t, unsigned int> QueryMatcher::rescoreHits(Sequence * querySeq, u
const unsigned char * query = querySeq->numSequence;
int maxSelfScore = align->scoreSingleSequence(std::make_pair(query, querySeq->L), 0,0);

maxSelfScore = std::min(maxSelfScore, USHRT_MAX);
maxSelfScore = (maxSelfScore-lowerBoundScore);
maxSelfScore = std::max(1, maxSelfScore);
maxSelfScore = std::min(maxSelfScore, USHRT_MAX);
float fltMaxSelfScore = static_cast<float>(maxSelfScore);
for (size_t i = 0; i < resultSize && results[i].count >= lowerBoundScore; i++) {
unsigned int newScore = align->scoreSingelSequenceByCounterResult(results[i]);
Expand Down
6 changes: 4 additions & 2 deletions src/prefiltering/QueryMatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ class QueryMatcher {
BaseMatrix *kmerSubMat, BaseMatrix *ungappedAlignmentSubMat,
short kmerThr, int kmerSize, size_t dbSize, unsigned int maxSeqLen,
size_t maxHitsPerQuery, bool aaBiasCorrection, bool diagonalScoringMode,
unsigned int minDiagScoreThr, bool takeOnlyBestKmer);
unsigned int minDiagScoreThr, bool takeOnlyBestKmer, bool isNucleotide);
~QueryMatcher();

// returns result for the sequence
// identityId is the id of the identitical sequence in the target database if there is any, UINT_MAX otherwise
std::pair<hit_t*, size_t> matchQuery(Sequence *querySeq, unsigned int identityId);
std::pair<hit_t*, size_t> matchQuery(Sequence *querySeq, unsigned int identityId, bool isNucleotide);

// set substituion matrix for KmerGenerator
void setProfileMatrix(ScoreMatrix **matrix){
Expand Down Expand Up @@ -190,6 +190,8 @@ class QueryMatcher {

Indexer idx;

bool isNucleotide;

const static size_t SCORE_RANGE = 256;

void updateScoreBins(CounterResult *result, size_t elementCount);
Expand Down

0 comments on commit c967985

Please sign in to comment.