Skip to content

Commit

Permalink
Allow to use SequenceLookup in ungappedprefilter
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Apr 29, 2023
1 parent 8a89305 commit 7b0a47e
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 68 deletions.
2 changes: 1 addition & 1 deletion src/commons/IndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ class IndexReader {
}

DBReader<unsigned int> *sequenceReader;
DBReader<unsigned int> *index;

private:
DBReader<unsigned int> *index;
int seqType;
};

Expand Down
115 changes: 48 additions & 67 deletions src/prefiltering/ungappedprefilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,44 @@
#include "NucleotideMatrix.h"
#include "FastSort.h"
#include "SubstitutionMatrixProfileStates.h"
#include "IndexReader.h"

#ifdef OPENMP
#include <omp.h>
#endif


int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter &resultWriter, size_t dbStart, size_t dbSize) {
int querySeqType = qdbr.getDbtype();
DBReader<unsigned int> *tdbr = NULL;
bool sameDB = false;
if (par.db1.compare(par.db2) == 0) {
sameDB = true;
tdbr = &qdbr;
int ungappedprefilter(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);
DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_PREFILTER_RES);
resultWriter.open();
bool sameDB = (par.db2.compare(par.db1) == 0);
bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
IndexReader tDbrIdx(par.db2, par.threads, IndexReader::SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
IndexReader * qDbrIdx = NULL;
DBReader<unsigned int> * qdbr = NULL;
DBReader<unsigned int> * tdbr = tDbrIdx.sequenceReader;
const int targetSeqType = tdbr->getDbtype();
int querySeqType;
if (sameDB == true) {
qDbrIdx = &tDbrIdx;
qdbr = tdbr;
querySeqType = targetSeqType;
} else {
tdbr = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
tdbr->open(DBReader<unsigned int>::NOSORT);
if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
tdbr->readMmapedDataInMemory();
}
// open the sequence, prefiltering and output databases
qDbrIdx = new IndexReader(par.db1, par.threads, IndexReader::SEQUENCES, (touch) ? IndexReader::PRELOAD_INDEX : 0);
qdbr = qDbrIdx->sequenceReader;
querySeqType = qdbr->getDbtype();
}
const int targetSeqType = tdbr->getDbtype();

SequenceLookup * sequenceLookup = NULL;
if(Parameters::isEqualDbtype(tDbrIdx.getDbtype(), Parameters::DBTYPE_INDEX_DB)){
PrefilteringIndexData data = PrefilteringIndexReader::getMetadata(tDbrIdx.index);
if(data.splits == 1){
sequenceLookup = PrefilteringIndexReader::getSequenceLookup(0, tDbrIdx.index, par.preloadMode);
}
}
BaseMatrix *subMat;
EvalueComputation * evaluer;
int8_t * tinySubMat;
Expand All @@ -61,8 +77,9 @@ int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter
}


Debug::Progress progress(dbSize);
Debug::Progress progress(qdbr->getSize());
std::vector<hit_t> shortResults;
shortResults.reserve(tdbr->getSize()/2);

#pragma omp parallel
{
Expand All @@ -79,12 +96,10 @@ int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter

std::string resultBuffer;
resultBuffer.reserve(262144);
for (size_t id = dbStart; id < (dbStart+dbSize); id++) {
#pragma omp master
progress.updateProgress();
char *querySeqData = qdbr.getData(id, thread_idx);
size_t queryKey = qdbr.getDbKey(id);
unsigned int querySeqLen = qdbr.getSeqLen(id);
for (size_t id = 0; id < qdbr->getSize(); id++) {
char *querySeqData = qdbr->getData(id, thread_idx);
size_t queryKey = qdbr->getDbKey(id);
unsigned int querySeqLen = qdbr->getSeqLen(id);

qSeq.mapSequence(id, queryKey, querySeqData, querySeqLen);
// qSeq.printProfileStatePSSM();
Expand All @@ -97,10 +112,13 @@ int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter
for (size_t tId = 0; tId < tdbr->getSize(); tId++) {
unsigned int targetKey = tdbr->getDbKey(tId);
const bool isIdentity = (queryKey == targetKey && (par.includeIdentity || sameDB))? true : false;
char * targetSeq = tdbr->getData(tId, thread_idx);
unsigned int targetSeqLen = tdbr->getSeqLen(tId);
tSeq.mapSequence(tId, targetKey, targetSeq, targetSeqLen);
// tSeq.print();
if(sequenceLookup == NULL){
char * targetSeq = tdbr->getData(tId, thread_idx);
unsigned int targetSeqLen = tdbr->getSeqLen(tId);
tSeq.mapSequence(tId, targetKey, targetSeq, targetSeqLen);
}else{
tSeq.mapSequence(tId, targetKey, sequenceLookup->getSequence(tId));
}
float queryLength = qSeq.L;
float targetLength = tSeq.L;
if(Util::canBeCovered(par.covThr, par.covMode, queryLength, targetLength)==false){
Expand Down Expand Up @@ -142,60 +160,23 @@ int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter
resultWriter.writeData(resultBuffer.c_str(), resultBuffer.length(), queryKey, 0);
resultBuffer.clear();
shortResults.clear();
progress.updateProgress();
}
}
}

qdbr.close();
if (sameDB == false) {
tdbr->close();
delete tdbr;


if(sameDB == false){
delete qDbrIdx;
}

delete [] tinySubMat;
delete subMat;
delete evaluer;
return 0;
}

int ungappedprefilter(int argc, const char **argv, const Command &command) {
MMseqsMPI::init(argc, argv);
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);
DBReader<unsigned int> qdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
qdbr.open(DBReader<unsigned int>::NOSORT);
if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
qdbr.readMmapedDataInMemory();
}


#ifdef HAVE_MPI
size_t dbFrom = 0;
size_t dbSize = 0;

qdbr.decomposeDomainByAminoAcid(MMseqsMPI::rank, MMseqsMPI::numProc, &dbFrom, &dbSize);
std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(par.db3, par.db3Index, MMseqsMPI::rank);
DBWriter resultWriter(tmpOutput.first.c_str(), tmpOutput.second.c_str(), 1 par.compressed, Parameters::DBTYPE_PREFILTER_RES);
resultWriter.open();
int status = doRescorealldiagonal(par, qdbr, resultWriter, dbFrom, dbSize);
resultWriter.close();

MPI_Barrier(MPI_COMM_WORLD);
if(MMseqsMPI::rank == 0) {
std::vector<std::pair<std::string, std::string>> splitFiles;
for(int proc = 0; proc < MMseqsMPI::numProc; ++proc){
std::pair<std::string, std::string> tmpFile = Util::createTmpFileNames(par.db3, par.db3Index, proc);
splitFiles.push_back(std::make_pair(tmpFile.first, tmpFile.second));
}
DBWriter::mergeResults(par.db3, par.db3Index, splitFiles);
}
#else
DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_PREFILTER_RES);
resultWriter.open();
int status = doRescorealldiagonal(par, qdbr, resultWriter, 0, qdbr.getSize());
resultWriter.close();
#endif
return status;
return 0;
}


0 comments on commit 7b0a47e

Please sign in to comment.