From 314c1f0c2a2ea2f1156f2646bda5fbd34eff39fc Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Wed, 2 Nov 2022 17:34:06 +0900 Subject: [PATCH] Add parameter --index-subset to create specialized subsets of precomputed indices --- src/commons/Parameters.cpp | 3 +++ src/commons/Parameters.h | 7 ++++-- src/prefiltering/PrefilteringIndexReader.cpp | 9 ++++--- src/prefiltering/PrefilteringIndexReader.h | 3 ++- src/util/indexdb.cpp | 26 ++++++++++++-------- 5 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index d7c73df44..45baee13f 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -184,6 +184,7 @@ Parameters::Parameters(): // indexdb PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"), + PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries 0: normal index 1: index without headers 1: index without prefiltering data", typeid(int), (void *) &indexSubset, "^[0-2]{1}", MMseqsParameter::COMMAND_EXPERT), // createdb PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""), PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"), @@ -725,6 +726,7 @@ Parameters::Parameters(): indexdb.push_back(&PARAM_SEARCH_TYPE); indexdb.push_back(&PARAM_SPLIT); indexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT); + indexdb.push_back(&PARAM_INDEX_SUBSET); indexdb.push_back(&PARAM_V); indexdb.push_back(&PARAM_THREADS); @@ -2275,6 +2277,7 @@ void Parameters::setDefaults() { // indexdb checkCompatible = 0; searchType = SEARCH_TYPE_AUTO; + indexSubset = INDEX_SUBSET_NORMAL; // createdb createdbMode = SEQUENCE_SPLIT_MODE_HARD; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 74aeedd0c..ba1e74ae8 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -182,8 +182,9 @@ class Parameters { static const int OUTFMT_TORFEND = 38; static const int OUTFMT_FIDENT = 39; - - + static const int INDEX_SUBSET_NORMAL = 0; + static const int INDEX_SUBSET_NO_HEADERS = 1; + static const int INDEX_SUBSET_NO_PREFILTER = 2; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy); @@ -529,6 +530,7 @@ class Parameters { // indexdb int checkCompatible; int searchType; + int indexSubset; // createdb int identifierOffset; @@ -861,6 +863,7 @@ class Parameters { // indexdb PARAMETER(PARAM_CHECK_COMPATIBLE) PARAMETER(PARAM_SEARCH_TYPE) + PARAMETER(PARAM_INDEX_SUBSET) // createdb PARAMETER(PARAM_USE_HEADER) // also used by extractorfs diff --git a/src/prefiltering/PrefilteringIndexReader.cpp b/src/prefiltering/PrefilteringIndexReader.cpp index 54d36b4f8..3116a925a 100644 --- a/src/prefiltering/PrefilteringIndexReader.cpp +++ b/src/prefiltering/PrefilteringIndexReader.cpp @@ -55,8 +55,8 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB, DBReader *alndbr, BaseMatrix *subMat, int maxSeqLen, bool hasSpacedKmer, const std::string &spacedKmerPattern, - bool compBiasCorrection, int alphabetSize, int kmerSize, - int maskMode, int maskLowerCase, float maskProb, int kmerThr, int splits) { + bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode, + int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset) { const int SPLIT_META = splits > 1 ? 0 : 0; const int SPLIT_SEQS = splits > 1 ? 1 : 0; @@ -82,7 +82,7 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB, writer.writeData(metadataptr, sizeof(metadata), META, SPLIT_META); writer.alignToPageSize(SPLIT_META); - if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false) { + if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) { int alphabetSize = subMat->alphabetSize; subMat->alphabetSize = subMat->alphabetSize-1; ScoreMatrix s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3); @@ -210,6 +210,9 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB, (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS)) ? alphabetSize -1: alphabetSize; + if (indexSubset == Parameters::INDEX_SUBSET_NO_PREFILTER) { + splits = 0; + } for (int s = 0; s < splits; s++) { size_t dbFrom = 0; size_t dbSize = 0; diff --git a/src/prefiltering/PrefilteringIndexReader.h b/src/prefiltering/PrefilteringIndexReader.h index 82d7de1c3..a77f4546b 100644 --- a/src/prefiltering/PrefilteringIndexReader.h +++ b/src/prefiltering/PrefilteringIndexReader.h @@ -59,7 +59,8 @@ class PrefilteringIndexReader { DBReader *hdbr1, DBReader *hdbr2, DBReader *alndbr, BaseMatrix *seedSubMat, int maxSeqLen, bool spacedKmer, const std::string &spacedKmerPattern, - bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode, int maskLowerCase, float maskProb, int kmerThr, int splits); + bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode, + int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset = 0); static DBReader *openNewHeaderReader(DBReader*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads, bool touchIndex, bool touchData); diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp index b603f8cd2..7d6c6cff0 100644 --- a/src/util/indexdb.cpp +++ b/src/util/indexdb.cpp @@ -136,11 +136,14 @@ int indexdb(int argc, const char **argv, const Command &command) { } if (recreate) { - DBReader hdbr1(hdr1.c_str(), hdr1Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); - hdbr1.open(DBReader::NOSORT); + DBReader *hdbr1 = NULL; + if (par.indexSubset != Parameters::INDEX_SUBSET_NO_HEADERS) { + hdbr1 = new DBReader(hdr1.c_str(), hdr1Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); + hdbr1->open(DBReader::NOSORT); + } DBReader *hdbr2 = NULL; - if (sameDB == false && ppDB == false) { + if (sameDB == false && ppDB == false && par.indexSubset != Parameters::INDEX_SUBSET_NO_HEADERS) { hdbr2 = new DBReader(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); hdbr2->open(DBReader::NOSORT); } @@ -152,22 +155,25 @@ int indexdb(int argc, const char **argv, const Command &command) { } DBReader::removeDb(indexDB); - PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, &hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen, + PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen, par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection, seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode, - par.maskProb, kmerScore, par.split); + par.maskProb, kmerScore, par.split, par.indexSubset); + + if (alndbr != NULL) { + alndbr->close(); + delete alndbr; + } if (hdbr2 != NULL) { hdbr2->close(); delete hdbr2; } - if (alndbr != NULL) { - alndbr->close(); - delete alndbr; + if (hdbr1 != NULL) { + hdbr1->close(); + delete hdbr1; } - - hdbr1.close(); } if (dbr2 != NULL) {