From 314c1f0c2a2ea2f1156f2646bda5fbd34eff39fc Mon Sep 17 00:00:00 2001
From: Milot Mirdita <milot@mirdita.de>
Date: Wed, 2 Nov 2022 17:34:06 +0900
Subject: [PATCH] Add parameter --index-subset to create specialized subsets of
 precomputed indices

---
 src/commons/Parameters.cpp                   |  3 +++
 src/commons/Parameters.h                     |  7 ++++--
 src/prefiltering/PrefilteringIndexReader.cpp |  9 ++++---
 src/prefiltering/PrefilteringIndexReader.h   |  3 ++-
 src/util/indexdb.cpp                         | 26 ++++++++++++--------
 5 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index d7c73df44..45baee13f 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -184,6 +184,7 @@ Parameters::Parameters():
         // indexdb
         PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
         PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"),
+        PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries 0: normal index 1: index without headers 1: index without prefiltering data", typeid(int), (void *) &indexSubset, "^[0-2]{1}", MMseqsParameter::COMMAND_EXPERT),
         // createdb
         PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""),
         PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
@@ -725,6 +726,7 @@ Parameters::Parameters():
     indexdb.push_back(&PARAM_SEARCH_TYPE);
     indexdb.push_back(&PARAM_SPLIT);
     indexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
+    indexdb.push_back(&PARAM_INDEX_SUBSET);
     indexdb.push_back(&PARAM_V);
     indexdb.push_back(&PARAM_THREADS);
 
@@ -2275,6 +2277,7 @@ void Parameters::setDefaults() {
     // indexdb
     checkCompatible = 0;
     searchType = SEARCH_TYPE_AUTO;
+    indexSubset = INDEX_SUBSET_NORMAL;
 
     // createdb
     createdbMode = SEQUENCE_SPLIT_MODE_HARD;
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index 74aeedd0c..ba1e74ae8 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -182,8 +182,9 @@ class Parameters {
     static const int OUTFMT_TORFEND = 38;
     static const int OUTFMT_FIDENT = 39;
 
-
-
+    static const int INDEX_SUBSET_NORMAL = 0;
+    static const int INDEX_SUBSET_NO_HEADERS = 1;
+    static const int INDEX_SUBSET_NO_PREFILTER = 2;
 
     static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
                                             bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy);
@@ -529,6 +530,7 @@ class Parameters {
     // indexdb
     int checkCompatible;
     int searchType;
+    int indexSubset;
 
     // createdb
     int identifierOffset;
@@ -861,6 +863,7 @@ class Parameters {
     // indexdb
     PARAMETER(PARAM_CHECK_COMPATIBLE)
     PARAMETER(PARAM_SEARCH_TYPE)
+    PARAMETER(PARAM_INDEX_SUBSET)
 
     // createdb
     PARAMETER(PARAM_USE_HEADER) // also used by extractorfs
diff --git a/src/prefiltering/PrefilteringIndexReader.cpp b/src/prefiltering/PrefilteringIndexReader.cpp
index 54d36b4f8..3116a925a 100644
--- a/src/prefiltering/PrefilteringIndexReader.cpp
+++ b/src/prefiltering/PrefilteringIndexReader.cpp
@@ -55,8 +55,8 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
                                               DBReader<unsigned int> *alndbr,
                                               BaseMatrix *subMat, int maxSeqLen,
                                               bool hasSpacedKmer, const std::string &spacedKmerPattern,
-                                              bool compBiasCorrection, int alphabetSize, int kmerSize,
-                                              int maskMode, int maskLowerCase, float maskProb, int kmerThr, int splits) {
+                                              bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
+                                              int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset) {
 
     const int SPLIT_META = splits > 1 ? 0 : 0;
     const int SPLIT_SEQS = splits > 1 ? 1 : 0;
@@ -82,7 +82,7 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
     writer.writeData(metadataptr, sizeof(metadata), META, SPLIT_META);
     writer.alignToPageSize(SPLIT_META);
 
-    if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false) {
+    if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
         int alphabetSize = subMat->alphabetSize;
         subMat->alphabetSize = subMat->alphabetSize-1;
         ScoreMatrix s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
@@ -210,6 +210,9 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
             (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
                 ? alphabetSize -1: alphabetSize;
 
+    if (indexSubset == Parameters::INDEX_SUBSET_NO_PREFILTER) {
+        splits = 0;
+    }
     for (int s = 0; s < splits; s++) {
         size_t dbFrom = 0;
         size_t dbSize = 0;
diff --git a/src/prefiltering/PrefilteringIndexReader.h b/src/prefiltering/PrefilteringIndexReader.h
index 82d7de1c3..a77f4546b 100644
--- a/src/prefiltering/PrefilteringIndexReader.h
+++ b/src/prefiltering/PrefilteringIndexReader.h
@@ -59,7 +59,8 @@ class PrefilteringIndexReader {
                                 DBReader<unsigned int> *hdbr1, DBReader<unsigned int> *hdbr2,
                                 DBReader<unsigned int> *alndbr,
                                 BaseMatrix *seedSubMat, int maxSeqLen, bool spacedKmer, const std::string &spacedKmerPattern,
-                                bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode, int maskLowerCase, float maskProb, int kmerThr, int splits);
+                                bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
+                                int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset = 0);
 
     static DBReader<unsigned int> *openNewHeaderReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads, bool touchIndex, bool touchData);
 
diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp
index b603f8cd2..7d6c6cff0 100644
--- a/src/util/indexdb.cpp
+++ b/src/util/indexdb.cpp
@@ -136,11 +136,14 @@ int indexdb(int argc, const char **argv, const Command &command) {
     }
 
     if (recreate) {
-        DBReader<unsigned int> hdbr1(hdr1.c_str(), hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
-        hdbr1.open(DBReader<unsigned int>::NOSORT);
+        DBReader<unsigned int> *hdbr1 = NULL;
+        if (par.indexSubset != Parameters::INDEX_SUBSET_NO_HEADERS) {
+            hdbr1 = new DBReader<unsigned int>(hdr1.c_str(), hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+            hdbr1->open(DBReader<unsigned int>::NOSORT);
+        }
 
         DBReader<unsigned int> *hdbr2 = NULL;
-        if (sameDB == false && ppDB == false) {
+        if (sameDB == false && ppDB == false && par.indexSubset != Parameters::INDEX_SUBSET_NO_HEADERS) {
             hdbr2 = new DBReader<unsigned int>(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
             hdbr2->open(DBReader<unsigned int>::NOSORT);
         }
@@ -152,22 +155,25 @@ int indexdb(int argc, const char **argv, const Command &command) {
         }
 
         DBReader<unsigned int>::removeDb(indexDB);
-        PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, &hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen,
+        PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen,
                                                  par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection,
                                                  seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode,
-                                                 par.maskProb, kmerScore, par.split);
+                                                 par.maskProb, kmerScore, par.split, par.indexSubset);
+
+        if (alndbr != NULL) {
+            alndbr->close();
+            delete alndbr;
+        }
 
         if (hdbr2 != NULL) {
             hdbr2->close();
             delete hdbr2;
         }
 
-        if (alndbr != NULL) {
-            alndbr->close();
-            delete alndbr;
+        if (hdbr1 != NULL) {
+            hdbr1->close();
+            delete hdbr1;
         }
-
-        hdbr1.close();
     }
 
     if (dbr2 != NULL) {