Add appenddbtoindex to argument a precomputed index in sub-projects

soedinglab · Dec 25, 2021 · 75af0c8 · 75af0c8
1 parent 4f046dd
commit 75af0c8
Show file tree

Hide file tree

Showing 7 changed files with 170 additions and 3 deletions.
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
@@ -5,6 +5,7 @@
 extern int align(int argc, const char **argv, const Command& command);
 extern int alignall(int argc, const char **argv, const Command& command);
 extern int alignbykmer(int argc, const char **argv, const Command& command);
+extern int appenddbtoindex(int argc, const char **argv, const Command& command);
 extern int apply(int argc, const char **argv, const Command& command);
 extern int besthitperset(int argc, const char **argv, const Command &command);
 extern int transitivealign(int argc, const char **argv, const Command &command);

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
@@ -128,7 +128,14 @@ std::vector<Command> baseCommands = {
                 "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
                                                            {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
-        {"indexdb",              indexdb,              &par.indexdb,              COMMAND_HIDDEN,
+        {"appenddbtoindex",      appenddbtoindex,      &par.appenddbtoindex,      COMMAND_HIDDEN,
+                NULL,
+                NULL,
+                "Milot Mirdita <[email protected]>",
+                "<i:DB1> ... <i:DBN> <o:DB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb },
+                                   {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+       {"indexdb",               indexdb,              &par.indexdb,              COMMAND_HIDDEN,
                 NULL,
                 NULL,
                 "Martin Steinegger <[email protected]>",

diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h
@@ -64,6 +64,8 @@ class DBWriter : public MemoryTracker  {
     bool isClosed(){
         return closed;
     }
+
+    static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);
 private:
     size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx);
     void writeThreadBuffer(unsigned int idx, size_t dataSize);
@@ -77,8 +79,6 @@ class DBWriter : public MemoryTracker  {
 
     static void mergeIndex(const char** indexFilenames, unsigned int fileCount, const std::vector<size_t> &dataSizes);
 
-    static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);
-
     char* dataFileName;
     char* indexFileName;
 

diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
@@ -1301,6 +1301,10 @@ Parameters::Parameters():
     tar2db.push_back(&PARAM_THREADS);
     tar2db.push_back(&PARAM_V);
 
+    // appenddbtoindex
+    appenddbtoindex.push_back(&PARAM_ID_LIST);
+    appenddbtoindex.push_back(&PARAM_V);
+
     //checkSaneEnvironment();
     setDefaults();
 }

diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
@@ -1110,6 +1110,7 @@ class Parameters {
     std::vector<MMseqsParameter*> enrichworkflow;
     std::vector<MMseqsParameter*> databases;
     std::vector<MMseqsParameter*> tar2db;
+    std::vector<MMseqsParameter*> appenddbtoindex;
 
     std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
                                              const std::vector<MMseqsParameter*> &par2);

diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(util_source_files
         util/alignall.cpp
         util/alignbykmer.cpp
+        util/appenddbtoindex.cpp
         util/apply.cpp
         util/clusthash.cpp
         util/compress.cpp

diff --git a/src/util/appenddbtoindex.cpp b/src/util/appenddbtoindex.cpp
@@ -0,0 +1,153 @@
+#include "Parameters.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "Debug.h"
+#include "Util.h"
+
+int appenddbtoindex(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+
+    std::string outDb = par.filenames.back();
+    par.filenames.pop_back();
+
+    // read in database keys for the new database entries and validate that we have enough
+    std::vector<unsigned int> keys;
+    {
+        std::vector<std::string> ids = Util::split(par.idList, ",");
+        keys.reserve(ids.size());
+        for (size_t i = 0; i < ids.size(); ++i) {
+            char *rest;
+            unsigned int key = strtoul(ids[i].c_str(), &rest, 10);
+            if ((rest != ids[i].c_str() && *rest != '\0') || errno == ERANGE) {
+                Debug(Debug::ERROR) << "Could not read key " << ids[i] << "\n";
+                return EXIT_FAILURE;
+            }
+            keys.emplace_back(key);
+        }
+        if (keys.size() != par.filenames.size()) {
+            Debug(Debug::ERROR) << "Same number of databases and keys are needed\n";
+            return EXIT_FAILURE;
+        }
+        // fail early if duplicates are found
+        std::vector<unsigned int> check(keys.begin(), keys.end());
+        std::sort(check.begin(), check.end());
+        for (size_t i = 1; i < check.size(); ++i) {
+            if (check[i - 1] == check[i] || (check[i - 1] + 1) == check[i]) {
+                Debug(Debug::ERROR) << "Duplicate ID given. Each database takes two consecutive IDs.\n";
+                return EXIT_FAILURE;
+            }
+        }
+    }
+
+    // if we have a split database, make one new split where we append the new files
+    FILE* outDataHandle = NULL;
+    {
+        std::string checkName = outDb + ".0";
+        size_t cnt = 0;
+        while (FileUtil::fileExists(checkName.c_str()) == true) {
+            cnt++;
+            checkName = outDb + "." + SSTR(cnt);
+        }
+        if (cnt == 0) {
+            outDataHandle = FileUtil::openFileOrDie(outDb.c_str(), "ab", true);
+        } else {
+            outDataHandle = FileUtil::openFileOrDie(checkName.c_str(), "wb", false);
+        }
+    }
+
+    std::string outIndexName = outDb + ".index";
+    size_t offset = 0;
+    {
+        DBReader<unsigned int> outReader(outDb.c_str(), outIndexName.c_str(), 1, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
+        outReader.open(DBReader<unsigned int>::NOSORT);
+        // validate that given keys dont exist already
+        for (size_t i = 0; i < keys.size(); ++i) {
+            if (outReader.getId(keys[i]) != UINT_MAX) {
+                Debug(Debug::ERROR) << "Key " << keys[i] << " already exists in database\n";
+                return EXIT_FAILURE;
+            }
+            if (outReader.getId(keys[i]+1) != UINT_MAX) {
+                Debug(Debug::ERROR) << "Key " << (keys[i]+1) << " already exists in database\n";
+                return EXIT_FAILURE;
+            }
+        }
+        offset = outReader.getTotalDataSize();
+        outReader.close();
+    }
+
+    const char nullbyte = '\0';
+    char buffer[8192];
+    FILE* outIndexHandle = FileUtil::openFileOrDie(outIndexName.c_str(), "a", true);
+    for (size_t i = 0; i < par.filenames.size(); ++i) {
+        const unsigned int key = keys[i];
+        const std::string& inDb = par.filenames[i];
+        const std::string inIndexName = inDb + ".index";
+        size_t inSize = 0;
+        FILE* inIndexHandle = FileUtil::openFileOrDie(inIndexName.c_str(), "r", true);
+        size_t indexSize = 0;
+        while ((indexSize = fread(buffer, 1, 8192, inIndexHandle)) > 0) {
+            size_t written = fwrite(buffer, 1, indexSize, outDataHandle);
+            if (written != indexSize) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+            inSize += indexSize;
+        }
+        fclose(inIndexHandle);
+        size_t written = fwrite(&nullbyte, sizeof(char), 1, outDataHandle);
+        if (written != 1) {
+            Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        inSize += 1;
+        size_t len = DBWriter::indexToBuffer(buffer, key, offset, inSize);
+        written = fwrite(buffer, sizeof(char), len, outIndexHandle);
+        if (written != len) {
+            Debug(Debug::ERROR) << "Cannot write to index file " << outIndexName << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        offset += inSize;
+
+        DBReader<unsigned int> reader(inDb.c_str(), inIndexName.c_str(), 1, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
+        reader.open(DBReader<unsigned int>::HARDNOSORT);
+        inSize = reader.getTotalDataSize();
+        for (size_t idx = 0; idx < reader.getDataFileCnt(); idx++) {
+            char* data = reader.getDataForFile(idx);
+            size_t size = reader.getDataSizeForFile(idx);
+            written = fwrite(data, sizeof(char), size, outDataHandle);
+            if (written != size) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+        }
+        reader.close();
+
+        written = fwrite(&nullbyte, sizeof(char), 1, outDataHandle);
+        if (written != 1) {
+            Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        inSize += 1;
+        len = DBWriter::indexToBuffer(buffer, key + 1, offset, inSize);
+        written = fwrite(buffer, sizeof(char), len, outIndexHandle);
+        if (written != len) {
+            Debug(Debug::ERROR) << "Cannot write to index file " << outIndexName << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        offset += inSize;
+    }
+
+    if (fclose(outDataHandle) != 0) {
+        Debug(Debug::ERROR) << "Cannot close data file " << outDb << "\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (fclose(outIndexHandle) != 0) {
+        Debug(Debug::ERROR) << "Cannot close index file " << outIndexName << "\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    DBWriter::sortIndex(outIndexName.c_str(), outIndexName.c_str(), false);
+
+    return EXIT_SUCCESS;
+}