Merge branch 'master' of https://github.com/soedinglab/mmseqs2

soedinglab · Aug 31, 2020 · b15e95a · b15e95a
2 parents b7ec0e9 + f8b3f8b
commit b15e95a
Show file tree

Hide file tree

Showing 17 changed files with 476 additions and 330 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -121,7 +121,7 @@ script:
     if [ -e "/opt/intel/inteloneapi/setvars.sh" ]; then source /opt/intel/inteloneapi/setvars.sh; fi; \
     mkdir build; cd build; \
     cmake -DHAVE_MPI="$([[ -z "$MPI" ]]; echo $?)" -DENABLE_WERROR=1 -DHAVE_TESTS=1 ..; \
-    make -j $(nproc --all); \
+    make -j ${MMSEQS_NUM_THREADS:-$(nproc --all)}; \
     mkdir path; \
     printf '#!/bin/sh\n/usr/bin/tee "$@" | tail\n' > path/tee; \
     chmod +x path/tee; \

diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and
 
 [![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2)
 [![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest)
+[![Biocontainer Pulls](https://img.shields.io/endpoint?url=https%3A%2F%2Fmmseqs.com%2Fbiocontainer.php%3Fcontainer%3Dmmseqs2)](https://biocontainers.pro/#/tools/mmseqs2)
 [![Docker Pulls](https://img.shields.io/docker/pulls/soedinglab/mmseqs2.svg)](https://hub.docker.com/r/soedinglab/mmseqs2)
 [![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master)
 [![Travis CI](https://travis-ci.org/soedinglab/MMseqs2.svg?branch=master)](https://travis-ci.org/soedinglab/MMseqs2)

diff --git a/data/workflow/update_clustering.sh b/data/workflow/update_clustering.sh
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
@@ -59,6 +59,7 @@ extern int lca(int argc, const char **argv, const Command& command);
 extern int taxonomyreport(int argc, const char **argv, const Command& command);
 extern int linclust(int argc, const char **argv, const Command& command);
 extern int map(int argc, const char **argv, const Command& command);
+extern int renamedbkeys(int argc, const char **argv, const Command& command);
 extern int maskbygff(int argc, const char **argv, const Command& command);
 extern int mergeclusters(int argc, const char **argv, const Command& command);
 extern int mergedbs(int argc, const char **argv, const Command& command);

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
@@ -788,7 +788,14 @@ std::vector<Command> baseCommands = {
                 "<i:resultDB> <o:resultDB>",
                 CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
                                           {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-
+        {"renamedbkeys",         renamedbkeys,         &par.renamedbkeys,         COMMAND_DB,
+                "Create a new DB with original keys renamed",
+                NULL,
+                "Milot Mirdita <[email protected]>",
+                "<i:idMapFile|stdin> <i:DB> <o:DB>",
+                CITATION_MMSEQS2, {{"idMapFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfileAndStdin },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
 
         {"extractorfs",          extractorfs,          &par.extractorfs,          COMMAND_SEQUENCE,
                 "Six-frame extraction of open reading frames",

diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
@@ -1090,12 +1090,15 @@ void DBReader<T>::removeDb(const std::string &databaseName){
     }
 }
 
-template<typename T>
-void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags) {
+void copyLinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags, bool link) {
     if (dbFilesFlags & DBFiles::DATA) {
         std::vector<std::string> names = FileUtil::findDatafiles(databaseName.c_str());
         if (names.size() == 1) {
-            FileUtil::symlinkAbs(names[0], outDb);
+            if (link) {
+                FileUtil::symlinkAbs(names[0].c_str(), outDb.c_str());
+            } else {
+                FileUtil::copyFile(names[0].c_str(), outDb.c_str());
+            }
         } else {
             for (size_t i = 0; i < names.size(); i++) {
                 std::string::size_type idx = names[i].rfind('.');
@@ -1107,7 +1110,11 @@ void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string
                                         << "Filename: " << names[i] << ".\n";
                     EXIT(EXIT_FAILURE);
                 }
-                FileUtil::symlinkAbs(names[i], outDb + ext);
+                if (link) {
+                    FileUtil::symlinkAbs(names[i], outDb + ext);
+                } else {
+                    FileUtil::copyFile(names[i].c_str(), (outDb + ext).c_str());
+                }
             }
         }
     }
@@ -1140,11 +1147,26 @@ void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string
     for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
         std::string file = databaseName + suffices[i].suffix;
         if (dbFilesFlags & suffices[i].flag && FileUtil::fileExists(file.c_str())) {
-            FileUtil::symlinkAbs(file, outDb + suffices[i].suffix);
+            if (link) {
+                FileUtil::symlinkAbs(file, outDb + suffices[i].suffix);
+            } else {
+                FileUtil::copyFile(file.c_str(), (outDb + suffices[i].suffix).c_str());
+            }
         }
     }
 }
 
+
+template<typename T>
+void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags) {
+    copyLinkDb(databaseName, outDb, dbFilesFlags, true);
+}
+
+template<typename T>
+void DBReader<T>::copyDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags) {
+    copyLinkDb(databaseName, outDb, dbFilesFlags, false);
+}
+
 template<typename T>
 void DBReader<T>::decomposeDomainByAminoAcid(size_t worldRank, size_t worldSize, size_t *startEntry, size_t *numEntries){
     const size_t dataSize = getDataSize();

diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
@@ -278,6 +278,7 @@ class DBReader : public MemoryTracker {
     static void removeDb(const std::string &databaseName);
 
     static void softlinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags = DBFiles::ALL);
+    static void copyDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags = DBFiles::ALL);
 
     char *mmapData(FILE *file, size_t *dataSize);
 

diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
@@ -229,7 +229,7 @@ Parameters::Parameters():
         PARAM_EXTRACT_MODE(PARAM_EXTRACT_MODE_ID, "--extract-mode", "Extract mode", "Extract from 1: Query, 2: Target", typeid(int), (void *) &extractMode, "^[1-2]{1}$"),
         // convertkb
         PARAM_KB_COLUMNS(PARAM_KB_COLUMNS_ID, "--kb-columns", "UniprotKB columns", "list of indices of UniprotKB columns to be extracted", typeid(std::string), (void *) &kbColumns, ""),
-        PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Indicates if sequences are allowed to be be removed during updating", typeid(bool), (void *) &recoverDeleted, ""),
+        PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Find and recover deleted sequences during updating of clustering", typeid(bool), (void *) &recoverDeleted, ""),
         // filtertaxdb
         PARAM_TAXON_LIST(PARAM_TAXON_LIST_ID, "--taxon-list", "Selected taxa", "Taxonomy ID, possibly multiple values separated by ','", typeid(std::string), (void *) &taxonList, ""),
         // view
@@ -977,6 +977,11 @@ Parameters::Parameters():
     createsubdb.push_back(&PARAM_SUBDB_MODE);
     createsubdb.push_back(&PARAM_V);
 
+    // renamedbkeys
+    renamedbkeys.push_back(&PARAM_SUBDB_MODE);
+    renamedbkeys.push_back(&PARAM_THREADS);
+    renamedbkeys.push_back(&PARAM_V);
+
     // createtaxdb
     createtaxdb.push_back(&PARAM_NCBI_TAX_DUMP);
     createtaxdb.push_back(&PARAM_TAX_MAPPING_FILE);

diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
@@ -1015,6 +1015,7 @@ class Parameters {
     std::vector<MMseqsParameter*> taxpercontig;
     std::vector<MMseqsParameter*> easytaxonomy;
     std::vector<MMseqsParameter*> createsubdb;
+    std::vector<MMseqsParameter*> renamedbkeys;
     std::vector<MMseqsParameter*> createtaxdb;
     std::vector<MMseqsParameter*> profile2pssm;
     std::vector<MMseqsParameter*> profile2seq;

diff --git a/src/taxonomy/aggregatetax.cpp b/src/taxonomy/aggregatetax.cpp
@@ -18,8 +18,6 @@ const int ROOT_RANK = INT_MAX;
 
 struct taxHit {
     void setByEntry(const TaxID & taxonInput, const bool useAln, const char ** taxHitData, const size_t numCols, const int voteMode) {
-        // plain format: 3+ tax columns: taxid, rank (can be more than one col), name (can be more than one col)
-        // taxid + aln format has 11 columns: taxid, tkey, bitscore, seqid, evalue, qs, qe, ql, ts, te, tl
         taxon = taxonInput;
         evalue = 1.0;
         weight = 0.0;
@@ -291,7 +289,7 @@ int aggregate(const bool useAln, int argc, const char **argv, const Command& com
                 results = Util::skipLine(results);
             }
 
-            // aggregate - the counters will be filled by the section function:
+            // aggregate - the counters will be filled by the selection function:
             size_t numAssignedSeqs = 0;
             size_t numUnassignedSeqs = 0;
             size_t numSeqsAgreeWithSelectedTaxon = 0;

diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
@@ -27,6 +27,7 @@ set(util_source_files
         util/touchdb.cpp
         util/filterdb.cpp
         util/gff2db.cpp
+        util/renamedbkeys.cpp
         util/masksequence.cpp
         util/maskbygff.cpp
         util/mergeclusters.cpp

diff --git a/src/util/renamedbkeys.cpp b/src/util/renamedbkeys.cpp
@@ -0,0 +1,190 @@
+#include "Parameters.h"
+#include "FileUtil.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "Debug.h"
+#include "Util.h"
+#include "FastSort.h"
+
+#include <climits>
+
+static bool compareToFirst(const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>& rhs){
+    return (lhs.first <= rhs.first);
+}
+
+void copyEntry(unsigned int oldKey, unsigned int newKey, DBReader<unsigned int>& reader, DBWriter& writer, bool isCompressed, int subDbMode) {
+    const size_t id = reader.getId(oldKey);
+    if (id >= UINT_MAX) {
+        Debug(Debug::ERROR) << "Key " << oldKey << " not found in database\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (subDbMode == Parameters::SUBDB_MODE_SOFT) {
+        writer.writeIndexEntry(newKey, reader.getOffset(id), reader.getEntryLen(id), 0);
+    } else {
+        char *data = reader.getDataUncompressed(id);
+        size_t originalLength = reader.getEntryLen(id);
+        size_t entryLength = std::max(originalLength, static_cast<size_t>(1)) - 1;
+
+        if (isCompressed) {
+            // copy also the null byte since it contains the information if compressed or not
+            entryLength = *(reinterpret_cast<unsigned int *>(data)) + sizeof(unsigned int) + 1;
+            writer.writeData(data, entryLength, newKey, 0, false, false);
+        } else {
+            writer.writeData(data, entryLength, newKey, 0, true, false);
+        }
+        // do not write null byte since
+        writer.writeIndexEntry(newKey, writer.getStart(0), originalLength, 0);
+    }
+}
+
+int renamedbkeys(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    FILE *orderFile = NULL;
+    if (FileUtil::fileExists(par.db1.c_str())) {
+        orderFile = fopen(par.db1.c_str(), "r");
+    } else {
+        Debug(Debug::ERROR) << "File " << par.db1 << " does not exist.\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    FILE* newLookupFile = NULL;
+    unsigned int mode = DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA;
+    if (FileUtil::fileExists((par.db2 + ".lookup").c_str())) {
+        mode |= DBReader<unsigned int>::USE_LOOKUP;
+        newLookupFile = FileUtil::openAndDelete((par.db3 + ".lookup").c_str(), "w");
+    }
+    DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), 1, mode);
+    reader.open(DBReader<unsigned int>::NOSORT);
+    const bool isCompressed = reader.isCompressed();
+
+    FILE* newMappingFile = NULL;
+    std::vector<std::pair<unsigned int, unsigned int>> mapping;
+    std::vector<std::pair<unsigned int, unsigned int>> newMapping;
+    if (FileUtil::fileExists((par.db2 + "_mapping").c_str())) {
+        mapping.reserve(reader.getSize());
+        newMapping.reserve(reader.getSize());
+        bool isSorted = Util::readMapping(par.db2 + "_mapping", mapping);
+        if (isSorted == false) {
+            std::stable_sort(mapping.begin(), mapping.end(), compareToFirst);
+        }
+        newMappingFile = FileUtil::openAndDelete((par.db3 + "_mapping").c_str(), "w");
+    }
+
+    bool isHeaderCompressed = false;
+    DBReader<unsigned int>* headerReader = NULL;
+    if (FileUtil::fileExists(par.hdr2dbtype.c_str())) {
+        headerReader = new DBReader<unsigned int>(par.hdr2.c_str(), par.hdr2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+        headerReader->open(DBReader<unsigned int>::NOSORT);
+        isHeaderCompressed = headerReader->isCompressed();
+    }
+
+    DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE);
+    writer.open();
+
+    DBWriter* headerWriter = NULL;
+    if (headerReader != NULL) {
+        headerWriter = new DBWriter(par.hdr3.c_str(), par.hdr3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE);
+        headerWriter->open();
+    }
+
+    DBReader<unsigned int>::LookupEntry* lookup = NULL;
+    std::vector<DBReader<unsigned int>::LookupEntry> newLookup;
+    if (newLookupFile != NULL) {
+        lookup = reader.getLookup();
+        newLookup.reserve(reader.getLookupSize());
+    }
+
+    char *line = NULL;
+    size_t len = 0;
+    const char *fields[2];
+    // getline malloc/reallocs automatically
+    while (getline(&line, &len, orderFile) != -1) {
+        const size_t columns = Util::getWordsOfLine(line, fields, 2);
+        if (columns < 2) {
+            Debug(Debug::WARNING) << "Not enough columns in mapping file\n";
+            continue;
+        }
+        const unsigned int oldKey = Util::fast_atoi<unsigned int>(fields[0]);
+        const unsigned int newKey = Util::fast_atoi<unsigned int>(fields[1]);
+
+        copyEntry(oldKey, newKey, reader, writer, isCompressed, par.subDbMode);
+        if (lookup != NULL) {
+            unsigned int lookupId = reader.getLookupIdByKey(oldKey);
+            DBReader<unsigned int>::LookupEntry entry = lookup[lookupId];
+            entry.id = newKey;
+            newLookup.emplace_back(entry);
+        }
+
+        if (mapping.size() > 0) {
+            std::pair<unsigned int, unsigned int> val;
+            val.first = oldKey;
+            std::vector<std::pair<unsigned int, unsigned int>>::iterator mappingIt;
+            mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirst);
+            if (mappingIt != mapping.end() && mappingIt->first == val.first) {
+                val.first = newKey;
+                newMapping.emplace_back(val);
+            }
+        }
+
+        if (headerReader != NULL && headerWriter != NULL) {
+            copyEntry(oldKey, newKey, *headerReader, *headerWriter, isHeaderCompressed, par.subDbMode);
+        }
+    }
+    // merge any kind of sequence database
+    writer.close(headerWriter != NULL);
+    DBWriter::writeDbtypeFile(par.db3.c_str(), reader.getDbtype(), isCompressed);
+    if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
+        DBReader<unsigned int>::softlinkDb(par.db2, par.db3, DBFiles::DATA);
+    }
+    if (newMappingFile != NULL) {
+        SORT_PARALLEL(newMapping.begin(), newMapping.end(), compareToFirst);
+        std::string buffer;
+        for (size_t i = 0; i < newMapping.size(); ++i) {
+            buffer.append(SSTR(newMapping[i].first));
+            buffer.append(1, '\t');
+            buffer.append(SSTR(newMapping[i].second));
+            buffer.append(1, '\n');
+            fwrite(buffer.c_str(), sizeof(char), buffer.size(), newMappingFile);
+            buffer.clear();
+        }
+        fclose(newMappingFile);
+    }
+
+    if (newLookupFile != NULL) {
+        SORT_PARALLEL(newLookup.begin(), newLookup.end(), DBReader<unsigned int>::LookupEntry::compareById);
+        std::string lookupBuffer;
+        lookupBuffer.reserve(2048);
+        for (size_t i = 0; i < newLookup.size(); ++i) {
+            reader.lookupEntryToBuffer(lookupBuffer, newLookup[i]);
+            fwrite(lookupBuffer.c_str(), sizeof(char), lookupBuffer.size(), newLookupFile);
+            lookupBuffer.clear();
+        }
+        fclose(newLookupFile);
+    }
+
+    if (headerWriter != NULL) {
+        headerWriter->close(true);
+        delete headerWriter;
+        DBWriter::writeDbtypeFile(par.hdr3.c_str(), headerReader->getDbtype(), isHeaderCompressed);
+        if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
+            DBReader<unsigned int>::softlinkDb(par.db2, par.db3, DBFiles::HEADER);
+        }
+    }
+    if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
+        DBReader<unsigned int>::softlinkDb(par.db2, par.db3, (DBFiles::Files) (DBFiles::SOURCE | DBFiles::TAX_MERGED | DBFiles::TAX_NAMES | DBFiles::TAX_NODES));
+    } else {
+        DBReader<unsigned int>::copyDb(par.db2, par.db3, (DBFiles::Files) (DBFiles::SOURCE | DBFiles::TAX_MERGED | DBFiles::TAX_NAMES | DBFiles::TAX_NODES));
+    }
+
+    free(line);
+    if (headerReader != NULL) {
+        headerReader->close();
+        delete headerReader;
+    }
+    reader.close();
+    fclose(orderFile);
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/util/result2repseq.cpp b/src/util/result2repseq.cpp
@@ -48,7 +48,7 @@ int result2repseq(int argc, const char **argv, const Command &command) {
     resultWriter.close(true);
     resultReader.close();
     seqReader.close();
-    DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY);
+    DBReader<unsigned int>::softlinkDb(par.db1, par.db3, DBFiles::SEQUENCE_ANCILLARY);
 
     return EXIT_SUCCESS;
 }