Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Aug 31, 2020
2 parents b7ec0e9 + f8b3f8b commit b15e95a
Show file tree
Hide file tree
Showing 17 changed files with 476 additions and 330 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ script:
if [ -e "/opt/intel/inteloneapi/setvars.sh" ]; then source /opt/intel/inteloneapi/setvars.sh; fi; \
mkdir build; cd build; \
cmake -DHAVE_MPI="$([[ -z "$MPI" ]]; echo $?)" -DENABLE_WERROR=1 -DHAVE_TESTS=1 ..; \
make -j $(nproc --all); \
make -j ${MMSEQS_NUM_THREADS:-$(nproc --all)}; \
mkdir path; \
printf '#!/bin/sh\n/usr/bin/tee "$@" | tail\n' > path/tee; \
chmod +x path/tee; \
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and

[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2)
[![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest)
[![Biocontainer Pulls](https://img.shields.io/endpoint?url=https%3A%2F%2Fmmseqs.com%2Fbiocontainer.php%3Fcontainer%3Dmmseqs2)](https://biocontainers.pro/#/tools/mmseqs2)
[![Docker Pulls](https://img.shields.io/docker/pulls/soedinglab/mmseqs2.svg)](https://hub.docker.com/r/soedinglab/mmseqs2)
[![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master)
[![Travis CI](https://travis-ci.org/soedinglab/MMseqs2.svg?branch=master)](https://travis-ci.org/soedinglab/MMseqs2)
Expand Down
312 changes: 120 additions & 192 deletions data/workflow/update_clustering.sh

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ extern int lca(int argc, const char **argv, const Command& command);
extern int taxonomyreport(int argc, const char **argv, const Command& command);
extern int linclust(int argc, const char **argv, const Command& command);
extern int map(int argc, const char **argv, const Command& command);
extern int renamedbkeys(int argc, const char **argv, const Command& command);
extern int maskbygff(int argc, const char **argv, const Command& command);
extern int mergeclusters(int argc, const char **argv, const Command& command);
extern int mergedbs(int argc, const char **argv, const Command& command);
Expand Down
9 changes: 8 additions & 1 deletion src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,14 @@ std::vector<Command> baseCommands = {
"<i:resultDB> <o:resultDB>",
CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
{"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},

{"renamedbkeys", renamedbkeys, &par.renamedbkeys, COMMAND_DB,
"Create a new DB with original keys renamed",
NULL,
"Milot Mirdita <[email protected]>",
"<i:idMapFile|stdin> <i:DB> <o:DB>",
CITATION_MMSEQS2, {{"idMapFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfileAndStdin },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
{"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},

{"extractorfs", extractorfs, &par.extractorfs, COMMAND_SEQUENCE,
"Six-frame extraction of open reading frames",
Expand Down
32 changes: 27 additions & 5 deletions src/commons/DBReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,12 +1090,15 @@ void DBReader<T>::removeDb(const std::string &databaseName){
}
}

template<typename T>
void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags) {
void copyLinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags, bool link) {
if (dbFilesFlags & DBFiles::DATA) {
std::vector<std::string> names = FileUtil::findDatafiles(databaseName.c_str());
if (names.size() == 1) {
FileUtil::symlinkAbs(names[0], outDb);
if (link) {
FileUtil::symlinkAbs(names[0].c_str(), outDb.c_str());
} else {
FileUtil::copyFile(names[0].c_str(), outDb.c_str());
}
} else {
for (size_t i = 0; i < names.size(); i++) {
std::string::size_type idx = names[i].rfind('.');
Expand All @@ -1107,7 +1110,11 @@ void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string
<< "Filename: " << names[i] << ".\n";
EXIT(EXIT_FAILURE);
}
FileUtil::symlinkAbs(names[i], outDb + ext);
if (link) {
FileUtil::symlinkAbs(names[i], outDb + ext);
} else {
FileUtil::copyFile(names[i].c_str(), (outDb + ext).c_str());
}
}
}
}
Expand Down Expand Up @@ -1140,11 +1147,26 @@ void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string
for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
std::string file = databaseName + suffices[i].suffix;
if (dbFilesFlags & suffices[i].flag && FileUtil::fileExists(file.c_str())) {
FileUtil::symlinkAbs(file, outDb + suffices[i].suffix);
if (link) {
FileUtil::symlinkAbs(file, outDb + suffices[i].suffix);
} else {
FileUtil::copyFile(file.c_str(), (outDb + suffices[i].suffix).c_str());
}
}
}
}


template<typename T>
void DBReader<T>::softlinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags) {
copyLinkDb(databaseName, outDb, dbFilesFlags, true);
}

template<typename T>
void DBReader<T>::copyDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags) {
copyLinkDb(databaseName, outDb, dbFilesFlags, false);
}

template<typename T>
void DBReader<T>::decomposeDomainByAminoAcid(size_t worldRank, size_t worldSize, size_t *startEntry, size_t *numEntries){
const size_t dataSize = getDataSize();
Expand Down
1 change: 1 addition & 0 deletions src/commons/DBReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ class DBReader : public MemoryTracker {
static void removeDb(const std::string &databaseName);

static void softlinkDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags = DBFiles::ALL);
static void copyDb(const std::string &databaseName, const std::string &outDb, DBFiles::Files dbFilesFlags = DBFiles::ALL);

char *mmapData(FILE *file, size_t *dataSize);

Expand Down
7 changes: 6 additions & 1 deletion src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ Parameters::Parameters():
PARAM_EXTRACT_MODE(PARAM_EXTRACT_MODE_ID, "--extract-mode", "Extract mode", "Extract from 1: Query, 2: Target", typeid(int), (void *) &extractMode, "^[1-2]{1}$"),
// convertkb
PARAM_KB_COLUMNS(PARAM_KB_COLUMNS_ID, "--kb-columns", "UniprotKB columns", "list of indices of UniprotKB columns to be extracted", typeid(std::string), (void *) &kbColumns, ""),
PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Indicates if sequences are allowed to be be removed during updating", typeid(bool), (void *) &recoverDeleted, ""),
PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Find and recover deleted sequences during updating of clustering", typeid(bool), (void *) &recoverDeleted, ""),
// filtertaxdb
PARAM_TAXON_LIST(PARAM_TAXON_LIST_ID, "--taxon-list", "Selected taxa", "Taxonomy ID, possibly multiple values separated by ','", typeid(std::string), (void *) &taxonList, ""),
// view
Expand Down Expand Up @@ -977,6 +977,11 @@ Parameters::Parameters():
createsubdb.push_back(&PARAM_SUBDB_MODE);
createsubdb.push_back(&PARAM_V);
// renamedbkeys
renamedbkeys.push_back(&PARAM_SUBDB_MODE);
renamedbkeys.push_back(&PARAM_THREADS);
renamedbkeys.push_back(&PARAM_V);
// createtaxdb
createtaxdb.push_back(&PARAM_NCBI_TAX_DUMP);
createtaxdb.push_back(&PARAM_TAX_MAPPING_FILE);
Expand Down
1 change: 1 addition & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,7 @@ class Parameters {
std::vector<MMseqsParameter*> taxpercontig;
std::vector<MMseqsParameter*> easytaxonomy;
std::vector<MMseqsParameter*> createsubdb;
std::vector<MMseqsParameter*> renamedbkeys;
std::vector<MMseqsParameter*> createtaxdb;
std::vector<MMseqsParameter*> profile2pssm;
std::vector<MMseqsParameter*> profile2seq;
Expand Down
4 changes: 1 addition & 3 deletions src/taxonomy/aggregatetax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ const int ROOT_RANK = INT_MAX;

struct taxHit {
void setByEntry(const TaxID & taxonInput, const bool useAln, const char ** taxHitData, const size_t numCols, const int voteMode) {
// plain format: 3+ tax columns: taxid, rank (can be more than one col), name (can be more than one col)
// taxid + aln format has 11 columns: taxid, tkey, bitscore, seqid, evalue, qs, qe, ql, ts, te, tl
taxon = taxonInput;
evalue = 1.0;
weight = 0.0;
Expand Down Expand Up @@ -291,7 +289,7 @@ int aggregate(const bool useAln, int argc, const char **argv, const Command& com
results = Util::skipLine(results);
}

// aggregate - the counters will be filled by the section function:
// aggregate - the counters will be filled by the selection function:
size_t numAssignedSeqs = 0;
size_t numUnassignedSeqs = 0;
size_t numSeqsAgreeWithSelectedTaxon = 0;
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ set(util_source_files
util/touchdb.cpp
util/filterdb.cpp
util/gff2db.cpp
util/renamedbkeys.cpp
util/masksequence.cpp
util/maskbygff.cpp
util/mergeclusters.cpp
Expand Down
190 changes: 190 additions & 0 deletions src/util/renamedbkeys.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#include "Parameters.h"
#include "FileUtil.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "Debug.h"
#include "Util.h"
#include "FastSort.h"

#include <climits>

static bool compareToFirst(const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>& rhs){
return (lhs.first <= rhs.first);
}

void copyEntry(unsigned int oldKey, unsigned int newKey, DBReader<unsigned int>& reader, DBWriter& writer, bool isCompressed, int subDbMode) {
const size_t id = reader.getId(oldKey);
if (id >= UINT_MAX) {
Debug(Debug::ERROR) << "Key " << oldKey << " not found in database\n";
EXIT(EXIT_FAILURE);
}
if (subDbMode == Parameters::SUBDB_MODE_SOFT) {
writer.writeIndexEntry(newKey, reader.getOffset(id), reader.getEntryLen(id), 0);
} else {
char *data = reader.getDataUncompressed(id);
size_t originalLength = reader.getEntryLen(id);
size_t entryLength = std::max(originalLength, static_cast<size_t>(1)) - 1;

if (isCompressed) {
// copy also the null byte since it contains the information if compressed or not
entryLength = *(reinterpret_cast<unsigned int *>(data)) + sizeof(unsigned int) + 1;
writer.writeData(data, entryLength, newKey, 0, false, false);
} else {
writer.writeData(data, entryLength, newKey, 0, true, false);
}
// do not write null byte since
writer.writeIndexEntry(newKey, writer.getStart(0), originalLength, 0);
}
}

int renamedbkeys(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

FILE *orderFile = NULL;
if (FileUtil::fileExists(par.db1.c_str())) {
orderFile = fopen(par.db1.c_str(), "r");
} else {
Debug(Debug::ERROR) << "File " << par.db1 << " does not exist.\n";
EXIT(EXIT_FAILURE);
}

FILE* newLookupFile = NULL;
unsigned int mode = DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA;
if (FileUtil::fileExists((par.db2 + ".lookup").c_str())) {
mode |= DBReader<unsigned int>::USE_LOOKUP;
newLookupFile = FileUtil::openAndDelete((par.db3 + ".lookup").c_str(), "w");
}
DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), 1, mode);
reader.open(DBReader<unsigned int>::NOSORT);
const bool isCompressed = reader.isCompressed();

FILE* newMappingFile = NULL;
std::vector<std::pair<unsigned int, unsigned int>> mapping;
std::vector<std::pair<unsigned int, unsigned int>> newMapping;
if (FileUtil::fileExists((par.db2 + "_mapping").c_str())) {
mapping.reserve(reader.getSize());
newMapping.reserve(reader.getSize());
bool isSorted = Util::readMapping(par.db2 + "_mapping", mapping);
if (isSorted == false) {
std::stable_sort(mapping.begin(), mapping.end(), compareToFirst);
}
newMappingFile = FileUtil::openAndDelete((par.db3 + "_mapping").c_str(), "w");
}

bool isHeaderCompressed = false;
DBReader<unsigned int>* headerReader = NULL;
if (FileUtil::fileExists(par.hdr2dbtype.c_str())) {
headerReader = new DBReader<unsigned int>(par.hdr2.c_str(), par.hdr2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
headerReader->open(DBReader<unsigned int>::NOSORT);
isHeaderCompressed = headerReader->isCompressed();
}

DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE);
writer.open();

DBWriter* headerWriter = NULL;
if (headerReader != NULL) {
headerWriter = new DBWriter(par.hdr3.c_str(), par.hdr3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE);
headerWriter->open();
}

DBReader<unsigned int>::LookupEntry* lookup = NULL;
std::vector<DBReader<unsigned int>::LookupEntry> newLookup;
if (newLookupFile != NULL) {
lookup = reader.getLookup();
newLookup.reserve(reader.getLookupSize());
}

char *line = NULL;
size_t len = 0;
const char *fields[2];
// getline malloc/reallocs automatically
while (getline(&line, &len, orderFile) != -1) {
const size_t columns = Util::getWordsOfLine(line, fields, 2);
if (columns < 2) {
Debug(Debug::WARNING) << "Not enough columns in mapping file\n";
continue;
}
const unsigned int oldKey = Util::fast_atoi<unsigned int>(fields[0]);
const unsigned int newKey = Util::fast_atoi<unsigned int>(fields[1]);

copyEntry(oldKey, newKey, reader, writer, isCompressed, par.subDbMode);
if (lookup != NULL) {
unsigned int lookupId = reader.getLookupIdByKey(oldKey);
DBReader<unsigned int>::LookupEntry entry = lookup[lookupId];
entry.id = newKey;
newLookup.emplace_back(entry);
}

if (mapping.size() > 0) {
std::pair<unsigned int, unsigned int> val;
val.first = oldKey;
std::vector<std::pair<unsigned int, unsigned int>>::iterator mappingIt;
mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirst);
if (mappingIt != mapping.end() && mappingIt->first == val.first) {
val.first = newKey;
newMapping.emplace_back(val);
}
}

if (headerReader != NULL && headerWriter != NULL) {
copyEntry(oldKey, newKey, *headerReader, *headerWriter, isHeaderCompressed, par.subDbMode);
}
}
// merge any kind of sequence database
writer.close(headerWriter != NULL);
DBWriter::writeDbtypeFile(par.db3.c_str(), reader.getDbtype(), isCompressed);
if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
DBReader<unsigned int>::softlinkDb(par.db2, par.db3, DBFiles::DATA);
}
if (newMappingFile != NULL) {
SORT_PARALLEL(newMapping.begin(), newMapping.end(), compareToFirst);
std::string buffer;
for (size_t i = 0; i < newMapping.size(); ++i) {
buffer.append(SSTR(newMapping[i].first));
buffer.append(1, '\t');
buffer.append(SSTR(newMapping[i].second));
buffer.append(1, '\n');
fwrite(buffer.c_str(), sizeof(char), buffer.size(), newMappingFile);
buffer.clear();
}
fclose(newMappingFile);
}

if (newLookupFile != NULL) {
SORT_PARALLEL(newLookup.begin(), newLookup.end(), DBReader<unsigned int>::LookupEntry::compareById);
std::string lookupBuffer;
lookupBuffer.reserve(2048);
for (size_t i = 0; i < newLookup.size(); ++i) {
reader.lookupEntryToBuffer(lookupBuffer, newLookup[i]);
fwrite(lookupBuffer.c_str(), sizeof(char), lookupBuffer.size(), newLookupFile);
lookupBuffer.clear();
}
fclose(newLookupFile);
}

if (headerWriter != NULL) {
headerWriter->close(true);
delete headerWriter;
DBWriter::writeDbtypeFile(par.hdr3.c_str(), headerReader->getDbtype(), isHeaderCompressed);
if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
DBReader<unsigned int>::softlinkDb(par.db2, par.db3, DBFiles::HEADER);
}
}
if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
DBReader<unsigned int>::softlinkDb(par.db2, par.db3, (DBFiles::Files) (DBFiles::SOURCE | DBFiles::TAX_MERGED | DBFiles::TAX_NAMES | DBFiles::TAX_NODES));
} else {
DBReader<unsigned int>::copyDb(par.db2, par.db3, (DBFiles::Files) (DBFiles::SOURCE | DBFiles::TAX_MERGED | DBFiles::TAX_NAMES | DBFiles::TAX_NODES));
}

free(line);
if (headerReader != NULL) {
headerReader->close();
delete headerReader;
}
reader.close();
fclose(orderFile);

return EXIT_SUCCESS;
}
2 changes: 1 addition & 1 deletion src/util/result2repseq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ int result2repseq(int argc, const char **argv, const Command &command) {
resultWriter.close(true);
resultReader.close();
seqReader.close();
DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY);
DBReader<unsigned int>::softlinkDb(par.db1, par.db3, DBFiles::SEQUENCE_ANCILLARY);

return EXIT_SUCCESS;
}
Loading

0 comments on commit b15e95a

Please sign in to comment.