Skip to content

Commit

Permalink
Add appenddbtoindex to argument a precomputed index in sub-projects
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Dec 25, 2021
1 parent 4f046dd commit 75af0c8
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
extern int align(int argc, const char **argv, const Command& command);
extern int alignall(int argc, const char **argv, const Command& command);
extern int alignbykmer(int argc, const char **argv, const Command& command);
extern int appenddbtoindex(int argc, const char **argv, const Command& command);
extern int apply(int argc, const char **argv, const Command& command);
extern int besthitperset(int argc, const char **argv, const Command &command);
extern int transitivealign(int argc, const char **argv, const Command &command);
Expand Down
9 changes: 8 additions & 1 deletion src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,14 @@ std::vector<Command> baseCommands = {
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
{"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN,
{"appenddbtoindex", appenddbtoindex, &par.appenddbtoindex, COMMAND_HIDDEN,
NULL,
NULL,
"Milot Mirdita <[email protected]>",
"<i:DB1> ... <i:DBN> <o:DB>",
CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb },
{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
{"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN,
NULL,
NULL,
"Martin Steinegger <[email protected]>",
Expand Down
4 changes: 2 additions & 2 deletions src/commons/DBWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ class DBWriter : public MemoryTracker {
bool isClosed(){
return closed;
}

static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);
private:
size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx);
void writeThreadBuffer(unsigned int idx, size_t dataSize);
Expand All @@ -77,8 +79,6 @@ class DBWriter : public MemoryTracker {

static void mergeIndex(const char** indexFilenames, unsigned int fileCount, const std::vector<size_t> &dataSizes);

static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);

char* dataFileName;
char* indexFileName;

Expand Down
4 changes: 4 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1301,6 +1301,10 @@ Parameters::Parameters():
tar2db.push_back(&PARAM_THREADS);
tar2db.push_back(&PARAM_V);
// appenddbtoindex
appenddbtoindex.push_back(&PARAM_ID_LIST);
appenddbtoindex.push_back(&PARAM_V);
//checkSaneEnvironment();
setDefaults();
}
Expand Down
1 change: 1 addition & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,7 @@ class Parameters {
std::vector<MMseqsParameter*> enrichworkflow;
std::vector<MMseqsParameter*> databases;
std::vector<MMseqsParameter*> tar2db;
std::vector<MMseqsParameter*> appenddbtoindex;

std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
const std::vector<MMseqsParameter*> &par2);
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(util_source_files
util/alignall.cpp
util/alignbykmer.cpp
util/appenddbtoindex.cpp
util/apply.cpp
util/clusthash.cpp
util/compress.cpp
Expand Down
153 changes: 153 additions & 0 deletions src/util/appenddbtoindex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#include "Parameters.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "Debug.h"
#include "Util.h"

int appenddbtoindex(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);

std::string outDb = par.filenames.back();
par.filenames.pop_back();

// read in database keys for the new database entries and validate that we have enough
std::vector<unsigned int> keys;
{
std::vector<std::string> ids = Util::split(par.idList, ",");
keys.reserve(ids.size());
for (size_t i = 0; i < ids.size(); ++i) {
char *rest;
unsigned int key = strtoul(ids[i].c_str(), &rest, 10);
if ((rest != ids[i].c_str() && *rest != '\0') || errno == ERANGE) {
Debug(Debug::ERROR) << "Could not read key " << ids[i] << "\n";
return EXIT_FAILURE;
}
keys.emplace_back(key);
}
if (keys.size() != par.filenames.size()) {
Debug(Debug::ERROR) << "Same number of databases and keys are needed\n";
return EXIT_FAILURE;
}
// fail early if duplicates are found
std::vector<unsigned int> check(keys.begin(), keys.end());
std::sort(check.begin(), check.end());
for (size_t i = 1; i < check.size(); ++i) {
if (check[i - 1] == check[i] || (check[i - 1] + 1) == check[i]) {
Debug(Debug::ERROR) << "Duplicate ID given. Each database takes two consecutive IDs.\n";
return EXIT_FAILURE;
}
}
}

// if we have a split database, make one new split where we append the new files
FILE* outDataHandle = NULL;
{
std::string checkName = outDb + ".0";
size_t cnt = 0;
while (FileUtil::fileExists(checkName.c_str()) == true) {
cnt++;
checkName = outDb + "." + SSTR(cnt);
}
if (cnt == 0) {
outDataHandle = FileUtil::openFileOrDie(outDb.c_str(), "ab", true);
} else {
outDataHandle = FileUtil::openFileOrDie(checkName.c_str(), "wb", false);
}
}

std::string outIndexName = outDb + ".index";
size_t offset = 0;
{
DBReader<unsigned int> outReader(outDb.c_str(), outIndexName.c_str(), 1, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
outReader.open(DBReader<unsigned int>::NOSORT);
// validate that given keys dont exist already
for (size_t i = 0; i < keys.size(); ++i) {
if (outReader.getId(keys[i]) != UINT_MAX) {
Debug(Debug::ERROR) << "Key " << keys[i] << " already exists in database\n";
return EXIT_FAILURE;
}
if (outReader.getId(keys[i]+1) != UINT_MAX) {
Debug(Debug::ERROR) << "Key " << (keys[i]+1) << " already exists in database\n";
return EXIT_FAILURE;
}
}
offset = outReader.getTotalDataSize();
outReader.close();
}

const char nullbyte = '\0';
char buffer[8192];
FILE* outIndexHandle = FileUtil::openFileOrDie(outIndexName.c_str(), "a", true);
for (size_t i = 0; i < par.filenames.size(); ++i) {
const unsigned int key = keys[i];
const std::string& inDb = par.filenames[i];
const std::string inIndexName = inDb + ".index";
size_t inSize = 0;
FILE* inIndexHandle = FileUtil::openFileOrDie(inIndexName.c_str(), "r", true);
size_t indexSize = 0;
while ((indexSize = fread(buffer, 1, 8192, inIndexHandle)) > 0) {
size_t written = fwrite(buffer, 1, indexSize, outDataHandle);
if (written != indexSize) {
Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
EXIT(EXIT_FAILURE);
}
inSize += indexSize;
}
fclose(inIndexHandle);
size_t written = fwrite(&nullbyte, sizeof(char), 1, outDataHandle);
if (written != 1) {
Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
EXIT(EXIT_FAILURE);
}
inSize += 1;
size_t len = DBWriter::indexToBuffer(buffer, key, offset, inSize);
written = fwrite(buffer, sizeof(char), len, outIndexHandle);
if (written != len) {
Debug(Debug::ERROR) << "Cannot write to index file " << outIndexName << "\n";
EXIT(EXIT_FAILURE);
}
offset += inSize;

DBReader<unsigned int> reader(inDb.c_str(), inIndexName.c_str(), 1, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
reader.open(DBReader<unsigned int>::HARDNOSORT);
inSize = reader.getTotalDataSize();
for (size_t idx = 0; idx < reader.getDataFileCnt(); idx++) {
char* data = reader.getDataForFile(idx);
size_t size = reader.getDataSizeForFile(idx);
written = fwrite(data, sizeof(char), size, outDataHandle);
if (written != size) {
Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
EXIT(EXIT_FAILURE);
}
}
reader.close();

written = fwrite(&nullbyte, sizeof(char), 1, outDataHandle);
if (written != 1) {
Debug(Debug::ERROR) << "Cannot write to data file " << outDb << "\n";
EXIT(EXIT_FAILURE);
}
inSize += 1;
len = DBWriter::indexToBuffer(buffer, key + 1, offset, inSize);
written = fwrite(buffer, sizeof(char), len, outIndexHandle);
if (written != len) {
Debug(Debug::ERROR) << "Cannot write to index file " << outIndexName << "\n";
EXIT(EXIT_FAILURE);
}
offset += inSize;
}

if (fclose(outDataHandle) != 0) {
Debug(Debug::ERROR) << "Cannot close data file " << outDb << "\n";
EXIT(EXIT_FAILURE);
}
if (fclose(outIndexHandle) != 0) {
Debug(Debug::ERROR) << "Cannot close index file " << outIndexName << "\n";
EXIT(EXIT_FAILURE);
}

DBWriter::sortIndex(outIndexName.c_str(), outIndexName.c_str(), false);

return EXIT_SUCCESS;
}

0 comments on commit 75af0c8

Please sign in to comment.