Skip to content

Commit

Permalink
Allow result database input in taxonomyreport #401
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Feb 2, 2021
1 parent b31ebb6 commit 0828d86
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 24 deletions.
4 changes: 2 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,9 +394,9 @@ std::vector<Command> baseCommands = {
"Create a taxonomy report in Kraken or Krona format",
NULL,
"Milot Mirdita <[email protected]> & Florian Breitwieser <[email protected]>",
"<i:targetDB> <i:taxDB> <o:taxonomyReport>",
"<i:targetDB> <i:taxDB/resultDB> <o:taxonomyReport>",
CITATION_TAXONOMY, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::taxResult },
{"taxDB/resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::resultAndTaxDb },
{"taxonomyReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
{"filtertaxdb", filtertaxdb, &par.filtertaxdb, COMMAND_TAXONOMY,
"Filter taxonomy result database",
Expand Down
1 change: 1 addition & 0 deletions src/commons/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,5 @@ std::vector<int> DbValidator::flatfile = {Parameters::DBTYPE_FLATFILE};
std::vector<int> DbValidator::flatfileAndStdin = {Parameters::DBTYPE_FLATFILE, Parameters::DBTYPE_STDIN};
std::vector<int> DbValidator::flatfileStdinAndGeneric = {Parameters::DBTYPE_FLATFILE, Parameters::DBTYPE_STDIN, Parameters::DBTYPE_GENERIC_DB};
std::vector<int> DbValidator::resultDb = {Parameters::DBTYPE_ALIGNMENT_RES, Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES};
std::vector<int> DbValidator::resultAndTaxDb = {Parameters::DBTYPE_ALIGNMENT_RES, Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES, Parameters::DBTYPE_TAXONOMICAL_RESULT};
std::vector<int> DbValidator::empty = {};
1 change: 1 addition & 0 deletions src/commons/Command.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct DbValidator {
static std::vector<int> allDb;
static std::vector<int> allDbAndFlat;
static std::vector<int> taxResult;
static std::vector<int> resultAndTaxDb;
static std::vector<int> directory;
static std::vector<int> flatfile;
static std::vector<int> flatfileAndStdin;
Expand Down
64 changes: 42 additions & 22 deletions src/taxonomy/taxonomyreport.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,53 +130,73 @@ int taxonomyreport(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

// 1. Read taxonomy
NcbiTaxonomy *taxDB = NcbiTaxonomy::openTaxonomy(par.db1);

DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

// support reading both LCA databases and result databases (e.g. alignment)
const bool isTaxonomyInput = Parameters::isEqualDbtype(reader.getDbtype(), Parameters::DBTYPE_TAXONOMICAL_RESULT);
std::vector<std::pair<unsigned int, unsigned int>> mapping;
if (FileUtil::fileExists(std::string(par.db1 + "_mapping").c_str()) == false) {
Debug(Debug::ERROR) << par.db1 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
EXIT(EXIT_FAILURE);
}
bool isSorted = Util::readMapping(par.db1 + "_mapping", mapping);
if (isSorted == false) {
std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt);
if (isTaxonomyInput == false) {
if (FileUtil::fileExists(std::string(par.db1 + "_mapping").c_str()) == false) {
Debug(Debug::ERROR) << par.db1 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
EXIT(EXIT_FAILURE);
}
bool isSorted = Util::readMapping(par.db1 + "_mapping", mapping);
if (isSorted == false) {
std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt);
}
}

DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

FILE *resultFP = FileUtil::openAndDelete(par.db3.c_str(), "w");

// 2. Read LCA file
std::unordered_map<TaxID, unsigned int> taxCounts;
Debug::Progress progress(reader.getSize());
// #pragma omp parallel
#pragma omp parallel
{
const char *entry[255];
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = (unsigned int) omp_get_thread_num();
#endif

// #pragma omp for schedule(dynamic, 10) reduction (+:taxonNotFound, found)
std::unordered_map<TaxID, unsigned int> localTaxCounts;
#pragma omp for schedule(dynamic, 10)
for (size_t i = 0; i < reader.getSize(); ++i) {
progress.updateProgress();

char *data = reader.getData(i, thread_idx);
while (*data != '\0') {
if (isTaxonomyInput) {
TaxID taxon = Util::fast_atoi<int>(data);
++localTaxCounts[taxon];
} else {
// match dbKey to its taxon based on mapping
std::pair<unsigned int, unsigned int> val;
val.first = Util::fast_atoi<unsigned int>(data);
std::vector<std::pair<unsigned int, unsigned int>>::iterator mappingIt;
mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt);
if (mappingIt != mapping.end() && mappingIt->first == val.first) {
++localTaxCounts[mappingIt->second];
}
}
data = Util::skipLine(data);
}
}

const size_t columns = Util::getWordsOfLine(data, entry, 255);
if (columns == 0) {
Debug(Debug::WARNING) << "Empty entry: " << i << "!";
// merge maps again
#pragma omp critical
for (std::unordered_map<TaxID, unsigned int>::const_iterator it = localTaxCounts.cbegin(); it != localTaxCounts.cend(); ++it) {
if (taxCounts[it->first]) {
taxCounts[it->first] += it->second;
} else {
int taxon = Util::fast_atoi<int>(entry[0]);
++taxCounts[taxon];
taxCounts[it->first] = it->second;
}
}
}
Debug(Debug::INFO) << "Found " << taxCounts.size() << " different taxa for " << reader.getSize() << " different reads.\n";
Debug(Debug::INFO) << "Found " << taxCounts.size() << " different taxa for " << reader.getSize() << " different reads\n";
unsigned int unknownCnt = (taxCounts.find(0) != taxCounts.end()) ? taxCounts.at(0) : 0;
Debug(Debug::INFO) << unknownCnt << " reads are unclassified.\n";
Debug(Debug::INFO) << unknownCnt << " reads are unclassified\n";
const size_t entryCount = reader.getSize();
reader.close();

Expand Down

0 comments on commit 0828d86

Please sign in to comment.