From 0828d86539a4b6d7f64bc369a5b29920862afc5a Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Tue, 2 Feb 2021 21:04:33 +0100 Subject: [PATCH] Allow result database input in taxonomyreport #401 --- src/MMseqsBase.cpp | 4 +-- src/commons/Command.cpp | 1 + src/commons/Command.h | 1 + src/taxonomy/taxonomyreport.cpp | 64 +++++++++++++++++++++------------ 4 files changed, 46 insertions(+), 24 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 7deabbaf7..70c06dd21 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -394,9 +394,9 @@ std::vector baseCommands = { "Create a taxonomy report in Kraken or Krona format", NULL, "Milot Mirdita & Florian Breitwieser ", - " ", + " ", CITATION_TAXONOMY, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb }, - {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::taxResult }, + {"taxDB/resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::resultAndTaxDb }, {"taxonomyReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}}, {"filtertaxdb", filtertaxdb, &par.filtertaxdb, COMMAND_TAXONOMY, "Filter taxonomy result database", diff --git a/src/commons/Command.cpp b/src/commons/Command.cpp index 3c8714e0d..2e6097767 100644 --- a/src/commons/Command.cpp +++ b/src/commons/Command.cpp @@ -54,4 +54,5 @@ std::vector DbValidator::flatfile = {Parameters::DBTYPE_FLATFILE}; std::vector DbValidator::flatfileAndStdin = {Parameters::DBTYPE_FLATFILE, Parameters::DBTYPE_STDIN}; std::vector DbValidator::flatfileStdinAndGeneric = {Parameters::DBTYPE_FLATFILE, Parameters::DBTYPE_STDIN, Parameters::DBTYPE_GENERIC_DB}; std::vector DbValidator::resultDb = {Parameters::DBTYPE_ALIGNMENT_RES, Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES}; +std::vector DbValidator::resultAndTaxDb = {Parameters::DBTYPE_ALIGNMENT_RES, Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES, Parameters::DBTYPE_TAXONOMICAL_RESULT}; std::vector DbValidator::empty = {}; diff --git a/src/commons/Command.h b/src/commons/Command.h index a6d56e255..9b96f5b28 100644 --- a/src/commons/Command.h +++ b/src/commons/Command.h @@ -62,6 +62,7 @@ struct DbValidator { static std::vector allDb; static std::vector allDbAndFlat; static std::vector taxResult; + static std::vector resultAndTaxDb; static std::vector directory; static std::vector flatfile; static std::vector flatfileAndStdin; diff --git a/src/taxonomy/taxonomyreport.cpp b/src/taxonomy/taxonomyreport.cpp index 2e01401c6..22f45a895 100644 --- a/src/taxonomy/taxonomyreport.cpp +++ b/src/taxonomy/taxonomyreport.cpp @@ -130,53 +130,73 @@ int taxonomyreport(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); - // 1. Read taxonomy NcbiTaxonomy *taxDB = NcbiTaxonomy::openTaxonomy(par.db1); + DBReader reader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA | DBReader::USE_INDEX); + reader.open(DBReader::LINEAR_ACCCESS); + + // support reading both LCA databases and result databases (e.g. alignment) + const bool isTaxonomyInput = Parameters::isEqualDbtype(reader.getDbtype(), Parameters::DBTYPE_TAXONOMICAL_RESULT); std::vector> mapping; - if (FileUtil::fileExists(std::string(par.db1 + "_mapping").c_str()) == false) { - Debug(Debug::ERROR) << par.db1 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n"; - EXIT(EXIT_FAILURE); - } - bool isSorted = Util::readMapping(par.db1 + "_mapping", mapping); - if (isSorted == false) { - std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt); + if (isTaxonomyInput == false) { + if (FileUtil::fileExists(std::string(par.db1 + "_mapping").c_str()) == false) { + Debug(Debug::ERROR) << par.db1 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n"; + EXIT(EXIT_FAILURE); + } + bool isSorted = Util::readMapping(par.db1 + "_mapping", mapping); + if (isSorted == false) { + std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt); + } } - DBReader reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); - reader.open(DBReader::LINEAR_ACCCESS); - FILE *resultFP = FileUtil::openAndDelete(par.db3.c_str(), "w"); - // 2. Read LCA file std::unordered_map taxCounts; Debug::Progress progress(reader.getSize()); -// #pragma omp parallel +#pragma omp parallel { - const char *entry[255]; unsigned int thread_idx = 0; #ifdef OPENMP thread_idx = (unsigned int) omp_get_thread_num(); #endif -// #pragma omp for schedule(dynamic, 10) reduction (+:taxonNotFound, found) + std::unordered_map localTaxCounts; +#pragma omp for schedule(dynamic, 10) for (size_t i = 0; i < reader.getSize(); ++i) { progress.updateProgress(); char *data = reader.getData(i, thread_idx); + while (*data != '\0') { + if (isTaxonomyInput) { + TaxID taxon = Util::fast_atoi(data); + ++localTaxCounts[taxon]; + } else { + // match dbKey to its taxon based on mapping + std::pair val; + val.first = Util::fast_atoi(data); + std::vector>::iterator mappingIt; + mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt); + if (mappingIt != mapping.end() && mappingIt->first == val.first) { + ++localTaxCounts[mappingIt->second]; + } + } + data = Util::skipLine(data); + } + } - const size_t columns = Util::getWordsOfLine(data, entry, 255); - if (columns == 0) { - Debug(Debug::WARNING) << "Empty entry: " << i << "!"; + // merge maps again +#pragma omp critical + for (std::unordered_map::const_iterator it = localTaxCounts.cbegin(); it != localTaxCounts.cend(); ++it) { + if (taxCounts[it->first]) { + taxCounts[it->first] += it->second; } else { - int taxon = Util::fast_atoi(entry[0]); - ++taxCounts[taxon]; + taxCounts[it->first] = it->second; } } } - Debug(Debug::INFO) << "Found " << taxCounts.size() << " different taxa for " << reader.getSize() << " different reads.\n"; + Debug(Debug::INFO) << "Found " << taxCounts.size() << " different taxa for " << reader.getSize() << " different reads\n"; unsigned int unknownCnt = (taxCounts.find(0) != taxCounts.end()) ? taxCounts.at(0) : 0; - Debug(Debug::INFO) << unknownCnt << " reads are unclassified.\n"; + Debug(Debug::INFO) << unknownCnt << " reads are unclassified\n"; const size_t entryCount = reader.getSize(); reader.close();