From 028409241891fb2373d2eed3a852d241e53b1bc6 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Thu, 4 Apr 2024 17:54:59 +0900 Subject: [PATCH 01/10] recover longest orf after fragment elimination --- data/workflow/taxpercontig.sh | 6 +- src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 8 ++ src/util/CMakeLists.txt | 1 + src/util/recoverlongestorf.cpp | 149 +++++++++++++++++++++++++++++++++ src/workflow/Taxonomy.cpp | 1 + 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 src/util/recoverlongestorf.cpp diff --git a/data/workflow/taxpercontig.sh b/data/workflow/taxpercontig.sh index 299ac3646..f350df220 100644 --- a/data/workflow/taxpercontig.sh +++ b/data/workflow/taxpercontig.sh @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then fi if notExists "${TMP_PATH}/orfs_aln.list"; then - awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list" + # shellcheck disable=SC2086 + "$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \ + || fail "recoverlongestorf died" + awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list" + cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list" fi if notExists "${TMP_PATH}/orfs_filter.dbtype"; then diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index e8eac296c..3c4abb4e0 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -98,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command extern int gappedprefilter(int argc, const char **argv, const Command& command); extern int unpackdb(int argc, const char **argv, const Command& command); extern int rbh(int argc, const char **argv, const Command& command); +extern int recoverlongestorf(int argc, const char **argv, const Command& command); extern int result2flat(int argc, const char **argv, const Command& command); extern int result2msa(int argc, const char **argv, const Command& command); extern int result2dnamsa(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 282e740d1..720aeaccb 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -907,6 +907,14 @@ std::vector baseCommands = { "Eli Levy Karin ", " ", CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, + {"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT, + "Recover longest ORF for taxonomy annotation after elimination", + NULL, + "Sung-eun Jang", + " ", + CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, + {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb}, + {"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE, "Reverse (without complement) sequences", NULL, diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 7fa7b0603..c43c3356d 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -47,6 +47,7 @@ set(util_source_files util/profile2pssm.cpp util/profile2neff.cpp util/profile2seq.cpp + util/recoverlongestorf.cpp util/result2dnamsa.cpp util/result2flat.cpp util/result2msa.cpp diff --git a/src/util/recoverlongestorf.cpp b/src/util/recoverlongestorf.cpp new file mode 100644 index 000000000..60fb3dcc1 --- /dev/null +++ b/src/util/recoverlongestorf.cpp @@ -0,0 +1,149 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" +#include "Orf.h" + +#include +#include + +#ifdef OPENMP +#include +#endif + +int recoverlongestorf(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + DBReader headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + headerReader.open(DBReader::LINEAR_ACCCESS); + + DBReader resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + resultReader.open(DBReader::LINEAR_ACCCESS); + + DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, false, Parameters::DBTYPE_OMIT_FILE); + writer.open(); + + Debug::Progress progress(resultReader.getSize()); + std::unordered_map> contigToLongest; +#pragma omp parallel + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); + +#endif + std::unordered_map> localContigToLongest; + +#pragma omp for schedule(dynamic, 100) + for (size_t id = 0; id < headerReader.getSize(); ++id) { + progress.updateProgress(); + unsigned int orfKey = headerReader.getDbKey(id); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + size_t orfLen = std::max(orf.from, orf.to) - std::min(orf.from, orf.to) + 1; + std::unordered_map>::iterator it = localContigToLongest.find(contigKey); + if (it != localContigToLongest.end()) { + std::pair orfKeyToLength = it->second; + if (orfLen > orfKeyToLength.second) { + it->second = std::make_pair(orfKey, orfLen); + } + } else { + localContigToLongest.emplace(contigKey, std::make_pair(orfKey, orfLen)); + } + } + +#pragma omp critical + { + for (auto &entry : localContigToLongest) { + auto &contigKey = entry.first; + auto &localData = entry.second; + auto it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + if (localData.second > it->second.second) { + it->second = localData; + } + } else { + contigToLongest[contigKey] = localData; + } + } + } + } + + progress.reset(resultReader.getSize()); + std::unordered_set acceptedContigs; + std::unordered_set eliminatedContigs; +#pragma omp parallel + { + int thread_idx = 0; +#ifdef OPENMP + thread_idx = omp_get_thread_num(); +#endif + char keyBuffer[1024]; + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + + std::unordered_set localAcceptedContigs; + std::unordered_set localEliminatedContigs; +#pragma omp for schedule(dynamic, 5) + for (size_t i = 0; i < resultReader.getSize(); ++i) { + progress.updateProgress(); + + unsigned int key = resultReader.getDbKey(i); + size_t entryLength = resultReader.getEntryLen(i); + if (entryLength > 1) { + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localAcceptedContigs.emplace(contigKey); + } + + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localEliminatedContigs.emplace(contigKey); + } + +#pragma omp critical + { + acceptedContigs.insert(localAcceptedContigs.begin(), localAcceptedContigs.end()); + eliminatedContigs.insert(localEliminatedContigs.begin(), localEliminatedContigs.end()); + } + } + + for (auto it = eliminatedContigs.begin(); it != eliminatedContigs.end(); ) { + if (acceptedContigs.find(*it) != acceptedContigs.end()) { + it = eliminatedContigs.erase(it); + } else { + ++it; + } + } + + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + for (auto contigIt = eliminatedContigs.begin(); contigIt != eliminatedContigs.end(); ++contigIt) { + unsigned int contigKey = *contigIt; + std::unordered_map>::iterator it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + unsigned int orfKey = it->second.first; + resultBuffer.append(SSTR(orfKey)); + resultBuffer.append(1, '\n'); + writer.writeData(resultBuffer.c_str(), resultBuffer.length(), orfKey, 0, false, false); + resultBuffer.clear(); + } else { + Debug(Debug::ERROR) << "Missing contig " << contigKey << "\n"; + EXIT(EXIT_FAILURE); + } + } + + writer.close(true); + FileUtil::remove(writer.getIndexFileName()); + headerReader.close(); + resultReader.close(); + + return EXIT_SUCCESS; +} diff --git a/src/workflow/Taxonomy.cpp b/src/workflow/Taxonomy.cpp index 4dbd34c73..78bb7606c 100644 --- a/src/workflow/Taxonomy.cpp +++ b/src/workflow/Taxonomy.cpp @@ -95,6 +95,7 @@ int taxonomy(int argc, const char **argv, const Command& command) { cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("RUNNER", par.runner.c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); if (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED && !(searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) { From 4c6f74f28dbac6783ae58baca25f9279e72fd063 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Thu, 4 Apr 2024 18:15:36 +0900 Subject: [PATCH 02/10] delete unused variable --- src/util/recoverlongestorf.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/util/recoverlongestorf.cpp b/src/util/recoverlongestorf.cpp index 60fb3dcc1..b25d89476 100644 --- a/src/util/recoverlongestorf.cpp +++ b/src/util/recoverlongestorf.cpp @@ -81,7 +81,6 @@ int recoverlongestorf(int argc, const char **argv, const Command &command) { #ifdef OPENMP thread_idx = omp_get_thread_num(); #endif - char keyBuffer[1024]; std::string resultBuffer; resultBuffer.reserve(1024 * 1024); From a035b87ede804380a7dd03a6083012f0a1a641b1 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Mon, 2 Sep 2024 16:01:14 +0900 Subject: [PATCH 03/10] add ORF_SKIP option in translated_search --- data/workflow/translated_search.sh | 48 +++++++++++++++++++++++------- src/commons/Parameters.cpp | 4 +++ src/commons/Parameters.h | 2 ++ src/workflow/Search.cpp | 12 +++++++- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh index b2e4252d4..61c8c4f3a 100755 --- a/data/workflow/translated_search.sh +++ b/data/workflow/translated_search.sh @@ -21,10 +21,24 @@ TMP_PATH="$4" QUERY="$1" QUERY_ORF="$1" if [ -n "$QUERY_NUCL" ]; then - if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \ - || fail "extract orfs step died" + # introduce EXTRACT_FRAMES_PAR, TRANSLATE_PAR, ORF_SKIP to Search.cpp and to Parameters::Parameters + if [ "$ORF_SKIP" ]; then + if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ + || fail "extractframes died" + # we want to avoid this \/ + # shellcheck disable=SC2086 + "$MMSEQS" translatenucs "${TMP_PATH}/q_orfs_nucl" "${TMP_PATH}/q_orfs_aa" ${TRANSLATE_PAR} \ + || fail "translatenucs died" + "$MMSEQS" rmdb "${TMP_PATH}/q_orfs_nucl" + fi + else + if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \ + || fail "extract orfs step died" + fi fi QUERY="${TMP_PATH}/q_orfs_aa" QUERY_ORF="${TMP_PATH}/q_orfs_aa" @@ -34,13 +48,27 @@ TARGET="$2" TARGET_ORF="$2" if [ -n "$TARGET_NUCL" ]; then if [ -n "$NO_TARGET_INDEX" ]; then - if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \ - || fail "extract target orfs step died" + if [ "$ORF_SKIP" ]; then + if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" extractframes "$1" "${TMP_PATH}/t_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ + || fail "extractframes died" + # we want to avoid this \/ + # shellcheck disable=SC2086 + "$MMSEQS" translatenucs "${TMP_PATH}/t_orfs_nucl" "${TMP_PATH}/t_orfs_aa" ${TRANSLATE_PAR} \ + || fail "translatenucs died" + "$MMSEQS" rmdb "${TMP_PATH}/t_orfs_nucl" + fi + else + if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then + # same here + # shellcheck disable=SC2086 + "$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \ + || fail "extract target orfs step died" + fi fi - TARGET="${TMP_PATH}/t_orfs_aa" - TARGET_ORF="${TMP_PATH}/t_orfs_aa" + TARGET="${TMP_PATH}/t_orfs_aa" + TARGET_ORF="${TMP_PATH}/t_orfs_aa" fi fi diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 34e429811..f78711964 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -172,6 +172,7 @@ Parameters::Parameters(): PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"), PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"), PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), + PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "ORF skip mode", "???", typeid(bool), (void *) &orfSkip, ""), // easysearch PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""), // extractorfs @@ -1251,6 +1252,7 @@ Parameters::Parameters(): searchworkflow = combineList(searchworkflow, rescorediagonal); searchworkflow = combineList(searchworkflow, result2profile); searchworkflow = combineList(searchworkflow, extractorfs); + searchworkflow = combineList(searchworkflow, extractframes); searchworkflow = combineList(searchworkflow, translatenucs); searchworkflow = combineList(searchworkflow, splitsequence); searchworkflow = combineList(searchworkflow, offsetalignment); @@ -1268,6 +1270,7 @@ Parameters::Parameters(): searchworkflow.push_back(&PARAM_RUNNER); searchworkflow.push_back(&PARAM_REUSELATEST); searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES); + searchworkflow.push_back(&PARAM_ORF_SKIP); linsearchworkflow = combineList(align, kmersearch); linsearchworkflow = combineList(linsearchworkflow, swapresult); @@ -2277,6 +2280,7 @@ void Parameters::setDefaults() { orfFilterSens = 2.0; orfFilterEval = 100; lcaSearch = false; + orfSkip = false; greedyBestHits = false; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 745d2f809..d8e06eabd 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -467,6 +467,7 @@ class Parameters { float orfFilterSens; double orfFilterEval; bool lcaSearch; + bool orfSkip; // easysearch bool greedyBestHits; @@ -886,6 +887,7 @@ class Parameters { PARAMETER(PARAM_ORF_FILTER_S) PARAMETER(PARAM_ORF_FILTER_E) PARAMETER(PARAM_LCA_SEARCH) + PARAMETER(PARAM_ORF_SKIP) // easysearch PARAMETER(PARAM_GREEDY_BEST_HITS) diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 62ec132e2..e6f350797 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -208,6 +208,9 @@ int search(int argc, const char **argv, const Command& command) { for (size_t i = 0; i < par.extractorfs.size(); i++) { par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } + for (size_t i = 0; i < par.extractframes.size(); i++) { + par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } for (size_t i = 0; i < par.translatenucs.size(); i++) { par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } @@ -541,9 +544,16 @@ int search(int argc, const char **argv, const Command& command) { par.subDbMode = 1; cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); par.translate = 1; - cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str()); cmd.addVariable("SEARCH", program.c_str()); + if (par.orfSkip) { + cmd.addVariable("ORF_SKIP", "TRUE"); + cmd.addVariable("TRANSLATE_PAR", par.createParameterString(par.translatenucs).c_str()); + cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); + } + else{ + cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); + } program = std::string(tmpDir + "/translated_search.sh"); }else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE && searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){ From 9dce49a7bf41accc6e3903a18432f9600457a2e2 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Tue, 3 Sep 2024 15:54:07 +0900 Subject: [PATCH 04/10] change extractframes function to consider multiple frames in forward/reverse --- src/util/extractframes.cpp | 76 ++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp index 348d40bf7..94f253c8d 100644 --- a/src/util/extractframes.cpp +++ b/src/util/extractframes.cpp @@ -31,8 +31,9 @@ int extractframes(int argc, const char **argv, const Command& command) { unsigned int forwardFrames = Orf::getFrames(par.forwardFrames); unsigned int reverseFrames = Orf::getFrames(par.reverseFrames); - Debug::Progress progress(reader.getSize()); + unsigned int totalFrames = __builtin_popcount(forwardFrames) + __builtin_popcount(reverseFrames); + Debug::Progress progress(reader.getSize()); #pragma omp parallel { int thread_idx = 0; @@ -52,28 +53,29 @@ int extractframes(int argc, const char **argv, const Command& command) { for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){ progress.updateProgress(); - unsigned int key = reader.getDbKey(i); + unsigned int key = reader.getDbKey(i) * totalFrames; const char* data = reader.getData(i, thread_idx); size_t dataLength = reader.getEntryLen(i); size_t bufferLen; - switch (forwardFrames){ - case Orf::FRAME_1: - // -1 to ignore the null byte copy the new line - sequenceWriter.writeData(data, dataLength - 1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), dataLength - 3, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_2: - sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), dataLength - 4, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_3: - sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), dataLength - 5, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; + if (forwardFrames & Orf::FRAME_1) { + unsigned int curKey = key++; + // -1 to ignore the null byte copy the new line + sequenceWriter.writeData(data, dataLength - 1, curKey, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, curKey, static_cast(0), dataLength - 3, 0, 0); + headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + } + if (forwardFrames & Orf::FRAME_2) { + unsigned int curKey = key++; + sequenceWriter.writeData(data + 1, dataLength - 2, curKey, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, curKey, static_cast(1), dataLength - 4, 0, 0); + headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + } + if (forwardFrames & Orf::FRAME_3) { + unsigned int curKey = key++; + sequenceWriter.writeData(data + 2, dataLength - 3, curKey, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, curKey, static_cast(2), dataLength - 5, 0, 0); + headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); } if(reverseFrames != 0){ @@ -91,23 +93,27 @@ int extractframes(int argc, const char **argv, const Command& command) { reverseComplementStr.push_back('\n'); } - switch (reverseFrames){ - case Orf::FRAME_1: - sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast(0), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_2: - sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast(1), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_3: - sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast(2), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; + if (reverseFrames & Orf::FRAME_1) { + unsigned int curKey = key++; + sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), curKey, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, curKey, reverseComplementStr.size() - 2, static_cast(0), 0, 0); + headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); } + + if (reverseFrames & Orf::FRAME_2) { + unsigned int curKey = key++; + sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, curKey, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, curKey, reverseComplementStr.size() - 3, static_cast(1), 0, 0); + headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + } + + if (reverseFrames & Orf::FRAME_3) { + unsigned int curKey = key++; + sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, curKey, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, curKey, reverseComplementStr.size() - 4, static_cast(2), 0, 0); + headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + } + reverseComplementStr.clear(); } } From b1e513993cf2bde7333255a1b405d012b2d00857 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Thu, 5 Sep 2024 18:25:09 +0900 Subject: [PATCH 05/10] fix error with DB key --- src/util/extractframes.cpp | 45 ++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp index 94f253c8d..5b530ee8e 100644 --- a/src/util/extractframes.cpp +++ b/src/util/extractframes.cpp @@ -31,7 +31,6 @@ int extractframes(int argc, const char **argv, const Command& command) { unsigned int forwardFrames = Orf::getFrames(par.forwardFrames); unsigned int reverseFrames = Orf::getFrames(par.reverseFrames); - unsigned int totalFrames = __builtin_popcount(forwardFrames) + __builtin_popcount(reverseFrames); Debug::Progress progress(reader.getSize()); #pragma omp parallel @@ -53,29 +52,26 @@ int extractframes(int argc, const char **argv, const Command& command) { for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){ progress.updateProgress(); - unsigned int key = reader.getDbKey(i) * totalFrames; + unsigned int key = reader.getDbKey(i); const char* data = reader.getData(i, thread_idx); size_t dataLength = reader.getEntryLen(i); size_t bufferLen; if (forwardFrames & Orf::FRAME_1) { - unsigned int curKey = key++; // -1 to ignore the null byte copy the new line - sequenceWriter.writeData(data, dataLength - 1, curKey, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, curKey, static_cast(0), dataLength - 3, 0, 0); - headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + sequenceWriter.writeData(data, dataLength - 1, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), dataLength - 3, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (forwardFrames & Orf::FRAME_2) { - unsigned int curKey = key++; - sequenceWriter.writeData(data + 1, dataLength - 2, curKey, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, curKey, static_cast(1), dataLength - 4, 0, 0); - headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), dataLength - 4, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (forwardFrames & Orf::FRAME_3) { - unsigned int curKey = key++; - sequenceWriter.writeData(data + 2, dataLength - 3, curKey, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, curKey, static_cast(2), dataLength - 5, 0, 0); - headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), dataLength - 5, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if(reverseFrames != 0){ @@ -94,24 +90,21 @@ int extractframes(int argc, const char **argv, const Command& command) { } if (reverseFrames & Orf::FRAME_1) { - unsigned int curKey = key++; - sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), curKey, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, curKey, reverseComplementStr.size() - 2, static_cast(0), 0, 0); - headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast(0), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (reverseFrames & Orf::FRAME_2) { - unsigned int curKey = key++; - sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, curKey, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, curKey, reverseComplementStr.size() - 3, static_cast(1), 0, 0); - headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast(1), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (reverseFrames & Orf::FRAME_3) { - unsigned int curKey = key++; - sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, curKey, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, curKey, reverseComplementStr.size() - 4, static_cast(2), 0, 0); - headerWriter.writeData(buffer, bufferLen, curKey, thread_idx); + sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast(2), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } reverseComplementStr.clear(); From 2c1f500c3915c7b7f8690ac71a488c586072ce49 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Thu, 5 Sep 2024 18:32:17 +0900 Subject: [PATCH 06/10] make sure translated_search with skip_orf option (extractorfs->extractframes & translatenucs) works --- data/workflow/translated_search.sh | 13 ++++++------- src/workflow/Search.cpp | 3 +-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh index 61c8c4f3a..d2f942d00 100755 --- a/data/workflow/translated_search.sh +++ b/data/workflow/translated_search.sh @@ -21,8 +21,7 @@ TMP_PATH="$4" QUERY="$1" QUERY_ORF="$1" if [ -n "$QUERY_NUCL" ]; then - # introduce EXTRACT_FRAMES_PAR, TRANSLATE_PAR, ORF_SKIP to Search.cpp and to Parameters::Parameters - if [ "$ORF_SKIP" ]; then + if [ "$ORF_SKIP" = "TRUE" ]; then if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ @@ -31,7 +30,7 @@ if [ -n "$QUERY_NUCL" ]; then # shellcheck disable=SC2086 "$MMSEQS" translatenucs "${TMP_PATH}/q_orfs_nucl" "${TMP_PATH}/q_orfs_aa" ${TRANSLATE_PAR} \ || fail "translatenucs died" - "$MMSEQS" rmdb "${TMP_PATH}/q_orfs_nucl" + "$MMSEQS" rmdb "${TMP_PATH}/q_orfs_nucl" ${VERBOSITY} fi else if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then @@ -51,13 +50,13 @@ if [ -n "$NO_TARGET_INDEX" ]; then if [ "$ORF_SKIP" ]; then if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" extractframes "$1" "${TMP_PATH}/t_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ + "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ || fail "extractframes died" # we want to avoid this \/ # shellcheck disable=SC2086 "$MMSEQS" translatenucs "${TMP_PATH}/t_orfs_nucl" "${TMP_PATH}/t_orfs_aa" ${TRANSLATE_PAR} \ || fail "translatenucs died" - "$MMSEQS" rmdb "${TMP_PATH}/t_orfs_nucl" + "$MMSEQS" rmdb "${TMP_PATH}/t_orfs_nucl" ${VERBOSITY} fi else if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then @@ -67,8 +66,8 @@ if [ -n "$NO_TARGET_INDEX" ]; then || fail "extract target orfs step died" fi fi - TARGET="${TMP_PATH}/t_orfs_aa" - TARGET_ORF="${TMP_PATH}/t_orfs_aa" + TARGET="${TMP_PATH}/t_orfs_aa" + TARGET_ORF="${TMP_PATH}/t_orfs_aa" fi fi diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index e6f350797..136732e5a 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -550,8 +550,7 @@ int search(int argc, const char **argv, const Command& command) { cmd.addVariable("ORF_SKIP", "TRUE"); cmd.addVariable("TRANSLATE_PAR", par.createParameterString(par.translatenucs).c_str()); cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); - } - else{ + } else { cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); } program = std::string(tmpDir + "/translated_search.sh"); From 88b367051481d55dd265a11403eca34a2c21dbe0 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Fri, 6 Sep 2024 15:10:59 +0900 Subject: [PATCH 07/10] add translation in extract frames --- data/workflow/translated_search.sh | 14 +-- src/commons/Parameters.cpp | 4 +- src/util/extractframes.cpp | 163 +++++++++++++++++++++++++++-- src/workflow/Search.cpp | 4 - 4 files changed, 158 insertions(+), 27 deletions(-) diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh index d2f942d00..5b8266042 100755 --- a/data/workflow/translated_search.sh +++ b/data/workflow/translated_search.sh @@ -24,13 +24,8 @@ if [ -n "$QUERY_NUCL" ]; then if [ "$ORF_SKIP" = "TRUE" ]; then if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ + "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \ || fail "extractframes died" - # we want to avoid this \/ - # shellcheck disable=SC2086 - "$MMSEQS" translatenucs "${TMP_PATH}/q_orfs_nucl" "${TMP_PATH}/q_orfs_aa" ${TRANSLATE_PAR} \ - || fail "translatenucs died" - "$MMSEQS" rmdb "${TMP_PATH}/q_orfs_nucl" ${VERBOSITY} fi else if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then @@ -50,13 +45,8 @@ if [ -n "$NO_TARGET_INDEX" ]; then if [ "$ORF_SKIP" ]; then if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_nucl" ${EXTRACT_FRAMES_PAR} \ + "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \ || fail "extractframes died" - # we want to avoid this \/ - # shellcheck disable=SC2086 - "$MMSEQS" translatenucs "${TMP_PATH}/t_orfs_nucl" "${TMP_PATH}/t_orfs_aa" ${TRANSLATE_PAR} \ - || fail "translatenucs died" - "$MMSEQS" rmdb "${TMP_PATH}/t_orfs_nucl" ${VERBOSITY} fi else if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index f78711964..fc6cf75f5 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -728,7 +728,8 @@ Parameters::Parameters(): // extract frames extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES); - extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES); + extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES); + extractframes.push_back(&PARAM_TRANSLATION_TABLE); extractframes.push_back(&PARAM_CREATE_LOOKUP); extractframes.push_back(&PARAM_THREADS); extractframes.push_back(&PARAM_COMPRESSED); @@ -1253,7 +1254,6 @@ Parameters::Parameters(): searchworkflow = combineList(searchworkflow, result2profile); searchworkflow = combineList(searchworkflow, extractorfs); searchworkflow = combineList(searchworkflow, extractframes); - searchworkflow = combineList(searchworkflow, translatenucs); searchworkflow = combineList(searchworkflow, splitsequence); searchworkflow = combineList(searchworkflow, offsetalignment); // needed for slice search, however all its parameters are already present in searchworkflow diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp index 5b530ee8e..b9a0d04ff 100644 --- a/src/util/extractframes.cpp +++ b/src/util/extractframes.cpp @@ -4,6 +4,7 @@ #include "DBWriter.h" #include "Matcher.h" #include "Util.h" +#include "TranslateNucl.h" #include "itoa.h" #include "Orf.h" @@ -23,7 +24,11 @@ int extractframes(int argc, const char **argv, const Command& command) { DBReader reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); reader.open(DBReader::NOSORT); - DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype()); + int outputDbtype = reader.getDbtype(); + if(par.translate) { + outputDbtype = Parameters::DBTYPE_AMINO_ACIDS; + } + DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype); sequenceWriter.open(); DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB); @@ -33,6 +38,7 @@ int extractframes(int argc, const char **argv, const Command& command) { unsigned int reverseFrames = Orf::getFrames(par.reverseFrames); Debug::Progress progress(reader.getSize()); + TranslateNucl translateNucl(static_cast(par.translationTable)); #pragma omp parallel { int thread_idx = 0; @@ -46,6 +52,7 @@ int extractframes(int argc, const char **argv, const Command& command) { queryFrom = 0; } + char* aa = new char[par.maxSeqLen + 3 + 1]; char buffer[1024]; std::string reverseComplementStr; reverseComplementStr.reserve(32000); @@ -58,22 +65,91 @@ int extractframes(int argc, const char **argv, const Command& command) { size_t bufferLen; if (forwardFrames & Orf::FRAME_1) { - // -1 to ignore the null byte copy the new line - sequenceWriter.writeData(data, dataLength - 1, key, thread_idx); + size_t cur_dataLength = dataLength-1; + const char* cur_data = data; + + sequenceWriter.writeStart(thread_idx); + if(par.translate){ + if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { + cur_dataLength = cur_dataLength - (cur_dataLength % 3); + } + + if (cur_dataLength < 3) { + continue; + } + + if (cur_dataLength > (3 * par.maxSeqLen)) { + cur_dataLength = (3 * par.maxSeqLen); + } + translateNucl.translate(aa, cur_data, cur_dataLength); + sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); + + }else{ + sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + } + sequenceWriter.writeEnd(key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), dataLength - 3, 0, 0); headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (forwardFrames & Orf::FRAME_2) { - sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx); + size_t cur_dataLength = dataLength - 2; + const char* cur_data = data + 1; + + sequenceWriter.writeStart(thread_idx); + if(par.translate){ + if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { + cur_dataLength = cur_dataLength - (cur_dataLength % 3); + } + + if (cur_dataLength < 3) { + continue; + } + + if (cur_dataLength > (3 * par.maxSeqLen)) { + cur_dataLength = (3 * par.maxSeqLen); + } + translateNucl.translate(aa, cur_data, cur_dataLength); + sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); + + }else{ + sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + } + sequenceWriter.writeEnd(key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), dataLength - 4, 0, 0); headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (forwardFrames & Orf::FRAME_3) { - sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx); + size_t cur_dataLength = dataLength - 3; + const char* cur_data = data + 2; + + sequenceWriter.writeStart(thread_idx); + if(par.translate){ + if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { + cur_dataLength = cur_dataLength - (cur_dataLength % 3); + } + + if (cur_dataLength < 3) { + continue; + } + + if (cur_dataLength > (3 * par.maxSeqLen)) { + cur_dataLength = (3 * par.maxSeqLen); + } + translateNucl.translate(aa, cur_data, cur_dataLength); + sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); + + }else{ + sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + } + sequenceWriter.writeEnd(key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), dataLength - 5, 0, 0); headerWriter.writeData(buffer, bufferLen, key, thread_idx); } + if(reverseFrames != 0){ size_t sequenceLength = dataLength -2; // bool hasWrongChar = false; @@ -90,25 +166,94 @@ int extractframes(int argc, const char **argv, const Command& command) { } if (reverseFrames & Orf::FRAME_1) { - sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx); + size_t cur_dataLength = reverseComplementStr.size(); + const char* cur_data = reverseComplementStr.c_str(); + + sequenceWriter.writeStart(thread_idx); + if(par.translate){ + if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { + cur_dataLength = cur_dataLength - (cur_dataLength % 3); + } + + if (cur_dataLength < 3) { + continue; + } + + if (cur_dataLength > (3 * par.maxSeqLen)) { + cur_dataLength = (3 * par.maxSeqLen); + } + translateNucl.translate(aa, cur_data, cur_dataLength); + sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); + + }else{ + sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + } + sequenceWriter.writeEnd(key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast(0), 0, 0); headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (reverseFrames & Orf::FRAME_2) { - sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx); + size_t cur_dataLength = reverseComplementStr.size() - 1; + const char* cur_data = reverseComplementStr.c_str() + 1; + + sequenceWriter.writeStart(thread_idx); + if(par.translate){ + if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { + cur_dataLength = cur_dataLength - (cur_dataLength % 3); + } + + if (cur_dataLength < 3) { + continue; + } + + if (cur_dataLength > (3 * par.maxSeqLen)) { + cur_dataLength = (3 * par.maxSeqLen); + } + translateNucl.translate(aa, cur_data, cur_dataLength); + sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); + + }else{ + sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + } + sequenceWriter.writeEnd(key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast(1), 0, 0); headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (reverseFrames & Orf::FRAME_3) { - sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx); + size_t cur_dataLength = reverseComplementStr.size() - 2; + const char* cur_data = reverseComplementStr.c_str() + 2; + + sequenceWriter.writeStart(thread_idx); + if(par.translate){ + if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { + cur_dataLength = cur_dataLength - (cur_dataLength % 3); + } + + if (cur_dataLength < 3) { + continue; + } + + if (cur_dataLength > (3 * par.maxSeqLen)) { + cur_dataLength = (3 * par.maxSeqLen); + } + translateNucl.translate(aa, cur_data, cur_dataLength); + sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); + + }else{ + sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + } + sequenceWriter.writeEnd(key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast(2), 0, 0); headerWriter.writeData(buffer, bufferLen, key, thread_idx); } - reverseComplementStr.clear(); } + delete[] aa; } headerWriter.close(true); sequenceWriter.close(true); diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 136732e5a..17873f768 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -211,9 +211,6 @@ int search(int argc, const char **argv, const Command& command) { for (size_t i = 0; i < par.extractframes.size(); i++) { par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } - for (size_t i = 0; i < par.translatenucs.size(); i++) { - par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); - } for (size_t i = 0; i < par.splitsequence.size(); i++) { par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } @@ -548,7 +545,6 @@ int search(int argc, const char **argv, const Command& command) { cmd.addVariable("SEARCH", program.c_str()); if (par.orfSkip) { cmd.addVariable("ORF_SKIP", "TRUE"); - cmd.addVariable("TRANSLATE_PAR", par.createParameterString(par.translatenucs).c_str()); cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); } else { cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); From 94eae3aeb709ef62a59a9d72975d689e56b867be Mon Sep 17 00:00:00 2001 From: LunaJang Date: Fri, 6 Sep 2024 15:35:19 +0900 Subject: [PATCH 08/10] add PARAM_TRANSLATE into extractframes --- src/commons/Parameters.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index fc6cf75f5..52638a180 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -730,6 +730,7 @@ Parameters::Parameters(): extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES); extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES); extractframes.push_back(&PARAM_TRANSLATION_TABLE); + extractframes.push_back(&PARAM_TRANSLATE); extractframes.push_back(&PARAM_CREATE_LOOKUP); extractframes.push_back(&PARAM_THREADS); extractframes.push_back(&PARAM_COMPRESSED); From 964bfaa2d46231dd19533271820fb0bf3a74c9f8 Mon Sep 17 00:00:00 2001 From: LunaJang Date: Mon, 9 Sep 2024 23:10:04 +0900 Subject: [PATCH 09/10] change extracframes to have same output with extractframes+tranlatenucs --- data/workflow/translated_search.sh | 2 +- src/util/extractframes.cpp | 255 ++++++++++++----------------- 2 files changed, 103 insertions(+), 154 deletions(-) diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh index 5b8266042..7efdfcb50 100755 --- a/data/workflow/translated_search.sh +++ b/data/workflow/translated_search.sh @@ -42,7 +42,7 @@ TARGET="$2" TARGET_ORF="$2" if [ -n "$TARGET_NUCL" ]; then if [ -n "$NO_TARGET_INDEX" ]; then - if [ "$ORF_SKIP" ]; then + if [ "$ORF_SKIP" = "TRUE" ]; then if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \ diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp index b9a0d04ff..6f2760bf7 100644 --- a/src/util/extractframes.cpp +++ b/src/util/extractframes.cpp @@ -61,97 +61,70 @@ int extractframes(int argc, const char **argv, const Command& command) { unsigned int key = reader.getDbKey(i); const char* data = reader.getData(i, thread_idx); - size_t dataLength = reader.getEntryLen(i); + size_t seqLen = reader.getSeqLen(i); size_t bufferLen; if (forwardFrames & Orf::FRAME_1) { - size_t cur_dataLength = dataLength-1; - const char* cur_data = data; - - sequenceWriter.writeStart(thread_idx); - if(par.translate){ - if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { - cur_dataLength = cur_dataLength - (cur_dataLength % 3); - } - - if (cur_dataLength < 3) { - continue; - } - - if (cur_dataLength > (3 * par.maxSeqLen)) { - cur_dataLength = (3 * par.maxSeqLen); - } - translateNucl.translate(aa, cur_data, cur_dataLength); - sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); - - }else{ - sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + if (par.translate) { + size_t currSeqLen = seqLen + 1; + if (currSeqLen >= 3) { + if (currSeqLen > (3 * par.maxSeqLen)) { + currSeqLen = (3 * par.maxSeqLen); + } + size_t condonLength = currSeqLen / 3 * 3; + translateNucl.translate(aa, data, condonLength); + sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), seqLen - 1, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } + } else { + sequenceWriter.writeData(data, seqLen + 1, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), seqLen - 1, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } - sequenceWriter.writeEnd(key, thread_idx); - - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), dataLength - 3, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (forwardFrames & Orf::FRAME_2) { - size_t cur_dataLength = dataLength - 2; - const char* cur_data = data + 1; - - sequenceWriter.writeStart(thread_idx); - if(par.translate){ - if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { - cur_dataLength = cur_dataLength - (cur_dataLength % 3); - } - - if (cur_dataLength < 3) { - continue; - } - - if (cur_dataLength > (3 * par.maxSeqLen)) { - cur_dataLength = (3 * par.maxSeqLen); - } - translateNucl.translate(aa, cur_data, cur_dataLength); - sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); - - }else{ - sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + if (par.translate) { + size_t currSeqLen = seqLen; + if (currSeqLen >= 3) { + if (currSeqLen > (3 * par.maxSeqLen)) { + currSeqLen = (3 * par.maxSeqLen); + } + size_t condonLength = currSeqLen / 3 * 3; + translateNucl.translate(aa, data + 1, condonLength); + sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), seqLen - 2, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } + } else { + sequenceWriter.writeData(data + 1, seqLen, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), seqLen - 2, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } - sequenceWriter.writeEnd(key, thread_idx); - - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), dataLength - 4, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (forwardFrames & Orf::FRAME_3) { - size_t cur_dataLength = dataLength - 3; - const char* cur_data = data + 2; - - sequenceWriter.writeStart(thread_idx); - if(par.translate){ - if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { - cur_dataLength = cur_dataLength - (cur_dataLength % 3); - } - - if (cur_dataLength < 3) { - continue; - } - - if (cur_dataLength > (3 * par.maxSeqLen)) { - cur_dataLength = (3 * par.maxSeqLen); - } - translateNucl.translate(aa, cur_data, cur_dataLength); - sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); - - }else{ - sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + if (par.translate) { + size_t currSeqLen = seqLen - 1; + if (currSeqLen >= 3) { + if (currSeqLen > (3 * par.maxSeqLen)) { + currSeqLen = (3 * par.maxSeqLen); + } + size_t condonLength = currSeqLen / 3 * 3; + translateNucl.translate(aa, data + 2, condonLength); + sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), seqLen - 3, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } + } else { + sequenceWriter.writeData(data + 2, seqLen - 1, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), seqLen - 3, 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } - sequenceWriter.writeEnd(key, thread_idx); - - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), dataLength - 5, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if(reverseFrames != 0){ - size_t sequenceLength = dataLength -2; + size_t sequenceLength = seqLen; // bool hasWrongChar = false; for(size_t pos = 0; pos < sequenceLength; ++pos) { char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]); @@ -163,93 +136,69 @@ int extractframes(int argc, const char **argv, const Command& command) { // continue; // } reverseComplementStr.push_back('\n'); + + seqLen = reverseComplementStr.size(); + data = reverseComplementStr.c_str(); } if (reverseFrames & Orf::FRAME_1) { - size_t cur_dataLength = reverseComplementStr.size(); - const char* cur_data = reverseComplementStr.c_str(); - - sequenceWriter.writeStart(thread_idx); - if(par.translate){ - if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { - cur_dataLength = cur_dataLength - (cur_dataLength % 3); - } - - if (cur_dataLength < 3) { - continue; - } - - if (cur_dataLength > (3 * par.maxSeqLen)) { - cur_dataLength = (3 * par.maxSeqLen); - } - translateNucl.translate(aa, cur_data, cur_dataLength); - sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); - - }else{ - sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + if (par.translate) { + size_t currSeqLen = seqLen; + if (currSeqLen >= 3) { + if (currSeqLen > (3 * par.maxSeqLen)) { + currSeqLen = (3 * par.maxSeqLen); + } + size_t condonLength = currSeqLen / 3 * 3; + translateNucl.translate(aa, data, condonLength); + sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 2, static_cast(0), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } + } else { + sequenceWriter.writeData(data, seqLen, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 2, static_cast(0), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } - sequenceWriter.writeEnd(key, thread_idx); - - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast(0), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (reverseFrames & Orf::FRAME_2) { - size_t cur_dataLength = reverseComplementStr.size() - 1; - const char* cur_data = reverseComplementStr.c_str() + 1; - - sequenceWriter.writeStart(thread_idx); - if(par.translate){ - if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { - cur_dataLength = cur_dataLength - (cur_dataLength % 3); - } - - if (cur_dataLength < 3) { - continue; - } - - if (cur_dataLength > (3 * par.maxSeqLen)) { - cur_dataLength = (3 * par.maxSeqLen); - } - translateNucl.translate(aa, cur_data, cur_dataLength); - sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); - - }else{ - sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); + if (par.translate) { + size_t currSeqLen = seqLen - 1; + if (currSeqLen >= 3) { + if (currSeqLen > (3 * par.maxSeqLen)) { + currSeqLen = (3 * par.maxSeqLen); + } + size_t condonLength = currSeqLen / 3 * 3; + translateNucl.translate(aa, data + 1, condonLength); + sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 3, static_cast(1), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } + } else { + sequenceWriter.writeData(data + 1, seqLen - 1, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 3, static_cast(1), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); } - sequenceWriter.writeEnd(key, thread_idx); - - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast(1), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); } if (reverseFrames & Orf::FRAME_3) { - size_t cur_dataLength = reverseComplementStr.size() - 2; - const char* cur_data = reverseComplementStr.c_str() + 2; - - sequenceWriter.writeStart(thread_idx); - if(par.translate){ - if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) { - cur_dataLength = cur_dataLength - (cur_dataLength % 3); - } - - if (cur_dataLength < 3) { - continue; - } - - if (cur_dataLength > (3 * par.maxSeqLen)) { - cur_dataLength = (3 * par.maxSeqLen); - } - translateNucl.translate(aa, cur_data, cur_dataLength); - sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx); - - }else{ - sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx); - } - sequenceWriter.writeEnd(key, thread_idx); - - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast(2), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); + if (par.translate) { + size_t currSeqLen = seqLen - 2; + if (currSeqLen >= 3) { + if (currSeqLen > (3 * par.maxSeqLen)) { + currSeqLen = (3 * par.maxSeqLen); + } + size_t condonLength = currSeqLen / 3 * 3; + translateNucl.translate(aa, data + 2, condonLength); + sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 4, static_cast(2), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } + } else { + sequenceWriter.writeData(data + 2, seqLen - 2, key, thread_idx); + bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 4, static_cast(2), 0, 0); + headerWriter.writeData(buffer, bufferLen, key, thread_idx); + } } reverseComplementStr.clear(); } From a804518bb2d09bae74882771267d582bc7a353d5 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Mon, 30 Sep 2024 16:57:44 +0900 Subject: [PATCH 10/10] make full frame mode available through --translation-mode --- data/workflow/createindex.sh | 14 ++- data/workflow/translated_search.sh | 4 +- src/commons/Parameters.cpp | 9 +- src/commons/Parameters.h | 8 +- src/util/extractframes.cpp | 178 ++++++++++------------------- src/workflow/CreateIndex.cpp | 8 +- src/workflow/Search.cpp | 10 +- 7 files changed, 94 insertions(+), 137 deletions(-) diff --git a/data/workflow/createindex.sh b/data/workflow/createindex.sh index d75127381..2b02b1174 100755 --- a/data/workflow/createindex.sh +++ b/data/workflow/createindex.sh @@ -18,9 +18,15 @@ INPUT="$1" if [ -n "$TRANSLATED" ]; then # 1. extract orf if notExists "$2/orfs_aa.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \ - || fail "extractorfs died" + if [ -n "$ORF_SKIP" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" extractframes "$INPUT" "$2/orfs_aa" ${EXTRACT_FRAMES_PAR} \ + || fail "extractframes died" + else + # shellcheck disable=SC2086 + "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \ + || fail "extractorfs died" + fi fi # shellcheck disable=SC2086 @@ -33,7 +39,7 @@ if [ -n "$TRANSLATED" ]; then rm -f "$2/createindex.sh" fi elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then - # 1. extract orf + # 1. extract orf if notExists "$2/nucl_split_seq.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \ diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh index 7efdfcb50..f39c5978f 100755 --- a/data/workflow/translated_search.sh +++ b/data/workflow/translated_search.sh @@ -21,7 +21,7 @@ TMP_PATH="$4" QUERY="$1" QUERY_ORF="$1" if [ -n "$QUERY_NUCL" ]; then - if [ "$ORF_SKIP" = "TRUE" ]; then + if [ -n "$ORF_SKIP" ]; then if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \ @@ -42,7 +42,7 @@ TARGET="$2" TARGET_ORF="$2" if [ -n "$TARGET_NUCL" ]; then if [ -n "$NO_TARGET_INDEX" ]; then - if [ "$ORF_SKIP" = "TRUE" ]; then + if [ -n "$ORF_SKIP" ]; then if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \ diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 52638a180..b8157625d 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -172,7 +172,7 @@ Parameters::Parameters(): PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"), PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"), PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), - PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "ORF skip mode", "???", typeid(bool), (void *) &orfSkip, ""), + PARAM_TRANSLATION_MODE(PARAM_TRANSLATION_MODE_ID, "--translation-mode", "Translation mode", "Translation AA seq from nucleotide by 0: ORFs, 1: full reading frames", typeid(int), (void *) &translationMode, "^[0-1]{1}$"), // easysearch PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""), // extractorfs @@ -1271,7 +1271,7 @@ Parameters::Parameters(): searchworkflow.push_back(&PARAM_RUNNER); searchworkflow.push_back(&PARAM_REUSELATEST); searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES); - searchworkflow.push_back(&PARAM_ORF_SKIP); + searchworkflow.push_back(&PARAM_TRANSLATION_MODE); linsearchworkflow = combineList(align, kmersearch); linsearchworkflow = combineList(linsearchworkflow, swapresult); @@ -1294,8 +1294,9 @@ Parameters::Parameters(): // createindex workflow createindex = combineList(indexdb, extractorfs); - createindex = combineList(createindex, translatenucs); + createindex = combineList(createindex, extractframes); createindex = combineList(createindex, splitsequence); + createindex.push_back(&PARAM_TRANSLATION_MODE); createindex.push_back(&PARAM_STRAND); createindex.push_back(&PARAM_REMOVE_TMP_FILES); @@ -2281,7 +2282,7 @@ void Parameters::setDefaults() { orfFilterSens = 2.0; orfFilterEval = 100; lcaSearch = false; - orfSkip = false; + translationMode = PARAM_TRANSLATION_MODE_ORF; greedyBestHits = false; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index d8e06eabd..4bfb0bafe 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -321,6 +321,10 @@ class Parameters { static const int PARAM_RESULT_DIRECTION_QUERY = 0; static const int PARAM_RESULT_DIRECTION_TARGET = 1; + // translation mode + static const int PARAM_TRANSLATION_MODE_ORF = 0; + static const int PARAM_TRANSLATION_MODE_FRAME = 1; + // path to databases std::string db1; std::string db1Index; @@ -467,7 +471,7 @@ class Parameters { float orfFilterSens; double orfFilterEval; bool lcaSearch; - bool orfSkip; + int translationMode; // easysearch bool greedyBestHits; @@ -887,7 +891,7 @@ class Parameters { PARAMETER(PARAM_ORF_FILTER_S) PARAMETER(PARAM_ORF_FILTER_E) PARAMETER(PARAM_LCA_SEARCH) - PARAMETER(PARAM_ORF_SKIP) + PARAMETER(PARAM_TRANSLATION_MODE) // easysearch PARAMETER(PARAM_GREEDY_BEST_HITS) diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp index 6f2760bf7..0196acb5a 100644 --- a/src/util/extractframes.cpp +++ b/src/util/extractframes.cpp @@ -5,8 +5,6 @@ #include "Matcher.h" #include "Util.h" #include "TranslateNucl.h" -#include "itoa.h" - #include "Orf.h" #include @@ -17,6 +15,41 @@ #include #endif +void handleSingleFrame(TranslateNucl& translateNucl, DBWriter& sequenceWriter, DBWriter& headerWriter, unsigned int key, char* headerBuffer, const char* data, size_t seqLen, int frame, bool reverse, bool translate, char*& aaBuffer, size_t& aaBufferSize, int thread_idx) { + data = data + frame; + seqLen = seqLen - frame; + if (translate == true) { + if (seqLen < 3) { + return; + } + size_t codonLength = (seqLen / 3) * 3; + if ((codonLength + 1) > aaBufferSize) { + aaBufferSize = codonLength * 1.5 + 1; + aaBuffer = (char*)realloc(aaBuffer, aaBufferSize * sizeof(char)); + } + translateNucl.translate(aaBuffer, data, codonLength); + aaBuffer[codonLength / 3] = '\n'; + sequenceWriter.writeData(aaBuffer, (codonLength / 3) + 1, key, thread_idx); + size_t bufferLen; + if (reverse) { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, frame + codonLength, static_cast(frame), 0, 0); + } else { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast(frame), frame + codonLength, 0, 0); + } + headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx); + } else { + // +1: add newline, but remove it from the end pos + sequenceWriter.writeData(data, seqLen + 1, key, thread_idx); + size_t bufferLen; + if (reverse) { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, seqLen - 1, static_cast(frame), 0, 0); + } else { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast(frame), seqLen - 1, 0, 0); + } + headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx); + } +} + int extractframes(int argc, const char **argv, const Command& command) { Parameters& par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); @@ -25,7 +58,7 @@ int extractframes(int argc, const char **argv, const Command& command) { reader.open(DBReader::NOSORT); int outputDbtype = reader.getDbtype(); - if(par.translate) { + if (par.translate) { outputDbtype = Parameters::DBTYPE_AMINO_ACIDS; } DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype); @@ -52,10 +85,17 @@ int extractframes(int argc, const char **argv, const Command& command) { queryFrom = 0; } - char* aa = new char[par.maxSeqLen + 3 + 1]; + size_t aaBufferSize = par.maxSeqLen + 3 + 1; + char* aa = NULL; + if (par.translate == true) { + aa = (char*)malloc(aaBufferSize * sizeof(char)); + } + char buffer[1024]; + std::string reverseComplementStr; reverseComplementStr.reserve(32000); + for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){ progress.updateProgress(); @@ -63,146 +103,48 @@ int extractframes(int argc, const char **argv, const Command& command) { const char* data = reader.getData(i, thread_idx); size_t seqLen = reader.getSeqLen(i); - size_t bufferLen; if (forwardFrames & Orf::FRAME_1) { - if (par.translate) { - size_t currSeqLen = seqLen + 1; - if (currSeqLen >= 3) { - if (currSeqLen > (3 * par.maxSeqLen)) { - currSeqLen = (3 * par.maxSeqLen); - } - size_t condonLength = currSeqLen / 3 * 3; - translateNucl.translate(aa, data, condonLength); - sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), seqLen - 1, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } - } else { - sequenceWriter.writeData(data, seqLen + 1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), seqLen - 1, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, false, par.translate, aa, aaBufferSize, thread_idx); } if (forwardFrames & Orf::FRAME_2) { - if (par.translate) { - size_t currSeqLen = seqLen; - if (currSeqLen >= 3) { - if (currSeqLen > (3 * par.maxSeqLen)) { - currSeqLen = (3 * par.maxSeqLen); - } - size_t condonLength = currSeqLen / 3 * 3; - translateNucl.translate(aa, data + 1, condonLength); - sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), seqLen - 2, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } - } else { - sequenceWriter.writeData(data + 1, seqLen, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), seqLen - 2, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, false, par.translate, aa, aaBufferSize, thread_idx); } if (forwardFrames & Orf::FRAME_3) { - if (par.translate) { - size_t currSeqLen = seqLen - 1; - if (currSeqLen >= 3) { - if (currSeqLen > (3 * par.maxSeqLen)) { - currSeqLen = (3 * par.maxSeqLen); - } - size_t condonLength = currSeqLen / 3 * 3; - translateNucl.translate(aa, data + 2, condonLength); - sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), seqLen - 3, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } - } else { - sequenceWriter.writeData(data + 2, seqLen - 1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), seqLen - 3, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, false, par.translate, aa, aaBufferSize, thread_idx); } - - if(reverseFrames != 0){ - size_t sequenceLength = seqLen; + if (reverseFrames != 0) { // bool hasWrongChar = false; - for(size_t pos = 0; pos < sequenceLength; ++pos) { - char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]); + for (size_t pos = 0; pos < seqLen; ++pos) { + char reverseComplement = Orf::complement(data[seqLen - pos - 1]); reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement; reverseComplementStr.push_back(reverseComplement); // hasWrongChar |= (reverseComplement == '.'); } -// if(hasWrongChar == true){ -// continue; -// } + // if (hasWrongChar == true) { + // continue; + // } reverseComplementStr.push_back('\n'); - - seqLen = reverseComplementStr.size(); + seqLen = reverseComplementStr.size() - 1; data = reverseComplementStr.c_str(); } if (reverseFrames & Orf::FRAME_1) { - if (par.translate) { - size_t currSeqLen = seqLen; - if (currSeqLen >= 3) { - if (currSeqLen > (3 * par.maxSeqLen)) { - currSeqLen = (3 * par.maxSeqLen); - } - size_t condonLength = currSeqLen / 3 * 3; - translateNucl.translate(aa, data, condonLength); - sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 2, static_cast(0), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } - } else { - sequenceWriter.writeData(data, seqLen, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 2, static_cast(0), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, true, par.translate, aa, aaBufferSize, thread_idx); } if (reverseFrames & Orf::FRAME_2) { - if (par.translate) { - size_t currSeqLen = seqLen - 1; - if (currSeqLen >= 3) { - if (currSeqLen > (3 * par.maxSeqLen)) { - currSeqLen = (3 * par.maxSeqLen); - } - size_t condonLength = currSeqLen / 3 * 3; - translateNucl.translate(aa, data + 1, condonLength); - sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 3, static_cast(1), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } - } else { - sequenceWriter.writeData(data + 1, seqLen - 1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 3, static_cast(1), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, true, par.translate, aa, aaBufferSize, thread_idx); } if (reverseFrames & Orf::FRAME_3) { - if (par.translate) { - size_t currSeqLen = seqLen - 2; - if (currSeqLen >= 3) { - if (currSeqLen > (3 * par.maxSeqLen)) { - currSeqLen = (3 * par.maxSeqLen); - } - size_t condonLength = currSeqLen / 3 * 3; - translateNucl.translate(aa, data + 2, condonLength); - sequenceWriter.writeData(aa, (condonLength / 3), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 4, static_cast(2), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } - } else { - sequenceWriter.writeData(data + 2, seqLen - 2, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, seqLen - 4, static_cast(2), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - } + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, true, par.translate, aa, aaBufferSize, thread_idx); } reverseComplementStr.clear(); } - delete[] aa; + if (aa != NULL) { + free(aa); + } } headerWriter.close(true); sequenceWriter.close(true); diff --git a/src/workflow/CreateIndex.cpp b/src/workflow/CreateIndex.cpp index fc913f09b..58b7dd167 100644 --- a/src/workflow/CreateIndex.cpp +++ b/src/workflow/CreateIndex.cpp @@ -42,8 +42,12 @@ int createindex(Parameters &par, const Command &command, const std::string &inde cmd.addVariable("INDEXER", indexerModule.c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); par.translate = 1; - cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); - cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); + cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL); + if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) { + cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); + } else { + cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); + } cmd.addVariable("SPLIT_SEQ_PAR", par.createParameterString(par.splitsequence).c_str()); if(indexerModule == "kmerindexdb"){ cmd.addVariable("INDEX_PAR", par.createParameterString(par.kmerindexdb).c_str()); diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 17873f768..5e6786d3c 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -534,22 +534,22 @@ int search(int argc, const char **argv, const Command& command) { if (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED|Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) { cmd.addVariable("NO_TARGET_INDEX", (indexStr == "") ? "TRUE" : NULL); - FileUtil::writeFile(tmpDir + "/translated_search.sh", translated_search_sh, translated_search_sh_len); cmd.addVariable("QUERY_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED) ? "TRUE" : NULL); - cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL); + cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL); cmd.addVariable("THREAD_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); par.subDbMode = 1; cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); par.translate = 1; cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str()); - cmd.addVariable("SEARCH", program.c_str()); - if (par.orfSkip) { - cmd.addVariable("ORF_SKIP", "TRUE"); + cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL); + if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) { cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); } else { cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); } + cmd.addVariable("SEARCH", program.c_str()); program = std::string(tmpDir + "/translated_search.sh"); + FileUtil::writeFile(program.c_str(), translated_search_sh, translated_search_sh_len); }else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE && searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){ FileUtil::writeFile(tmpDir + "/blastn.sh", blastn_sh, blastn_sh_len);