diff --git a/data/workflow/createindex.sh b/data/workflow/createindex.sh index d75127381..2b02b1174 100755 --- a/data/workflow/createindex.sh +++ b/data/workflow/createindex.sh @@ -18,9 +18,15 @@ INPUT="$1" if [ -n "$TRANSLATED" ]; then # 1. extract orf if notExists "$2/orfs_aa.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \ - || fail "extractorfs died" + if [ -n "$ORF_SKIP" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" extractframes "$INPUT" "$2/orfs_aa" ${EXTRACT_FRAMES_PAR} \ + || fail "extractframes died" + else + # shellcheck disable=SC2086 + "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \ + || fail "extractorfs died" + fi fi # shellcheck disable=SC2086 @@ -33,7 +39,7 @@ if [ -n "$TRANSLATED" ]; then rm -f "$2/createindex.sh" fi elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then - # 1. extract orf + # 1. extract orf if notExists "$2/nucl_split_seq.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \ diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh index b2e4252d4..f39c5978f 100755 --- a/data/workflow/translated_search.sh +++ b/data/workflow/translated_search.sh @@ -21,10 +21,18 @@ TMP_PATH="$4" QUERY="$1" QUERY_ORF="$1" if [ -n "$QUERY_NUCL" ]; then - if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \ - || fail "extract orfs step died" + if [ -n "$ORF_SKIP" ]; then + if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \ + || fail "extractframes died" + fi + else + if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \ + || fail "extract orfs step died" + fi fi QUERY="${TMP_PATH}/q_orfs_aa" QUERY_ORF="${TMP_PATH}/q_orfs_aa" @@ -34,10 +42,19 @@ TARGET="$2" TARGET_ORF="$2" if [ -n "$TARGET_NUCL" ]; then if [ -n "$NO_TARGET_INDEX" ]; then - if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \ - || fail "extract target orfs step died" + if [ -n "$ORF_SKIP" ]; then + if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \ + || fail "extractframes died" + fi + else + if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then + # same here + # shellcheck disable=SC2086 + "$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \ + || fail "extract target orfs step died" + fi fi TARGET="${TMP_PATH}/t_orfs_aa" TARGET_ORF="${TMP_PATH}/t_orfs_aa" diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 34e429811..b8157625d 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -172,6 +172,7 @@ Parameters::Parameters(): PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"), PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"), PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), + PARAM_TRANSLATION_MODE(PARAM_TRANSLATION_MODE_ID, "--translation-mode", "Translation mode", "Translation AA seq from nucleotide by 0: ORFs, 1: full reading frames", typeid(int), (void *) &translationMode, "^[0-1]{1}$"), // easysearch PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""), // extractorfs @@ -727,7 +728,9 @@ Parameters::Parameters(): // extract frames extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES); - extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES); + extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES); + extractframes.push_back(&PARAM_TRANSLATION_TABLE); + extractframes.push_back(&PARAM_TRANSLATE); extractframes.push_back(&PARAM_CREATE_LOOKUP); extractframes.push_back(&PARAM_THREADS); extractframes.push_back(&PARAM_COMPRESSED); @@ -1251,7 +1254,7 @@ Parameters::Parameters(): searchworkflow = combineList(searchworkflow, rescorediagonal); searchworkflow = combineList(searchworkflow, result2profile); searchworkflow = combineList(searchworkflow, extractorfs); - searchworkflow = combineList(searchworkflow, translatenucs); + searchworkflow = combineList(searchworkflow, extractframes); searchworkflow = combineList(searchworkflow, splitsequence); searchworkflow = combineList(searchworkflow, offsetalignment); // needed for slice search, however all its parameters are already present in searchworkflow @@ -1268,6 +1271,7 @@ Parameters::Parameters(): searchworkflow.push_back(&PARAM_RUNNER); searchworkflow.push_back(&PARAM_REUSELATEST); searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES); + searchworkflow.push_back(&PARAM_TRANSLATION_MODE); linsearchworkflow = combineList(align, kmersearch); linsearchworkflow = combineList(linsearchworkflow, swapresult); @@ -1290,8 +1294,9 @@ Parameters::Parameters(): // createindex workflow createindex = combineList(indexdb, extractorfs); - createindex = combineList(createindex, translatenucs); + createindex = combineList(createindex, extractframes); createindex = combineList(createindex, splitsequence); + createindex.push_back(&PARAM_TRANSLATION_MODE); createindex.push_back(&PARAM_STRAND); createindex.push_back(&PARAM_REMOVE_TMP_FILES); @@ -2277,6 +2282,7 @@ void Parameters::setDefaults() { orfFilterSens = 2.0; orfFilterEval = 100; lcaSearch = false; + translationMode = PARAM_TRANSLATION_MODE_ORF; greedyBestHits = false; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 745d2f809..4bfb0bafe 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -321,6 +321,10 @@ class Parameters { static const int PARAM_RESULT_DIRECTION_QUERY = 0; static const int PARAM_RESULT_DIRECTION_TARGET = 1; + // translation mode + static const int PARAM_TRANSLATION_MODE_ORF = 0; + static const int PARAM_TRANSLATION_MODE_FRAME = 1; + // path to databases std::string db1; std::string db1Index; @@ -467,6 +471,7 @@ class Parameters { float orfFilterSens; double orfFilterEval; bool lcaSearch; + int translationMode; // easysearch bool greedyBestHits; @@ -886,6 +891,7 @@ class Parameters { PARAMETER(PARAM_ORF_FILTER_S) PARAMETER(PARAM_ORF_FILTER_E) PARAMETER(PARAM_LCA_SEARCH) + PARAMETER(PARAM_TRANSLATION_MODE) // easysearch PARAMETER(PARAM_GREEDY_BEST_HITS) diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp index 348d40bf7..0196acb5a 100644 --- a/src/util/extractframes.cpp +++ b/src/util/extractframes.cpp @@ -4,8 +4,7 @@ #include "DBWriter.h" #include "Matcher.h" #include "Util.h" -#include "itoa.h" - +#include "TranslateNucl.h" #include "Orf.h" #include @@ -16,6 +15,41 @@ #include #endif +void handleSingleFrame(TranslateNucl& translateNucl, DBWriter& sequenceWriter, DBWriter& headerWriter, unsigned int key, char* headerBuffer, const char* data, size_t seqLen, int frame, bool reverse, bool translate, char*& aaBuffer, size_t& aaBufferSize, int thread_idx) { + data = data + frame; + seqLen = seqLen - frame; + if (translate == true) { + if (seqLen < 3) { + return; + } + size_t codonLength = (seqLen / 3) * 3; + if ((codonLength + 1) > aaBufferSize) { + aaBufferSize = codonLength * 1.5 + 1; + aaBuffer = (char*)realloc(aaBuffer, aaBufferSize * sizeof(char)); + } + translateNucl.translate(aaBuffer, data, codonLength); + aaBuffer[codonLength / 3] = '\n'; + sequenceWriter.writeData(aaBuffer, (codonLength / 3) + 1, key, thread_idx); + size_t bufferLen; + if (reverse) { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, frame + codonLength, static_cast(frame), 0, 0); + } else { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast(frame), frame + codonLength, 0, 0); + } + headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx); + } else { + // +1: add newline, but remove it from the end pos + sequenceWriter.writeData(data, seqLen + 1, key, thread_idx); + size_t bufferLen; + if (reverse) { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, seqLen - 1, static_cast(frame), 0, 0); + } else { + bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast(frame), seqLen - 1, 0, 0); + } + headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx); + } +} + int extractframes(int argc, const char **argv, const Command& command) { Parameters& par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); @@ -23,7 +57,11 @@ int extractframes(int argc, const char **argv, const Command& command) { DBReader reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); reader.open(DBReader::NOSORT); - DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype()); + int outputDbtype = reader.getDbtype(); + if (par.translate) { + outputDbtype = Parameters::DBTYPE_AMINO_ACIDS; + } + DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype); sequenceWriter.open(); DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB); @@ -31,8 +69,9 @@ int extractframes(int argc, const char **argv, const Command& command) { unsigned int forwardFrames = Orf::getFrames(par.forwardFrames); unsigned int reverseFrames = Orf::getFrames(par.reverseFrames); - Debug::Progress progress(reader.getSize()); + Debug::Progress progress(reader.getSize()); + TranslateNucl translateNucl(static_cast(par.translationTable)); #pragma omp parallel { int thread_idx = 0; @@ -46,70 +85,66 @@ int extractframes(int argc, const char **argv, const Command& command) { queryFrom = 0; } + size_t aaBufferSize = par.maxSeqLen + 3 + 1; + char* aa = NULL; + if (par.translate == true) { + aa = (char*)malloc(aaBufferSize * sizeof(char)); + } + char buffer[1024]; + std::string reverseComplementStr; reverseComplementStr.reserve(32000); + for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){ progress.updateProgress(); unsigned int key = reader.getDbKey(i); const char* data = reader.getData(i, thread_idx); - size_t dataLength = reader.getEntryLen(i); - - size_t bufferLen; - switch (forwardFrames){ - case Orf::FRAME_1: - // -1 to ignore the null byte copy the new line - sequenceWriter.writeData(data, dataLength - 1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(0), dataLength - 3, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_2: - sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(1), dataLength - 4, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_3: - sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, static_cast(2), dataLength - 5, 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; + size_t seqLen = reader.getSeqLen(i); + + if (forwardFrames & Orf::FRAME_1) { + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, false, par.translate, aa, aaBufferSize, thread_idx); + } + if (forwardFrames & Orf::FRAME_2) { + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, false, par.translate, aa, aaBufferSize, thread_idx); + } + if (forwardFrames & Orf::FRAME_3) { + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, false, par.translate, aa, aaBufferSize, thread_idx); } - if(reverseFrames != 0){ - size_t sequenceLength = dataLength -2; + if (reverseFrames != 0) { // bool hasWrongChar = false; - for(size_t pos = 0; pos < sequenceLength; ++pos) { - char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]); + for (size_t pos = 0; pos < seqLen; ++pos) { + char reverseComplement = Orf::complement(data[seqLen - pos - 1]); reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement; reverseComplementStr.push_back(reverseComplement); // hasWrongChar |= (reverseComplement == '.'); } -// if(hasWrongChar == true){ -// continue; -// } + // if (hasWrongChar == true) { + // continue; + // } reverseComplementStr.push_back('\n'); + seqLen = reverseComplementStr.size() - 1; + data = reverseComplementStr.c_str(); } - switch (reverseFrames){ - case Orf::FRAME_1: - sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast(0), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_2: - sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast(1), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; - case Orf::FRAME_3: - sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx); - bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast(2), 0, 0); - headerWriter.writeData(buffer, bufferLen, key, thread_idx); - break; + if (reverseFrames & Orf::FRAME_1) { + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, true, par.translate, aa, aaBufferSize, thread_idx); + } + + if (reverseFrames & Orf::FRAME_2) { + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, true, par.translate, aa, aaBufferSize, thread_idx); + } + + if (reverseFrames & Orf::FRAME_3) { + handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, true, par.translate, aa, aaBufferSize, thread_idx); } reverseComplementStr.clear(); } + if (aa != NULL) { + free(aa); + } } headerWriter.close(true); sequenceWriter.close(true); diff --git a/src/workflow/CreateIndex.cpp b/src/workflow/CreateIndex.cpp index fc913f09b..58b7dd167 100644 --- a/src/workflow/CreateIndex.cpp +++ b/src/workflow/CreateIndex.cpp @@ -42,8 +42,12 @@ int createindex(Parameters &par, const Command &command, const std::string &inde cmd.addVariable("INDEXER", indexerModule.c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); par.translate = 1; - cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); - cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); + cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL); + if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) { + cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); + } else { + cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); + } cmd.addVariable("SPLIT_SEQ_PAR", par.createParameterString(par.splitsequence).c_str()); if(indexerModule == "kmerindexdb"){ cmd.addVariable("INDEX_PAR", par.createParameterString(par.kmerindexdb).c_str()); diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 62ec132e2..5e6786d3c 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -208,8 +208,8 @@ int search(int argc, const char **argv, const Command& command) { for (size_t i = 0; i < par.extractorfs.size(); i++) { par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } - for (size_t i = 0; i < par.translatenucs.size(); i++) { - par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + for (size_t i = 0; i < par.extractframes.size(); i++) { + par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } for (size_t i = 0; i < par.splitsequence.size(); i++) { par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); @@ -534,17 +534,22 @@ int search(int argc, const char **argv, const Command& command) { if (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED|Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) { cmd.addVariable("NO_TARGET_INDEX", (indexStr == "") ? "TRUE" : NULL); - FileUtil::writeFile(tmpDir + "/translated_search.sh", translated_search_sh, translated_search_sh_len); cmd.addVariable("QUERY_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED) ? "TRUE" : NULL); - cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL); + cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL); cmd.addVariable("THREAD_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); par.subDbMode = 1; cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); par.translate = 1; - cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str()); + cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL); + if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) { + cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str()); + } else { + cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str()); + } cmd.addVariable("SEARCH", program.c_str()); program = std::string(tmpDir + "/translated_search.sh"); + FileUtil::writeFile(program.c_str(), translated_search_sh, translated_search_sh_len); }else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE && searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){ FileUtil::writeFile(tmpDir + "/blastn.sh", blastn_sh, blastn_sh_len);