Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SKIP_ORF option in translated_search #885

Merged
merged 11 commits into from
Nov 22, 2024
14 changes: 10 additions & 4 deletions data/workflow/createindex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,15 @@ INPUT="$1"
if [ -n "$TRANSLATED" ]; then
# 1. extract orf
if notExists "$2/orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
|| fail "extractorfs died"
if [ -n "$ORF_SKIP" ]; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$INPUT" "$2/orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
else
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
|| fail "extractorfs died"
fi
fi

# shellcheck disable=SC2086
Expand All @@ -33,7 +39,7 @@ if [ -n "$TRANSLATED" ]; then
rm -f "$2/createindex.sh"
fi
elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then
# 1. extract orf
# 1. extract orf
if notExists "$2/nucl_split_seq.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \
Expand Down
33 changes: 25 additions & 8 deletions data/workflow/translated_search.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,18 @@ TMP_PATH="$4"
QUERY="$1"
QUERY_ORF="$1"
if [ -n "$QUERY_NUCL" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
if [ -n "$ORF_SKIP" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
fi
fi
QUERY="${TMP_PATH}/q_orfs_aa"
QUERY_ORF="${TMP_PATH}/q_orfs_aa"
Expand All @@ -34,10 +42,19 @@ TARGET="$2"
TARGET_ORF="$2"
if [ -n "$TARGET_NUCL" ]; then
if [ -n "$NO_TARGET_INDEX" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
if [ -n "$ORF_SKIP" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# same here
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
fi
fi
TARGET="${TMP_PATH}/t_orfs_aa"
TARGET_ORF="${TMP_PATH}/t_orfs_aa"
Expand Down
12 changes: 9 additions & 3 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Parameters::Parameters():
PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_TRANSLATION_MODE(PARAM_TRANSLATION_MODE_ID, "--translation-mode", "Translation mode", "Translation AA seq from nucleotide by 0: ORFs, 1: full reading frames", typeid(int), (void *) &translationMode, "^[0-1]{1}$"),
// easysearch
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
// extractorfs
Expand Down Expand Up @@ -727,7 +728,9 @@ Parameters::Parameters():

// extract frames
extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_TRANSLATION_TABLE);
extractframes.push_back(&PARAM_TRANSLATE);
extractframes.push_back(&PARAM_CREATE_LOOKUP);
extractframes.push_back(&PARAM_THREADS);
extractframes.push_back(&PARAM_COMPRESSED);
Expand Down Expand Up @@ -1251,7 +1254,7 @@ Parameters::Parameters():
searchworkflow = combineList(searchworkflow, rescorediagonal);
searchworkflow = combineList(searchworkflow, result2profile);
searchworkflow = combineList(searchworkflow, extractorfs);
searchworkflow = combineList(searchworkflow, translatenucs);
searchworkflow = combineList(searchworkflow, extractframes);
searchworkflow = combineList(searchworkflow, splitsequence);
searchworkflow = combineList(searchworkflow, offsetalignment);
// needed for slice search, however all its parameters are already present in searchworkflow
Expand All @@ -1268,6 +1271,7 @@ Parameters::Parameters():
searchworkflow.push_back(&PARAM_RUNNER);
searchworkflow.push_back(&PARAM_REUSELATEST);
searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
searchworkflow.push_back(&PARAM_TRANSLATION_MODE);

linsearchworkflow = combineList(align, kmersearch);
linsearchworkflow = combineList(linsearchworkflow, swapresult);
Expand All @@ -1290,8 +1294,9 @@ Parameters::Parameters():

// createindex workflow
createindex = combineList(indexdb, extractorfs);
createindex = combineList(createindex, translatenucs);
createindex = combineList(createindex, extractframes);
createindex = combineList(createindex, splitsequence);
createindex.push_back(&PARAM_TRANSLATION_MODE);
createindex.push_back(&PARAM_STRAND);
createindex.push_back(&PARAM_REMOVE_TMP_FILES);

Expand Down Expand Up @@ -2277,6 +2282,7 @@ void Parameters::setDefaults() {
orfFilterSens = 2.0;
orfFilterEval = 100;
lcaSearch = false;
translationMode = PARAM_TRANSLATION_MODE_ORF;

greedyBestHits = false;

Expand Down
6 changes: 6 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,10 @@ class Parameters {
static const int PARAM_RESULT_DIRECTION_QUERY = 0;
static const int PARAM_RESULT_DIRECTION_TARGET = 1;

// translation mode
static const int PARAM_TRANSLATION_MODE_ORF = 0;
static const int PARAM_TRANSLATION_MODE_FRAME = 1;

// path to databases
std::string db1;
std::string db1Index;
Expand Down Expand Up @@ -467,6 +471,7 @@ class Parameters {
float orfFilterSens;
double orfFilterEval;
bool lcaSearch;
int translationMode;

// easysearch
bool greedyBestHits;
Expand Down Expand Up @@ -886,6 +891,7 @@ class Parameters {
PARAMETER(PARAM_ORF_FILTER_S)
PARAMETER(PARAM_ORF_FILTER_E)
PARAMETER(PARAM_LCA_SEARCH)
PARAMETER(PARAM_TRANSLATION_MODE)

// easysearch
PARAMETER(PARAM_GREEDY_BEST_HITS)
Expand Down
129 changes: 82 additions & 47 deletions src/util/extractframes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include "DBWriter.h"
#include "Matcher.h"
#include "Util.h"
#include "itoa.h"

#include "TranslateNucl.h"
#include "Orf.h"

#include <unistd.h>
Expand All @@ -16,23 +15,63 @@
#include <omp.h>
#endif

void handleSingleFrame(TranslateNucl& translateNucl, DBWriter& sequenceWriter, DBWriter& headerWriter, unsigned int key, char* headerBuffer, const char* data, size_t seqLen, int frame, bool reverse, bool translate, char*& aaBuffer, size_t& aaBufferSize, int thread_idx) {
data = data + frame;
seqLen = seqLen - frame;
if (translate == true) {
if (seqLen < 3) {
return;
}
size_t codonLength = (seqLen / 3) * 3;
if ((codonLength + 1) > aaBufferSize) {
aaBufferSize = codonLength * 1.5 + 1;
aaBuffer = (char*)realloc(aaBuffer, aaBufferSize * sizeof(char));
}
translateNucl.translate(aaBuffer, data, codonLength);
aaBuffer[codonLength / 3] = '\n';
sequenceWriter.writeData(aaBuffer, (codonLength / 3) + 1, key, thread_idx);
size_t bufferLen;
if (reverse) {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, frame + codonLength, static_cast<size_t>(frame), 0, 0);
} else {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), frame + codonLength, 0, 0);
}
headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
} else {
// +1: add newline, but remove it from the end pos
sequenceWriter.writeData(data, seqLen + 1, key, thread_idx);
size_t bufferLen;
if (reverse) {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, seqLen - 1, static_cast<size_t>(frame), 0, 0);
} else {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), seqLen - 1, 0, 0);
}
headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
}
}

int extractframes(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
reader.open(DBReader<unsigned int>::NOSORT);

DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
int outputDbtype = reader.getDbtype();
if (par.translate) {
outputDbtype = Parameters::DBTYPE_AMINO_ACIDS;
}
DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype);
sequenceWriter.open();

DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB);
headerWriter.open();

unsigned int forwardFrames = Orf::getFrames(par.forwardFrames);
unsigned int reverseFrames = Orf::getFrames(par.reverseFrames);
Debug::Progress progress(reader.getSize());

Debug::Progress progress(reader.getSize());
TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
#pragma omp parallel
{
int thread_idx = 0;
Expand All @@ -46,70 +85,66 @@ int extractframes(int argc, const char **argv, const Command& command) {
queryFrom = 0;
}

size_t aaBufferSize = par.maxSeqLen + 3 + 1;
char* aa = NULL;
if (par.translate == true) {
aa = (char*)malloc(aaBufferSize * sizeof(char));
}

char buffer[1024];

std::string reverseComplementStr;
reverseComplementStr.reserve(32000);

for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){
progress.updateProgress();

unsigned int key = reader.getDbKey(i);
const char* data = reader.getData(i, thread_idx);
size_t dataLength = reader.getEntryLen(i);

size_t bufferLen;
switch (forwardFrames){
case Orf::FRAME_1:
// -1 to ignore the null byte copy the new line
sequenceWriter.writeData(data, dataLength - 1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
size_t seqLen = reader.getSeqLen(i);

if (forwardFrames & Orf::FRAME_1) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, false, par.translate, aa, aaBufferSize, thread_idx);
}
if (forwardFrames & Orf::FRAME_2) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, false, par.translate, aa, aaBufferSize, thread_idx);
}
if (forwardFrames & Orf::FRAME_3) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, false, par.translate, aa, aaBufferSize, thread_idx);
}

if(reverseFrames != 0){
size_t sequenceLength = dataLength -2;
if (reverseFrames != 0) {
// bool hasWrongChar = false;
for(size_t pos = 0; pos < sequenceLength; ++pos) {
char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]);
for (size_t pos = 0; pos < seqLen; ++pos) {
char reverseComplement = Orf::complement(data[seqLen - pos - 1]);
reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement;
reverseComplementStr.push_back(reverseComplement);
// hasWrongChar |= (reverseComplement == '.');
}
// if(hasWrongChar == true){
// continue;
// }
// if (hasWrongChar == true) {
// continue;
// }
reverseComplementStr.push_back('\n');
seqLen = reverseComplementStr.size() - 1;
data = reverseComplementStr.c_str();
}

switch (reverseFrames){
case Orf::FRAME_1:
sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
if (reverseFrames & Orf::FRAME_1) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, true, par.translate, aa, aaBufferSize, thread_idx);
}

if (reverseFrames & Orf::FRAME_2) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, true, par.translate, aa, aaBufferSize, thread_idx);
}

if (reverseFrames & Orf::FRAME_3) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, true, par.translate, aa, aaBufferSize, thread_idx);
}
reverseComplementStr.clear();
}
if (aa != NULL) {
free(aa);
}
}
headerWriter.close(true);
sequenceWriter.close(true);
Expand Down
8 changes: 6 additions & 2 deletions src/workflow/CreateIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,12 @@ int createindex(Parameters &par, const Command &command, const std::string &inde
cmd.addVariable("INDEXER", indexerModule.c_str());
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
} else {
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
cmd.addVariable("SPLIT_SEQ_PAR", par.createParameterString(par.splitsequence).c_str());
if(indexerModule == "kmerindexdb"){
cmd.addVariable("INDEX_PAR", par.createParameterString(par.kmerindexdb).c_str());
Expand Down
15 changes: 10 additions & 5 deletions src/workflow/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ int search(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < par.extractorfs.size(); i++) {
par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.translatenucs.size(); i++) {
par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
for (size_t i = 0; i < par.extractframes.size(); i++) {
par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.splitsequence.size(); i++) {
par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
Expand Down Expand Up @@ -534,17 +534,22 @@ int search(int argc, const char **argv, const Command& command) {

if (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED|Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) {
cmd.addVariable("NO_TARGET_INDEX", (indexStr == "") ? "TRUE" : NULL);
FileUtil::writeFile(tmpDir + "/translated_search.sh", translated_search_sh, translated_search_sh_len);
cmd.addVariable("QUERY_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED) ? "TRUE" : NULL);
cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL);
cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL);
cmd.addVariable("THREAD_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
par.subDbMode = 1;
cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str());
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
LunaJang marked this conversation as resolved.
Show resolved Hide resolved
} else {
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
cmd.addVariable("SEARCH", program.c_str());
program = std::string(tmpDir + "/translated_search.sh");
FileUtil::writeFile(program.c_str(), translated_search_sh, translated_search_sh_len);
}else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE &&
searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){
FileUtil::writeFile(tmpDir + "/blastn.sh", blastn_sh, blastn_sh_len);
Expand Down