Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SKIP_ORF option in translated_search #885

Merged
merged 11 commits into from
Nov 22, 2024
33 changes: 25 additions & 8 deletions data/workflow/translated_search.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,18 @@ TMP_PATH="$4"
QUERY="$1"
QUERY_ORF="$1"
if [ -n "$QUERY_NUCL" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
if [ "$ORF_SKIP" = "TRUE" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
fi
fi
QUERY="${TMP_PATH}/q_orfs_aa"
QUERY_ORF="${TMP_PATH}/q_orfs_aa"
Expand All @@ -34,10 +42,19 @@ TARGET="$2"
TARGET_ORF="$2"
if [ -n "$TARGET_NUCL" ]; then
if [ -n "$NO_TARGET_INDEX" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
if [ "$ORF_SKIP" ]; then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# same here
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
fi
fi
TARGET="${TMP_PATH}/t_orfs_aa"
TARGET_ORF="${TMP_PATH}/t_orfs_aa"
Expand Down
9 changes: 7 additions & 2 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Parameters::Parameters():
PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "ORF skip mode", "???", typeid(bool), (void *) &orfSkip, ""),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still TODO

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "Extract frames instead of ORFs", "Extract frames instead of ORFs during translated search", typeid(bool), (void *) &orfSkip, ""),

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@martin-steinegger any better ideas for a parameter name?

// easysearch
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
// extractorfs
Expand Down Expand Up @@ -727,7 +728,9 @@ Parameters::Parameters():

// extract frames
extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_TRANSLATION_TABLE);
extractframes.push_back(&PARAM_TRANSLATE);
extractframes.push_back(&PARAM_CREATE_LOOKUP);
extractframes.push_back(&PARAM_THREADS);
extractframes.push_back(&PARAM_COMPRESSED);
Expand Down Expand Up @@ -1251,7 +1254,7 @@ Parameters::Parameters():
searchworkflow = combineList(searchworkflow, rescorediagonal);
searchworkflow = combineList(searchworkflow, result2profile);
searchworkflow = combineList(searchworkflow, extractorfs);
searchworkflow = combineList(searchworkflow, translatenucs);
searchworkflow = combineList(searchworkflow, extractframes);
searchworkflow = combineList(searchworkflow, splitsequence);
searchworkflow = combineList(searchworkflow, offsetalignment);
// needed for slice search, however all its parameters are already present in searchworkflow
Expand All @@ -1268,6 +1271,7 @@ Parameters::Parameters():
searchworkflow.push_back(&PARAM_RUNNER);
searchworkflow.push_back(&PARAM_REUSELATEST);
searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
searchworkflow.push_back(&PARAM_ORF_SKIP);

linsearchworkflow = combineList(align, kmersearch);
linsearchworkflow = combineList(linsearchworkflow, swapresult);
Expand Down Expand Up @@ -2277,6 +2281,7 @@ void Parameters::setDefaults() {
orfFilterSens = 2.0;
orfFilterEval = 100;
lcaSearch = false;
orfSkip = false;

greedyBestHits = false;

Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ class Parameters {
float orfFilterSens;
double orfFilterEval;
bool lcaSearch;
bool orfSkip;

// easysearch
bool greedyBestHits;
Expand Down Expand Up @@ -886,6 +887,7 @@ class Parameters {
PARAMETER(PARAM_ORF_FILTER_S)
PARAMETER(PARAM_ORF_FILTER_E)
PARAMETER(PARAM_LCA_SEARCH)
PARAMETER(PARAM_ORF_SKIP)

// easysearch
PARAMETER(PARAM_GREEDY_BEST_HITS)
Expand Down
214 changes: 179 additions & 35 deletions src/util/extractframes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "DBWriter.h"
#include "Matcher.h"
#include "Util.h"
#include "TranslateNucl.h"
#include "itoa.h"

#include "Orf.h"
Expand All @@ -23,16 +24,21 @@ int extractframes(int argc, const char **argv, const Command& command) {
DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
reader.open(DBReader<unsigned int>::NOSORT);

DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
int outputDbtype = reader.getDbtype();
if(par.translate) {
outputDbtype = Parameters::DBTYPE_AMINO_ACIDS;
}
DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype);
sequenceWriter.open();

DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB);
headerWriter.open();

unsigned int forwardFrames = Orf::getFrames(par.forwardFrames);
unsigned int reverseFrames = Orf::getFrames(par.reverseFrames);
Debug::Progress progress(reader.getSize());

Debug::Progress progress(reader.getSize());
TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
#pragma omp parallel
{
int thread_idx = 0;
Expand All @@ -46,6 +52,7 @@ int extractframes(int argc, const char **argv, const Command& command) {
queryFrom = 0;
}

char* aa = new char[par.maxSeqLen + 3 + 1];
char buffer[1024];
std::string reverseComplementStr;
reverseComplementStr.reserve(32000);
Expand All @@ -57,24 +64,91 @@ int extractframes(int argc, const char **argv, const Command& command) {
size_t dataLength = reader.getEntryLen(i);

size_t bufferLen;
switch (forwardFrames){
case Orf::FRAME_1:
// -1 to ignore the null byte copy the new line
sequenceWriter.writeData(data, dataLength - 1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
if (forwardFrames & Orf::FRAME_1) {
size_t cur_dataLength = dataLength-1;
const char* cur_data = data;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}
if (forwardFrames & Orf::FRAME_2) {
size_t cur_dataLength = dataLength - 2;
const char* cur_data = data + 1;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}
if (forwardFrames & Orf::FRAME_3) {
size_t cur_dataLength = dataLength - 3;
const char* cur_data = data + 2;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}


if(reverseFrames != 0){
size_t sequenceLength = dataLength -2;
Expand All @@ -91,25 +165,95 @@ int extractframes(int argc, const char **argv, const Command& command) {
reverseComplementStr.push_back('\n');
}

switch (reverseFrames){
case Orf::FRAME_1:
sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
if (reverseFrames & Orf::FRAME_1) {
size_t cur_dataLength = reverseComplementStr.size();
const char* cur_data = reverseComplementStr.c_str();

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}

if (reverseFrames & Orf::FRAME_2) {
size_t cur_dataLength = reverseComplementStr.size() - 1;
const char* cur_data = reverseComplementStr.c_str() + 1;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}

if (reverseFrames & Orf::FRAME_3) {
size_t cur_dataLength = reverseComplementStr.size() - 2;
const char* cur_data = reverseComplementStr.c_str() + 2;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}
reverseComplementStr.clear();
}
delete[] aa;
}
headerWriter.close(true);
sequenceWriter.close(true);
Expand Down
11 changes: 8 additions & 3 deletions src/workflow/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ int search(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < par.extractorfs.size(); i++) {
par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.translatenucs.size(); i++) {
par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
for (size_t i = 0; i < par.extractframes.size(); i++) {
par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.splitsequence.size(); i++) {
par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
Expand Down Expand Up @@ -541,9 +541,14 @@ int search(int argc, const char **argv, const Command& command) {
par.subDbMode = 1;
cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str());
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
cmd.addVariable("SEARCH", program.c_str());
if (par.orfSkip) {
cmd.addVariable("ORF_SKIP", "TRUE");
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
LunaJang marked this conversation as resolved.
Show resolved Hide resolved
} else {
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
program = std::string(tmpDir + "/translated_search.sh");
}else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE &&
searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){
Expand Down