Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SKIP_ORF option in translated_search #885

Merged
merged 11 commits into from
Nov 22, 2024
48 changes: 38 additions & 10 deletions data/workflow/translated_search.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,24 @@ TMP_PATH="$4"
QUERY="$1"
QUERY_ORF="$1"
if [ -n "$QUERY_NUCL" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
# introduce EXTRACT_FRAMES_PAR, TRANSLATE_PAR, ORF_SKIP to Search.cpp and to Parameters::Parameters
LunaJang marked this conversation as resolved.
Show resolved Hide resolved
if [ "$ORF_SKIP" ]; then
LunaJang marked this conversation as resolved.
Show resolved Hide resolved
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_nucl" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
# we want to avoid this \/
# shellcheck disable=SC2086
"$MMSEQS" translatenucs "${TMP_PATH}/q_orfs_nucl" "${TMP_PATH}/q_orfs_aa" ${TRANSLATE_PAR} \
|| fail "translatenucs died"
"$MMSEQS" rmdb "${TMP_PATH}/q_orfs_nucl"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see below

fi
else
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
fi
fi
QUERY="${TMP_PATH}/q_orfs_aa"
QUERY_ORF="${TMP_PATH}/q_orfs_aa"
Expand All @@ -34,13 +48,27 @@ TARGET="$2"
TARGET_ORF="$2"
if [ -n "$TARGET_NUCL" ]; then
if [ -n "$NO_TARGET_INDEX" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
if [ "$ORF_SKIP" ]; then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$1" "${TMP_PATH}/t_orfs_nucl" ${EXTRACT_FRAMES_PAR} \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be "$2".

|| fail "extractframes died"
# we want to avoid this \/
# shellcheck disable=SC2086
"$MMSEQS" translatenucs "${TMP_PATH}/t_orfs_nucl" "${TMP_PATH}/t_orfs_aa" ${TRANSLATE_PAR} \
|| fail "translatenucs died"
"$MMSEQS" rmdb "${TMP_PATH}/t_orfs_nucl"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rmdb calls need ${VERBOSITY}:

# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/t_orfs_nucl" ${VERBOSITY}

fi
else
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# same here
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
fi
fi
TARGET="${TMP_PATH}/t_orfs_aa"
TARGET_ORF="${TMP_PATH}/t_orfs_aa"
TARGET="${TMP_PATH}/t_orfs_aa"
TARGET_ORF="${TMP_PATH}/t_orfs_aa"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check if this whitespace should be here. From the diff, this looks wrong.

fi
fi

Expand Down
4 changes: 4 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Parameters::Parameters():
PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "ORF skip mode", "???", typeid(bool), (void *) &orfSkip, ""),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still TODO

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "Extract frames instead of ORFs", "Extract frames instead of ORFs during translated search", typeid(bool), (void *) &orfSkip, ""),

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@martin-steinegger any better ideas for a parameter name?

// easysearch
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
// extractorfs
Expand Down Expand Up @@ -1251,6 +1252,7 @@ Parameters::Parameters():
searchworkflow = combineList(searchworkflow, rescorediagonal);
searchworkflow = combineList(searchworkflow, result2profile);
searchworkflow = combineList(searchworkflow, extractorfs);
searchworkflow = combineList(searchworkflow, extractframes);
searchworkflow = combineList(searchworkflow, translatenucs);
searchworkflow = combineList(searchworkflow, splitsequence);
searchworkflow = combineList(searchworkflow, offsetalignment);
Expand All @@ -1268,6 +1270,7 @@ Parameters::Parameters():
searchworkflow.push_back(&PARAM_RUNNER);
searchworkflow.push_back(&PARAM_REUSELATEST);
searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
searchworkflow.push_back(&PARAM_ORF_SKIP);

linsearchworkflow = combineList(align, kmersearch);
linsearchworkflow = combineList(linsearchworkflow, swapresult);
Expand Down Expand Up @@ -2277,6 +2280,7 @@ void Parameters::setDefaults() {
orfFilterSens = 2.0;
orfFilterEval = 100;
lcaSearch = false;
orfSkip = false;

greedyBestHits = false;

Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ class Parameters {
float orfFilterSens;
double orfFilterEval;
bool lcaSearch;
bool orfSkip;

// easysearch
bool greedyBestHits;
Expand Down Expand Up @@ -886,6 +887,7 @@ class Parameters {
PARAMETER(PARAM_ORF_FILTER_S)
PARAMETER(PARAM_ORF_FILTER_E)
PARAMETER(PARAM_LCA_SEARCH)
PARAMETER(PARAM_ORF_SKIP)

// easysearch
PARAMETER(PARAM_GREEDY_BEST_HITS)
Expand Down
12 changes: 11 additions & 1 deletion src/workflow/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ int search(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < par.extractorfs.size(); i++) {
par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.extractframes.size(); i++) {
par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.translatenucs.size(); i++) {
par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
Expand Down Expand Up @@ -541,9 +544,16 @@ int search(int argc, const char **argv, const Command& command) {
par.subDbMode = 1;
cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str());
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
cmd.addVariable("SEARCH", program.c_str());
if (par.orfSkip) {
cmd.addVariable("ORF_SKIP", "TRUE");
cmd.addVariable("TRANSLATE_PAR", par.createParameterString(par.translatenucs).c_str());
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
LunaJang marked this conversation as resolved.
Show resolved Hide resolved
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

code/whitespace style: please do } else {

else{
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
program = std::string(tmpDir + "/translated_search.sh");
}else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE &&
searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){
Expand Down