Skip to content

Commit

Permalink
Merge branch 'skip_orf'
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Nov 22, 2024
2 parents 266c894 + a804518 commit 0310e6b
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 69 deletions.
14 changes: 10 additions & 4 deletions data/workflow/createindex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,15 @@ INPUT="$1"
if [ -n "$TRANSLATED" ]; then
# 1. extract orf
if notExists "$2/orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
|| fail "extractorfs died"
if [ -n "$ORF_SKIP" ]; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$INPUT" "$2/orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
else
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
|| fail "extractorfs died"
fi
fi

# shellcheck disable=SC2086
Expand All @@ -33,7 +39,7 @@ if [ -n "$TRANSLATED" ]; then
rm -f "$2/createindex.sh"
fi
elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then
# 1. extract orf
# 1. extract orf
if notExists "$2/nucl_split_seq.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \
Expand Down
33 changes: 25 additions & 8 deletions data/workflow/translated_search.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,18 @@ TMP_PATH="$4"
QUERY="$1"
QUERY_ORF="$1"
if [ -n "$QUERY_NUCL" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
if [ -n "$ORF_SKIP" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
fi
fi
QUERY="${TMP_PATH}/q_orfs_aa"
QUERY_ORF="${TMP_PATH}/q_orfs_aa"
Expand All @@ -34,10 +42,19 @@ TARGET="$2"
TARGET_ORF="$2"
if [ -n "$TARGET_NUCL" ]; then
if [ -n "$NO_TARGET_INDEX" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
if [ -n "$ORF_SKIP" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# same here
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
fi
fi
TARGET="${TMP_PATH}/t_orfs_aa"
TARGET_ORF="${TMP_PATH}/t_orfs_aa"
Expand Down
12 changes: 9 additions & 3 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ Parameters::Parameters():
PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_TRANSLATION_MODE(PARAM_TRANSLATION_MODE_ID, "--translation-mode", "Translation mode", "Translation AA seq from nucleotide by 0: ORFs, 1: full reading frames", typeid(int), (void *) &translationMode, "^[0-1]{1}$"),
// easysearch
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
// extractorfs
Expand Down Expand Up @@ -733,7 +734,9 @@ Parameters::Parameters():
// extract frames
extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_TRANSLATION_TABLE);
extractframes.push_back(&PARAM_TRANSLATE);
extractframes.push_back(&PARAM_CREATE_LOOKUP);
extractframes.push_back(&PARAM_THREADS);
extractframes.push_back(&PARAM_COMPRESSED);
Expand Down Expand Up @@ -1268,7 +1271,7 @@ Parameters::Parameters():
searchworkflow = combineList(searchworkflow, rescorediagonal);
searchworkflow = combineList(searchworkflow, result2profile);
searchworkflow = combineList(searchworkflow, extractorfs);
searchworkflow = combineList(searchworkflow, translatenucs);
searchworkflow = combineList(searchworkflow, extractframes);
searchworkflow = combineList(searchworkflow, splitsequence);
searchworkflow = combineList(searchworkflow, offsetalignment);
// needed for slice search, however all its parameters are already present in searchworkflow
Expand All @@ -1284,6 +1287,7 @@ Parameters::Parameters():
searchworkflow.push_back(&PARAM_RUNNER);
searchworkflow.push_back(&PARAM_REUSELATEST);
searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
searchworkflow.push_back(&PARAM_TRANSLATION_MODE);
linsearchworkflow = combineList(align, kmersearch);
linsearchworkflow = combineList(linsearchworkflow, swapresult);
Expand All @@ -1307,8 +1311,9 @@ Parameters::Parameters():
// createindex workflow
createindex = combineList(indexdb, extractorfs);
createindex = combineList(createindex, translatenucs);
createindex = combineList(createindex, extractframes);
createindex = combineList(createindex, splitsequence);
createindex.push_back(&PARAM_TRANSLATION_MODE);
createindex.push_back(&PARAM_STRAND);
createindex.push_back(&PARAM_REMOVE_TMP_FILES);
Expand Down Expand Up @@ -2318,6 +2323,7 @@ void Parameters::setDefaults() {
orfFilterSens = 2.0;
orfFilterEval = 100;
lcaSearch = false;
translationMode = PARAM_TRANSLATION_MODE_ORF;

greedyBestHits = false;

Expand Down
6 changes: 6 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,10 @@ class Parameters {
static const int PARAM_RESULT_DIRECTION_QUERY = 0;
static const int PARAM_RESULT_DIRECTION_TARGET = 1;

// translation mode
static const int PARAM_TRANSLATION_MODE_ORF = 0;
static const int PARAM_TRANSLATION_MODE_FRAME = 1;

// path to databases
std::string db1;
std::string db1Index;
Expand Down Expand Up @@ -471,6 +475,7 @@ class Parameters {
float orfFilterSens;
double orfFilterEval;
bool lcaSearch;
int translationMode;

// easysearch
bool greedyBestHits;
Expand Down Expand Up @@ -892,6 +897,7 @@ class Parameters {
PARAMETER(PARAM_ORF_FILTER_S)
PARAMETER(PARAM_ORF_FILTER_E)
PARAMETER(PARAM_LCA_SEARCH)
PARAMETER(PARAM_TRANSLATION_MODE)

// easysearch
PARAMETER(PARAM_GREEDY_BEST_HITS)
Expand Down
129 changes: 82 additions & 47 deletions src/util/extractframes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include "DBWriter.h"
#include "Matcher.h"
#include "Util.h"
#include "itoa.h"

#include "TranslateNucl.h"
#include "Orf.h"

#include <unistd.h>
Expand All @@ -16,23 +15,63 @@
#include <omp.h>
#endif

void handleSingleFrame(TranslateNucl& translateNucl, DBWriter& sequenceWriter, DBWriter& headerWriter, unsigned int key, char* headerBuffer, const char* data, size_t seqLen, int frame, bool reverse, bool translate, char*& aaBuffer, size_t& aaBufferSize, int thread_idx) {
data = data + frame;
seqLen = seqLen - frame;
if (translate == true) {
if (seqLen < 3) {
return;
}
size_t codonLength = (seqLen / 3) * 3;
if ((codonLength + 1) > aaBufferSize) {
aaBufferSize = codonLength * 1.5 + 1;
aaBuffer = (char*)realloc(aaBuffer, aaBufferSize * sizeof(char));
}
translateNucl.translate(aaBuffer, data, codonLength);
aaBuffer[codonLength / 3] = '\n';
sequenceWriter.writeData(aaBuffer, (codonLength / 3) + 1, key, thread_idx);
size_t bufferLen;
if (reverse) {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, frame + codonLength, static_cast<size_t>(frame), 0, 0);
} else {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), frame + codonLength, 0, 0);
}
headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
} else {
// +1: add newline, but remove it from the end pos
sequenceWriter.writeData(data, seqLen + 1, key, thread_idx);
size_t bufferLen;
if (reverse) {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, seqLen - 1, static_cast<size_t>(frame), 0, 0);
} else {
bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), seqLen - 1, 0, 0);
}
headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
}
}

int extractframes(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
reader.open(DBReader<unsigned int>::NOSORT);

DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
int outputDbtype = reader.getDbtype();
if (par.translate) {
outputDbtype = Parameters::DBTYPE_AMINO_ACIDS;
}
DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype);
sequenceWriter.open();

DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB);
headerWriter.open();

unsigned int forwardFrames = Orf::getFrames(par.forwardFrames);
unsigned int reverseFrames = Orf::getFrames(par.reverseFrames);
Debug::Progress progress(reader.getSize());

Debug::Progress progress(reader.getSize());
TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
#pragma omp parallel
{
int thread_idx = 0;
Expand All @@ -46,70 +85,66 @@ int extractframes(int argc, const char **argv, const Command& command) {
queryFrom = 0;
}

size_t aaBufferSize = par.maxSeqLen + 3 + 1;
char* aa = NULL;
if (par.translate == true) {
aa = (char*)malloc(aaBufferSize * sizeof(char));
}

char buffer[1024];

std::string reverseComplementStr;
reverseComplementStr.reserve(32000);

for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){
progress.updateProgress();

unsigned int key = reader.getDbKey(i);
const char* data = reader.getData(i, thread_idx);
size_t dataLength = reader.getEntryLen(i);

size_t bufferLen;
switch (forwardFrames){
case Orf::FRAME_1:
// -1 to ignore the null byte copy the new line
sequenceWriter.writeData(data, dataLength - 1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
size_t seqLen = reader.getSeqLen(i);

if (forwardFrames & Orf::FRAME_1) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, false, par.translate, aa, aaBufferSize, thread_idx);
}
if (forwardFrames & Orf::FRAME_2) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, false, par.translate, aa, aaBufferSize, thread_idx);
}
if (forwardFrames & Orf::FRAME_3) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, false, par.translate, aa, aaBufferSize, thread_idx);
}

if(reverseFrames != 0){
size_t sequenceLength = dataLength -2;
if (reverseFrames != 0) {
// bool hasWrongChar = false;
for(size_t pos = 0; pos < sequenceLength; ++pos) {
char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]);
for (size_t pos = 0; pos < seqLen; ++pos) {
char reverseComplement = Orf::complement(data[seqLen - pos - 1]);
reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement;
reverseComplementStr.push_back(reverseComplement);
// hasWrongChar |= (reverseComplement == '.');
}
// if(hasWrongChar == true){
// continue;
// }
// if (hasWrongChar == true) {
// continue;
// }
reverseComplementStr.push_back('\n');
seqLen = reverseComplementStr.size() - 1;
data = reverseComplementStr.c_str();
}

switch (reverseFrames){
case Orf::FRAME_1:
sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
if (reverseFrames & Orf::FRAME_1) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, true, par.translate, aa, aaBufferSize, thread_idx);
}

if (reverseFrames & Orf::FRAME_2) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, true, par.translate, aa, aaBufferSize, thread_idx);
}

if (reverseFrames & Orf::FRAME_3) {
handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, true, par.translate, aa, aaBufferSize, thread_idx);
}
reverseComplementStr.clear();
}
if (aa != NULL) {
free(aa);
}
}
headerWriter.close(true);
sequenceWriter.close(true);
Expand Down
8 changes: 6 additions & 2 deletions src/workflow/CreateIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,12 @@ int createindex(Parameters &par, const Command &command, const std::string &inde
cmd.addVariable("INDEXER", indexerModule.c_str());
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
} else {
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
cmd.addVariable("SPLIT_SEQ_PAR", par.createParameterString(par.splitsequence).c_str());
if(indexerModule == "kmerindexdb"){
cmd.addVariable("INDEX_PAR", par.createParameterString(par.kmerindexdb).c_str());
Expand Down
15 changes: 10 additions & 5 deletions src/workflow/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ int search(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < par.extractorfs.size(); i++) {
par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.translatenucs.size(); i++) {
par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
for (size_t i = 0; i < par.extractframes.size(); i++) {
par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.splitsequence.size(); i++) {
par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
Expand Down Expand Up @@ -564,17 +564,22 @@ int search(int argc, const char **argv, const Command& command) {

if (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED|Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) {
cmd.addVariable("NO_TARGET_INDEX", (indexStr == "") ? "TRUE" : NULL);
FileUtil::writeFile(tmpDir + "/translated_search.sh", translated_search_sh, translated_search_sh_len);
cmd.addVariable("QUERY_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED) ? "TRUE" : NULL);
cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL);
cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL);
cmd.addVariable("THREAD_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
par.subDbMode = 1;
cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str());
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
} else {
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
cmd.addVariable("SEARCH", program.c_str());
program = std::string(tmpDir + "/translated_search.sh");
FileUtil::writeFile(program.c_str(), translated_search_sh, translated_search_sh_len);
}else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE &&
searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){
if (par.gpu != 0) {
Expand Down

0 comments on commit 0310e6b

Please sign in to comment.