-
Notifications
You must be signed in to change notification settings - Fork 220
alignproteome added #875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
alignproteome added #875
Changes from 1 commit
1cb2074
12c0c81
df80f07
1073ec9
3617ea7
3fc7fcd
0487a1f
3f4fc91
eeabad3
9996821
27d9f68
e2ef229
fd1f5d6
0d7db0f
1fe489e
86f6b15
8344e5b
be5a53d
8d3a38f
bfa1911
7e912d2
618e2bf
e990773
e9ffe22
d14217f
5e6ee6d
743370d
44a6fca
910bb7f
e24b9e3
ad1acdf
e026260
d9213ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/bin/sh -e | ||
fail() { | ||
echo "Error: $1" | ||
exit 1 | ||
} | ||
|
||
notExists() { | ||
[ ! -f "$1" ] | ||
} | ||
|
||
|
||
if notExists "${TMP_PATH}/input.dbtype"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \ | ||
|| fail "query createdb died" | ||
fi | ||
|
||
if notExists "${TMP_PATH}/clu.dbtype"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \ | ||
|| fail "Search died" | ||
fi | ||
|
||
if notExists "${TMP_PATH}/aln.dbtype"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" alignproteome "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${ALIGNPROTEOME_PAR} \ | ||
|| fail "Convert Alignments died" | ||
fi | ||
|
||
if notExists "${RESULTS}_protein_cluster.tsv"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \ | ||
|| fail "createtsv protein died" | ||
fi | ||
|
||
if notExists "${RESULTS}_proteome_cluster.tsv"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \ | ||
|| fail "createtsv proteome died" | ||
fi | ||
|
||
if [ -n "${REMOVE_TMP}" ]; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR} | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR} | ||
rm -rf "${TMP_PATH}/clu_tmp" | ||
rm -f "${TMP_PATH}/easyproteomecluster.sh" | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,6 +79,12 @@ std::vector<Command> baseCommands = { | |
CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, | ||
{"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }, | ||
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, | ||
{"easy-alignproteome", easyalignproteome, &par.easyalignproteome, COMMAND_EASY, | ||
"Proteome clustering and alignemnt", | ||
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>" | ||
// CITATION_MMSEQS2, | ||
|
||
}, | ||
{"easy-taxonomy", easytaxonomy, &par.easytaxonomy, COMMAND_EASY, | ||
"Taxonomic classification", | ||
"# Assign taxonomic labels to FASTA sequences\n" | ||
|
@@ -633,9 +639,9 @@ std::vector<Command> baseCommands = { | |
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, | ||
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, | ||
{"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, | ||
"Within-result all-vs-all gapped local alignment", | ||
"Proteome clustering and alignment", | ||
NULL, | ||
"Martin Steinegger <[email protected]>", | ||
"Martin Steinegger <[email protected]> & Gyuri Kim <[email protected]>", | ||
"<i:sequenceDB> <i:resultDB> <o:alignmentDB>", | ||
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, | ||
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, | ||
|
This file was deleted.
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -187,7 +187,7 @@ class DBReader : public MemoryTracker { | |
|
||
size_t getSize() const; | ||
|
||
unsigned int getProteomeTotalLen(size_t id); //gyuri | ||
unsigned int getProteomeTotalLen(size_t id); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would name this |
||
|
||
unsigned int getMaxSeqLen(){ | ||
return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ? | ||
|
@@ -446,8 +446,6 @@ class DBReader : public MemoryTracker { | |
|
||
size_t findNextOffsetid(size_t id); | ||
|
||
size_t getIndexLen(size_t id); | ||
|
||
int isCompressed(){ | ||
return isCompressed(dbtype); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,6 @@ int alignall(int argc, const char **argv, const Command &command) { | |
} | ||
unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); | ||
|
||
// DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX); | ||
DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this is not needed anymore |
||
|
||
tdbr.open(DBReader<unsigned int>::NOSORT); | ||
|
@@ -58,7 +57,6 @@ int alignall(int argc, const char **argv, const Command &command) { | |
EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); | ||
const size_t flushSize = 100000000; | ||
size_t iterations = static_cast<int>(ceil(static_cast<double>(dbr_res.getSize()) / static_cast<double>(flushSize))); | ||
Debug(Debug::INFO) << "Number of iterations: " << iterations << " Size of linclust dbr_res : " << dbr_res.getSize() << '\n'; | ||
|
||
for (size_t i = 0; i < iterations; i++) { | ||
size_t start = (i * flushSize); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#include <cassert> | ||
|
||
#include "FileUtil.h" | ||
#include "CommandCaller.h" | ||
#include "Util.h" | ||
#include "Debug.h" | ||
#include "Parameters.h" | ||
|
||
#include "easyalignproteome.sh.h" | ||
|
||
void setEasyAlignproteomeDefaults(Parameters *p) { | ||
p->proteomeSimThr = 0.9; | ||
} | ||
|
||
// void setEasyAlignproteomeMustPassAlong(Parameters *p){ | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to implement this whenever you use the |
||
// } | ||
|
||
int easyalignproteome(int argc, const char **argv, const Command &command) { | ||
Parameters &par = Parameters::getInstance(); | ||
setEasyAlignproteomeDefaults(&par); | ||
par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); | ||
// setEasyAlignproteomeMustPassAlong(&par); | ||
std::string tmpDir = par.filenames.back(); | ||
std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); | ||
if (par.reuseLatest) { | ||
hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); | ||
} | ||
tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); | ||
par.filenames.pop_back(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to pop_back() the output path too and pass it as an environment variable. Or else it will also be passed to |
||
|
||
CommandCaller cmd; | ||
cmd.addVariable("ALIGNPROTEOME_PAR", par.createParameterString(par.alignproteome,true).c_str()); // what? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CLUSTER_PAR, THREADS_PAR etc are missing |
||
std::string program = tmpDir + "/easyalignproteome.sh"; | ||
FileUtil::writeFile(program, easyalignproteome_sh, easyalignproteome_sh_len); | ||
cmd.execProgram(program.c_str(), par.filenames); | ||
|
||
// Should never get here | ||
assert(false); | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file looks incomplete