Skip to content

Commit

Permalink
Rebuild foldseek createclusearch workflow in clusterdb
Browse files Browse the repository at this point in the history
  • Loading branch information
RuoshiZhang committed Feb 7, 2025
1 parent 886bc67 commit 56a7f5f
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
68 changes: 66 additions & 2 deletions data/clusterdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1;
[ "$#" -ne 2 ] && echo "Please provide <inputDB> <tmpDir>" && exit 1

fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}
Expand All @@ -24,10 +29,69 @@ if [ -n "${USE_FOLDSEEK}" ]; then
|| fail "foldseek cluster failed"
fi

if notExists "${IN}_clu_seq.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" cpdb "${IN}" "${IN}_clu_seq" ${VERBOSITY}
fi

if notExists "${IN}_clu_seq_h.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" cpdb "${IN}_h" "${IN}_clu_seq_h" ${VERBOSITY}
fi

if notExists "${IN}_clu_seq_ss.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" cpdb "${IN}_ss" "${IN}_clu_seq_ss" ${VERBOSITY}
fi

if [ -f "${IN}_ca.dbtype" ] && notExists "${IN}_clu_seq_ca.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" cpdb "${IN}_ca" "${IN}_clu_seq_ca" ${VERBOSITY}
fi

if notExists "${IN}_clu_clu.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" cpdb ${TMP_PATH}/cluster_foldseek "${IN}_clu_clu" ${VERBOSITY}
fi

if [ -f "${IN}_clu_seq_ca.dbtype" ] && notExists "${IN}_clu_ca.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" createsubdb ${TMP_PATH}/cluster_foldseek "${IN}_clu_seq_ca" "${IN}_clu_ca" ${VERBOSITY}
fi

if notExists "${IN}_clu_aln.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" structurealign "${IN}_clu_seq" "${IN}_clu_seq" "${TMP_PATH}/cluster_foldseek" "${IN}_clu_aln" -a -e 0.1 --sort-by-structure-bits 0 ${MERGECLU_PAR}
fi

if notExists "${IN}_clu_profile.dbtype"; then
# shellcheck disable=SC2086
"${MMSEQS}" result2profile "${IN}_clu_seq" "${IN}_clu_seq" "${IN}_clu_aln" "${IN}_clu_profile" ${PROFILE_PAR}
fi

if notExists "${IN}_clu.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" createclusearchdb "${IN}" "${TMP_PATH}/cluster_foldseek" "${IN}_clu" ${THREADS_PAR} \
|| fail "foldseek createclusearchdb failed"
"${FOLDSEEK}" profile2consensus "${IN}_clu_profile" "${IN}_clu" ${MERGECLU_PAR}
if [ -n "$REMOVE_TMP" ]; then
# shellcheck disable=SC2086
"${FOLDSEEK}" rmdb "${IN}_clu_profile" ${VERBOSITY}
fi
fi

if notExists "${IN}_clu_profile_ss.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" result2profile "${IN}_clu_seq_ss" "${IN}_clu_seq_ss" "${IN}_clu_aln" "${IN}_clu_profile_ss" ${PROFILE_SS_PAR}
fi

if notExists "${IN}_clu_ss.dbtype"; then
# shellcheck disable=SC2086
"${FOLDSEEK}" profile2consensus "${IN}_clu_profile_ss" "${IN}_clu_ss" ${MERGECLU_PAR}
if [ -n "$REMOVE_TMP" ]; then
# shellcheck disable=SC2086
"${FOLDSEEK}" rmdb "${IN}_clu_profile_ss" ${VERBOSITY}
# shellcheck disable=SC2086
"${FOLDSEEK}" rmdb "${IN}_clu_aln" ${VERBOSITY}
fi
fi
else
if notExists "${TMP_PATH}/cluster_mmseqs.dbtype"; then
Expand Down
18 changes: 18 additions & 0 deletions src/workflow/clusterdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,27 @@ int clusterdb(int argc, const char **argv, const Command &command) {
cmd.addVariable("USE_FOLDSEEK", useFoldseek ? "TRUE" : NULL);
cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clusterworkflow).c_str());
cmd.addVariable("CONSENSUS_PAR", par.createParameterString(par.profile2seq).c_str());
cmd.addVariable("PROFILE_PAR", par.createParameterString(par.result2profile).c_str());
cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
cmd.addVariable("MERGECLU_PAR", par.createParameterString(par.threadsandcompression).c_str());
par.pca = 1.4;
par.pcb = 1.5;
par.scoringMatrixFile = "3di.out";
par.seedScoringMatrixFile = "3di.out";
par.maskProfile = 0;
par.compBiasCorrection = 0;
if(par.PARAM_E_PROFILE.wasSet == false){
par.evalProfile = 0.1;
par.evalThr = 0.1;
}
std::vector<MMseqsParameter*> result2profile_ss;
for (size_t i = 0; i < par.result2profile.size(); i++) {
if (par.result2profile[i]->uniqid != par.PARAM_GAP_PSEUDOCOUNT.uniqid) {
result2profile_ss.push_back(par.result2profile[i]);
}
}
cmd.addVariable("PROFILE_SS_PAR", par.createParameterString(result2profile_ss).c_str());

FileUtil::writeFile(tmpDir + "/clusterdb.sh", clusterdb_sh, clusterdb_sh_len);
std::string program(tmpDir + "/clusterdb.sh");
Expand Down

0 comments on commit 56a7f5f

Please sign in to comment.