Skip to content

Commit

Permalink
Fix #272 remove deleted sequences from old clustering in cluster update
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Aug 30, 2020
1 parent 66f77ce commit b5a0883
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 26 deletions.
69 changes: 44 additions & 25 deletions data/workflow/update_clustering.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,33 +64,47 @@ WARN
exit 1
fi

if [ -n "${RECOVER_DELETED}" ] && [ -s "${TMP_PATH}/removedSeqs" ]; then
log "=== Recover removed sequences"
if notExists "${TMP_PATH}/OLDDB.removedMapping"; then
HIGHESTID="$(awk '$1 > max { max = $1 } END { print max }' "${NEWDB}.index")"
awk -v highest="$HIGHESTID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \
"${TMP_PATH}/removedSeqs" > "${TMP_PATH}/OLDDB.removedMapping"
cat "${TMP_PATH}/OLDDB.removedMapping" >> "${TMP_PATH}/mappingSeqs"
fi
if [ -s "${TMP_PATH}/removedSeqs" ]; then
if [ -n "${RECOVER_DELETED}" ]; then
log "=== Recover removed sequences"
if notExists "${TMP_PATH}/OLDDB.removedMapping"; then
HIGHESTID="$(awk '$1 > max { max = $1 } END { print max }' "${NEWDB}.index")"
awk -v highest="$HIGHESTID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \
"${TMP_PATH}/removedSeqs" > "${TMP_PATH}/OLDDB.removedMapping"
cat "${TMP_PATH}/OLDDB.removedMapping" >> "${TMP_PATH}/mappingSeqs"
fi

if notExists "${TMP_PATH}/NEWDB.withOld.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" renamedbkeys "${TMP_PATH}/OLDDB.removedMapping" "${OLDDB}" "${TMP_PATH}/OLDDB.removedDb" --subdb-mode 1 ${VERBOSITY} \
|| fail "renamedbkeys died"
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "$NEWDB" "${TMP_PATH}/OLDDB.removedDb" "${TMP_PATH}/NEWDB.withOld" --preserve-keys --threads 1 ${VERBOSITY} \
|| fail "concatdbs died"
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "${NEWDB}_h" "${TMP_PATH}/OLDDB.removedDb_h" "${TMP_PATH}/NEWDB.withOld_h" --preserve-keys --threads 1 ${VERBOSITY} \
|| fail "concatdbs died"
fi
NEWDB="${TMP_PATH}/NEWDB.withOld"
if notExists "${TMP_PATH}/NEWDB.withOld.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" renamedbkeys "${TMP_PATH}/OLDDB.removedMapping" "${OLDDB}" "${TMP_PATH}/OLDDB.removedDb" --subdb-mode 1 ${VERBOSITY} \
|| fail "renamedbkeys died"
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "$NEWDB" "${TMP_PATH}/OLDDB.removedDb" "${TMP_PATH}/NEWDB.withOld" --preserve-keys --threads 1 ${VERBOSITY} \
|| fail "concatdbs died"
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "${NEWDB}_h" "${TMP_PATH}/OLDDB.removedDb_h" "${TMP_PATH}/NEWDB.withOld_h" --preserve-keys --threads 1 ${VERBOSITY} \
|| fail "concatdbs died"
fi
NEWDB="${TMP_PATH}/NEWDB.withOld"

if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files 1/3"
rm -f "${TMP_PATH}/OLDDB.removedMapping"
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/OLDDB.removedDb" ${VERBOSITY}
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files 1/3"
rm -f "${TMP_PATH}/OLDDB.removedMapping"
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/OLDDB.removedDb" ${VERBOSITY}
fi
else
if notExists "${TMP_PATH}/OLCLUST.withoutDeletedKeys.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "${TMP_PATH}/mappingSeqs" "${OLDCLUST}" "${TMP_PATH}/OLCLUST.withoutDeletedKeys" --subdb-mode 1 ${VERBOSITY} \
|| fail "createsubdb died"
fi
if notExists "${TMP_PATH}/OLCLUST.withoutDeleted.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" filterdb "${TMP_PATH}/OLCLUST.withoutDeletedKeys" "${TMP_PATH}/OLCLUST.withoutDeleted" --filter-file "${TMP_PATH}/removedSeqs" --positive-filter ${THREADS_PAR} \
|| fail "filterdb died"
fi
OLDCLUST="${TMP_PATH}/OLCLUST.withoutDeleted"
fi
fi

Expand Down Expand Up @@ -194,6 +208,11 @@ if [ -n "$REMOVE_TMP" ]; then
"$MMSEQS" rmdb "${TMP_PATH}/NEWDB.withOld" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/NEWDB.withOld_h" ${VERBOSITY}
else
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/OLCLUST.withoutDeletedKeys" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/OLCLUST.withoutDeleted" ${VERBOSITY}
fi

# shellcheck disable=SC2086
Expand Down
2 changes: 1 addition & 1 deletion src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ Parameters::Parameters():
PARAM_EXTRACT_MODE(PARAM_EXTRACT_MODE_ID, "--extract-mode", "Extract mode", "Extract from 1: Query, 2: Target", typeid(int), (void *) &extractMode, "^[1-2]{1}$"),
// convertkb
PARAM_KB_COLUMNS(PARAM_KB_COLUMNS_ID, "--kb-columns", "UniprotKB columns", "list of indices of UniprotKB columns to be extracted", typeid(std::string), (void *) &kbColumns, ""),
PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Indicates if sequences are allowed to be be removed during updating", typeid(bool), (void *) &recoverDeleted, ""),
PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Find and recover deleted sequences during updating of clustering", typeid(bool), (void *) &recoverDeleted, ""),
// filtertaxdb
PARAM_TAXON_LIST(PARAM_TAXON_LIST_ID, "--taxon-list", "Selected taxa", "Taxonomy ID, possibly multiple values separated by ','", typeid(std::string), (void *) &taxonList, ""),
// view
Expand Down

0 comments on commit b5a0883

Please sign in to comment.