Skip to content

Commit

Permalink
Squashed 'lib/mmseqs/' changes from 7281baf9..b0b8e85f
Browse files Browse the repository at this point in the history
b0b8e85f Fix truncated profile sequences in convertalis #567
96b20099 Fix broken badges in README (and remove travis)
407b315e Fix multi-threading issues in pairaln
92deb92f Fix unpackdb parameter
be8c278c Progress update fix
58593ec0 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
3f8695ea Add multi-thread support to pairaln
e9e829c7 Fix seg. fault in realign
ce7bf53b Point Kalamari3.7v to a fixed commit soedinglab/MMseqs2#531
fcf52600 Remove a level of indirection to access compatible index version
922e2691 Fix failing utility tests
74c3aa65 Fix typo (violoations -> violations) (#526)

git-subtree-dir: lib/mmseqs
git-subtree-split: b0b8e85f3b8437c10a666e3ea35c78c0ad0d7ec2
  • Loading branch information
martin-steinegger committed Jun 14, 2022
1 parent b72a421 commit 785a05e
Show file tree
Hide file tree
Showing 21 changed files with 118 additions and 91 deletions.
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,7 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and

[Mirdita M, Steinegger M, Breitwieser F, Soding J, Levy Karin E: Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, doi: 10.1093/bioinformatics/btab184 (2021)](https://doi.org/10.1093/bioinformatics/btab184).

[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2)
[![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest)
[![Biocontainer Pulls](https://img.shields.io/endpoint?url=https%3A%2F%2Fmmseqs.com%2Fbiocontainer.php%3Fcontainer%3Dmmseqs2)](https://biocontainers.pro/#/tools/mmseqs2)
[![Docker Pulls](https://img.shields.io/docker/pulls/soedinglab/mmseqs2.svg)](https://hub.docker.com/r/soedinglab/mmseqs2)
[![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master)
[![Travis CI](https://travis-ci.org/soedinglab/MMseqs2.svg?branch=master)](https://travis-ci.org/soedinglab/MMseqs2)
<a href="https://chat.mmseqs.com/"><img src="https://chat.mmseqs.com/api/v1/shield.svg?type=online&name=chat&icon=false" /></a>
[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2) [![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest) [![Biocontainer Pulls](https://img.shields.io/endpoint?url=https%3A%2F%2Fmmseqs.com%2Fbiocontainer.php%3Fcontainer%3Dmmseqs2)](https://biocontainers.pro/#/tools/mmseqs2) [![Docker Pulls](https://img.shields.io/docker/pulls/soedinglab/mmseqs2.svg)](https://hub.docker.com/r/soedinglab/mmseqs2) [![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master) <a href="https://chat.mmseqs.com/"><img src="https://chat.mmseqs.com/api/v1/shield.svg?type=online&name=chat&icon=false" /></a>

<p align="center"><img src="https://raw.githubusercontent.com/soedinglab/mmseqs2/master/.github/mmseqs2_logo.png" height="256" /></p>

Expand Down
2 changes: 1 addition & 1 deletion data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ case "${SELECTION}" in
"Kalamari")
if notExists "${TMP_PATH}/kalamari.tsv"; then
printf "3.7 %s\n" "$(date "+%s")" > "${TMP_PATH}/version"
downloadFile "https://raw.githubusercontent.com/lskatz/Kalamari/master/src/Kalamari_v3.7.tsv" "${TMP_PATH}/kalamari.tsv"
downloadFile "https://raw.githubusercontent.com/lskatz/Kalamari/18d71da740546ba4a5117682e1ae2a037379afe0/src/Kalamari_v3.7.tsv" "${TMP_PATH}/kalamari.tsv"
fi
ACCESSIONS=""
# shellcheck disable=SC2034
Expand Down
6 changes: 3 additions & 3 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ std::vector<Command> baseCommands = {
"# -c 0.7 - + -\n"
"# -c 0.6 + + +\n\n"
"# Cascaded clustering with reassignment\n"
"# - Corrects criteria-violoations of cascaded merging\n"
"# - Corrects criteria-violations of cascaded merging\n"
"# - Produces more clusters and is a bit slower\n"
"mmseqs easy-cluster examples/DB.fasta result tmp --cluster-reassign\n",
"Martin Steinegger <[email protected]>",
Expand Down Expand Up @@ -264,7 +264,7 @@ std::vector<Command> baseCommands = {
"# Cutoff -c 0.7 - + -\n"
"# -c 0.6 + + +\n\n"
"# Cascaded clustering with reassignment\n"
"# - Corrects criteria-violoations of cascaded merging\n"
"# - Corrects criteria-violations of cascaded merging\n"
"# - Produces more clusters and is a bit slower\n"
"mmseqs cluster sequenceDB clusterDB tmp --cluster-reassign\n",
"Martin Steinegger <[email protected]> & Lars von den Driesch",
Expand Down Expand Up @@ -722,7 +722,7 @@ std::vector<Command> baseCommands = {
"<i:srcDB> <o:dstDB>",
CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL },
{"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
{"unpackdb", unpackdb, &par.onlyverbosity, COMMAND_STORAGE,
{"unpackdb", unpackdb, &par.unpackdbs, COMMAND_STORAGE,
"Unpack a DB into separate files",
NULL,
"Milot Mirdita <[email protected]>",
Expand Down
2 changes: 1 addition & 1 deletion src/alignment/Alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, con
}

std::vector<Matcher::result_t> *returnRes = &swResults;
if (realign == true) {
if (realign == true && *origData != '\0') {
realigner->initQuery(&qSeq);
int realignAccepted = 0;
for (size_t result = 0; result < swResults.size() && realignAccepted < realignMaxSeqs; result++) {
Expand Down
6 changes: 6 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,12 @@ Parameters::Parameters():
tar2db.push_back(&PARAM_THREADS);
tar2db.push_back(&PARAM_V);
// unpackdb
unpackdbs.push_back(&PARAM_UNPACK_NAME_MODE);
unpackdbs.push_back(&PARAM_UNPACK_SUFFIX);
unpackdbs.push_back(&PARAM_THREADS);
unpackdbs.push_back(&PARAM_V);
// appenddbtoindex
appenddbtoindex.push_back(&PARAM_ID_LIST);
appenddbtoindex.push_back(&PARAM_V);
Expand Down
1 change: 1 addition & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,7 @@ class Parameters {
std::vector<MMseqsParameter*> enrichworkflow;
std::vector<MMseqsParameter*> databases;
std::vector<MMseqsParameter*> tar2db;
std::vector<MMseqsParameter*> unpackdbs;
std::vector<MMseqsParameter*> appenddbtoindex;

std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
Expand Down
12 changes: 6 additions & 6 deletions src/commons/Sequence.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,18 +375,18 @@ void Sequence::print() {
std::cout << std::endl;
}

void extractProfileData(const char* data, const BaseMatrix &subMat, const int offset, std::string &result) {
void extractProfileData(const char* data, size_t dataSize, const BaseMatrix &subMat, const int offset, std::string &result) {
size_t i = 0;
while (data[i] != '\0'){
while (i < dataSize){
result.append(1, subMat.num2aa[(int)data[i + Sequence::PROFILE_AA_SIZE + offset]]);
i += Sequence::PROFILE_READIN_SIZE;
}
}

void Sequence::extractProfileSequence(const char* data, const BaseMatrix &submat, std::string &result) {
extractProfileData(data, submat, 0, result);
void Sequence::extractProfileSequence(const char* data, size_t dataSize, const BaseMatrix &submat, std::string &result) {
extractProfileData(data, dataSize, submat, 0, result);
}

void Sequence::extractProfileConsensus(const char* data, const BaseMatrix &submat, std::string &result) {
extractProfileData(data, submat, 1, result);
void Sequence::extractProfileConsensus(const char* data, size_t dataSize, const BaseMatrix &submat, std::string &result) {
extractProfileData(data, dataSize, submat, 1, result);
}
4 changes: 2 additions & 2 deletions src/commons/Sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,8 @@ class Sequence {

void print(); // for debugging

static void extractProfileSequence(const char* data, const BaseMatrix &submat, std::string &result);
static void extractProfileConsensus(const char* data, const BaseMatrix &submat, std::string &result);
static void extractProfileSequence(const char* data, size_t dataSize, const BaseMatrix &submat, std::string &result);
static void extractProfileConsensus(const char* data, size_t dataSize, const BaseMatrix &submat, std::string &result);

int getId() const { return id; }

Expand Down
3 changes: 2 additions & 1 deletion src/linclust/LinsearchIndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#define SIZE_T_MAX ((size_t) -1)
#endif

extern const char* index_version_compatible;

template <int TYPE>
size_t LinsearchIndexReader::pickCenterKmer(KmerPosition<short> *hashSeqPair, size_t splitKmerCount) {
Expand Down Expand Up @@ -241,7 +242,7 @@ bool LinsearchIndexReader::checkIfIndexFile(DBReader<unsigned int> *pReader) {
if(version == NULL){
return false;
}
return (strncmp(version, PrefilteringIndexReader::CURRENT_VERSION, strlen(PrefilteringIndexReader::CURRENT_VERSION)) == 0 ) ? true : false;
return (strncmp(version, index_version_compatible, strlen(index_version_compatible)) == 0 ) ? true : false;
}

void LinsearchIndexReader::writeKmerIndexToDisk(std::string fileName, KmerPosition<short> *kmers, size_t kmerCnt){
Expand Down
3 changes: 2 additions & 1 deletion src/linclust/kmerindexdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#endif

extern const char* version;
extern const char* index_version_compatible;

int kmerindexdb(int argc, const char **argv, const Command &command) {
MMseqsMPI::init(argc, argv);
Expand Down Expand Up @@ -169,7 +170,7 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {
dbw.open();

Debug(Debug::INFO) << "Write VERSION (" << PrefilteringIndexReader::VERSION << ")\n";
dbw.writeData((char *) PrefilteringIndexReader::CURRENT_VERSION, strlen(PrefilteringIndexReader::CURRENT_VERSION) * sizeof(char), PrefilteringIndexReader::VERSION, 0);
dbw.writeData((char *) index_version_compatible, strlen(index_version_compatible) * sizeof(char), PrefilteringIndexReader::VERSION, 0);
dbw.alignToPageSize();

Debug(Debug::INFO) << "Write META (" << PrefilteringIndexReader::META << ")\n";
Expand Down
5 changes: 2 additions & 3 deletions src/prefiltering/PrefilteringIndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include "Parameters.h"

extern const char* index_version_compatible;
const char* PrefilteringIndexReader::CURRENT_VERSION = index_version_compatible;
unsigned int PrefilteringIndexReader::VERSION = 0;
unsigned int PrefilteringIndexReader::META = 1;
unsigned int PrefilteringIndexReader::SCOREMATRIXNAME = 2;
Expand Down Expand Up @@ -41,7 +40,7 @@ bool PrefilteringIndexReader::checkIfIndexFile(DBReader<unsigned int>* reader) {
if(version == NULL){
return false;
}
return (strncmp(version, CURRENT_VERSION, strlen(CURRENT_VERSION)) == 0 ) ? true : false;
return (strncmp(version, index_version_compatible, strlen(index_version_compatible)) == 0 ) ? true : false;
}

std::string PrefilteringIndexReader::indexName(const std::string &outDB) {
Expand All @@ -67,7 +66,7 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
writer.open();

Debug(Debug::INFO) << "Write VERSION (" << VERSION << ")\n";
writer.writeData((char *) CURRENT_VERSION, strlen(CURRENT_VERSION) * sizeof(char), VERSION, SPLIT_META);
writer.writeData((char *) index_version_compatible, strlen(index_version_compatible) * sizeof(char), VERSION, SPLIT_META);
writer.alignToPageSize(SPLIT_META);

Debug(Debug::INFO) << "Write META (" << META << ")\n";
Expand Down
1 change: 0 additions & 1 deletion src/prefiltering/PrefilteringIndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ struct PrefilteringIndexData {

class PrefilteringIndexReader {
public:
static const char* CURRENT_VERSION;
static unsigned int VERSION;
static unsigned int ENTRIES;
static unsigned int ENTRIESOFFSETS;
Expand Down
2 changes: 1 addition & 1 deletion src/test/TestAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main (int, const char**) {
Sequence* dbSeq = new Sequence(10000, 0, &subMat, kmer_size, true, false);
//dbSeq->mapSequence(1,"lala2",ref_seq);
dbSeq->mapSequence(1,1,tim2.c_str(), tim2.size());
SmithWaterman aligner(15000, subMat.alphabetSize, true, Parameters::DBTYPE_AMINO_ACIDS);
SmithWaterman aligner(15000, subMat.alphabetSize, true, 1.0, Parameters::DBTYPE_AMINO_ACIDS);
int8_t * tinySubMat = new int8_t[subMat.alphabetSize*subMat.alphabetSize];
for (int i = 0; i < subMat.alphabetSize; i++) {
for (int j = 0; j < subMat.alphabetSize; j++) {
Expand Down
2 changes: 1 addition & 1 deletion src/test/TestAlignmentPerformance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ int main (int, const char**) {
Sequence* query = new Sequence(10000, 0, &subMat, kmer_size, true, false);
Sequence* dbSeq = new Sequence(10000, 0, &subMat, kmer_size, true, false);
//dbSeq->mapSequence(1,"lala2",ref_seq);
SmithWaterman aligner(15000, subMat.alphabetSize, false, Parameters::DBTYPE_AMINO_ACIDS);
SmithWaterman aligner(15000, subMat.alphabetSize, false, 1.0, Parameters::DBTYPE_AMINO_ACIDS);
int8_t * tinySubMat = new int8_t[subMat.alphabetSize*subMat.alphabetSize];
for (int i = 0; i < subMat.alphabetSize; i++) {
for (int j = 0; j < subMat.alphabetSize; j++) {
Expand Down
2 changes: 1 addition & 1 deletion src/test/TestAlignmentTraceback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ int main(int, const char**) {
Sequence* dbSeq = new Sequence(10000, 0, &subMat, kmer_size, true, false);
//dbSeq->mapSequence(1,"lala2",ref_seq);
dbSeq->mapSequence(1,1,tim2.c_str(), tim2.size());
SmithWaterman aligner(15000, subMat.alphabetSize, false, Parameters::DBTYPE_AMINO_ACIDS);
SmithWaterman aligner(15000, subMat.alphabetSize, false, 1.0, Parameters::DBTYPE_AMINO_ACIDS);
int8_t * tinySubMat = new int8_t[subMat.alphabetSize*subMat.alphabetSize];
for (int i = 0; i < subMat.alphabetSize; i++) {
for (int j = 0; j < subMat.alphabetSize; j++) {
Expand Down
4 changes: 2 additions & 2 deletions src/test/TestDiagonalScoring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ int main (int, const char**) {
CounterResult hits[32];
UngappedAlignment matcher(10000, &subMat, &lookup);

SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s5.numSequence, s5.L, compositionBias);
SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s5.numSequence, s5.L, compositionBias, 1.0);
memset(compositionBias, 0.0, sizeof(float)*s5.L);
// std::cout << compositionBias[74] << std::endl;
// std::cout << compositionBias[79] << std::endl;
Expand All @@ -116,7 +116,7 @@ int main (int, const char**) {



SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias);
SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias, 1.0);

hits[0].id = s1.getId();
hits[0].diagonal = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/test/TestDiagonalScoringPerformance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ int main (int, const char**) {
}

float * compositionBias = new float[s1.L];
SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias);
SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias, 1.0);



Expand Down
2 changes: 1 addition & 1 deletion src/test/TestMultipleAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int main(int, const char**) {
// BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
std::cout << "\n";
EvalueComputation evaluer(100000, &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
Matcher * aligner = new Matcher(Parameters::DBTYPE_AMINO_ACIDS, Parameters::DBTYPE_AMINO_ACIDS, 10000, &subMat, &evaluer, false, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid(), 0.0);
Matcher * aligner = new Matcher(Parameters::DBTYPE_AMINO_ACIDS, Parameters::DBTYPE_AMINO_ACIDS, 10000, &subMat, &evaluer, false, 1.0, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid(), 0.0, 40);
std::vector<Matcher::result_t> alnResults;
std::vector<std::vector<unsigned char>> seqSet;
std::cout << "Sequence (id 0):\n";
Expand Down
2 changes: 1 addition & 1 deletion src/test/TestProfileAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,7 @@ int main (int, const char**) {
const char* sequence2 = "LFILNIISMNKQTKVKGYLLLLLVISSLFISLVGHGYTANKVSAPNPAKEYPQDNLSVIDMKNLPGTQIKSMVKDELQQFLEEQGFRRLKNKSLVDLRRIWLGFMYEDFFYTMHKKTDLPISVIYAFFIIEATNAGIESKLMAKALNPGGIKYRGTGKKMKAMDDCY";

dbSeq->mapSequence(1,1,sequence2, strlen(sequence2));
SmithWaterman aligner(15000, subMat.alphabetSize, false, Parameters::DBTYPE_AMINO_ACIDS);
SmithWaterman aligner(15000, subMat.alphabetSize, false, 1.0, Parameters::DBTYPE_AMINO_ACIDS);
int8_t * tinySubMat = new int8_t[subMat.alphabetSize*subMat.alphabetSize];
aligner.ssw_init(s, s->getAlignmentProfile(), &subMat);
int32_t maskLen = s->L / 2;
Expand Down
9 changes: 6 additions & 3 deletions src/util/convertalignments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ int convertalignments(int argc, const char **argv, const Command &command) {
querySeqData = (char*) queryBuffer.c_str();
}
if (queryProfile) {
Sequence::extractProfileConsensus(querySeqData, *subMat, queryProfData);
size_t queryEntryLen = qDbr.sequenceReader->getEntryLen(qId);
Sequence::extractProfileConsensus(querySeqData, queryEntryLen, *subMat, queryProfData);
}
}

Expand Down Expand Up @@ -481,7 +482,8 @@ int convertalignments(int argc, const char **argv, const Command &command) {
size_t tId = tDbr->sequenceReader->getId(res.dbKey);
targetSeqData = tDbr->sequenceReader->getData(tId, thread_idx);
if (targetProfile) {
Sequence::extractProfileConsensus(targetSeqData, *subMat, targetProfData);
size_t targetEntryLen = tDbr->sequenceReader->getEntryLen(tId);
Sequence::extractProfileConsensus(targetSeqData, targetEntryLen, *subMat, targetProfData);
}
}
for(size_t i = 0; i < outcodes.size(); i++) {
Expand Down Expand Up @@ -723,7 +725,8 @@ int convertalignments(int argc, const char **argv, const Command &command) {
size_t tId = tDbr->sequenceReader->getId(res.dbKey);
char* targetSeqData = tDbr->sequenceReader->getData(tId, thread_idx);
if (targetProfile) {
Sequence::extractProfileConsensus(targetSeqData, *subMat, targetProfData);
size_t targetEntryLen = tDbr->sequenceReader->getEntryLen(tId);
Sequence::extractProfileConsensus(targetSeqData, targetEntryLen, *subMat, targetProfData);
printSeqBasedOnAln(result, targetProfData.c_str(), res.dbStartPos,
Matcher::uncompressAlignment(res.backtrace), true,
(res.dbStartPos > res.dbEndPos),
Expand Down
Loading

0 comments on commit 785a05e

Please sign in to comment.