Skip to content

Commit

Permalink
Squashed 'lib/mmseqs/' changes from 24f6b52a38..f349118312
Browse files Browse the repository at this point in the history
f349118312 Make sure very large database don't overflow localThreads
ad5837b344 Revert "result2msa now supports reading from index"
7ee3e79453 Fix wrong database name printed for variadic input when creating a tmp directory
15fdf48e73 result2msa now supports reading from index
7aade9df74 Change deep copies to const references in result2msa
ce7cf754e5 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
31eb67ae0e Add A3M support to result2msa
56f7685b56 Add symlinks/copies for _taxonomy file #474
904d0c6d1d Transition old compiler tests from travis to CirrusCI
442d898306 Fix memory issues in QueryMatcher
17c8028e18 Move fixRlimitNoFile to Application
c6634976c8 Fix the forbidden symbols when using unpackdb (#467)
488df8634e Refactoring of gff2db
d822533f2e Build update function for DbType validators
a09a704eb9 Remove bash dependency in regression to fix FreeBSD in CirrusCI
4f1996a472 Fix FreeBSD on CirrusCI samtools issue
a2e2129c13 Add CirrusCI to test FreeBSD
15ace29a27 Fix posix_madvise on FreeBSD returning error if size=0 (See #460)
139e4502a9 Get rid of MathUtil::popCount in favor of __builtin_popcount
bbfd6e26db Add preload mode to expand(aln/2profile)
b14d0136e8 Fix a few more tests
635911ecf4 Increase sortresult buffer for matcher result
d6c19db94b Fix exhaustive search parameter in examples
e86afeab17 Move substitution matrix init code out of Parameters::parseParameters to fix tests
62f7aba12a Replace biorxiv citation for taxonomy paper

git-subtree-dir: lib/mmseqs
git-subtree-split: f349118312919c4fcc448f4595ca3b3a387018e2
  • Loading branch information
elileka committed Aug 17, 2021
1 parent 39afb6a commit 0e8c2a0
Show file tree
Hide file tree
Showing 44 changed files with 496 additions and 358 deletions.
36 changes: 36 additions & 0 deletions .cirrus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
env:
CIRRUS_CLONE_SUBMODULES: true

task:
name: FreeBSD-13
freebsd_instance:
image_family: freebsd-13-0
install_script: pkg install -y cmake git samtools
compile_script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=1 -DENABLE_WERROR=1 -DHAVE_SSE4_1=1 ..
make -j $(sysctl -n hw.ncpu)
test_script: ./util/regression/run_regression.sh ./build/src/mmseqs SCRATCH

task:
name: "Old compilers"
container:
image: debian:jessie-slim
memory: 8G
matrix:
- name: Clang-4
install_script: apt update --yes && apt install cmake clang-4.0 libc++-dev make git ca-certificates --yes --no-install-suggests --no-install-recommends
env:
CC: clang-4.0
CXX: clang++-4.0
- name: GCC-4.9
install_script: apt update --yes && apt install cmake gcc-4.9 g++-4.9 make git ca-certificates --yes --no-install-suggests --no-install-recommends
env:
CC: gcc-4.9
CXX: g++-4.9
compile_script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=1 -DENABLE_WERROR=1 -DHAVE_SSE4_1=1 -DREQUIRE_OPENMP=0 ..
make -j $(nproc --all)
test_script: ./util/regression/run_regression.sh ./build/src/mmseqs SCRATCH SEARCH

130 changes: 0 additions & 130 deletions .travis.yml

This file was deleted.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and

[Mirdita M, Steinegger M and Soeding J. MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019)](https://academic.oup.com/bioinformatics/article/35/16/2856/5280135).

[Mirdita M, Steinegger M, Breitwieser F, Soding J, Levy Karin E: Fast and sensitive taxonomic assignment to metagenomic contigs. bioRxiv, doi: 10.1101/2020.11.27.401018 (2020)](https://www.biorxiv.org/content/10.1101/2020.11.27.401018v1).
[Mirdita M, Steinegger M, Breitwieser F, Soding J, Levy Karin E: Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, doi: 10.1093/bioinformatics/btab184 (2021)](https://doi.org/10.1093/bioinformatics/btab184).

[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2)
[![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest)
Expand Down
8 changes: 4 additions & 4 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ std::vector<Command> baseCommands = {
"mmseqs databases PFAM pfam_db tmp\n"
"mmseqs easy-search examples/QUERY.fasta pfam_db res.m8 tmp\n\n"
"# Exhaustive search against sequences or profiles (works for large DBs)\n"
"mmseqs easy-search examples/QUERY.fasta targetProfiles res.m8 tmp --slice-search\n\n"
"mmseqs easy-search examples/QUERY.fasta targetProfiles res.m8 tmp --exhaustive-search\n\n"
"# Increasing sensitivity search (from 2 to 7 in 3 steps)\n"
"mmseqs easy-search examples/QUERY.fasta examples/DB.fasta result.m8 tmp --start-sens 2 -s 7 --sens-steps 3\n",
"Milot Mirdita <[email protected]> & Martin Steinegger <[email protected]>",
Expand Down Expand Up @@ -190,7 +190,7 @@ std::vector<Command> baseCommands = {
"mmseqs databases PFAM pfam_db tmp\n"
"mmseqs search queryDB pfam_db resultDB tmp\n\n"
"# Exhaustive search against sequences or profiles (works for large DBs)\n"
"mmseqs search queryDB targetDB resultDB tmp --slice-search\n\n"
"mmseqs search queryDB targetDB resultDB tmp --exhaustive-search\n\n"
"# Increasing sensitivity search (from 2 to 7 in 3 steps)\n"
"mmseqs search queryDB targetDB resultDB --start-sens 2 -s 7 --sens-steps 3\n",
"Martin Steinegger <[email protected]>",
Expand Down Expand Up @@ -1146,8 +1146,8 @@ std::vector<Command> baseCommands = {
"Extract regions from a sequence database based on a GFF3 file",
NULL,
"Milot Mirdita <[email protected]>",
"<i:gff3File> <i:sequenceDB> <o:sequenceDB>",
CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
"<i:gff3File1> ... <i:gff3FileN> <i:sequenceDB> <o:sequenceDB>",
CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile },
{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
{"maskbygff", maskbygff, &par.gff2db, COMMAND_SPECIAL,
Expand Down
2 changes: 1 addition & 1 deletion src/alignment/DistanceCalculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ class DistanceCalculator {
// the 16 signed or unsigned 8-bit integers in a and zero-extends the upper bits.
simd_int seqComparision = simdi8_eq(seq1vec, seq2vec);
int res = simdi8_movemask(seqComparision);
diff += MathUtil::popCount(res); // subtract positions that should not contribute to coverage
diff += __builtin_popcount(res); // subtract positions that should not contribute to coverage
}
// compute missing rest
for (unsigned int pos = simdBlock*(VECSIZE_INT*4); pos < length; pos++ ) {
Expand Down
5 changes: 2 additions & 3 deletions src/alignment/MsaFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,15 +417,14 @@ size_t MsaFilter::filter(const int N_in, const int L, const int coverage, const
// printf("%02d:%02d ", (int) ((char*)&XK[i])[u], (int) ((char*)&XK[i])[u]);
// }
// std::cout << std::endl;
cov_kj -= MathUtil::popCount(res); // subtract positions that should not contribute to coverage
cov_kj -= __builtin_popcount(res); // subtract positions that should not contribute to coverage

// Compute 16 bit mask that indicates positions where k and j have identical residues
int c = simdi8_movemask(simdi8_eq(XK[i], XJ[i]));

// Count positions where k and j have different amino acids, which is equal to 16 minus the
// number of positions for which either j and k are equal or which contain ANY, GAP, or ENDGAP
diff += (VECSIZE_INT * 4) - MathUtil::popCount(c | res);

diff += (VECSIZE_INT * 4) - __builtin_popcount(c | res);
}
// // DEBUG
// printf("%20.20s with %20.20s: diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
Expand Down
7 changes: 7 additions & 0 deletions src/commons/Application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "Util.h"
#include "Command.h"
#include "DistanceCalculator.h"
#include "FileUtil.h"
#include "Timer.h"

#include <iomanip>
Expand All @@ -18,6 +19,7 @@ extern bool hide_base_commands;
extern std::vector<Command> commands;
extern std::vector<Command> baseCommands;
extern std::vector<Categories> categories;
extern void (*validatorUpdate)(void);

Command *getCommandByName(const char *s) {
for (size_t i = 0; i < commands.size(); i++) {
Expand Down Expand Up @@ -188,6 +190,11 @@ int main(int argc, const char **argv) {
return EXIT_SUCCESS;
}

if(validatorUpdate != NULL){
(*validatorUpdate)();
}
FileUtil::fixRlimitNoFile();

setenv("MMSEQS", argv[0], true);
Command *c = NULL;
if (strncmp(argv[1], "shellcompletion", strlen("shellcompletion")) == 0) {
Expand Down
11 changes: 4 additions & 7 deletions src/commons/DBReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,7 @@ threads(threads), dataMode(dataMode), dataFileName(strdup(dataFileName_)),
totalDataSize(0), dataSize(0), lastKey(T()), closed(1), dbtype(Parameters::DBTYPE_GENERIC_DB),
compressedBuffers(NULL), compressedBufferSizes(NULL), index(NULL), id2local(NULL), local2id(NULL),
dataMapped(false), accessType(0), externalData(false), didMlock(false)
{
if (threads > 1) {
FileUtil::fixRlimitNoFile();
}
}
{}

template <typename T>
DBReader<T>::DBReader(DBReader<T>::Index *index, size_t size, size_t dataSize, T lastKey,
Expand Down Expand Up @@ -721,7 +717,7 @@ template <typename T> size_t DBReader<T>::maxCount(char c) {
if (compression == COMPRESSED) {
size_t entries = getSize();
#ifdef OPENMP
size_t localThreads = std::min(entries, static_cast<size_t>(threads));
size_t localThreads = std::max(std::min(entries, static_cast<size_t>(threads)), (size_t)1);
#endif
#pragma omp parallel num_threads(localThreads)
{
Expand Down Expand Up @@ -1000,7 +996,7 @@ void DBReader<T>::setSequentialAdvice() {
#ifdef HAVE_POSIX_MADVISE
for(size_t i = 0; i < dataFileCnt; i++){
size_t dataSize = dataSizeOffset[i+1] - dataSizeOffset[i];
if (posix_madvise (dataFiles[i], dataSize, POSIX_MADV_SEQUENTIAL) != 0){
if (dataSize > 0 && posix_madvise (dataFiles[i], dataSize, POSIX_MADV_SEQUENTIAL) != 0){
Debug(Debug::ERROR) << "posix_madvise returned an error " << dataFileName << "\n";
}
}
Expand Down Expand Up @@ -1136,6 +1132,7 @@ void copyLinkDb(const std::string &databaseName, const std::string &outDb, DBFil
{ DBFiles::TAX_NAMES, "_names.dmp" },
{ DBFiles::TAX_NODES, "_nodes.dmp" },
{ DBFiles::TAX_MERGED, "_merged.dmp" },
{ DBFiles::TAX_MERGED, "_taxonomy" },
{ DBFiles::CA3M_DATA, "_ca3m.ffdata" },
{ DBFiles::CA3M_INDEX, "_ca3m.ffindex" },
{ DBFiles::CA3M_SEQ, "_sequence.ffdata" },
Expand Down
3 changes: 2 additions & 1 deletion src/commons/DBReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ struct DBFiles {
CA3M_SEQ_IDX = (1ull << 15),
CA3M_HDR = (1ull << 16),
CA3M_HDR_IDX = (1ull << 17),
TAX_BINARY = (1ull << 18),


GENERIC = DATA | DATA_INDEX | DATA_DBTYPE,
HEADERS = HEADER | HEADER_INDEX | HEADER_DBTYPE,
TAXONOMY = TAX_MAPPING | TAX_NAMES | TAX_NODES | TAX_MERGED,
TAXONOMY = TAX_MAPPING | TAX_NAMES | TAX_NODES | TAX_MERGED | TAX_BINARY,
SEQUENCE_DB = GENERIC | HEADERS | TAXONOMY | LOOKUP | SOURCE,
SEQUENCE_ANCILLARY= SEQUENCE_DB & (~GENERIC),
SEQUENCE_NO_DATA_INDEX = SEQUENCE_DB & (~DATA_INDEX),
Expand Down
1 change: 1 addition & 0 deletions src/commons/FastSort.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <algorithm>
#ifdef ENABLE_IPS4O
# include "simde/hedley.h"
# if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(0,0,0) && !HEDLEY_GCC_VERSION_CHECK(5,1,0) && defined(__cplusplus)
Expand Down
18 changes: 18 additions & 0 deletions src/commons/FileUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,3 +465,21 @@ void FileUtil::fixRlimitNoFile() {
}
}
}

std::string FileUtil::sanitizeFilename(std::string name){
static const std::vector<std::pair<char, char>> symbolTable =
{{'\\', '@'},
{'/', '@'},
{':', '@'},
{'*', '@'},
{'?', '@'},
{'<', '@'},
{'>', '@'},
{'|', '!'}};

std::vector<std::pair<char, char>>::const_iterator it;
for (it = symbolTable.begin(); it != symbolTable.end(); ++it) {
std::replace(name.begin(), name.end(), it->first, it->second);
}
return name;
}
3 changes: 3 additions & 0 deletions src/commons/FileUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ class FileUtil {
static std::string createTemporaryDirectory(const std::string& basePath, const std::string& subDirectory);

static void fixRlimitNoFile();

// remove forbidden symbols in from filenames
static std::string sanitizeFilename(std::string name);
};


Expand Down
7 changes: 0 additions & 7 deletions src/commons/MathUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,6 @@ class MathUtil {
return (a > 0xFFFF - b) ? 0xFFFF : a + b;
}

// Compute the sum of bits of one or two integers
static inline int popCount(int i) {
i = i - ((i >> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}

static inline float getCoverage(size_t start, size_t end, size_t length) {
return static_cast<float>(end - start + 1) / static_cast<float>(length);
}
Expand Down
Loading

0 comments on commit 0e8c2a0

Please sign in to comment.