Skip to content

Commit

Permalink
Add support to write compressed files in unpackdb with `--unpack-suff…
Browse files Browse the repository at this point in the history
…ix .gz`
  • Loading branch information
milot-mirdita committed Apr 24, 2023
1 parent e379831 commit 570e3ed
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ Parameters::Parameters():
PARAM_TAR_INCLUDE(PARAM_TAR_INCLUDE_ID, "--tar-include", "Tar Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &tarInclude, "^.*$"),
PARAM_TAR_EXCLUDE(PARAM_TAR_EXCLUDE_ID, "--tar-exclude", "Tar Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &tarExclude, "^.*$"),
// unpackdb
PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files", typeid(std::string), (void *) &unpackSuffix, "^.*$"),
PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files.\nAdd .gz suffix to write compressed files.", typeid(std::string), (void *) &unpackSuffix, "^.*$"),
PARAM_UNPACK_NAME_MODE(PARAM_UNPACK_NAME_MODE_ID, "--unpack-name-mode", "Unpack name mode", "Name unpacked files by 0: DB key, 1: accession (through .lookup)", typeid(int), (void *) &unpackNameMode, "^[0-1]{1}$"),
// for modules that should handle -h themselves
PARAM_HELP(PARAM_HELP_ID, "-h", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN),
Expand Down
56 changes: 52 additions & 4 deletions src/util/unpackdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include "Util.h"
#include "FileUtil.h"
#include "Debug.h"
#ifdef HAVE_ZLIB
#include <zlib.h>
#endif

#ifdef OPENMP
#include <omp.h>
Expand All @@ -27,7 +30,6 @@ int unpackdb(int argc, const char **argv, const Command& command) {

size_t entries = reader.getSize();
Debug::Progress progress(entries);

#pragma omp parallel
{
unsigned int thread_idx = 0;
Expand All @@ -50,9 +52,55 @@ int unpackdb(int argc, const char **argv, const Command& command) {
name.append(SSTR(key));
}
name.append(par.unpackSuffix);
FILE* handle = FileUtil::openAndDelete(name.c_str(), "w");
fwrite(reader.getData(i, thread_idx), sizeof(char), reader.getEntryLen(i) - 1, handle);
fclose(handle);

const char* cname = name.c_str();

if (FileUtil::fileExists(cname) == true) {
if (FileUtil::directoryExists(cname) == true) {
Debug(Debug::ERROR) << "Cannot open directory " << name << " for writing\n";
continue;
}
FileUtil::remove(cname);
}

if (Util::endsWith(".gz", name.c_str()) == true) {
#ifdef HAVE_ZLIB
gzFile handle = gzopen(cname, "w");
if (handle == NULL) {
Debug(Debug::ERROR) << "Cannot not open " << name << " for writing\n";
continue;
}
size_t len = reader.getEntryLen(i) - 1;
int n = gzwrite(handle ,reader.getData(i, thread_idx), len * sizeof(char));
if ((size_t)n != len) {
Debug(Debug::ERROR) << "Cannot not write " << name << "\n";
continue;
}
if (gzclose(handle) != 0) {
Debug(Debug::ERROR) << "Cannot not close " << name << "\n";
continue;
}
#else
Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Cannot write compressed output\n";
EXIT(EXIT_FAILURE);
#endif
} else {
FILE* handle = fopen(cname, "w");
if (handle == NULL) {
Debug(Debug::ERROR) << "Cannot not open " << name << " for writing\n";
continue;
}
size_t len = reader.getEntryLen(i) - 1;
int n = fwrite(reader.getData(i, thread_idx), sizeof(char), len, handle);
if ((size_t)n != len) {
Debug(Debug::ERROR) << "Cannot not write " << name << "\n";
continue;
}
if (fclose(handle) != 0) {
Debug(Debug::ERROR) << "Cannot not close " << name << "\n";
continue;
}
}
}
}
reader.close();
Expand Down

0 comments on commit 570e3ed

Please sign in to comment.