From 28b0088317127294fe767d2819a08b0f57b999a7 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Wed, 12 Oct 2022 15:37:41 +0900 Subject: [PATCH] Fix FASTA input not ending with a newline resulting in invalid sequence db with --createdb-mode 1 (#617) --- lib/ksw2/kseq.h | 4 +- src/commons/KSeqWrapper.cpp | 8 ++-- src/commons/KSeqWrapper.h | 2 +- src/util/createdb.cpp | 74 ++++++++++++++++++++++++++++--------- 4 files changed, 63 insertions(+), 25 deletions(-) diff --git a/lib/ksw2/kseq.h b/lib/ksw2/kseq.h index de3cfeb35..35562c9ba 100644 --- a/lib/ksw2/kseq.h +++ b/lib/ksw2/kseq.h @@ -208,7 +208,7 @@ typedef struct __kstring_t { seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ - seq->multiline = (ks->newline > 1); \ + seq->newlineCount = ks->newline; \ if (c == '>' || c == '@') { /* the first header char has been read */ \ seq->last_char = c; \ } \ @@ -238,7 +238,7 @@ typedef struct __kstring_t { int last_char; \ size_t headerOffset; \ size_t sequenceOffset; \ - bool multiline; \ + int newlineCount; \ kstream_t *f; \ } kseq_t; diff --git a/src/commons/KSeqWrapper.cpp b/src/commons/KSeqWrapper.cpp index bec3a5d56..9e1158773 100644 --- a/src/commons/KSeqWrapper.cpp +++ b/src/commons/KSeqWrapper.cpp @@ -22,7 +22,7 @@ bool KSeqFile::ReadEntry() { return false; entry.headerOffset = s->headerOffset; entry.sequenceOffset = s->sequenceOffset; - entry.multiline = s->multiline; + entry.newlineCount = s->newlineCount; entry.name = s->name; entry.comment = s->comment; entry.sequence = s->seq; @@ -100,7 +100,7 @@ bool KSeqGzip::ReadEntry() { entry.qual = s->qual; entry.headerOffset = 0; entry.sequenceOffset = 0; - entry.multiline = s->multiline; + entry.newlineCount = s->newlineCount; return true; } @@ -145,7 +145,7 @@ bool KSeqBzip::ReadEntry() { entry.qual = s->qual; entry.headerOffset = 0; entry.sequenceOffset = 0; - entry.multiline = s->multiline; + entry.newlineCount = s->newlineCount; return true; } @@ -214,7 +214,7 @@ bool KSeqBuffer::ReadEntry() { return false; entry.headerOffset = s->headerOffset; entry.sequenceOffset = s->sequenceOffset; - entry.multiline = s->multiline; + entry.newlineCount = s->newlineCount; entry.name = s->name; entry.comment = s->comment; entry.sequence = s->seq; diff --git a/src/commons/KSeqWrapper.h b/src/commons/KSeqWrapper.h index 47bf7b8a7..0e8d06766 100644 --- a/src/commons/KSeqWrapper.h +++ b/src/commons/KSeqWrapper.h @@ -15,7 +15,7 @@ class KSeqWrapper { kstring_t qual; size_t headerOffset; size_t sequenceOffset; - bool multiline; + int newlineCount; } entry; enum kseq_type { diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index dca9dfb7b..ea371bfe7 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -131,9 +131,41 @@ int createdb(int argc, const char **argv, const Command& command) { } else { kseq = KSeqFactory(filenames[fileIdx].c_str()); } - if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && kseq->type != KSeqWrapper::KSEQ_FILE) { - Debug(Debug::WARNING) << "Only uncompressed fasta files can be used with --createdb-mode 0.\n"; - Debug(Debug::WARNING) << "We recompute with --createdb-mode 1.\n"; + + bool resetNotFile = par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && kseq->type != KSeqWrapper::KSEQ_FILE; + if (resetNotFile) { + Debug(Debug::WARNING) << "Only uncompressed fasta files can be used with --createdb-mode 0\n"; + Debug(Debug::WARNING) << "We recompute with --createdb-mode 1\n"; + } + + bool resetIncorrectNewline = false; + if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && kseq->type == KSeqWrapper::KSEQ_FILE) { + // get last byte from filenames[fileIdx].c_str() + FILE* fp = fopen(filenames[fileIdx].c_str(), "rb"); + if (fp == NULL) { + Debug(Debug::ERROR) << "Cannot open file " << filenames[fileIdx] << "\n"; + EXIT(EXIT_FAILURE); + } + int res = fseek(fp, -1, SEEK_END); + if (res != 0) { + Debug(Debug::ERROR) << "Cannot seek at the end of file " << filenames[fileIdx] << "\n"; + EXIT(EXIT_FAILURE); + } + int lastChar = fgetc(fp); + if (lastChar == EOF) { + Debug(Debug::ERROR) << "Error reading from " << filenames[fileIdx] << "\n"; + EXIT(EXIT_FAILURE); + } + if (fclose(fp) != 0) { + Debug(Debug::ERROR) << "Error closing " << filenames[fileIdx] << "\n"; + EXIT(EXIT_FAILURE); + } + if (lastChar != '\n') { + Debug(Debug::WARNING) << "Last byte is not a newline. We recompute with --createdb-mode 1\n"; + resetIncorrectNewline = true; + } + } + if (resetNotFile || resetIncorrectNewline) { par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; progress.reset(SIZE_MAX); hdrWriter.close(); @@ -196,22 +228,28 @@ int createdb(int argc, const char **argv, const Command& command) { } sampleCount++; } - if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && e.multiline == true) { - Debug(Debug::WARNING) << "Multiline fasta can not be combined with --createdb-mode 0\n"; - Debug(Debug::WARNING) << "We recompute with --createdb-mode 1\n"; - par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; - progress.reset(SIZE_MAX); - hdrWriter.close(); - seqWriter.close(); - delete kseq; - if (fclose(source) != 0) { - Debug(Debug::ERROR) << "Cannot close file " << sourceFile << "\n"; - EXIT(EXIT_FAILURE); - } - for (size_t i = 0; i < shuffleSplits; ++i) { - sourceLookup[i].clear(); + if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT) { + if (e.newlineCount != 1) { + if (e.newlineCount == 0) { + Debug(Debug::WARNING) << "Fasta entry " << entries_num << " has no newline character\n"; + } else if (e.newlineCount > 1) { + Debug(Debug::WARNING) << "Multiline fasta can not be combined with --createdb-mode 0\n"; + } + Debug(Debug::WARNING) << "We recompute with --createdb-mode 1\n"; + par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; + progress.reset(SIZE_MAX); + hdrWriter.close(); + seqWriter.close(); + delete kseq; + if (fclose(source) != 0) { + Debug(Debug::ERROR) << "Cannot close file " << sourceFile << "\n"; + EXIT(EXIT_FAILURE); + } + for (size_t i = 0; i < shuffleSplits; ++i) { + sourceLookup[i].clear(); + } + goto redoComputation; } - goto redoComputation; } }