Skip to content

Commit

Permalink
Fix FASTA input not ending with a newline resulting in invalid sequen…
Browse files Browse the repository at this point in the history
…ce db with --createdb-mode 1 (#617)
  • Loading branch information
milot-mirdita committed Oct 12, 2022
1 parent a81d9e7 commit 28b0088
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 25 deletions.
4 changes: 2 additions & 2 deletions lib/ksw2/kseq.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ typedef struct __kstring_t {
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
} \
seq->multiline = (ks->newline > 1); \
seq->newlineCount = ks->newline; \
if (c == '>' || c == '@') { /* the first header char has been read */ \
seq->last_char = c; \
} \
Expand Down Expand Up @@ -238,7 +238,7 @@ typedef struct __kstring_t {
int last_char; \
size_t headerOffset; \
size_t sequenceOffset; \
bool multiline; \
int newlineCount; \
kstream_t *f; \
} kseq_t;

Expand Down
8 changes: 4 additions & 4 deletions src/commons/KSeqWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ bool KSeqFile::ReadEntry() {
return false;
entry.headerOffset = s->headerOffset;
entry.sequenceOffset = s->sequenceOffset;
entry.multiline = s->multiline;
entry.newlineCount = s->newlineCount;
entry.name = s->name;
entry.comment = s->comment;
entry.sequence = s->seq;
Expand Down Expand Up @@ -100,7 +100,7 @@ bool KSeqGzip::ReadEntry() {
entry.qual = s->qual;
entry.headerOffset = 0;
entry.sequenceOffset = 0;
entry.multiline = s->multiline;
entry.newlineCount = s->newlineCount;

return true;
}
Expand Down Expand Up @@ -145,7 +145,7 @@ bool KSeqBzip::ReadEntry() {
entry.qual = s->qual;
entry.headerOffset = 0;
entry.sequenceOffset = 0;
entry.multiline = s->multiline;
entry.newlineCount = s->newlineCount;

return true;
}
Expand Down Expand Up @@ -214,7 +214,7 @@ bool KSeqBuffer::ReadEntry() {
return false;
entry.headerOffset = s->headerOffset;
entry.sequenceOffset = s->sequenceOffset;
entry.multiline = s->multiline;
entry.newlineCount = s->newlineCount;
entry.name = s->name;
entry.comment = s->comment;
entry.sequence = s->seq;
Expand Down
2 changes: 1 addition & 1 deletion src/commons/KSeqWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class KSeqWrapper {
kstring_t qual;
size_t headerOffset;
size_t sequenceOffset;
bool multiline;
int newlineCount;
} entry;

enum kseq_type {
Expand Down
74 changes: 56 additions & 18 deletions src/util/createdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,41 @@ int createdb(int argc, const char **argv, const Command& command) {
} else {
kseq = KSeqFactory(filenames[fileIdx].c_str());
}
if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && kseq->type != KSeqWrapper::KSEQ_FILE) {
Debug(Debug::WARNING) << "Only uncompressed fasta files can be used with --createdb-mode 0.\n";
Debug(Debug::WARNING) << "We recompute with --createdb-mode 1.\n";

bool resetNotFile = par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && kseq->type != KSeqWrapper::KSEQ_FILE;
if (resetNotFile) {
Debug(Debug::WARNING) << "Only uncompressed fasta files can be used with --createdb-mode 0\n";
Debug(Debug::WARNING) << "We recompute with --createdb-mode 1\n";
}

bool resetIncorrectNewline = false;
if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && kseq->type == KSeqWrapper::KSEQ_FILE) {
// get last byte from filenames[fileIdx].c_str()
FILE* fp = fopen(filenames[fileIdx].c_str(), "rb");
if (fp == NULL) {
Debug(Debug::ERROR) << "Cannot open file " << filenames[fileIdx] << "\n";
EXIT(EXIT_FAILURE);
}
int res = fseek(fp, -1, SEEK_END);
if (res != 0) {
Debug(Debug::ERROR) << "Cannot seek at the end of file " << filenames[fileIdx] << "\n";
EXIT(EXIT_FAILURE);
}
int lastChar = fgetc(fp);
if (lastChar == EOF) {
Debug(Debug::ERROR) << "Error reading from " << filenames[fileIdx] << "\n";
EXIT(EXIT_FAILURE);
}
if (fclose(fp) != 0) {
Debug(Debug::ERROR) << "Error closing " << filenames[fileIdx] << "\n";
EXIT(EXIT_FAILURE);
}
if (lastChar != '\n') {
Debug(Debug::WARNING) << "Last byte is not a newline. We recompute with --createdb-mode 1\n";
resetIncorrectNewline = true;
}
}
if (resetNotFile || resetIncorrectNewline) {
par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
progress.reset(SIZE_MAX);
hdrWriter.close();
Expand Down Expand Up @@ -196,22 +228,28 @@ int createdb(int argc, const char **argv, const Command& command) {
}
sampleCount++;
}
if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && e.multiline == true) {
Debug(Debug::WARNING) << "Multiline fasta can not be combined with --createdb-mode 0\n";
Debug(Debug::WARNING) << "We recompute with --createdb-mode 1\n";
par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
progress.reset(SIZE_MAX);
hdrWriter.close();
seqWriter.close();
delete kseq;
if (fclose(source) != 0) {
Debug(Debug::ERROR) << "Cannot close file " << sourceFile << "\n";
EXIT(EXIT_FAILURE);
}
for (size_t i = 0; i < shuffleSplits; ++i) {
sourceLookup[i].clear();
if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT) {
if (e.newlineCount != 1) {
if (e.newlineCount == 0) {
Debug(Debug::WARNING) << "Fasta entry " << entries_num << " has no newline character\n";
} else if (e.newlineCount > 1) {
Debug(Debug::WARNING) << "Multiline fasta can not be combined with --createdb-mode 0\n";
}
Debug(Debug::WARNING) << "We recompute with --createdb-mode 1\n";
par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
progress.reset(SIZE_MAX);
hdrWriter.close();
seqWriter.close();
delete kseq;
if (fclose(source) != 0) {
Debug(Debug::ERROR) << "Cannot close file " << sourceFile << "\n";
EXIT(EXIT_FAILURE);
}
for (size_t i = 0; i < shuffleSplits; ++i) {
sourceLookup[i].clear();
}
goto redoComputation;
}
goto redoComputation;
}
}

Expand Down

0 comments on commit 28b0088

Please sign in to comment.