Skip to content

Commit

Permalink
Filter based on homopoly compressed length (if enabled); compute the …
Browse files Browse the repository at this point in the history
…length of a read when sequence is added to the read, instead of when the read is added to the store.
  • Loading branch information
brianwalenz committed Sep 5, 2020
1 parent 1241bb7 commit 258941d
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 103 deletions.
9 changes: 8 additions & 1 deletion src/pipelines/canu/SequenceStore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@ sub createSequenceStore ($@) {
print F " -bias " . getGlobal("readSamplingBias") . " \\\n";
}

if (getGlobal("homoPolyCompress") > 0) { # Tell sqStoreCreate that these reads should
print F " -homopolycompress \\\n"; # be homopoly compressed, and so length filtering
} # should be based on that length.

foreach my $iii (@inputs) {
if ($iii =~ m/^(.*)\0(.*)$/) {
my $tech = $1;
Expand Down Expand Up @@ -341,7 +345,10 @@ sub createSequenceStore ($@) {

createSequenceStoreMetaDataFiles($asm);

# Set or unset the homopolymer compression flag.
# Set or unset the homopolymer compression flag. This is also initially
# created by sqStoreCreate (also based on the homoPolyCompress flag) but
# we allow canu to explicitly change later. Though doing that isn't
# recommended.

if (defined(getGlobal("homoPolyCompress"))) {
touch("./$asm.seqStore/homopolymerCompression");
Expand Down
22 changes: 21 additions & 1 deletion src/stores/sqRead.H
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,12 @@ public:
_rawBases[Slen] = 0;

_rawBasesLen = Slen + 1; // Length INCLUDING NUL, remember?

assert(_rawU->sqReadSeq_valid() == false);
assert(_rawC->sqReadSeq_valid() == false);

_rawU->sqReadSeq_setLength(_rawBases, _rawBasesLen-1, false);
_rawC->sqReadSeq_setLength(_rawBases, _rawBasesLen-1, true);
};

void sqReadDataWriter_setCorrectedBases(const char *S, uint32 Slen) {
Expand All @@ -570,11 +576,25 @@ public:
memcpy(_corBases, S, sizeof(char) * Slen);
_corBases[Slen] = 0;

_corBasesLen = Slen + 1;
_corBasesLen = Slen + 1; // Length INCLUDING NUL, remember?

assert(_corU->sqReadSeq_valid() == false);
assert(_corC->sqReadSeq_valid() == false);

_corU->sqReadSeq_setLength(_corBases, _corBasesLen-1, false);
_corC->sqReadSeq_setLength(_corBases, _corBasesLen-1, true);
};

void sqReadDataWriter_writeBlob(writeBuffer *buffer);

uint32 sqReadDataWriter_getRawLength(bool compressed) {
return((compressed == false) ? _rawU->sqReadSeq_length() : _rawC->sqReadSeq_length());
};

uint32 sqReadDataWriter_getCorrectedLength(bool compressed) {
return((compressed == false) ? _corU->sqReadSeq_length() : _corC->sqReadSeq_length());
};

private:
sqReadMeta *_meta = nullptr; // Pointer to the read metadata.
sqReadSeq *_rawU = nullptr; // In sqStoreCreate, these pointers are
Expand Down
8 changes: 4 additions & 4 deletions src/stores/sqReadDataWriter.C
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ sqReadDataWriter::sqReadDataWriter_writeBlob(writeBuffer *buffer) {
if ((_rawBases != NULL) && (_rawBases[0] != 0)) {
assert(_rawBasesLen > 0);

if ((_rawU) && (_rawU->sqReadSeq_valid() == false)) _rawU->sqReadSeq_setLength(_rawBases, _rawBasesLen-1, false);
if ((_rawC) && (_rawC->sqReadSeq_valid() == false)) _rawC->sqReadSeq_setLength(_rawBases, _rawBasesLen-1, true);
assert(_rawU->sqReadSeq_valid() == true); // setRawBases() should be setting this true.
assert(_rawC->sqReadSeq_valid() == true);

rseq = NULL;
rseq2Len = encode2bitSequence(rseq, _rawBases, _rawU->sqReadSeq_length());
Expand All @@ -87,8 +87,8 @@ sqReadDataWriter::sqReadDataWriter_writeBlob(writeBuffer *buffer) {
if ((_corBases != NULL) && (_corBases[0] != 0)) {
assert(_corBasesLen > 0);

if ((_corU) && (_corU->sqReadSeq_valid() == false)) _corU->sqReadSeq_setLength(_corBases, _corBasesLen-1, false);
if ((_corC) && (_corC->sqReadSeq_valid() == false)) _corC->sqReadSeq_setLength(_corBases, _corBasesLen-1, true);
assert(_corU->sqReadSeq_valid() == true); // setCorrectedBases should be setting this true.
assert(_corC->sqReadSeq_valid() == true);

cseq = NULL;
cseq2Len = encode2bitSequence(cseq, _corBases, _corU->sqReadSeq_length());
Expand Down
78 changes: 60 additions & 18 deletions src/stores/sqStore.C
Original file line number Diff line number Diff line change
Expand Up @@ -180,41 +180,54 @@ sqStore::sqStore_addEmptyLibrary(char const *name, sqLibrary_tech techType) {




// Allocate and return a new sqReadDataWriter that can be used to add a new
// read to the store. This function does NOT actually add the read to the
// store; the sqReadDataWriter is used to collect all the info about the
// read (read ID, read name, bases, quals, trim points) and then that object
// is added to the store after all info is added.
//
// Because the read isn't added until later, two consecutive calls to
// createEmptyRead() will result in both sqReadDataWriter objects referring
// to the same sqRead.
//
// (The reason for this annoyance is so that sqStoreCreate can test that the
// _homopoly_compressed_ length is big enough, and that length is only
// computed by sqReadDataWriter::setRawBases(). Thus, we need to populate
// the sqReadDataWriter object, test it, then discard it if too short.)
//
sqReadDataWriter *
sqStore::sqStore_addEmptyRead(sqLibrary *lib, const char *name) {
sqStore::sqStore_createEmptyRead(sqLibrary *lib, const char *name) {

assert(_info.sqInfo_lastReadID() < _readsAlloc);
assert(_mode != sqStore_readOnly);

// We reserve the zeroth read for "null". This is easy to accomplish
// here, just pre-increment the number of reads. However, we need to be sure
// to iterate up to and including _info.sqInfo_lastReadID().
// The zeroth read is "null". To get the next valid read ID, just
// add one to the current last read ID.

_info.sqInfo_addRead();
uint32 rID = _info.sqInfo_lastReadID() + 1;
uint32 lID = lib->sqLibrary_libraryID();

if (_readsAlloc <= _info.sqInfo_lastReadID()) {
uint32 newMax = _readsAlloc + _info.sqInfo_lastReadID() / 2;
// Grow the metadata arrays if they're too small.

setArraySize(_meta, _info.sqInfo_lastReadID(), _readsAlloc, newMax);
setArraySize(_rawU, _info.sqInfo_lastReadID(), _readsAlloc, newMax);
setArraySize(_rawC, _info.sqInfo_lastReadID(), _readsAlloc, newMax);
setArraySize(_corU, _info.sqInfo_lastReadID(), _readsAlloc, newMax);
setArraySize(_corC, _info.sqInfo_lastReadID(), _readsAlloc, newMax);
}
if (_readsAlloc <= rID) {
uint32 newMax = _readsAlloc + rID / 2;

// Initialize the new read.
setArraySize(_meta, rID, _readsAlloc, newMax);
setArraySize(_rawU, rID, _readsAlloc, newMax);
setArraySize(_rawC, rID, _readsAlloc, newMax);
setArraySize(_corU, rID, _readsAlloc, newMax);
setArraySize(_corC, rID, _readsAlloc, newMax);
}

uint32 rID = _info.sqInfo_lastReadID();
uint32 lID = lib->sqLibrary_libraryID();
// Initialize the metadata.

_meta[rID].sqReadMeta_initialize(rID, lID);
_rawU[rID].sqReadSeq_initialize();
_rawC[rID].sqReadSeq_initialize();
_corU[rID].sqReadSeq_initialize();
_corC[rID].sqReadSeq_initialize();

// With the read set up, set pointers in the readData. Whatever data is in there can stay.
// Make a new writer object, and initialize what we can.

sqReadDataWriter *rdw = new sqReadDataWriter(&_meta[rID],
&_rawU[rID],
Expand All @@ -229,6 +242,35 @@ sqStore::sqStore_addEmptyRead(sqLibrary *lib, const char *name) {



// Add a fully initialized read in sqReadDataWriter to the store.
//
// This is inherently dangerous. It assumes that the supplied sRDW is
// actually for the next read.
//
// If we're adding a new read - the ID _must_ be one more than the number
// of reads in the store - increment the number of reads in the metadata.
//
// If we're trying to add/modify a read that doesn't exist, blow up.
//
// Otherwise, we're modifying an existing read, and need to only write the
// read data.
//
void
sqStore::sqStore_addRead(sqReadDataWriter *rdw) {

if (rdw->_meta->sqRead_readID() == _info.sqInfo_lastReadID() + 1)
_info.sqInfo_addRead();

if (rdw->_meta->sqRead_readID() > _info.sqInfo_lastReadID()) {
fprintf(stderr, "ERROR: Attempt to add/modify read %u in a store with only %u reads.\n",
rdw->_meta->sqRead_readID(), _info.sqInfo_lastReadID());
assert(0);
}

_blobWriter->writeData(rdw);
}



void
sqStore::sqStore_setIgnored(uint32 id,
Expand Down
18 changes: 12 additions & 6 deletions src/stores/sqStore.H
Original file line number Diff line number Diff line change
Expand Up @@ -290,16 +290,22 @@ public:
bool sqStore_isTrimmedRead(uint32 id, sqRead_which w=sqRead_defaultVersion);

// For use ONLY by sqStoreCreate, to add new libraries and reads to a
// store. The first two allocate a new metadata object in the store,
// while the last loads read sequence data.
// store.
//
// addEmptyLibrary() adds a new library to the store.
//
// createEmptyRead() allocates a sRDW, but does not add a new read to
// the store. It must be followed by addRead() to load the data to the
// store. Of note, the sRDW is assigned a read ID for the next -
// currently non-existent - read in the store. Calling
// createEmptyRead() twice with no addRead() between will create two
// reads with the same ID and Bad Things will result.
//
public:
sqLibrary *sqStore_addEmptyLibrary(char const *name, sqLibrary_tech techType);
sqReadDataWriter *sqStore_addEmptyRead(sqLibrary *lib, const char *name);

void sqStore_addRead(sqReadDataWriter *rdw) {
_blobWriter->writeData(rdw);
};
sqReadDataWriter *sqStore_createEmptyRead(sqLibrary *lib, const char *name);
void sqStore_addRead(sqReadDataWriter *rdw);

// Used when initially loading reads into seqStore, and when loading
// trimmed reads. It sets the ignore flag in both the normal and
Expand Down
Loading

0 comments on commit 258941d

Please sign in to comment.