Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work-around for CIGARs with >64k operations in BAM files #560

Closed
wants to merge 15 commits into from
109 changes: 95 additions & 14 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,18 @@ bam1_t *bam_dup1(const bam1_t *bsrc)
return bam_copy1(bdst, bsrc);
}

void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen)
{
int k;
*rlen = *qlen = 0;
for (k = 0; k < n_cigar; ++k) {
int type = bam_cigar_type(bam_cigar_op(cigar[k]));
int len = bam_cigar_oplen(cigar[k]);
if (type & 1) *qlen += len;
if (type & 2) *rlen += len;
}
}

int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
{
int k, l;
Expand All @@ -352,6 +364,49 @@ int32_t bam_endpos(const bam1_t *b)
return b->core.pos + 1;
}

static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
{
bam1_core_t *c = &b->core;
uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
uint8_t *CG;

// test where there is a real CIGAR in the CG tag to move
Copy link

@TransGirlCodes TransGirlCodes Jun 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @lh3, I'm adding support for cigars in CG tags into BioJulia/BioAlignments.jl, and am currently implementing the test for the fake cigar, which is defined as [k<<4|4,m<<4|3] in the spec, where k is l_seq and m is the length of the reference in the alignment. I don't see a field I can say obviously corresponds to m in BAM records.

In your C of bam_tag2cigar, if I understand it correctly, I see you check whether the first op is a softclip, and if it's length == l_seq: if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
But I don't see in the C where the second cigar op of "mN" is checked, is it therefore something that is a given if kS is present then mN will be present?

Thanks,
Ben.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There isn't a field directly in the BAM structure to define 'm' as it is the number of reference bases spanned by the sequence and not query (so not l_qseq). The corresponding code for writing it is also in sam.c:

        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
        cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;

You'll see the bam_cigar2rlen function is the appropriate code for working out 'm'.

Copy link

@TransGirlCodes TransGirlCodes Jun 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkbonfield Ah I see, so if I'm properly understanding, since m is computed with bam_cigar2rlen, using the actual long cigar, then when reading in a BAM record, in bam_tag2cigar, only the first cigar op is checked for k and BAM_CSOFT_CLIP here:

// test where there is a real CIGAR in the CG tag to move
    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
    cigar0 = bam_get_cigar(b);
    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
    fake_bytes = c->n_cigar * 4;

Preciely because you can't yet compute m as the real cigar has not been read yet, in order to do so?

Copy link
Contributor

@jkbonfield jkbonfield Jun 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be the generous interpretation, but it probably boils down to not bothering to check. :-) Note it's not solely entire-soft-clip, but also the presence of the CG tag. The two combined are pretty specific, although it's not a bad idea to also validate that CG and cigar are consistent with each other.

Note the main reason for the existence of the m part of kSmN was so that tools that wanted to do coverage graphs could still use the minimal cigar string to derive the alignment end coordinate. The suggestion started with just kS and so the htslib implementation probably shows some of that history in how it is structured.

if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
cigar0 = bam_get_cigar(b);
if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
fake_bytes = c->n_cigar * 4;
if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag
if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I
CG_len = le_to_u32(CG + 2);
if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length

// move from the CG tag to the right position
cigar_st = (uint8_t*)cigar0 - b->data;
c->n_cigar = CG_len;
n_cigar4 = c->n_cigar * 4;
CG_st = CG - b->data - 2;
CG_en = CG_st + 8 + n_cigar4;
b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
if (b->m_data < b->l_data) {
uint8_t *new_data;
uint32_t new_max = b->l_data;
kroundup32(new_max);
new_data = (uint8_t*)realloc(b->data, new_max);
if (new_data == 0) return -1;
b->m_data = new_max, b->data = new_data;
}
memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
if (ori_len > CG_en) // move data after the CG tag
memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
if (recal_bin)
b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5);
if (give_warning)
hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
return 1;
}

static inline int aux_type2size(uint8_t type)
{
switch (type) {
Expand Down Expand Up @@ -423,13 +478,19 @@ int bam_read1(BGZF *fp, bam1_t *b)
bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
return -4;
if (fp->is_be) swap_data(c, b->l_data, b->data, 0);

// Sanity check for broken CIGAR alignments
if (c->n_cigar > 0 && c->l_qseq > 0 && !(c->flag & BAM_FUNMAP)
&& bam_cigar2qlen(c->n_cigar, bam_get_cigar(b)) != c->l_qseq) {
hts_log_error("CIGAR and query sequence lengths differ for %s",
bam_get_qname(b));
if (bam_tag2cigar(b, 0, 0) < 0)
return -4;

if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
int rlen, qlen;
bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
// Sanity check for broken CIGAR alignments
if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
hts_log_error("CIGAR and query sequence lengths differ for %s",
bam_get_qname(b));
return -4;
}
}

return 4 + block_len;
Expand All @@ -440,15 +501,12 @@ int bam_write1(BGZF *fp, const bam1_t *b)
const bam1_core_t *c = &b->core;
uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
int i, ok;
if (c->n_cigar >= 65536) {
hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b));
errno = EOVERFLOW;
return -1;
}
if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
x[0] = c->tid;
x[1] = c->pos;
x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
x[4] = c->l_qseq;
x[5] = c->mtid;
x[6] = c->mpos;
Expand All @@ -464,7 +522,24 @@ int bam_write1(BGZF *fp, const bam1_t *b)
}
if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
} else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
uint8_t buf[8];
uint32_t cigar_st, cigar_en, cigar[2];
cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
cigar_en = cigar_st + c->n_cigar * 4;
cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP;
u32_to_le(cigar[0], buf);
u32_to_le(cigar[1], buf + 4);
if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
u32_to_le(c->n_cigar, buf);
if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
}
if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
return ok? 4 + block_len : -1;
}
Expand Down Expand Up @@ -577,10 +652,11 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg,
htsFile *fp = fpv;
bam1_t *b = bv;
int ret = cram_get_bam_seq(fp->fp.cram, &b);
if (bam_tag2cigar(b, 1, 1) < 0)
return -2;
*tid = b->core.tid;
*beg = b->core.pos;
*end = bam_endpos(b);

return ret >= 0
? ret
: (cram_eof(fp->fp.cram) ? -1 : -2);
Expand Down Expand Up @@ -651,6 +727,8 @@ static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int
case bam: return bam_read1(bgzfp, b);
case cram: {
int ret = cram_get_bam_seq(fp->fp.cram, &b);
if (bam_tag2cigar(b, 1, 1) < 0)
return -2;
return ret >= 0
? ret
: (cram_eof(fp->fp.cram) ? -1 : -2);
Expand Down Expand Up @@ -1233,6 +1311,8 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
} else _parse_err_param(1, "unrecognized type %c", type);
}
b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
if (bam_tag2cigar(b, 1, 1) < 0)
return -2;
return 0;

#undef _parse_warn
Expand Down Expand Up @@ -1261,6 +1341,7 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)

case cram: {
int ret = cram_get_bam_seq(fp->fp.cram, &b);
bam_tag2cigar(b, 1, 1);
return ret >= 0
? ret
: (cram_eof(fp->fp.cram) ? -1 : -2);
Expand Down