-
Notifications
You must be signed in to change notification settings - Fork 446
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Work-around for CIGARs with >64k operations in BAM files #560
Changes from 10 commits
71f06db
860ebdd
0227406
6cb3dcd
9d2d1a8
fce8639
f923193
1026c56
821c2ac
963091e
27bba42
79466f8
5cae643
08acf13
d57d155
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -352,6 +352,46 @@ int32_t bam_endpos(const bam1_t *b) | |
return b->core.pos + 1; | ||
} | ||
|
||
static int bam_tag2cigar(bam1_t *b) | ||
{ | ||
bam1_core_t *c = &b->core; | ||
uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; | ||
uint8_t *CG; | ||
|
||
// test where there is a real CIGAR in the CG tag to move | ||
if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; | ||
cigar0 = bam_get_cigar(b); | ||
if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; | ||
fake_bytes = c->n_cigar * 4; | ||
if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag | ||
if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I | ||
CG_len = le_to_u32(CG + 2); | ||
if (CG_len == 0) return 0; // nothing to move | ||
|
||
// move from the CG tag to the right position | ||
cigar_st = (uint8_t*)cigar0 - b->data; | ||
c->n_cigar = CG_len; | ||
n_cigar4 = c->n_cigar * 4; | ||
CG_st = CG - b->data - 2; | ||
CG_en = CG_st + 8 + n_cigar4; | ||
b->l_data += n_cigar4 - fake_bytes; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place | ||
if (b->m_data < b->l_data) { | ||
uint8_t *new_data; | ||
uint32_t new_max = b->l_data; | ||
kroundup32(new_max); | ||
new_data = (uint8_t*)realloc(b->data, new_max); | ||
if (new_data == 0) return -1; | ||
b->m_data = new_max, b->data = new_data; | ||
} | ||
memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room | ||
memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR | ||
if (ori_len > CG_en) // move data after the CG tag | ||
memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); | ||
b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) | ||
b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the main decoding loop has a bam_cigar2qlen check always performed, it makes sense to combine these into a (hypothetical) This can be done in the main loop, rather than |
||
return 1; | ||
} | ||
|
||
static inline int aux_type2size(uint8_t type) | ||
{ | ||
switch (type) { | ||
|
@@ -423,6 +463,7 @@ int bam_read1(BGZF *fp, bam1_t *b) | |
bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) | ||
return -4; | ||
if (fp->is_be) swap_data(c, b->l_data, b->data, 0); | ||
if (bam_tag2cigar(b) < 0) return -4; | ||
|
||
// Sanity check for broken CIGAR alignments | ||
if (c->n_cigar > 0 && c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) | ||
|
@@ -440,15 +481,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) | |
const bam1_core_t *c = &b->core; | ||
uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; | ||
int i, ok; | ||
if (c->n_cigar >= 65536) { | ||
hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b)); | ||
errno = EOVERFLOW; | ||
return -1; | ||
} | ||
if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR | ||
x[0] = c->tid; | ||
x[1] = c->pos; | ||
x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); | ||
x[3] = (uint32_t)c->flag<<16 | c->n_cigar; | ||
if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; | ||
else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); | ||
x[4] = c->l_qseq; | ||
x[5] = c->mtid; | ||
x[6] = c->mpos; | ||
|
@@ -464,7 +502,24 @@ int bam_write1(BGZF *fp, const bam1_t *b) | |
} | ||
if (ok) ok = (bgzf_write(fp, x, 32) >= 0); | ||
if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); | ||
if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); | ||
if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally | ||
if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); | ||
} else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag | ||
uint8_t buf[8]; | ||
uint32_t cigar_st, cigar_en, cigar[2]; | ||
cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; | ||
cigar_en = cigar_st + c->n_cigar * 4; | ||
cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; | ||
cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP; | ||
u32_to_le(cigar[0], buf); | ||
u32_to_le(cigar[1], buf + 4); | ||
if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N | ||
if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR | ||
if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I | ||
u32_to_le(c->n_cigar, buf); | ||
if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length | ||
if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR | ||
} | ||
if (fp->is_be) swap_data(c, b->l_data, b->data, 0); | ||
return ok? 4 + block_len : -1; | ||
} | ||
|
@@ -1162,6 +1217,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) | |
} else _parse_err_param(1, "unrecognized type %c", type); | ||
} | ||
b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; | ||
if (bam_tag2cigar(b) < 0) return -2; | ||
return 0; | ||
|
||
#undef _parse_warn | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @lh3, I'm adding support for cigars in CG tags into BioJulia/BioAlignments.jl, and am currently implementing the test for the fake cigar, which is defined as
[k<<4|4,m<<4|3]
in the spec, where k is l_seq and m is the length of the reference in the alignment. I don't see a field I can say obviously corresponds to m in BAM records.In your C of bam_tag2cigar, if I understand it correctly, I see you check whether the first op is a softclip, and if it's length == l_seq:
if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
But I don't see in the C where the second cigar op of "mN" is checked, is it therefore something that is a given if kS is present then mN will be present?
Thanks,
Ben.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There isn't a field directly in the BAM structure to define 'm' as it is the number of reference bases spanned by the sequence and not query (so not l_qseq). The corresponding code for writing it is also in sam.c:
You'll see the bam_cigar2rlen function is the appropriate code for working out 'm'.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jkbonfield Ah I see, so if I'm properly understanding, since
m
is computed withbam_cigar2rlen
, using the actual long cigar, then when reading in a BAM record, inbam_tag2cigar
, only the first cigar op is checked fork
andBAM_CSOFT_CLIP
here:Preciely because you can't yet compute
m
as the real cigar has not been read yet, in order to do so?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That would be the generous interpretation, but it probably boils down to not bothering to check. :-) Note it's not solely entire-soft-clip, but also the presence of the CG tag. The two combined are pretty specific, although it's not a bad idea to also validate that CG and cigar are consistent with each other.
Note the main reason for the existence of the m part of kSmN was so that tools that wanted to do coverage graphs could still use the minimal cigar string to derive the alignment end coordinate. The suggestion started with just kS and so the htslib implementation probably shows some of that history in how it is structured.