From 71f06db720b81e14076e1af5bf97708f16c11a2b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 11:20:10 -0400 Subject: [PATCH 01/12] support CIGARs with >64k operations See also samtools/hts-spec#40. --- sam.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/sam.c b/sam.c index 5e9c20de2..f1cd8800c 100644 --- a/sam.c +++ b/sam.c @@ -352,6 +352,33 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } +static void bam_tag2cigar(bam1_t *b) +{ + bam1_core_t *c = &b->core; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data; + uint8_t *CG; + if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return; + if ((CG = bam_aux_get(b, "CG")) == 0) return; + if (CG[0] != 'B' || CG[1] != 'I') return; + cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + c->n_cigar = *(uint32_t*)(CG + 2); + n_cigar4 = c->n_cigar * 4; + CG_st = CG - b->data - 2; + CG_en = CG_st + 8 + n_cigar4; + b->l_data += n_cigar4; + if (b->m_data < b->l_data) { + b->m_data = b->l_data; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st, ori_len - cigar_st); // make room for the real CIGAR + memcpy(b->data + cigar_st, b->data + n_cigar4 + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place + if (ori_len > CG_en) + memmove(b->data + CG_st + n_cigar4, b->data + CG_en + n_cigar4, ori_len - CG_en); + b->l_data -= n_cigar4 + 8; + c->flag &= ~BAM_FUNMAP; +} + static inline int aux_type2size(uint8_t type) { switch (type) { @@ -421,6 +448,7 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); + bam_tag2cigar(b); return 4 + block_len; } @@ -429,15 +457,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar >= 65536) { - hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b)); - errno = EOVERFLOW; - return -1; - } + if (c->n_cigar > 0xffff) block_len += 8; // "8" for "CGBI" plus 4-byte tag length x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)(c->flag | BAM_FUNMAP) << 16; + else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; x[6] = c->mpos; @@ -453,7 +478,18 @@ int bam_write1(BGZF *fp, const bam1_t *b) } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); - if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + if (c->n_cigar <= 0xffff) { + if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + } else { + uint32_t cigar_st, cigar_en; + cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + cigar_en = cigar_st + c->n_cigar * 4; + if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR + memcpy(&x[0], "CGBI", 4); + x[1] = c->n_cigar; + if (ok) ok = (bgzf_write(fp, x, 8) >= 0); // write CG:B:I and length + if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR + } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return ok? 4 + block_len : -1; } @@ -1143,6 +1179,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; + bam_tag2cigar(b); return 0; #undef _parse_warn From 860ebdd796c0c19bb7bf96f476b99a292f796480 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 11:49:36 -0400 Subject: [PATCH 02/12] fixed wrong byte-order on big endian --- sam.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sam.c b/sam.c index f1cd8800c..d649b2e62 100644 --- a/sam.c +++ b/sam.c @@ -485,9 +485,13 @@ int bam_write1(BGZF *fp, const bam1_t *b) cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR - memcpy(&x[0], "CGBI", 4); - x[1] = c->n_cigar; - if (ok) ok = (bgzf_write(fp, x, 8) >= 0); // write CG:B:I and length + if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + if (fp->is_be) { + y = c->n_cigar; + if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); + } else { + if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); + } if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); From 02274067ab222c4197869fd2bfaf8162e8b0195f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 11:51:00 -0400 Subject: [PATCH 03/12] changed TAB to 4 spaces --- sam.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sam.c b/sam.c index d649b2e62..fa97e9357 100644 --- a/sam.c +++ b/sam.c @@ -485,13 +485,13 @@ int bam_write1(BGZF *fp, const bam1_t *b) cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR - if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I - if (fp->is_be) { - y = c->n_cigar; - if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); - } else { - if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); - } + if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + if (fp->is_be) { + y = c->n_cigar; + if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); + } else { + if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); + } if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); From 6cb3dcd48144553c613e065a2b41423f7511fb13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 12:28:02 -0400 Subject: [PATCH 04/12] check realloc return; fixed an endianness issue --- sam.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sam.c b/sam.c index fa97e9357..fdef51aa3 100644 --- a/sam.c +++ b/sam.c @@ -352,24 +352,27 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } -static void bam_tag2cigar(bam1_t *b) +static int bam_tag2cigar(bam1_t *b) { bam1_core_t *c = &b->core; uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data; uint8_t *CG; - if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return; - if ((CG = bam_aux_get(b, "CG")) == 0) return; - if (CG[0] != 'B' || CG[1] != 'I') return; + if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return 0; + if ((CG = bam_aux_get(b, "CG")) == 0) return 0; + if (CG[0] != 'B' || CG[1] != 'I') return 0; cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; - c->n_cigar = *(uint32_t*)(CG + 2); + c->n_cigar = le_to_u32(CG + 2); n_cigar4 = c->n_cigar * 4; CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; b->l_data += n_cigar4; if (b->m_data < b->l_data) { - b->m_data = b->l_data; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + uint8_t *new_data; + uint32_t new_max = b->l_data; + kroundup32(new_max); + new_data = (uint8_t*)realloc(b->data, new_max); + if (new_data == 0) return -1; + b->m_data = new_max, b->data = new_data; } memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st, ori_len - cigar_st); // make room for the real CIGAR memcpy(b->data + cigar_st, b->data + n_cigar4 + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place @@ -377,6 +380,7 @@ static void bam_tag2cigar(bam1_t *b) memmove(b->data + CG_st + n_cigar4, b->data + CG_en + n_cigar4, ori_len - CG_en); b->l_data -= n_cigar4 + 8; c->flag &= ~BAM_FUNMAP; + return 1; } static inline int aux_type2size(uint8_t type) @@ -448,7 +452,7 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); - bam_tag2cigar(b); + if (bam_tag2cigar(b) < 0) return -4; return 4 + block_len; } @@ -1183,7 +1187,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - bam_tag2cigar(b); + if (bam_tag2cigar(b) < 0) return -2; return 0; #undef _parse_warn From 9d2d1a83c3a9ced1c091aecae809a94cd82eaf33 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jul 2017 17:48:26 -0400 Subject: [PATCH 05/12] Long-cigar solution with soft-clipping --- sam.c | 55 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/sam.c b/sam.c index fdef51aa3..68d27b81d 100644 --- a/sam.c +++ b/sam.c @@ -355,17 +355,25 @@ int32_t bam_endpos(const bam1_t *b) static int bam_tag2cigar(bam1_t *b) { bam1_core_t *c = &b->core; - uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len; uint8_t *CG; - if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return 0; - if ((CG = bam_aux_get(b, "CG")) == 0) return 0; - if (CG[0] != 'B' || CG[1] != 'I') return 0; - cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; - c->n_cigar = le_to_u32(CG + 2); + + // test where there is a real CIGAR in the CG tag to move + if (c->n_cigar != 1 || c->tid < 0 || c->pos < 0) return 0; + cigar0 = bam_get_cigar(b); + if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; + if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag + if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I + CG_len = le_to_u32(CG + 2); + if (CG_len == 0) return 0; // nothing to move + + // move from the CG tag to the right position + cigar_st = (uint8_t*)cigar0 - b->data; + c->n_cigar = CG_len; n_cigar4 = c->n_cigar * 4; CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; - b->l_data += n_cigar4; + b->l_data += n_cigar4 - 4; // we need (c->n_cigar-1)*4 bytes to swap CIGAR to the right place if (b->m_data < b->l_data) { uint8_t *new_data; uint32_t new_max = b->l_data; @@ -374,12 +382,11 @@ static int bam_tag2cigar(bam1_t *b) if (new_data == 0) return -1; b->m_data = new_max, b->data = new_data; } - memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st, ori_len - cigar_st); // make room for the real CIGAR - memcpy(b->data + cigar_st, b->data + n_cigar4 + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place - if (ori_len > CG_en) - memmove(b->data + CG_st + n_cigar4, b->data + CG_en + n_cigar4, ori_len - CG_en); - b->l_data -= n_cigar4 + 8; - c->flag &= ~BAM_FUNMAP; + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + 4, ori_len - (cigar_st + 4)); // insert 4*(c->n_cigar-1) empty space to make room + memcpy(b->data + cigar_st, b->data + (n_cigar4 - 4) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -4 for the fake CIGAR + if (ori_len > CG_en) // move data after the CG tag + memmove(b->data + CG_st + n_cigar4 - 4, b->data + CG_en + n_cigar4 - 4, ori_len - CG_en); + b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) return 1; } @@ -461,11 +468,11 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar > 0xffff) block_len += 8; // "8" for "CGBI" plus 4-byte tag length + if (c->n_cigar > 0xffff) block_len += 12; // "12" for "CGBI", 4-byte tag length and 4-byte fake CIGAR x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - if (c->n_cigar > 0xffff) x[3] = (uint32_t)(c->flag | BAM_FUNMAP) << 16; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 1; else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; @@ -482,20 +489,20 @@ int bam_write1(BGZF *fp, const bam1_t *b) } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); - if (c->n_cigar <= 0xffff) { + if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); - } else { - uint32_t cigar_st, cigar_en; + } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag + uint8_t buf[4]; + uint32_t cigar_st, cigar_en, cigar1; cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; + cigar1 = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; + u32_to_le(cigar1, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write cigar: S if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I - if (fp->is_be) { - y = c->n_cigar; - if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); - } else { - if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); - } + u32_to_le(c->n_cigar, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); From f923193f3fd6165d22901663061e303a4a13f771 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 18 Oct 2017 08:22:55 -0400 Subject: [PATCH 06/12] recompute bin for long-cigar records because older tools may put a wrong bin in BAM. Although htslib does not care, 3rd-party tools may have problems. --- sam.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sam.c b/sam.c index b26c7e0e8..c2990418e 100644 --- a/sam.c +++ b/sam.c @@ -387,6 +387,7 @@ static int bam_tag2cigar(bam1_t *b) if (ori_len > CG_en) // move data after the CG tag memmove(b->data + CG_st + n_cigar4 - 4, b->data + CG_en + n_cigar4 - 4, ori_len - CG_en); b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); return 1; } From 1026c566ec9cfa3fb2c8bc9ec51cbf98e496c80d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 18 Oct 2017 09:49:24 -0400 Subject: [PATCH 07/12] convert TAB to 4 SPACEs --- sam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sam.c b/sam.c index c2990418e..a1c923e6e 100644 --- a/sam.c +++ b/sam.c @@ -387,7 +387,7 @@ static int bam_tag2cigar(bam1_t *b) if (ori_len > CG_en) // move data after the CG tag memmove(b->data + CG_st + n_cigar4 - 4, b->data + CG_en + n_cigar4 - 4, ori_len - CG_en); b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) - b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); return 1; } From 821c2ac405d48dcf2b877c0861509c463f213fa2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Nov 2017 22:27:40 -0400 Subject: [PATCH 08/12] support both fake cigar xS and xSyN --- sam.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/sam.c b/sam.c index a1c923e6e..c60733927 100644 --- a/sam.c +++ b/sam.c @@ -355,13 +355,14 @@ int32_t bam_endpos(const bam1_t *b) static int bam_tag2cigar(bam1_t *b) { bam1_core_t *c = &b->core; - uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; uint8_t *CG; // test where there is a real CIGAR in the CG tag to move - if (c->n_cigar != 1 || c->tid < 0 || c->pos < 0) return 0; + if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; cigar0 = bam_get_cigar(b); if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; + fake_bytes = c->n_cigar * 4; if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I CG_len = le_to_u32(CG + 2); @@ -373,7 +374,7 @@ static int bam_tag2cigar(bam1_t *b) n_cigar4 = c->n_cigar * 4; CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; - b->l_data += n_cigar4 - 4; // we need (c->n_cigar-1)*4 bytes to swap CIGAR to the right place + b->l_data += n_cigar4 - fake_bytes; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place if (b->m_data < b->l_data) { uint8_t *new_data; uint32_t new_max = b->l_data; @@ -382,10 +383,10 @@ static int bam_tag2cigar(bam1_t *b) if (new_data == 0) return -1; b->m_data = new_max, b->data = new_data; } - memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + 4, ori_len - (cigar_st + 4)); // insert 4*(c->n_cigar-1) empty space to make room - memcpy(b->data + cigar_st, b->data + (n_cigar4 - 4) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -4 for the fake CIGAR + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room + memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR if (ori_len > CG_en) // move data after the CG tag - memmove(b->data + CG_st + n_cigar4 - 4, b->data + CG_en + n_cigar4 - 4, ori_len - CG_en); + memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); return 1; @@ -480,11 +481,11 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar > 0xffff) block_len += 12; // "12" for "CGBI", 4-byte tag length and 4-byte fake CIGAR + if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 1; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; @@ -504,13 +505,15 @@ int bam_write1(BGZF *fp, const bam1_t *b) if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag - uint8_t buf[4]; - uint32_t cigar_st, cigar_en, cigar1; + uint8_t buf[8]; + uint32_t cigar_st, cigar_en, cigar[2]; cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; - cigar1 = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; - u32_to_le(cigar1, buf); - if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write cigar: S + cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; + cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP; + u32_to_le(cigar[0], buf); + u32_to_le(cigar[1], buf + 4); + if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: SN if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I u32_to_le(c->n_cigar, buf); From 963091ed9f2a1e0123850cbd3893332e37bfd70a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Nov 2017 23:31:07 -0400 Subject: [PATCH 09/12] replaced TAB with SPACEs --- sam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sam.c b/sam.c index c60733927..7ebbd2f55 100644 --- a/sam.c +++ b/sam.c @@ -362,7 +362,7 @@ static int bam_tag2cigar(bam1_t *b) if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; cigar0 = bam_get_cigar(b); if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; - fake_bytes = c->n_cigar * 4; + fake_bytes = c->n_cigar * 4; if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I CG_len = le_to_u32(CG + 2); From 27bba42190b41cf8ed55213c5e26da43b33ae980 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 3 Nov 2017 10:25:10 -0400 Subject: [PATCH 10/12] reduce unnecessary cigar parsing; recompute bin --- sam.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/sam.c b/sam.c index 7ebbd2f55..24535fbd5 100644 --- a/sam.c +++ b/sam.c @@ -326,6 +326,18 @@ bam1_t *bam_dup1(const bam1_t *bsrc) return bam_copy1(bdst, bsrc); } +void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen) +{ + int k; + *rlen = *qlen = 0; + for (k = 0; k < n_cigar; ++k) { + int type = bam_cigar_type(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + if (type & 1) *qlen += len; + if (type & 2) *rlen += len; + } +} + int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) { int k, l; @@ -352,7 +364,7 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } -static int bam_tag2cigar(bam1_t *b) +int bam_tag2cigar(bam1_t *b) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG { bam1_core_t *c = &b->core; uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; @@ -388,7 +400,6 @@ static int bam_tag2cigar(bam1_t *b) if (ori_len > CG_en) // move data after the CG tag memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) - b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); return 1; } @@ -463,14 +474,18 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); - if (bam_tag2cigar(b) < 0) return -4; - - // Sanity check for broken CIGAR alignments - if (c->n_cigar > 0 && c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) - && bam_cigar2qlen(c->n_cigar, bam_get_cigar(b)) != c->l_qseq) { - hts_log_error("CIGAR and query sequence lengths differ for %s", - bam_get_qname(b)); - return -4; + bam_tag2cigar(b); + + if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency + int rlen, qlen; + bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); + // Sanity check for broken CIGAR alignments + if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { + hts_log_error("CIGAR and query sequence lengths differ for %s", + bam_get_qname(b)); + return -4; + } } return 4 + block_len; @@ -1217,7 +1232,8 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - if (bam_tag2cigar(b) < 0) return -2; + if (bam_tag2cigar(b)) // if the CIGAR is replaced with the CG tag, recompute bin + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); return 0; #undef _parse_warn From 79466f8da480d078d8363baa26784ae5a4381961 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 3 Nov 2017 12:38:18 -0400 Subject: [PATCH 11/12] lift the CG tag to CIGAR for CRAM and give warning --- sam.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index 24535fbd5..57af88ab3 100644 --- a/sam.c +++ b/sam.c @@ -331,7 +331,7 @@ void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen) int k; *rlen = *qlen = 0; for (k = 0; k < n_cigar; ++k) { - int type = bam_cigar_type(bam_cigar_op(cigar[k])); + int type = bam_cigar_type(bam_cigar_op(cigar[k])); int len = bam_cigar_oplen(cigar[k]); if (type & 1) *qlen += len; if (type & 2) *rlen += len; @@ -364,7 +364,7 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } -int bam_tag2cigar(bam1_t *b) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG +static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG { bam1_core_t *c = &b->core; uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; @@ -398,8 +398,12 @@ int bam_tag2cigar(bam1_t *b) // return 0 if CIGAR is untouched; 1 if CIGAR is up memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR if (ori_len > CG_en) // move data after the CG tag - memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); + memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) + if (recal_bin) + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); + if (give_warning) + hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); return 1; } @@ -474,7 +478,7 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); - bam_tag2cigar(b); + bam_tag2cigar(b, 0, 0); if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency int rlen, qlen; @@ -647,6 +651,7 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, htsFile *fp = fpv; bam1_t *b = bv; int ret = cram_get_bam_seq(fp->fp.cram, &b); + bam_tag2cigar(b, 1, 1); return ret >= 0 ? ret : (cram_eof(fp->fp.cram) ? -1 : -2); @@ -661,6 +666,7 @@ static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int case bam: return bam_read1(bgzfp, b); case cram: { int ret = cram_get_bam_seq(fp->fp.cram, &b); + bam_tag2cigar(b, 1, 1); return ret >= 0 ? ret : (cram_eof(fp->fp.cram) ? -1 : -2); @@ -1232,8 +1238,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - if (bam_tag2cigar(b)) // if the CIGAR is replaced with the CG tag, recompute bin - b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); + bam_tag2cigar(b, 1, 1); return 0; #undef _parse_warn @@ -1262,6 +1267,7 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) case cram: { int ret = cram_get_bam_seq(fp->fp.cram, &b); + bam_tag2cigar(b, 1, 1); return ret >= 0 ? ret : (cram_eof(fp->fp.cram) ? -1 : -2); From d57d1552e08a297e2d44e01b8bb99def983013fd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 9 Nov 2017 14:10:58 -0500 Subject: [PATCH 12/12] check tag2cigar() return code and real cigar len --- sam.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index 4cddeb55e..34b6f5389 100644 --- a/sam.c +++ b/sam.c @@ -378,7 +378,7 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I CG_len = le_to_u32(CG + 2); - if (CG_len == 0) return 0; // nothing to move + if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length // move from the CG tag to the right position cigar_st = (uint8_t*)cigar0 - b->data; @@ -386,7 +386,7 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 n_cigar4 = c->n_cigar * 4; CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; - b->l_data += n_cigar4 - fake_bytes; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place + b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place if (b->m_data < b->l_data) { uint8_t *new_data; uint32_t new_max = b->l_data; @@ -478,7 +478,8 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); - bam_tag2cigar(b, 0, 0); + if (bam_tag2cigar(b, 0, 0) < 0) + return -4; if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency int rlen, qlen; @@ -651,7 +652,8 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, htsFile *fp = fpv; bam1_t *b = bv; int ret = cram_get_bam_seq(fp->fp.cram, &b); - bam_tag2cigar(b, 1, 1); + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; *tid = b->core.tid; *beg = b->core.pos; *end = bam_endpos(b); @@ -725,7 +727,8 @@ static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int case bam: return bam_read1(bgzfp, b); case cram: { int ret = cram_get_bam_seq(fp->fp.cram, &b); - bam_tag2cigar(b, 1, 1); + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; return ret >= 0 ? ret : (cram_eof(fp->fp.cram) ? -1 : -2); @@ -1308,7 +1311,8 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - bam_tag2cigar(b, 1, 1); + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; return 0; #undef _parse_warn