Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to the depth filtering for mpileup. #1084

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -1650,6 +1650,17 @@ typedef struct __bam_mplp_t *bam_mplp_t;
HTSLIB_EXPORT
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);

/**
* bam_plp_set_cntpos() - indicate which end of reach to apply maxcnt.
* @plp: The bam_plp_t initialised using bam_plp_init.
* @end: A fraction along the read from 0.0 (start) to 1.0 (end).
* Defaults to 0.0 which makes maxcnt a true upper-bound value.
* Using 1.0 makes maxcnt a lower-bound, with upper-bound
* varying based on length and coordinate distribution.
*/
HTSLIB_EXPORT
void bam_plp_set_cntpos(bam_plp_t iter, double end);

HTSLIB_EXPORT
void bam_plp_reset(bam_plp_t iter);

Expand Down Expand Up @@ -1712,6 +1723,18 @@ typedef struct __bam_mplp_t *bam_mplp_t;
HTSLIB_EXPORT
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);

/**
* bam_mplp_set_cntpos() - indicate which end of reach to apply maxcnt.
* @mplp: The bam_mplp_t initialised using bam_mplp_init.
* @end: A fraction along the read from 0.0 (start) to 1.0 (end).
* Defaults to 0.0 which makes maxcnt a true upper-bound value.
* Using 1.0 makes maxcnt a lower-bound, with upper-bound
* varying based on length and coordinate distribution.
*/
HTSLIB_EXPORT
void bam_mplp_set_cntpos(bam_mplp_t iter, double end);


HTSLIB_EXPORT
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);

Expand Down
96 changes: 93 additions & 3 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include <signal.h>
#include <inttypes.h>
#include <math.h>

// Suppress deprecation message for cigar_tab, which we initialise
#include "htslib/hts_defs.h"
Expand Down Expand Up @@ -4139,6 +4140,8 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
typedef khash_t(olap_hash) olap_hash_t;

#define MAX_AHEAD (65536*8)

struct __bam_plp_t {
mempool_t *mp;
lbnode_t *head, *tail;
Expand All @@ -4157,6 +4160,12 @@ struct __bam_plp_t {
// and associated client-owned pointer.
int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);

// Field used for automatic depth reduction
uint32_t depth[MAX_AHEAD]; // circular depth buffer from b->core.pos on.
uint64_t last_depth_pos; // array filed out to this pos
uint64_t max_depth_pos; // >= max depth up to this pos
double depth_pos_fract; // (Simple method)
};

bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
Expand All @@ -4172,6 +4181,9 @@ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
iter->data = data;
iter->b = bam_init1();
}
memset(iter->depth, 0, MAX_AHEAD * sizeof(*iter->depth));
iter->depth_pos_fract = 0; // fraction between 0 = left and 1 = right
iter->last_depth_pos = 0;
return iter;
}

Expand Down Expand Up @@ -4494,21 +4506,87 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_

int bam_plp_push(bam_plp_t iter, const bam1_t *b)
{
uint64_t endpos = 0;

if (iter->error) return -1;
if (b) {
if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
// Skip only unmapped reads here, any additional filtering must be done in iter->func
if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
{
if (iter->depth_pos_fract == 0
&& iter->tid == b->core.tid
&& iter->pos == b->core.pos
&& iter->mp->cnt > iter->maxcnt) {
// Removal from left-end depth; mp->cnt
overlap_remove(iter, b);
return 0;
} else if (iter->depth_pos_fract > 0
&& iter->tid == b->core.tid
&& b->core.pos - iter->pos <= b->core.l_qseq / 16
) {
// Removal from depth somewhere else along read
endpos = bam_endpos(b);
uint32_t len = (uint32_t)(endpos - b->core.pos - 1);

#if 1
len = len * iter->depth_pos_fract + 0.4999;
if (b->core.pos + len < iter->last_depth_pos
&& iter->depth[(b->core.pos+len)%MAX_AHEAD] > iter->maxcnt) {
// NB this means read longer than MAX_AHEAD (after downsizing
// by depth_pos fraction) will be retained as by definition
// it'll have zero coverage that far out.
overlap_remove(iter, b);
return 0;
}
#else
len = len * iter->depth_pos_fract + 0.4999;
if (len >= MAX_AHEAD) len = MAX_AHEAD-1;
int64_t pos_delta = (b->core.pos + len) - iter->max_depth_pos -1;
int last_depth = endpos > iter->last_depth_pos
? 0
: iter->depth[(iter->last_depth_pos-1)%MAX_AHEAD];
int pos_depth = iter->depth[b->core.pos % MAX_AHEAD];

#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif
if (drand48() >
//(double)h / 0xffffffff >
MIN(1.0, pow((double)pos_delta / (len+.01), 2))
* MIN(1.0, pow((1-(last_depth+.1)/iter->maxcnt), 2))
* MIN(1.0, pow((iter->maxcnt+.1)/pos_depth, 2)) *2) {
overlap_remove(iter, b);
return 0;
}
#endif

// Zero newly observed iter depth elemtns.
uint64_t end_clipped = endpos, p;
if (end_clipped > iter->pos + MAX_AHEAD)
end_clipped = iter->pos + MAX_AHEAD;
if (iter->last_depth_pos < end_clipped) {
//iter->last_depth_pos = end_clipped;
if (iter->last_depth_pos < end_clipped-MAX_AHEAD)
iter->last_depth_pos = end_clipped-MAX_AHEAD;
for (p = iter->last_depth_pos; p < end_clipped; p++)
iter->depth[p % MAX_AHEAD] = 0;
iter->last_depth_pos = end_clipped;
}

// Increment depth
for (p = b->core.pos; p < end_clipped; p++) {
if (++iter->depth[p % MAX_AHEAD] >= iter->maxcnt)
if (iter->max_depth_pos < p)
iter->max_depth_pos = p;
}
}
if (bam_copy1(&iter->tail->b, b) == NULL)
return -1;
iter->tail->b.id = iter->id++;
iter->tail->beg = b->core.pos;
iter->tail->end = bam_endpos(b);
if (!endpos)
endpos = bam_endpos(b);
iter->tail->end = endpos;
iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
if (b->core.tid < iter->max_tid) {
hts_log_error("The input is not sorted (chromosomes out of order)");
Expand Down Expand Up @@ -4602,6 +4680,11 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
iter->maxcnt = maxcnt;
}

void bam_plp_set_cntpos(bam_plp_t iter, double end)
{
iter->depth_pos_fract = end;
}

/************************
*** Mpileup iterator ***
************************/
Expand Down Expand Up @@ -4651,6 +4734,13 @@ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
iter->iter[i]->maxcnt = maxcnt;
}

void bam_mplp_set_cntpos(bam_mplp_t iter, double end)
{
int i;
for (i = 0; i < iter->n; ++i)
iter->iter[i]->depth_pos_fract = end;
}

void bam_mplp_destroy(bam_mplp_t iter)
{
int i;
Expand Down