Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unscaled cumulative plots, and cumulative with a null curve via Monte #19

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 132 additions & 15 deletions micov/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def ordered_coverage(coverage, grp, target):
.filter(pl.col(COLUMN_GENOME_ID) == target)
.sort(COLUMN_PERCENT_COVERED)
.with_row_index()
.with_columns(x=pl.col('index') / pl.len())).collect()
.with_columns(x=pl.col('index') / pl.len(),
x_unscaled=pl.col('index'))).collect()


def slice_positions(positions, id_):
Expand All @@ -40,6 +41,119 @@ def per_sample_plots(all_coverage, all_covered_positions, metadata,
sample_metadata_column, output, scale=10000)


def per_sample_plots_monte(all_coverage, all_covered_positions, metadata,
sample_metadata_column, output, target_lookup, iters):
for genome in all_coverage[COLUMN_GENOME_ID].unique():
target_name = target_lookup[genome]
cumulative_monte(metadata, all_coverage, all_covered_positions, genome,
sample_metadata_column, output, target_name, iters)


def compute_cumulative(coverage, grp, target, target_positions, lengths):
current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat)
grp_coverage = ordered_coverage(coverage, grp, target)

if len(grp_coverage) == 0:
return None, None

cur_y = []
cur_x = grp_coverage['x_unscaled']
for id_ in grp_coverage[COLUMN_SAMPLE_ID]:
next_ = slice_positions(target_positions, id_).collect()
current = compress(pl.concat([current, next_]))
per_cov = coverage_percent(current, lengths).collect()
cur_y.append(per_cov[COLUMN_PERCENT_COVERED].item(0))
return cur_x, cur_y


def cumulative_monte(metadata, coverage, positions, target, variable, output,
target_name, iters):
plt.figure(figsize=(12, 8))
labels = []

target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target)
coverage = coverage.filter(pl.col(COLUMN_GENOME_ID) == target)
cov_samples = coverage.select(pl.col(COLUMN_SAMPLE_ID).unique())
metadata = metadata.filter(pl.col(COLUMN_SAMPLE_ID).is_in(cov_samples))

if len(target_positions) == 0:
raise ValueError()

if len(coverage) == 0:
raise ValueError()

lengths = coverage[[COLUMN_GENOME_ID, COLUMN_LENGTH]].unique()

if len(lengths) > 1:
raise ValueError()

length = lengths[COLUMN_LENGTH].item(0)
value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
max_n = 0
for name, color in zip(value_order, range(0, 10)):
color = f'C{color}'

grp = metadata.filter(pl.col(variable) == name)

n = len(grp)
if n < 10:
continue
max_n = max(n, max_n)

cur_x, cur_y = compute_cumulative(coverage, grp, target,
target_positions, lengths)
if cur_x is None:
continue
else:
labels.append(f"{name} (n={len(cur_x)})")
plt.plot(cur_x, cur_y, color=color)

if not labels:
return

monte_y = []
for it in range(iters):
monte = (metadata.select(pl.col(COLUMN_SAMPLE_ID)
.shuffle())
.head(max_n))
grp_monte = metadata.filter(pl.col(COLUMN_SAMPLE_ID)
.is_in(monte))
monte_x, cur_y = compute_cumulative(coverage, grp_monte, target,
target_positions, lengths)
monte_y.append(cur_y)

monte_y = np.asarray(monte_y)
median = np.median(monte_y, axis=0)
std = np.std(monte_y, axis=0)

plt.plot(monte_x, median, color='k', linestyle='dotted', linewidth=1,
alpha=0.6)
plt.plot(monte_x, median + std, color='k', linestyle='--', linewidth=1,
alpha=0.6)
plt.plot(monte_x, median - std, color='k', linestyle='--', linewidth=1,
alpha=0.6)
plt.fill_between(monte_x, median - std, median + std, color='k',
alpha=0.1)
labels.append(f'Monte Carlo (n={len(monte_x)})')

ax = plt.gca()
ax.set_ylabel('Percent genome covered', fontsize=16)
ax.set_xlabel('Within group sample rank by coverage', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
ax.set_xlim(0, max_n - 1)
ax.set_ylim(0, 100)
ax.set_title((f'Cumulative: {target_name}({target}) '
f'({length}bp)'), fontsize=16)
plt.legend(labels, fontsize=14)

plt.tight_layout()
plt.savefig(f'{output}.{target_name}.{target}.{variable}.cumulative-monte.png')
plt.close()


def cumulative(metadata, coverage, positions, target, variable, output):
plt.figure(figsize=(12, 8))
labels = []
Expand All @@ -61,23 +175,19 @@ def cumulative(metadata, coverage, positions, target, variable, output):

length = lengths[COLUMN_LENGTH].item(0)

for (name, ), grp in metadata.group_by([variable, ], maintain_order=True):
value_order = metadata.select(pl.col(variable).unique().sort())[variable]
for name in value_order:
grp = metadata.filter(pl.col(variable) == name)
current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat)

grp_coverage = ordered_coverage(coverage, grp, target)

if len(grp_coverage) == 0:
cur_x, cur_y = compute_cumulative(coverage, grp, target,
target_positions, lengths)
if cur_x is None:
continue
labels.append(name)

cur_y = []
cur_x = grp_coverage['x']
for id_ in grp_coverage[COLUMN_SAMPLE_ID]:
next_ = slice_positions(target_positions, id_).collect()
current = compress(pl.concat([current, next_]))
per_cov = coverage_percent(current, lengths).collect()
cur_y.append(per_cov[COLUMN_PERCENT_COVERED].item(0))

covs.append(cur_y)
plt.plot(cur_x, cur_y)

Expand Down Expand Up @@ -108,7 +218,11 @@ def non_cumulative(metadata, coverage, target, variable, output):
labels = []
covs = []

for (name, ), grp in metadata.group_by([variable, ], maintain_order=True):
value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
for name in value_order:
grp = metadata.filter(pl.col(variable) == name)
grp_coverage = ordered_coverage(coverage, grp, target)

if len(grp_coverage) == 0:
Expand Down Expand Up @@ -190,9 +304,11 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale
colors = []
target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target).lazy()

for ((name, ), grp), color in zip(metadata.group_by([variable, ],
maintain_order=True),
range(0, 10)):
value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
for name, color in zip(value_order, range(0, 10)):
grp = metadata.filter(pl.col(variable) == name)
color = f'C{color}'
grp_coverage = ordered_coverage(coverage, grp, target)

Expand Down Expand Up @@ -254,6 +370,7 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale
leg = ax.get_legend()
for i, lh in enumerate(leg.legendHandles):
lh.set_color(colors[i])
lh._sizes = [5.0, ]

ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
Expand Down
60 changes: 58 additions & 2 deletions micov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
from ._cov import coverage_percent
from ._convert import cigar_to_lens
from ._per_sample import per_sample_coverage
from ._plot import per_sample_plots, single_sample_position_plot
from ._plot import (per_sample_plots, per_sample_plots_monte,
single_sample_position_plot)
from ._utils import logger
from ._constants import COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, BED_COV_SAMPLEID_SCHEMA
from ._constants import (COLUMN_SAMPLE_ID, COLUMN_GENOME_ID,
BED_COV_SAMPLEID_SCHEMA,
COLUMN_START, COLUMN_CIGAR, COLUMN_STOP)
from ._quant import pos_to_bins, make_csv_ready


Expand Down Expand Up @@ -243,6 +246,59 @@ def per_sample_group(parquet_coverage, sample_metadata, sample_metadata_column,
sample_metadata_column, output)


@cli.command()
@click.option('--parquet-coverage', type=click.Path(exists=False),
required=True, help=('Pre-computed coverage data as parquet. '
'This should be the basename used, i.e. '
'for "foo.coverage.parquet", please use '
'"foo"'))
@click.option('--sample-metadata', type=click.Path(exists=True),
required=True,
help='A metadata file with the sample metadata')
@click.option('--sample-metadata-column', type=str,
required=True,
help='The column to consider in the sample metadata')
@click.option('--features-to-keep', type=click.Path(exists=True),
required=False,
help='A metadata file with the features to keep')
@click.option('--iters', type=int, default=10, required=False)
@click.option('--target-names', type=str, required=False)
@click.option('--output', type=click.Path(exists=False), required=True)
@click.option('--plot', is_flag=True, default=False,
help='Generate plots from features')
def per_sample_monte(parquet_coverage, sample_metadata, sample_metadata_column,
features_to_keep, output, plot, iters, target_names):
"""Generate sample group plots and coverage data with a null curve."""
_load_db(parquet_coverage, sample_metadata, features_to_keep)

all_covered_positions = duckdb.sql("SELECT * from covered_positions").pl()
all_coverage = duckdb.sql("SELECT * FROM coverage").pl()
metadata_pl = duckdb.sql("SELECT * FROM metadata").pl()

if target_names is not None:
target_names = dict(pl.scan_csv(target_names,
separator='\t',
new_columns=['feature-id', 'lineage'],
has_header=False)
.with_columns(pl.col('lineage')
.str
.split(';')
.list
.get(-1)
.str
.replace_all(r" |\[|\]", "_")
.alias('species'))
.select('feature-id', 'species')
.collect()
.iter_rows())
else:
sql = "SELECT DISTINCT genome_id FROM coverage"
target_names = {k[0]: k[0] for k in duckdb.sql(sql).fetchall()}

per_sample_plots_monte(all_coverage, all_covered_positions, metadata_pl,
sample_metadata_column, output, target_names, iters)


def _load_db(dbbase, sample_metadata, features_to_keep):
metadata_pl = parse_sample_metadata(sample_metadata)
sample_column = metadata_pl.columns[0]
Expand Down
Loading