Skip to content

Commit b81e2d2

Browse files
committed
improved report summary script
1 parent b7ff4a2 commit b81e2d2

File tree

1 file changed

+76
-7
lines changed

1 file changed

+76
-7
lines changed

src/talos/SummariseReport.py

+76-7
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,78 @@
2323

2424
from talos.utils import read_json_from_path
2525

26+
MEAN_SLASH_SAMPLE = 'Mean/sample'
27+
28+
29+
class NoVariantsFoundError(Exception):
30+
"""raise if a report subset contains no data"""
31+
32+
33+
def get_variant_summary(results: ResultData) -> dict:
34+
"""
35+
Run the numbers across all variant categories
36+
Treat each primary-secondary comp-het pairing as one event
37+
i.e. the thing being counted here is the number of events
38+
which passed through the MOI process, not the absolute number
39+
of variants in the report
40+
41+
Args:
42+
results (ResultData): the results object in full
43+
44+
Returns:
45+
a dictionary summarising the categorised variants
46+
"""
47+
48+
# get the categories this report was aware of
49+
all_categories = results.metadata.categories.keys()
50+
51+
ordered_categories = ['any', *all_categories]
52+
53+
category_count: dict = {key: [] for key in ordered_categories}
54+
55+
for sample_data in results.results.values():
56+
sample_variants: dict[str, set[str]] = {key: set() for key in ordered_categories}
57+
58+
# iterate over the list of variants
59+
for variant in sample_data.variants:
60+
var_string = variant.var_data.coordinates.string_format
61+
sample_variants['any'].add(var_string)
62+
63+
# find all categories associated with this variant
64+
# for each category, add to corresponding list and set
65+
for category_value in variant.categories:
66+
sample_variants[category_value].add(var_string)
67+
68+
# update the global lists with per-sample counts
69+
for key, key_list in category_count.items():
70+
key_list.append(len(sample_variants[key]))
71+
72+
summary_dicts = {
73+
key: {
74+
'Description': results.metadata.categories.get(key, 'All Variants'),
75+
'Total': sum(category_count[key]),
76+
'Unique': len(set(category_count[key])),
77+
'Peak #/sample': max(category_count[key]),
78+
MEAN_SLASH_SAMPLE: sum(category_count[key]) / len(category_count[key]),
79+
}
80+
for key in ordered_categories
81+
}
82+
83+
# this can fail if there are no categorised variants... at all
84+
if not summary_dicts:
85+
raise NoVariantsFoundError('No categorised variants found')
86+
87+
summary_dicts['samples_no_variants'] = category_count['any'].count(0)
88+
89+
return summary_dicts
90+
2691

2792
def main(input_path: str, output_path: str | None = None, prefix: int | None = None):
2893
"""
29-
read the target report, and summarise the number of affected samples involved
94+
read the target report, and summarise the content:
95+
- the number of affected samples involved
96+
- the number of variants in each category
97+
- the number of samples with no variants
3098
3199
Args:
32100
input_path (str): where to read the report from
@@ -37,21 +105,22 @@ def main(input_path: str, output_path: str | None = None, prefix: int | None = N
37105
# read the report file, local or cloud
38106
report = read_json_from_path(input_path, return_model=ResultData)
39107

40-
# this is a simple overview
41-
family_breakdown = report.metadata.family_breakdown
108+
summarised_content: dict = {'family_breakdown': report.metadata.family_breakdown}
42109

43110
if prefix:
44111
# set up a section in the dictionary for this
45-
family_breakdown['grouped_by_prefix'] = defaultdict(int)
112+
summarised_content['family_breakdown']['grouped_by_prefix'] = defaultdict(int)
46113
for proband in report.results.values():
47-
family_breakdown['grouped_by_prefix'][proband.metadata.ext_id[:prefix]] += 1
114+
summarised_content['family_breakdown']['grouped_by_prefix'][proband.metadata.ext_id[:prefix]] += 1
115+
116+
summarised_content['variant_summary'] = get_variant_summary(report)
48117

49-
print(json.dumps(family_breakdown, indent=4))
118+
print(json.dumps(summarised_content, indent=4))
50119

51120
if output_path:
52121
# write the output to file
53122
with to_path(output_path).open('w') as handle:
54-
json.dump(family_breakdown, handle, indent=4)
123+
json.dump(summarised_content, handle, indent=4)
55124

56125

57126
def cli_main():

0 commit comments

Comments
 (0)