-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2023-08-30--combined-vb-examples.py
executable file
·78 lines (63 loc) · 2.75 KB
/
2023-08-30--combined-vb-examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
import os
import glob
import json
import random
from collections import defaultdict
MGS_PIPELINE_DIR="/home/ec2-user/mgs-pipeline"
MGS_RESTRICTED_DIR="/home/ec2-user/mgs-restricted"
bioproject_to_s3_bucket = {}
with open(os.path.join(MGS_PIPELINE_DIR, "dashboard",
"metadata_papers.json")) as inf:
metadata_papers = json.load(inf)
with open(os.path.join(MGS_PIPELINE_DIR, "dashboard",
"metadata_bioprojects.json")) as inf:
metadata_bioprojects = json.load(inf)
for bioproject in metadata_bioprojects:
bioproject_to_s3_bucket[bioproject] = "nao-mgs"
with open(os.path.join(MGS_PIPELINE_DIR, "dashboard",
"metadata_samples.json")) as inf:
metadata_samples = json.load(inf)
with open(os.path.join(MGS_RESTRICTED_DIR, "dashboard",
"metadata_papers.json")) as inf:
metadata_papers.update(json.load(inf))
with open(os.path.join(MGS_RESTRICTED_DIR, "dashboard",
"metadata_bioprojects.json")) as inf:
restricted_metadata_bioprojects = json.load(inf)
metadata_bioprojects.update(restricted_metadata_bioprojects)
for bioproject in restricted_metadata_bioprojects:
bioproject_to_s3_bucket[bioproject] = "nao-restricted"
with open(os.path.join(MGS_RESTRICTED_DIR, "dashboard",
"metadata_samples.json")) as inf:
metadata_samples.update(json.load(inf))
sample_to_paper = {}
sample_to_bioproject = {}
for paper in metadata_papers:
for bioproject in metadata_papers[paper]["projects"]:
for sample in metadata_bioprojects[bioproject]:
enrichment_suffix = ""
if metadata_samples[sample].get("enrichment", "") == "panel":
enrichment_suffix = " panel"
sample_to_bioproject[sample] = bioproject
sample_to_paper[sample] = "%s%s" % (
paper, enrichment_suffix)
combined_by_paper = defaultdict(list)
for fname in glob.glob("combined-vb/*.json"):
with open(fname) as inf:
sample, *_ = os.path.basename(fname).split(".")
paper = sample_to_paper.get(sample, None)
if not paper:
continue
for seq_id, kraken_info in json.load(inf)["both"]:
combined_by_paper[paper].append((
sample_to_bioproject[sample], sample, seq_id, kraken_info))
with open("targets.tsv", "w") as outf:
for paper, combined in sorted(combined_by_paper.items()):
random.shuffle(combined)
for bioproject, sample, seq_id, kraken_info in combined[:10]:
outf.write("%s\t%s\t%s\t%s\t%s\n" % (
bioproject_to_s3_bucket[bioproject],
bioproject,
sample,
seq_id,
kraken_info))