|
7 | 7 | import re
|
8 | 8 | import io
|
9 | 9 |
|
| 10 | + |
10 | 11 | def main():
|
11 | 12 | """ Main function """
|
12 | 13 | __doc__ = "Evaluate the run based on expected values"
|
13 | 14 | outpath = os.path.dirname(os.path.realpath(__file__))
|
14 | 15 | os.chdir(outpath)
|
15 |
| - a = pd.read_csv( |
16 |
| - 'results/GN/PUM2_K562_ENCSR661ICQ_2/PUM2_K562_ENCSR661ICQ_2_total_peaks.csv', |
17 |
| - sep='\t', header=0, index_col=None) |
18 |
| - expected = pd.read_csv( |
19 |
| - 'means_crosslink.tsv', |
20 |
| - sep='\t', header=0, index_col=None) |
21 |
| - a['center'] = a['start'] + a['crosslink'] |
22 |
| - |
23 |
| - for index, row in a.iterrows(): |
24 |
| - distance = ( |
25 |
| - row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min() |
26 |
| - if distance <= 50: |
27 |
| - a.loc[index, 'distance'] = distance |
28 |
| - else: |
29 |
| - a.loc[index, 'distance'] = np.nan |
30 |
| - rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()])) |
31 |
| - outlier_percentage = len(a[a.distance.isna()])/len(a) * 100 |
32 |
| - |
33 |
| - |
34 |
| - # expected = pd.read_csv( |
35 |
| - # 'means_peak_center.tsv', |
36 |
| - # sep='\t', header=0, index_col=None) |
37 |
| - # a['center'] = a['start'] + a['mi'] |
38 |
| - |
39 |
| - # for index, row in a.iterrows(): |
40 |
| - # distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min() |
41 |
| - # if distance <= 50: |
42 |
| - # a.loc[index, 'distance'] = distance |
43 |
| - # else: |
44 |
| - # a.loc[index, 'distance'] = np.nan |
45 |
| - # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n' |
46 |
| - # ) |
47 |
| - # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n' |
48 |
| - # ) |
49 |
| - |
50 |
| - # add the new test best pwm |
51 |
| - path = f'results/GN/PUM2_K562_ENCSR661ICQ_2/motif_analysis_final/peak_center/motif_enrichment.tsv' |
52 |
| - a = pd.read_csv(path, sep='\t', header=0, index_col=0) |
53 |
| - b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif'] |
54 |
| - run = b.split('_')[-1].split('.')[0] |
55 |
| - name = path.split('/')[-4] |
56 |
| - path_new = path.replace( |
57 |
| - 'motif_analysis_final', |
58 |
| - f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}') |
59 |
| - shutil.copy(path_new, f'wlib/{name}') |
60 |
| - |
61 |
| - df = pd.DataFrame() |
62 |
| - for path1 in glob.glob('wlib/*'): |
63 |
| - name1 = path1.split('/')[-1] |
64 |
| - for path2 in glob.glob('wlib/*'): |
65 |
| - name2 = path2.split('/')[-1] |
66 |
| - sim = getsimilarity(path1, path2) |
67 |
| - df.loc[name1, name2] = sim |
68 |
| - mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) |
69 |
| - |
70 |
| - sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') |
71 |
| - sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') |
72 |
| - sys.stdout.write(f'motif similarity: {mean_sim}\n') |
73 |
| - if rmsd < 1.5: |
74 |
| - sys.stdout.write('Rmsd is low. Test passed. 1/3\n') |
75 |
| - else: |
76 |
| - sys.stdout.write('Rmsd seems to be too high 1/3\n') |
77 |
| - if outlier_percentage < 5: |
78 |
| - sys.stdout.write('Few outliers detected. Test passed. 2/3\n') |
79 |
| - else: |
80 |
| - sys.stdout.write('Rmsd seems to be too high 2/3\n') |
81 |
| - if mean_sim > 0.75: |
82 |
| - sys.stdout.write('Motif similarity is high.Test passed 3/3\n') |
83 |
| - else: |
84 |
| - sys.stdout.write('Similarity seems to be low 3/3\n') |
| 16 | + for runtype in ['GN', 'TR']: |
| 17 | + for sample in ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]: |
| 18 | + a = pd.read_csv( |
| 19 | + f'results/{runtype}/{sample}/{sample}_total_peaks.csv', |
| 20 | + sep='\t', header=0, index_col=None) |
| 21 | + expected = pd.read_csv( |
| 22 | + 'means_crosslink.tsv', |
| 23 | + sep='\t', header=0, index_col=None) |
| 24 | + a['center'] = a['start'] + a['crosslink'] |
| 25 | + |
| 26 | + for index, row in a.iterrows(): |
| 27 | + distance = ( |
| 28 | + row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min() |
| 29 | + if distance <= 50: |
| 30 | + a.loc[index, 'distance'] = distance |
| 31 | + else: |
| 32 | + a.loc[index, 'distance'] = np.nan |
| 33 | + if runtype == "GN": |
| 34 | + rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()])) |
| 35 | + outlier_percentage = len(a[a.distance.isna()])/len(a) * 100 |
| 36 | + |
| 37 | + |
| 38 | + # expected = pd.read_csv( |
| 39 | + # 'means_peak_center.tsv', |
| 40 | + # sep='\t', header=0, index_col=None) |
| 41 | + # a['center'] = a['start'] + a['mi'] |
| 42 | + |
| 43 | + # for index, row in a.iterrows(): |
| 44 | + # distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min() |
| 45 | + # if distance <= 50: |
| 46 | + # a.loc[index, 'distance'] = distance |
| 47 | + # else: |
| 48 | + # a.loc[index, 'distance'] = np.nan |
| 49 | + # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n' |
| 50 | + # ) |
| 51 | + # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n' |
| 52 | + # ) |
| 53 | + |
| 54 | + # add the new test best pwm |
| 55 | + path = f'results/{runtype}/{sample}/motif_analysis_final/peak_center/motif_enrichment.tsv' |
| 56 | + a = pd.read_csv(path, sep='\t', header=0, index_col=0) |
| 57 | + b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif'] |
| 58 | + run = b.split('_')[-1].split('.')[0] |
| 59 | + name = path.split('/')[-4] |
| 60 | + path_new = path.replace( |
| 61 | + 'motif_analysis_final', |
| 62 | + f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}') |
| 63 | + shutil.copy(path_new, f'wlib/PUM2_K562_ENCSR661ICQ_2') |
| 64 | + |
| 65 | + df = pd.DataFrame() |
| 66 | + for path1 in glob.glob('wlib/*'): |
| 67 | + name1 = path1.split('/')[-1] |
| 68 | + for path2 in glob.glob('wlib/*'): |
| 69 | + name2 = path2.split('/')[-1] |
| 70 | + sim = getsimilarity(path1, path2) |
| 71 | + df.loc[name1, name2] = sim |
| 72 | + mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), |
| 73 | + df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) |
| 74 | + sys.stdout.write(f'''{runtype.replace("GN","Genomic").replace("TR","Transcriptomic")} {"-".join(sample.split("_")[-2:])} evaluation\n''') |
| 75 | + sys.stdout.write(f'motif similarity: {mean_sim}\n') |
| 76 | + if runtype == "GN": |
| 77 | + sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') |
| 78 | + sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') |
| 79 | + |
| 80 | + |
| 81 | + if mean_sim > 0.75: |
| 82 | + sys.stdout.write('Motif similarity is high.Test passed 1/3\n') |
| 83 | + else: |
| 84 | + sys.stdout.write('Motif similarity seems to be low 1/3\n') |
| 85 | + if runtype == "GN": |
| 86 | + if rmsd < 1.5: |
| 87 | + sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 2/3\n') |
| 88 | + else: |
| 89 | + sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 2/3\n') |
| 90 | + if outlier_percentage < 5: |
| 91 | + sys.stdout.write('Few outliers detected. Test passed. 3/3\n') |
| 92 | + else: |
| 93 | + sys.stdout.write('Too many peaks are not found in the test runs 3/3\n') |
| 94 | + sys.stdout.write('\n') |
85 | 95 | return
|
86 | 96 |
|
87 | 97 |
|
88 |
| - |
89 | 98 | def getSimilarityScore(wm1, wm2):
|
90 | 99 | wm1 = np.array(wm1)
|
91 | 100 | wm2 = np.array(wm2)
|
@@ -119,7 +128,7 @@ def get_wm(path):
|
119 | 128 | def getsimilarity(wm1_path, wm2_path):
|
120 | 129 | wm1 = get_wm(wm1_path)
|
121 | 130 | wm2 = get_wm(wm2_path)
|
122 |
| - similarity = ( |
| 131 | + similarity = ( |
123 | 132 | (2 * getSimilarityScore(wm1, wm2)) / (
|
124 | 133 | getSimilarityScore(wm1, wm1) + getSimilarityScore(wm2, wm2)))
|
125 | 134 | return similarity
|
|
0 commit comments