Skip to content

Commit d638d2c

Browse files
authored
feat: support single-stranded clip data (#16)
* feat: add single-strand preprocessing subworkflow * feat: single and paired-end implementation * feat: addition of testing for single, paired end and TR in addition to GN * feat: remove paired specification from config and infer from input
1 parent d805807 commit d638d2c

14 files changed

+5136
-4316
lines changed

config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@
105105
# if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT
106106
# files are assumed to be in fastq.gz format
107107
mate2: "ENCFF462SCV",
108-
# if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT
108+
# if file name is ENCFF462SCV.fastq.gz then add ENCFF462SCV
109109
# files are assumed to be in fastq.gz format.
110110
paired: 2,
111111
# leave as is. used for future support of single-end data

test/test_singularity_execution/config.yaml

+15-7
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
# ------------------------------------------
3131
# Experiment specific info - REQUIRED
32-
EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2"]
32+
EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]
3333

3434
# Experiment specific info - REQUIRED
3535
# ***Note: an experiment refers to the foreground (replicates) and background (smis) samples
@@ -38,22 +38,31 @@
3838
# as one sample (This functionality is useful for treating technical replicates as one sample)
3939
# Make sure that the samples you merge have the same sample features (e.g paired, sense, dup_type)
4040
# Choose an informative experiment name
41-
PUM2_K562_ENCSR661ICQ_2: {
41+
PUM2_K562_ENCSR661ICQ_2_paired_end: {
4242
rbp: "PUM2",
4343
replicates: ["ENCFF041KJT_ENCFF462SCV"],
4444
smis: ["ENCFF616FCF_ENCFF495ZPY"],
4545
window_f: 300,
4646
window_b: 300,
4747
step_size: 150,
4848
background_type: "standard"}
49+
PUM2_K562_ENCSR661ICQ_2_single_end: {
50+
rbp: "PUM2",
51+
replicates: ["ENCFF462SCV"],
52+
smis: ["ENCFF495ZPY"],
53+
window_f: 300,
54+
window_b: 300,
55+
step_size: 150,
56+
background_type: "standard"}
4957

5058
# Sample specific info - REQUIRED
51-
ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
52-
ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
53-
59+
ENCFF462SCV: { mate1: "ENCFF462SCV.chr20", sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" }
60+
ENCFF495ZPY: { mate1: "ENCFF495ZPY.chr20", sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" }
61+
ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
62+
ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
5463
# ----------------------------------------------------------------------------------------------
5564
# RCRUNCH specific options - REQUIRED - DEFAULT VALUES SUGGESTED
56-
method_types : ["GN"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR']
65+
method_types : ["GN", "TR"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR']
5766
# This will create both transcriptomic and genomic predictions
5867
multimappers: 1 # change if you want to accept reads that map to multiple regions
5968
# (the number indicates the number of regions a read can map to at
@@ -65,7 +74,6 @@
6574

6675

6776
## RCRUNCH specific options - DEFAULTS - Use with care
68-
seq_type: "pe" # paired end - currently only paired-end seq are allowed
6977
motif_lengths: [6, 8] # Motif sizes to be considered.
7078
peak_center : ["crosslink", "peak_center"] # whether to use 'crosslink' (position within where most reads start)
7179
# or 'peak_center' as the center of the peak

test/test_singularity_execution/performance_evaluation.py

+81-72
Original file line numberDiff line numberDiff line change
@@ -7,85 +7,94 @@
77
import re
88
import io
99

10+
1011
def main():
1112
""" Main function """
1213
__doc__ = "Evaluate the run based on expected values"
1314
outpath = os.path.dirname(os.path.realpath(__file__))
1415
os.chdir(outpath)
15-
a = pd.read_csv(
16-
'results/GN/PUM2_K562_ENCSR661ICQ_2/PUM2_K562_ENCSR661ICQ_2_total_peaks.csv',
17-
sep='\t', header=0, index_col=None)
18-
expected = pd.read_csv(
19-
'means_crosslink.tsv',
20-
sep='\t', header=0, index_col=None)
21-
a['center'] = a['start'] + a['crosslink']
22-
23-
for index, row in a.iterrows():
24-
distance = (
25-
row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min()
26-
if distance <= 50:
27-
a.loc[index, 'distance'] = distance
28-
else:
29-
a.loc[index, 'distance'] = np.nan
30-
rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))
31-
outlier_percentage = len(a[a.distance.isna()])/len(a) * 100
32-
33-
34-
# expected = pd.read_csv(
35-
# 'means_peak_center.tsv',
36-
# sep='\t', header=0, index_col=None)
37-
# a['center'] = a['start'] + a['mi']
38-
39-
# for index, row in a.iterrows():
40-
# distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min()
41-
# if distance <= 50:
42-
# a.loc[index, 'distance'] = distance
43-
# else:
44-
# a.loc[index, 'distance'] = np.nan
45-
# sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n'
46-
# )
47-
# sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n'
48-
# )
49-
50-
# add the new test best pwm
51-
path = f'results/GN/PUM2_K562_ENCSR661ICQ_2/motif_analysis_final/peak_center/motif_enrichment.tsv'
52-
a = pd.read_csv(path, sep='\t', header=0, index_col=0)
53-
b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif']
54-
run = b.split('_')[-1].split('.')[0]
55-
name = path.split('/')[-4]
56-
path_new = path.replace(
57-
'motif_analysis_final',
58-
f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}')
59-
shutil.copy(path_new, f'wlib/{name}')
60-
61-
df = pd.DataFrame()
62-
for path1 in glob.glob('wlib/*'):
63-
name1 = path1.split('/')[-1]
64-
for path2 in glob.glob('wlib/*'):
65-
name2 = path2.split('/')[-1]
66-
sim = getsimilarity(path1, path2)
67-
df.loc[name1, name2] = sim
68-
mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()])
69-
70-
sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n')
71-
sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n')
72-
sys.stdout.write(f'motif similarity: {mean_sim}\n')
73-
if rmsd < 1.5:
74-
sys.stdout.write('Rmsd is low. Test passed. 1/3\n')
75-
else:
76-
sys.stdout.write('Rmsd seems to be too high 1/3\n')
77-
if outlier_percentage < 5:
78-
sys.stdout.write('Few outliers detected. Test passed. 2/3\n')
79-
else:
80-
sys.stdout.write('Rmsd seems to be too high 2/3\n')
81-
if mean_sim > 0.75:
82-
sys.stdout.write('Motif similarity is high.Test passed 3/3\n')
83-
else:
84-
sys.stdout.write('Similarity seems to be low 3/3\n')
16+
for runtype in ['GN', 'TR']:
17+
for sample in ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]:
18+
a = pd.read_csv(
19+
f'results/{runtype}/{sample}/{sample}_total_peaks.csv',
20+
sep='\t', header=0, index_col=None)
21+
expected = pd.read_csv(
22+
'means_crosslink.tsv',
23+
sep='\t', header=0, index_col=None)
24+
a['center'] = a['start'] + a['crosslink']
25+
26+
for index, row in a.iterrows():
27+
distance = (
28+
row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min()
29+
if distance <= 50:
30+
a.loc[index, 'distance'] = distance
31+
else:
32+
a.loc[index, 'distance'] = np.nan
33+
if runtype == "GN":
34+
rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))
35+
outlier_percentage = len(a[a.distance.isna()])/len(a) * 100
36+
37+
38+
# expected = pd.read_csv(
39+
# 'means_peak_center.tsv',
40+
# sep='\t', header=0, index_col=None)
41+
# a['center'] = a['start'] + a['mi']
42+
43+
# for index, row in a.iterrows():
44+
# distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min()
45+
# if distance <= 50:
46+
# a.loc[index, 'distance'] = distance
47+
# else:
48+
# a.loc[index, 'distance'] = np.nan
49+
# sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n'
50+
# )
51+
# sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n'
52+
# )
53+
54+
# add the new test best pwm
55+
path = f'results/{runtype}/{sample}/motif_analysis_final/peak_center/motif_enrichment.tsv'
56+
a = pd.read_csv(path, sep='\t', header=0, index_col=0)
57+
b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif']
58+
run = b.split('_')[-1].split('.')[0]
59+
name = path.split('/')[-4]
60+
path_new = path.replace(
61+
'motif_analysis_final',
62+
f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}')
63+
shutil.copy(path_new, f'wlib/PUM2_K562_ENCSR661ICQ_2')
64+
65+
df = pd.DataFrame()
66+
for path1 in glob.glob('wlib/*'):
67+
name1 = path1.split('/')[-1]
68+
for path2 in glob.glob('wlib/*'):
69+
name2 = path2.split('/')[-1]
70+
sim = getsimilarity(path1, path2)
71+
df.loc[name1, name2] = sim
72+
mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(),
73+
df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()])
74+
sys.stdout.write(f'''{runtype.replace("GN","Genomic").replace("TR","Transcriptomic")} {"-".join(sample.split("_")[-2:])} evaluation\n''')
75+
sys.stdout.write(f'motif similarity: {mean_sim}\n')
76+
if runtype == "GN":
77+
sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n')
78+
sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n')
79+
80+
81+
if mean_sim > 0.75:
82+
sys.stdout.write('Motif similarity is high.Test passed 1/3\n')
83+
else:
84+
sys.stdout.write('Motif similarity seems to be low 1/3\n')
85+
if runtype == "GN":
86+
if rmsd < 1.5:
87+
sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 2/3\n')
88+
else:
89+
sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 2/3\n')
90+
if outlier_percentage < 5:
91+
sys.stdout.write('Few outliers detected. Test passed. 3/3\n')
92+
else:
93+
sys.stdout.write('Too many peaks are not found in the test runs 3/3\n')
94+
sys.stdout.write('\n')
8595
return
8696

8797

88-
8998
def getSimilarityScore(wm1, wm2):
9099
wm1 = np.array(wm1)
91100
wm2 = np.array(wm2)
@@ -119,7 +128,7 @@ def get_wm(path):
119128
def getsimilarity(wm1_path, wm2_path):
120129
wm1 = get_wm(wm1_path)
121130
wm2 = get_wm(wm2_path)
122-
similarity = (
131+
similarity = (
123132
(2 * getSimilarityScore(wm1, wm2)) / (
124133
getSimilarityScore(wm1, wm1) + getSimilarityScore(wm2, wm2)))
125134
return similarity

workflow/Snakefile

+11-15
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,16 @@ rule finish:
4646
# experiment=config[config["EXPERIMENT_SET"][0]]["replicates"])
4747

4848

49-
50-
if config['seq_type'] == 'pe':
51-
if ('TR' in config['method_types']) & ('GN' in config['method_types']):
52-
include: 'rules/rcrunch_preprocessing_pe.smk'
53-
include: 'rules/rcrunch_genomic_pe.smk'
54-
include: 'rules/rcrunch_transcriptomic_pe.smk'
55-
include: 'rules/rcrunch_motif_analysis.smk'
56-
elif 'TR' in config['method_types']:
57-
include: 'rules/rcrunch_preprocessing_pe.smk'
58-
include: 'rules/rcrunch_transcriptomic_pe.smk'
59-
include: 'rules/rcrunch_motif_analysis.smk'
60-
elif 'GN' in config['method_types']:
61-
include: 'rules/rcrunch_preprocessing_pe.smk'
62-
include: 'rules/rcrunch_genomic_pe.smk'
63-
include: 'rules/rcrunch_motif_analysis.smk'
49+
include: 'rules/rcrunch_preprocessing_pe.smk'
50+
include: 'rules/rcrunch_preprocessing_se.smk'
51+
include: 'rules/rcrunch_preprocessing.smk'
52+
include: 'rules/rcrunch_genomic_pe.smk'
53+
include: 'rules/rcrunch_genomic_se.smk'
54+
include: 'rules/rcrunch_genomic.smk'
55+
include: 'rules/rcrunch_transcriptomic.smk'
56+
include: 'rules/rcrunch_transcriptomic_pe.smk'
57+
include: 'rules/rcrunch_transcriptomic_se.smk'
58+
include: 'rules/rcrunch_motif_analysis.smk'
59+
include: 'rules/common.smk'
6460

6561

workflow/rules/common.smk

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
def get_mates(sample):
2+
"""Get number of mates"""
3+
try:
4+
if config[sample]['mate2']:
5+
return "pe"
6+
else:
7+
return "se"
8+
except KeyError:
9+
return "se"
10+
11+
def get_mates_number(sample):
12+
"""Get library type"""
13+
try:
14+
if config[sample]['mate2']:
15+
return '2'
16+
else:
17+
return '1'
18+
except KeyError:
19+
return '1'
20+
21+
def get_library_type(sample, sense):
22+
try:
23+
if config[sample]['mate2']:
24+
if str(int(sense)) == '2':
25+
return "ISR"
26+
if str(int(sense)) == '1':
27+
return "ISF"
28+
else:
29+
if str(int(sense)) == '2':
30+
return "SR"
31+
if str(int(sense)) == '1':
32+
return "SF"
33+
except KeyError:
34+
if str(int(sense)) == '2':
35+
return "SR"
36+
if str(int(sense)) == '1':
37+
return "SF"
38+

0 commit comments

Comments
 (0)