-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcmds.txt
288 lines (170 loc) · 26.4 KB
/
cmds.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
zcat all_SRA_introns.tsv.gz | head | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1
zcat all_SRA_introns.tsv.gz | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1
#sums and counts last 2 columns
cat all_SRA_introns.tsv.100000 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$_); @f1=split(/,/,$f[6]); @f2=split(/,/,$f[7]); $f1=scalar @f1; $f2=scalar @f2; pop(@f); pop(@f); $s=join("\t",@f); $s1=0; $s2=0; map { $s1+=$_; } @f1; map { $s2+=$_; } @f2; print "$s\t$f1\t$f2\t$s1\t$s2\n";'
cat all_SRA_introns_ids.tsv | perl -ne 'BEGIN {$data_source_id=0;} chomp; $s=$_; @f=split(/\t/,$s); @f1=split(/,/,$f[7]); @f2=split(/,/,$f[8]); $f1=scalar @f1; $f2=scalar @f2; $s2=0; map { $s2+=$_; } @f2; $avg=$s2/$f2; $median=-1; $m_=$f2/2; $m_=int($m_); @f2a=sort {$a<=>$b} @f2; if(($f2 % 2)==0) { $m2_=$f2/2; $m2_-=1; $median=($f2a[$m_]+$f2a[$m2_])/2; } else { $median=$f2a[$m_]; } printf("$s\t$f1\t$f2\t$s2\t%.3f\t%.3f\n",$avg,$median,$data_source_id);'
time python ./joiner.py 'chr6:1-10000000|samples_count_i:5 AND chromosome_s:chr6' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:5' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:[5 TO 200000] AND chromosome_s:chr6' > t1
curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5ANDcoverage_sumGT5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000**' | wc -l
time python ./snaptron.py 'chr11:82970135-82997450|samples_count>=100,coverage_sum>=1000|' > /data2/gigatron2/ccdc90b.range2.sc100_cs1000
python ./snaptron_new.py 'regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000' | wc -l
curl --data 'fields="[{"intervals":["chr11:82970135-82997450"],"samples_count":[{"op":">:","val":100}],"coverage_sum":[{"op":">:","val":1000}]}]"' http://stingray.cs.jhu.edu:8443/snaptron | wc -l
source /data/gigatron/snaptron/python/bin/activate
./snaptron_server --no-daemon
curl "http://128.220.35.129:8443/snaptron?" --data-urlencode "chr6:1-10000000|samples_count=5|"
curl -vv -G "http://128.220.35.129:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"
curl -vv -G "http://stingray.cs.jhu.edu:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"
curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'
#REL finding project
cat all_SRA_introns_ids_stats_length_annotated.tsv.gtf | head -50 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$s); $iid=$f[0]; @s1=split(/,/,$f[12]); @s2=split(/,/,$f[13]); $i=0; foreach $sample (@s1) { $samples{$sample}->[0]++; $samples{$sample}->[1]+=$s2[$i]; $i++; print STDERR "$sample\t$iid\n"; } END { foreach $sample (sort {$a <=> $b} keys %samples) { ($count,$cov) = @{$samples{$sample}}; print "$sample\t$count\t$cov\n";}}'
python find_gene-rp_overlaps.py knownGene_28-Jun-2015.clusterid.tsv ucsc_repeatmasker_2015_12_22.bed.sorted.gtf.no_chr_prefix all_SRA_introns_ids_stats_length_annotated.tsv.gtf all_SRA_introns_ids_stats_length_annotated.tsv.gtf.by_sample_count_coverate.tsv > annotated_repeated_introns.strands.tsv.new 2> errors &
cat all_illumina_sra_for_human_ids.tsv | perl -ne 'chomp; @f=split(/\t/,$_); $read=join("\t",($f[11],$f[12],$f[28])); $library=join("\t",($f[22],$f[23],$f[24],$f[25],$f[26],$f[27])); $instrument=$f[31]; $s=join("\t",($f[0],$library,$instrument,$read)); print "$s\n";' > sarvens/relevant_sample_md_fields.tsv
curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'
curl --data 'fields="[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"' http://stingray.cs.jhu.edu:8443/snaptron
python snaptron.py '"[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"'
python snample.py "sfilter=description:cortex" 1
zcat all_SRA_introns.tsv.gz | head | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1
zcat all_SRA_introns.tsv.gz | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1
#sums and counts last 2 columns
cat all_SRA_introns.tsv.100000 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$_); @f1=split(/,/,$f[6]); @f2=split(/,/,$f[7]); $f1=scalar @f1; $f2=scalar @f2; pop(@f); pop(@f); $s=join("\t",@f); $s1=0; $s2=0; map { $s1+=$_; } @f1; map { $s2+=$_; } @f2; print "$s\t$f1\t$f2\t$s1\t$s2\n";'
cat all_SRA_introns_ids.tsv | perl -ne 'chomp; $s=$_; @f=split(/\t/,$s); @f1=split(/,/,$f[7]); @f2=split(/,/,$f[8]); $f1=scalar @f1; $f2=scalar @f2; $s2=0; map { $s2+=$_; } @f2; $avg=$s2/$f2; $median=-1; $m_=$f2/2; $m_=int($m_); @f2a=sort {$a<=>$b} @f2; if(($f2 % 2)==0) { $m2_=$f2/2; $m2_-=1; $median=($f2a[$m_]+$f2a[$m2_])/2; } else { $median=$f2a[$m_]; } printf("$s\t$f1\t$f2\t$s2\t%.3f\t%.3f\n",$avg,$median);'
time python ./joiner.py 'chr6:1-10000000|samples_count_i:5 AND chromosome_s:chr6' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:5' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:[5 TO 200000] AND chromosome_s:chr6' > t1
curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5ANDcoverage_sumGT5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000**' | wc -l
time python ./snaptron.py 'chr11:82970135-82997450|samples_count>=100,coverage_sum>=1000|' > /data2/gigatron2/ccdc90b.range2.sc100_cs1000
python ./snaptron_new.py 'regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000' | wc -l
curl --data 'fields="[{"intervals":["chr11:82970135-82997450"],"samples_count":[{"op":">:","val":100}],"coverage_sum":[{"op":">:","val":1000}]}]"' http://stingray.cs.jhu.edu:8443/snaptron | wc -l
source /data/gigatron/snaptron/python/bin/activate
./snaptron_server --no-daemon
curl "http://128.220.35.129:8443/snaptron?" --data-urlencode "chr6:1-10000000|samples_count=5|"
curl -vv -G "http://128.220.35.129:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"
curl -vv -G "http://stingray.cs.jhu.edu:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"
curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'
#REL finding project
cat all_SRA_introns_ids_stats_length_annotated.tsv.gtf | head -50 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$s); $iid=$f[0]; @s1=split(/,/,$f[12]); @s2=split(/,/,$f[13]); $i=0; foreach $sample (@s1) { $samples{$sample}->[0]++; $samples{$sample}->[1]+=$s2[$i]; $i++; print STDERR "$sample\t$iid\n"; } END { foreach $sample (sort {$a <=> $b} keys %samples) { ($count,$cov) = @{$samples{$sample}}; print "$sample\t$count\t$cov\n";}}'
python find_gene-rp_overlaps.py knownGene_28-Jun-2015.clusterid.tsv ucsc_repeatmasker_2015_12_22.bed.sorted.gtf.no_chr_prefix all_SRA_introns_ids_stats_length_annotated.tsv.gtf all_SRA_introns_ids_stats_length_annotated.tsv.gtf.by_sample_count_coverate.tsv > annotated_repeated_introns.strands.tsv.new 2> errors &
cat all_illumina_sra_for_human_ids.tsv | perl -ne 'chomp; @f=split(/\t/,$_); $read=join("\t",($f[11],$f[12],$f[28])); $library=join("\t",($f[22],$f[23],$f[24],$f[25],$f[26],$f[27])); $instrument=$f[31]; $s=join("\t",($f[0],$library,$instrument,$read)); print "$s\n";' > sarvens/relevant_sample_md_fields.tsv
curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'
curl --data 'fields="[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"' http://stingray.cs.jhu.edu:8443/snaptron
python snaptron.py '"[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"'
python snample.py "sfilter=description:cortex" 1
#redo creation of individially sorted indices including length
cat all_SRA_introns_ids_stats.tsv.new2_w_header_w_sourcedb | perl -ne 'chomp; $s=$_; next if($s=~/gigatron_id/); print "1\t$s\n";' | sort -s -k6,6n | tee by_length | sort -s -k15,15n | tee by_sample_count | sort -s -k16,16n | tee by_coverage_sum | sort -s -k17,17n | tee by_coverage_avg | sort -s -k18,18n > by_coverage_m
cut -f1,2,3,5 all_illumina_sra_for_human_ids.tsv | perl -ne 'chomp; ($id,$srr,$srs,$srp)=split(/\t/,$_); $ids{$id}=[$srr,$srs,$srp]; push(@{$srps{$srp}},$id); push(@{$srss{$srs}},$id); END { $idx=0; foreach $srp (sort {$a cmp $b} keys %srps) { @i=@{$srps{$srp}}; $ic = scalar @i; $srps{$srp}=[$ic,$idx++]; } $idx=0; foreach $srs (sort {$a cmp $b} keys %srss) { @j=@{$srss{$srs}}; $jc = scalar @j; $srss{$srs}=[$jc,$idx++]; } foreach $id (sort {$a <=> $b} keys %ids) { next if($id=~/intropolis_sample_id_i/); ($srr,$srs,$srp)=@{$ids{$id}}; ($srp_sz,$srp_id)=@{$srps{$srp}}; ($srs_sz,$srs_id)=@{$srss{$srs}}; print "$id\t$srs_sz\t$srp_sz\t$srp\t$srp_id\t$srs\t$srs_id\n";}}' > all_illumina_sra_for_human_ids.tsv.id_mapping
zcat /data2/gigatron2/all_SRA_introns_ids_stats.tsv.new2_w_sourcedb2.gz | perl add_stats.pl ../all_illumina_sra_for_human_ids.tsv.id_mapping2 | bgzip > /data2/gigatron2/all_SRA_introns_ids_stats.tsv.new2_w_sourcedb2_extended.gz
python snanalysis.py "ids_a=4&ids_b=5,6&compute=jir&ratio=cov&order=T:5"
zcat /data/gigatron/intropolis.v2.hg38.tsv.snaptron.bgzip | cut -f1,12 | perl -ne 'chomp; ($tid,$sid)=split(/\t/,$_); @sids=split(/,/,$sid); foreach $s (@sids) { print "$s\t$tid\n";}' | sort -k1,1 -n | perl -ne 'chomp; $s=$_; ($sid,$tid)=split(/\t/,$s); if(defined($pid) && $pid != $sid) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n"; %h=();} $pid=$sid; $h{$tid}=1; END { if(defined($pid)) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n";}}' | bzip2 > intropolis.v2.hg38.tsv.snaptron.sample2intron_ids.bz2
cat /data2/gigatron2/all_illumina_sra_for_human_ids.tsv | perl -ne 'BEGIN { open(IN,"</data2/gigatron2/intropolis.idmap.v2.hg38.tsv"); %h; $h{'run_accession_s'}='intropolis_sample_id_i'; while($line=<IN>) {chomp($line); @f=split(/\t/,$line); $h{$f[4]}=$f[0]; } close(IN);} chomp; $s=$_; @f=split(/\t/,$s); $newid=$h{$f[1]}; shift @f; print "$newid\t".(join("\t",@f))."\n";' > /data2/gigatron2/first_half_illumina_sra_for_human_ids.v2.tsv
echo "SELECT run_accession,sample_accession,experiment_accession,study_accession,submission_accession,sra_ID,run_ID,run_alias,run_date,updated_date,spots,bases,run_center,experiment_name,run_attribute,experiment_ID,experiment_alias,experiment_title,study_name,sample_name,design_description,library_name,library_strategy,library_source,library_selection,library_layout,library_construction_protocol,read_spec,platform,instrument_model,platform_parameters,experiment_url_link,experiment_attribute,sample_ID,sample_alias,taxon_id,common_name,description,sample_url_link,sample_attribute,study_ID,study_alias,study_title,study_type,study_abstract,center_project_name,study_description,study_url_link,study_attribute,related_studies,primary_study,submission_ID,submission_comment,submission_center,submission_lab,submission_date,sradb_updated,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL FROM SRA;" | sqlite3 -cmd '.separator "\t"' SRAmetadb.sqlite > all_sra_related_md
cat all_illumina_sra_for_human_v2.tsv | perl -ne 'BEGIN { open(IN,"</data/gigatron/intropolis.idmap.v2.hg38.tsv"); %h; $h{'run_accession_s'}='intropolis_sample_id_i'; while($line=<IN>) {chomp($line); @f=split(/\t/,$line); $h{$f[4]}=$f[0]; } close(IN);} chomp; $s=$_; @f=split(/\t/,$s); $newid=$h{$f[1]}; shift @f; print "$newid\t".(join("\t",@f))."\n";' > illumina_sra_for_human_ids.v2.tsv
#get sample total junction counts
bzcat all_SRA_introns_ids_stats_length_annotated.tsv.gtf.sample2intron.tsv.0based.sorted.online.bz2 | perl -ne 'chomp; ($sid,$tids)=split(/\t/,$_); @tids=split(/,/,$tids); $c=scalar @tids; print "$sid\t$c\n";' > all_SRA_introns_ids_stats_length_annotated.tsv.gtf.sample2intron.tsv.0based.sorted.online.bz2.counts_per_sample
#infer types for lucene from raw metadata fields
cat /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv | perl -ne 'chomp; $n++; @f=split(/\t/,$_); foreach $idx (0 .. (scalar @f)-1) { $type="t"; $type="f" if($f[$idx]=~/^-?\d+?\.?\d+$/); $type="i" if($f[$idx]=~/^-?\d+$/); $counts{$idx}->{$type}++; } END { foreach $idx (sort { $a<=>$b} keys %counts) { print "$idx"; foreach $t (sort {$counts{$idx}->{$b}<=>$counts{$idx}->{$a}} keys %{$counts{$idx}}) { print "\t$t,".$counts{$idx}->{$t}; } print "\n";}}' > /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv.inferred_type_counts
#to find Sarven's ~3 genes with REL-exons
cat sarven_splices.tsv | perl -ne 'chomp; ($f,$t)=split(/\s+/,$_); $f1=$f; $f1=~s/chr\w+://; print "R:$f "; $f1=~s/-/\t/; $s=`python ./snaptron.py "regions=$f" 2>/dev/null| grep "$f1" | cut -f 15,16`; $t1=$t; $t1=~s/chr\w+://; $t1=~s/-/\t/; print "$s"; $s=""; print "R:$t "; $s=`python ./snaptron.py "regions=$t" 2>/dev/null | grep "$t1" | cut -f 15,16`; print "$s";'
curl "http://stingray.cs.jhu.edu:8090/srav1/snaptron?regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000&fields=snaptron_id"
curl "http://stingray.cs.jhu.edu:8090/srav1/density?bigwig_db=snps®ions=chr1:1-100000"
#direct link to custom track
http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr21:33038447-33041505&hgct_customText=track%20type=bigBed%20name=myBigBedTrack%20description=%22a%20bigBed%20track%22%20visibility=full%20bigDataUrl=http://genome.ucsc.edu/goldenPath/help/examples/bigBedExample.bb
http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&position=chr11:82970135-82997450&hgct_customText=name=snaptron%20description=snaptron_exported_splice_junctions%20visibility=full%20bigDataUrl=%22http://127.0.0.1:1300/snaptron?regions=chr11:82970135-82997450%26rfilter=samples_count>:100%26rfilter=coverage_sum>:1000%26return_format=1%22
#works
http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&position=chr11:82970135-82997450&hgct_customText=http://stingray.cs.jhu.edu:8090/srav1/snaptron?regions=chr11:82970135-82997450%26rfilter=samples_count>:100%26rfilter=coverage_sum>:1000%26return_format=1%22
#filter/limit for gene models
zcat /data2/gigatron2/gensemrefg.hg19_annotations.sorted.gtf.gz | perl -ne 'chomp; @f=split(/\t/,$_); $f[5]=0.0; @f1=split(/;/,$f[8]); $boost=0; $boost=100000 if($f1[1]!~/"NA"/); @f2=split(/,/,$f1[2]); $s1=$f1[2]; print "$s1\n"; $f[5]=(scalar @f2)+$boost; print "".(join("\t",@f))."\n";' | sort -t" " -k6,6nr | head -10
#find the gaps in the sample id metadata
cut -f 1 data/illumina_sra_for_human_ids.v2.tsv | sort -n | perl -ne 'chomp; $s=$_; if($p) { $d=abs($s-$p); if($d>1) { print "$s-$p:\t"; for($i=$p+1; $i<=$p+($d-1); $i++) { print "$i,"; } print "\n"; } } $p=$s;' > data/illumina_sra_for_human_ids.v2.tsv.missing_sample_ids
#look for total top covered unannotated junctions (Ben's idea):
time python ./snaptron.py "rfilter=annotated:0,length<:10000" | sort -t' ' -k15,15nr -k16,16nr | bgzip > /data3/unannotated_short_srav1_sorted_samplescount_coverage.tsv.bgz
#client
python query_snaptron.py --query-file alk.tsv --function jir
python ./sbreakpoint.py '1271' > CCDC6_RET_breakpoint.tsv
python ./sbreakpoint.py 'CCDC6-RET' > CCDC6_RET_breakpoint.tsv
curl "http://stingray.cs.jhu.edu:8090/srav1/breakpoint?eml4-alk"
<<<<<<< Updated upstream
sqlite3 data/by_sample_ids.v2.sqlite3 "select * from by_sample_id;" | python pack_samples.py
#dump per-sample # of snaptron ids
sqlite3 ./by_sample_ids.sqlite3 "select * from by_sample_id;" | perl -ne 'chomp; ($sid,$sids)=split(/\|/,$_); @sids=split(/,/,$sids); print "$sid\t".scalar(@sids)."\n";' > /data3/snaptron/gtex_sample_snaptron_id_mapping_counts.tsv
#get lenths per intron, count,coverage per sample
zcat consolidated_gtex_junctions.tsv.snaptron.no_extra.gz | cut -f2,6,13,14 | perl -ne 'BEGIN { %h; } chomp; ($snid,$length,$samples,$covs)=split(/\t/,$_); print "$snid\t$length\n"; @samples_=split(/,/,$samples); @covs_=split(/,/,$covs); for($i=0;$i<scalar(@samples_);$i++) { if($covs_[$i] != 0) { $s=$samples_[$i]; $c=$covs_[$i]; $h{$s}->[0]+=1; $h{$s}->[1]+=$c;} } END { open(OUT,">/data3/snaptron/gtex_samples_counts_covs.tsv"); for $s (keys %h) { print OUT "$s\t".$h{$s}->[0]."\t".$h{$s}->[1]."\n";}}' > /data3/snaptron/gtex_junction_lengths.tsv
#same for hg38/srav2
zcat intropolis.v2.hg38.tsv.snaptron2.bgzip | cut -f 1,5,12,13 | perl -ne 'BEGIN { %h; } chomp; ($snid,$length,$samples,$covs)=split(/\t/,$_); print "$snid\t$length\n"; @samples_=split(/,/,$samples); @covs_=split(/,/,$covs); for($i=0;$i<scalar(@samples_);$i++) { $s=$samples_[$i]; $c=$covs_[$i]; $h{$s}->[0]+=1; $h{$s}->[1]+=$c; } END { open(OUT,">srav2_samples_counts_covs.tsv"); for $s (keys %h) { print OUT "$s\t".$h{$s}->[0]."\t".$h{$s}->[1]."\n";}}' > srav2_junction_lengths.tsv
#plotting sample2junction count/coverage
#do a log+1 transform in the case of the junction_counts since log(0) is -Inf
ggplot(gs1, aes(x = junction_count)) + geom_histogram() + scale_x_log10() + scale_y_continuous(trans="log1p")
ggplot(gs1, aes(x = junction_coverage_sum)) + geom_histogram() + scale_y_log10() + scale_x_log10()
#transparent blue version
gplot(ss1, aes(x = junction_coverage_sum)) + geom_histogram(color="black", fill="blue", alpha=0.3) + scale_y_log10() + scale_x_log10() + theme_bw()
#experimental
ggplot(gs1, aes(x = junction_count)) + stat_density(aes(y=..count..), color="black", fill="blue", alpha=0.3) + theme_bw() + scale_y_continuous(trans="log1p", expand=c(0,0)) + scale_x_continuous(breaks=round(seq(min(ss1$junction_count), max(ss1$junction_count), by=100000),1))
ggplot(ss1, aes(x = junction_count)) + geom_histogram() + theme_bw() + scale_y_log10(expand=c(0,0)) + scale_x_continuous(breaks=round(seq(min(ss1$junction_count), max(ss1$junction_count), by=100000),1))
#similar for length of intron
ggplot(lengths, aes(x=junction_length)) + geom_histogram(color="black", fill="blue", alpha=0.3) + theme_bw() + scale_y_log10()
#lucene indexer
cat data/all_illumina_sra_for_human_ids.tsv | python lucene_indexer.py
#convert to BED for bedToBigBed formatting
zcat junctions.bgzip | perl -ne 'chomp; @f=split(/\t/,$_); @f1=@f; ($c,$s,$e)=splice(@f1,2,4); $s--; $jid=$f[1]; $scount=int(log($f[14])*200); $scount=1000 if($scount > 1000); $st=$f[6]; print "$c\t$s\t$e\t$jid\t$scount\t$st\n";' > junctions.bed
/data/kent_tools/bedToBigBed -tab -type=bed6 junctions.bed /data/kent_tools/hg19.chrom.sizes ./srav1_hg19_junctions.bb
python ./snaptron.py "regions=CD99&sfilter=description:tissue&sfilter=design_description:brain"
#reformat sample ids and covs together for Ahocoarsick
zcat junctions.bgz | perl -ne 'chomp; @f=split(/\t/,$_); @ids=split(/,/,$f[11]); @covs=split(/,/,$f[12]); @new=(); for($i=0;$i<scalar(@ids);$i++) { push(@new,$ids[$i].":".$covs[$i]); } @f1=splice(@f,0,11); @f2=splice(@f,2); print "".join("\t",@f1)."\t".join(",",@new)."\t".join("\t",@f2)."\n";' | bgzip > /data3/snaptron/srav2.junctions.sids_covs_together.bgz
#both sqlite3 and tabix
zcat junctions.bgz | perl -ne 'chomp; @f=split(/\t/,$_); @ids=split(/,/,$f[11]); @covs=split(/,/,$f[12]); @new=(); for($i=0;$i<scalar(@ids);$i++) { push(@new,$ids[$i].":".$covs[$i]); } @f1=splice(@f,0,11); @f2=splice(@f,2); print "".join("\t",@f1)."\t,".join(",",@new)."\t".join("\t",@f2)."\n";' | tee /data/import | bgzip > /data3/snaptron/srav2.junctions.sids_covs_together2.bgz
#redone metadata indexing
cat /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv | perl -ne 'chomp; $n++; @f=split(/\t/,$_); foreach $idx (0 .. (scalar @f)-1) { $type="t"; $type="n" if($f[$idx] eq "NA"); if($f[$idx]=~/^-?\d+?\.?\d+$/) { $type="f"; } $type="i" if($f[$idx]=~/^-?\d+$/); $counts{$idx}->{$type}++; } END { foreach $idx (sort { $a<=>$b} keys %counts) { print "$idx"; $counts{$idx}->{"n"}=-$counts{$idx}->{"n"} if(defined($counts{$idx}->{"n"})); $counts{$idx}->{"i"}=-$counts{$idx}->{"i"} if(defined($counts{$idx}->{"f"}) && defined($counts{$idx}->{"i"}) > 0); foreach $t (sort {$counts{$idx}->{$b}<=>$counts{$idx}->{$a}} keys %{$counts{$idx}}) { print "\t$t,".$counts{$idx}->{$t}; } print "\n";}}' > /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv.type_inferences_w_na_floats
cat gtex_pheno_table.w_ids.tsv | python lucene_indexer.py.gtex gtex_pheno_table.w_ids.tsv.type_inferences_w_na_floats > gtex_lucene_indexer_run 2>&1 &
zcat junctions.bgz | cut -f 14 | perl -ne 'chomp; $c++; $s+=$_; END { $a=$s/$c; print "$c\t$s\t$a\n";}' > junctions.bgz.avg_samples_per_junction
/data/kent_tools/bedToBigBed -tab -type=bed6 junctions.bed /data/kent_tools/hg19.chrom.sizes ./srav1_hg19_junctions.bb
#get over-represented tissues TS analysis
cut -f 1,3,4 client/ts_list.tsv | egrep -e ' 1 ' | sort | uniq -c | sort -k1,1nr
python -mcProfile -s "cumtime" ./snaptron.py 'regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000&sfilter=SMTSD:Brain' > prof1a 2>&1 &
#latest version of sampleid2intronid mapping (fixed to check if coverage was 0 or not)
zcat /data2/gigatron2/gtex/consolidated_gtex_junctions.tsv.snaptron.no_extra.gz | cut -f2,13,14 | perl -ne 'chomp; ($tid,$sids,$covs)=split(/\t/,$_); @sids=split(/,/,$sid); @covs=split(/,/,$covs); for($i=0;$i<scalar(@sids);$i++) { $s=$sids[$i]; $cov=$covs[$i]; if($cov > 0) { print "$s\t$tid\n";}}' | sort -k1,1 -n | perl -ne 'chomp; $s=$_; ($sid,$tid)=split(/\t/,$s); if(defined($pid) && $pid != $sid) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n"; %h=();} $pid=$sid; $h{$tid}=1; END { if(defined($pid)) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n";}}' | bzip2 > /data3/snaptron/gtex_by_sample_id.bgzip
#speed testing AHOC
time python ./snaptron.py "regions=chr1:1-25000000&rfilter=samples_count>:1&sfilter=all:tissue" | wc -l
cut -f 63 data/samples.tsv | perl -ne 'chomp; $s=$_; if($s>=8) { print "$s\n";}' | wc -l
python ./snaptron.py "regions=ALK&rfilter=samples_count>:10&sfilter=SMRIN>:8" | wc -l
cat lucene_indexed_numeric_types.tsv | perl -ne 'BEGIN { %h=("t"=>"text","s"=>"string","i"=>"integer","f"=>"float"); $i=0;} chomp; ($n,$t)=split(/\t/,$_); $t1=$h{$t}; print "".$i++."\t$n\t$t1\n";' > data/samples.fields.tsv
#source for recalculation rfilter test
python ./snaptron.py "regions=ALK&rfilter=coverage_avg>:5&rfilter=samples_count:10" > h4
python ./snaptron.py "regions=ALK&rfilter=coverage_avg>:6&sfilter=intropolis_sample_id:47852" > h5
s1="groups=`base64 -w 0 <(echo -n 'group=r2®ions=CD99|||group=r3&contains=1®ions=chr2:29446395-30142858&rfilter=samples_count>:100&rfilter=annotated:1')`"
#file w/o groups= at start
curl --data-urlencode groups@bulk_test_50.txt.b64 http://localhost:1580//snaptron
#large query test (1200 samples)
perl -e '$s="regions=chr8:46210817-46210817&sids="; $i=0; $s.=join(",",(map {++$i} split(//,"a"x1200))); print "group=r0&$s"; for $x (1..100) { print "|||group=r$x&$s";} END { print "\n";}' > bulk_test_large_samples.txt.100
base64 -w 0 bulk_test_large_samples.txt.100 > bulk_test_large_samples.txt.100.b64
curl --data-urlencode groups@bulk_test_large_samples.txt.100.b64 http://localhost:1580//snaptron
#selection of junctions limiting by multiple samples (ORing the sample IDs)
#TCGA
curl 'http://localhost:1591/genes?regions=ALK&sids=50099,50100'
#SRAV2
curl 'http://localhost:1580/genes?regions=ALK&sids=40099,40100'
curl --data-urlencode groups@jons_breaking_bulk_query.b64 http://localhost:1580//snaptron
#get total: # of junctions, #coverage sum, and average coverage for each sample
time zcat junctions.bgz | cut -f 12 | perl -ne 'chomp; @f=split(/,/,$_); shift(@f); for $f (@f) { ($f,$c)=split(/:/,$f); $count{$f}++; $cov{$f}+=$c; } END { for $f (keys %count) { $avg=$cov{$f}/$count{$f}; print "$f\t".$count{$f}."\t".$cov{$f}."\t$avg\n";}}' > sample_summary_stats.tsv &
#look for client side injection possibilities in stderr
grep "stderr" snquery.py snaputil.py snaptron.py snannotation.py snample.py > stderrs
#create chromosome2region map
ls x* | perl -ne 'chomp; $f=$_; open(IN,"<$f"); while($line=<IN>) { chomp($line); ($chrm,$s,$e)=split(/\t/,$line); if(!$h{$chrm}) { $h{$chrm}->{$f}=[0,$e]; } if(!$h{$chrm}->{$f}) { $h{$chrm}->{$f}=[$s,$e]; } $h{$chrm}->{$f}->[1]=$e; } close(IN); END { for $chrm (sort {$a cmp $b} keys %h) { print "\"$chrm\":["; for $f (sort {$a cmp $b} keys %{$h{$chrm}}) { print "[\"$f\",".$h{$chrm}->{$f}->[0].",".$h{$chrm}->{$f}->[1]."],"; } print "],"; } print "\n";}' | perl -ne 'chomp; $s=$_; $s=~s/\],\]/\]\]/g; $s=~s/,$//; print "$s\n";'
#find gaps between junctions
time zcat junctions.bgz | perl -ne 'chomp; $f=$_; $f=~/^\d+\t(chr[^\t]+)\t(\d+)\t(\d+)\t/; $c=$1; $s=$2; $e=$3; if($pc eq $c) { if($s > $pe) { $d=($s-$pe)+1; print "$c\t$pe\t$s\t$d\n"; $ps=$s; $pe=$e; next; } if($s <= $pe && $e > $pe) { $pe=$e; next; }} $pc=$c; $ps=$s; $pe=$e;' | sort -k4,4nr > junctions.bgz.gaps.sorted.tsv &
git submodule update --init --recursive
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | ./scripts/groupby/calc
curl "http://snaptron.cs.jhu.edu/test/bases?regions%3Dchr11%3A82970135-82997450%26return_format%3D3"
#column-wise sum
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$sum=0; map { $sum+=$_; } @F; print "$sum\n";' > t1
#column-wise mean
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$sum=0; $c=0; map { $sum+=$_; $c++; } @F; printf("%.3f\n",$sum/$c);' > t1m
#row-wise sum
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$c++; $i=0; map { $sums[$i++]+=$_; } @F; END { for $sum (@sums) { printf("\t%.0f",$sum); } print "\n";}' > a1
#row-wise mean
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$c++; $i=0; map { $sums[$i++]+=$_; } @F; END { for $sum (@sums) { printf("\t%.3f",$sum/$c); } print "\n";}' > a1m
time curl -s "http://snaptron.cs.jhu.edu/test/bases?regions=chr11:82973133-83071923&label=bob&calc=1&calc_axis=0&calc_op=mean" | head
docker run --rm -p 21656:1656 -i -t --name snaptron --volume /data7/stest:/deploy:rw --entrypoint "/bin/bash" snaptron