cmds.txt

zcat all_SRA_introns.tsv.gz | head | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1


zcat all_SRA_introns.tsv.gz | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1

#sums and counts last 2 columns
cat all_SRA_introns.tsv.100000 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$_); @f1=split(/,/,$f[6]); @f2=split(/,/,$f[7]); $f1=scalar @f1; $f2=scalar @f2; pop(@f); pop(@f); $s=join("\t",@f); $s1=0; $s2=0; map { $s1+=$_; } @f1; map { $s2+=$_; } @f2; print "$s\t$f1\t$f2\t$s1\t$s2\n";'

cat all_SRA_introns_ids.tsv | perl -ne 'BEGIN {$data_source_id=0;} chomp; $s=$_; @f=split(/\t/,$s); @f1=split(/,/,$f[7]); @f2=split(/,/,$f[8]); $f1=scalar @f1; $f2=scalar @f2; $s2=0; map { $s2+=$_; } @f2; $avg=$s2/$f2; $median=-1; $m_=$f2/2; $m_=int($m_); @f2a=sort {$a<=>$b} @f2; if(($f2 % 2)==0) { $m2_=$f2/2; $m2_-=1; $median=($f2a[$m_]+$f2a[$m2_])/2; } else { $median=$f2a[$m_]; }  printf("$s\t$f1\t$f2\t$s2\t%.3f\t%.3f\n",$avg,$median,$data_source_id);'


time python ./joiner.py 'chr6:1-10000000|samples_count_i:5 AND chromosome_s:chr6' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:5' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:[5 TO 200000] AND chromosome_s:chr6' > t1

curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5ANDcoverage_sumGT5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000**' | wc -l

time python ./snaptron.py 'chr11:82970135-82997450|samples_count>=100,coverage_sum>=1000|' > /data2/gigatron2/ccdc90b.range2.sc100_cs1000
python ./snaptron_new.py 'regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000' | wc -l
curl --data 'fields="[{"intervals":["chr11:82970135-82997450"],"samples_count":[{"op":">:","val":100}],"coverage_sum":[{"op":">:","val":1000}]}]"' http://stingray.cs.jhu.edu:8443/snaptron | wc -l

source /data/gigatron/snaptron/python/bin/activate
./snaptron_server --no-daemon

curl "http://128.220.35.129:8443/snaptron?" --data-urlencode "chr6:1-10000000|samples_count=5|"
curl -vv -G "http://128.220.35.129:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"
curl -vv -G "http://stingray.cs.jhu.edu:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"

curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'


#REL finding project

cat all_SRA_introns_ids_stats_length_annotated.tsv.gtf | head -50 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$s); $iid=$f[0]; @s1=split(/,/,$f[12]); @s2=split(/,/,$f[13]); $i=0; foreach $sample (@s1) { $samples{$sample}->[0]++; $samples{$sample}->[1]+=$s2[$i]; $i++; print STDERR "$sample\t$iid\n"; }  END { foreach $sample (sort {$a <=> $b} keys %samples) { ($count,$cov) = @{$samples{$sample}}; print "$sample\t$count\t$cov\n";}}'


python find_gene-rp_overlaps.py knownGene_28-Jun-2015.clusterid.tsv ucsc_repeatmasker_2015_12_22.bed.sorted.gtf.no_chr_prefix all_SRA_introns_ids_stats_length_annotated.tsv.gtf all_SRA_introns_ids_stats_length_annotated.tsv.gtf.by_sample_count_coverate.tsv > annotated_repeated_introns.strands.tsv.new 2> errors &

cat all_illumina_sra_for_human_ids.tsv | perl -ne 'chomp; @f=split(/\t/,$_); $read=join("\t",($f[11],$f[12],$f[28])); $library=join("\t",($f[22],$f[23],$f[24],$f[25],$f[26],$f[27])); $instrument=$f[31]; $s=join("\t",($f[0],$library,$instrument,$read)); print "$s\n";' > sarvens/relevant_sample_md_fields.tsv

curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'

curl --data 'fields="[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"' http://stingray.cs.jhu.edu:8443/snaptron

python snaptron.py '"[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"'

python snample.py "sfilter=description:cortex" 1

zcat all_SRA_introns.tsv.gz | head | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1


zcat all_SRA_introns.tsv.gz | perl -ne 'BEGIN { $counter=0;} chomp; ($c,$s,$e,$st,$s1,$s2,$f1,$f2)=split(/\t/,$_); @f1=split(/,/,$f1); @f2=split(/,/,$f2); $score1=scalar @f1; $score2=scalar @f2; $st=($st eq "-"?18:2); $size=($e-$s)+1; $cigar=length("$f1:$f2"); print "i$counter\t$st\t$c\t$s\t$score2\t$cigar"."M\t*\t0\t$size\t$f1:$f2\t$f1:$f2\n"; $counter++;' >> t1

#sums and counts last 2 columns
cat all_SRA_introns.tsv.100000 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$_); @f1=split(/,/,$f[6]); @f2=split(/,/,$f[7]); $f1=scalar @f1; $f2=scalar @f2; pop(@f); pop(@f); $s=join("\t",@f); $s1=0; $s2=0; map { $s1+=$_; } @f1; map { $s2+=$_; } @f2; print "$s\t$f1\t$f2\t$s1\t$s2\n";'

cat all_SRA_introns_ids.tsv | perl -ne 'chomp; $s=$_; @f=split(/\t/,$s); @f1=split(/,/,$f[7]); @f2=split(/,/,$f[8]); $f1=scalar @f1; $f2=scalar @f2; $s2=0; map { $s2+=$_; } @f2; $avg=$s2/$f2; $median=-1; $m_=$f2/2; $m_=int($m_); @f2a=sort {$a<=>$b} @f2; if(($f2 % 2)==0) { $m2_=$f2/2; $m2_-=1; $median=($f2a[$m_]+$f2a[$m2_])/2; } else { $median=$f2a[$m_]; }  printf("$s\t$f1\t$f2\t$s2\t%.3f\t%.3f\n",$avg,$median);'


time python ./joiner.py 'chr6:1-10000000|samples_count_i:5 AND chromosome_s:chr6' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:5' > t1
time python ./joiner.py 'chr6:1-10000000|samples_count_i:[5 TO 200000] AND chromosome_s:chr6' > t1

curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5ANDcoverage_sumGT5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000*samples_countEQ5*' | wc -l
time curl 'http://10.161.159.186:1555/snaptron?rquery=chr6$1-10000000**' | wc -l

time python ./snaptron.py 'chr11:82970135-82997450|samples_count>=100,coverage_sum>=1000|' > /data2/gigatron2/ccdc90b.range2.sc100_cs1000
python ./snaptron_new.py 'regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000' | wc -l
curl --data 'fields="[{"intervals":["chr11:82970135-82997450"],"samples_count":[{"op":">:","val":100}],"coverage_sum":[{"op":">:","val":1000}]}]"' http://stingray.cs.jhu.edu:8443/snaptron | wc -l

source /data/gigatron/snaptron/python/bin/activate
./snaptron_server --no-daemon

curl "http://128.220.35.129:8443/snaptron?" --data-urlencode "chr6:1-10000000|samples_count=5|"
curl -vv -G "http://128.220.35.129:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"
curl -vv -G "http://stingray.cs.jhu.edu:8443/snaptron/?rquery=chr6:1-10000000|samples_countEQ5|"

curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'


#REL finding project

cat all_SRA_introns_ids_stats_length_annotated.tsv.gtf | head -50 | perl -ne 'chomp; $s=$_; @f=split(/\t/,$s); $iid=$f[0]; @s1=split(/,/,$f[12]); @s2=split(/,/,$f[13]); $i=0; foreach $sample (@s1) { $samples{$sample}->[0]++; $samples{$sample}->[1]+=$s2[$i]; $i++; print STDERR "$sample\t$iid\n"; }  END { foreach $sample (sort {$a <=> $b} keys %samples) { ($count,$cov) = @{$samples{$sample}}; print "$sample\t$count\t$cov\n";}}'


python find_gene-rp_overlaps.py knownGene_28-Jun-2015.clusterid.tsv ucsc_repeatmasker_2015_12_22.bed.sorted.gtf.no_chr_prefix all_SRA_introns_ids_stats_length_annotated.tsv.gtf all_SRA_introns_ids_stats_length_annotated.tsv.gtf.by_sample_count_coverate.tsv > annotated_repeated_introns.strands.tsv.new 2> errors &

cat all_illumina_sra_for_human_ids.tsv | perl -ne 'chomp; @f=split(/\t/,$_); $read=join("\t",($f[11],$f[12],$f[28])); $library=join("\t",($f[22],$f[23],$f[24],$f[25],$f[26],$f[27])); $instrument=$f[31]; $s=join("\t",($f[0],$library,$instrument,$read)); print "$s\n";' > sarvens/relevant_sample_md_fields.tsv

curl http://stingray.cs.jhu.edu:8443/snaptron/?rquery='chr6:1-10000000|samples_count=5||'

curl --data 'fields="[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"' http://stingray.cs.jhu.edu:8443/snaptron

python snaptron.py '"[{"intervals":["chr6:1-10000000"],"samples_count":[{"op":"=","val":5}]}]"'

python snample.py "sfilter=description:cortex" 1

#redo creation of individially sorted indices including length
cat all_SRA_introns_ids_stats.tsv.new2_w_header_w_sourcedb | perl -ne 'chomp; $s=$_; next if($s=~/gigatron_id/); print "1\t$s\n";' | sort -s -k6,6n | tee by_length | sort -s -k15,15n | tee by_sample_count | sort -s -k16,16n | tee by_coverage_sum | sort -s -k17,17n | tee  by_coverage_avg | sort -s -k18,18n > by_coverage_m

cut -f1,2,3,5  all_illumina_sra_for_human_ids.tsv | perl -ne 'chomp; ($id,$srr,$srs,$srp)=split(/\t/,$_); $ids{$id}=[$srr,$srs,$srp]; push(@{$srps{$srp}},$id); push(@{$srss{$srs}},$id); END { $idx=0; foreach $srp (sort {$a cmp $b} keys %srps) { @i=@{$srps{$srp}}; $ic = scalar @i; $srps{$srp}=[$ic,$idx++]; } $idx=0; foreach $srs (sort {$a cmp $b} keys %srss) { @j=@{$srss{$srs}}; $jc = scalar @j; $srss{$srs}=[$jc,$idx++]; } foreach $id (sort {$a <=> $b} keys %ids) { next if($id=~/intropolis_sample_id_i/); ($srr,$srs,$srp)=@{$ids{$id}}; ($srp_sz,$srp_id)=@{$srps{$srp}}; ($srs_sz,$srs_id)=@{$srss{$srs}}; print "$id\t$srs_sz\t$srp_sz\t$srp\t$srp_id\t$srs\t$srs_id\n";}}' > all_illumina_sra_for_human_ids.tsv.id_mapping

zcat /data2/gigatron2/all_SRA_introns_ids_stats.tsv.new2_w_sourcedb2.gz | perl add_stats.pl ../all_illumina_sra_for_human_ids.tsv.id_mapping2 | bgzip > /data2/gigatron2/all_SRA_introns_ids_stats.tsv.new2_w_sourcedb2_extended.gz

python snanalysis.py "ids_a=4&ids_b=5,6&compute=jir&ratio=cov&order=T:5"

zcat /data/gigatron/intropolis.v2.hg38.tsv.snaptron.bgzip | cut -f1,12 | perl -ne 'chomp; ($tid,$sid)=split(/\t/,$_); @sids=split(/,/,$sid); foreach $s (@sids) { print "$s\t$tid\n";}' | sort -k1,1 -n | perl -ne 'chomp; $s=$_; ($sid,$tid)=split(/\t/,$s); if(defined($pid) && $pid != $sid) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n"; %h=();} $pid=$sid; $h{$tid}=1; END { if(defined($pid)) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n";}}' | bzip2 > intropolis.v2.hg38.tsv.snaptron.sample2intron_ids.bz2


cat /data2/gigatron2/all_illumina_sra_for_human_ids.tsv | perl -ne 'BEGIN { open(IN,"</data2/gigatron2/intropolis.idmap.v2.hg38.tsv"); %h; $h{'run_accession_s'}='intropolis_sample_id_i'; while($line=<IN>) {chomp($line); @f=split(/\t/,$line); $h{$f[4]}=$f[0]; } close(IN);}  chomp; $s=$_; @f=split(/\t/,$s); $newid=$h{$f[1]}; shift @f; print "$newid\t".(join("\t",@f))."\n";' > /data2/gigatron2/first_half_illumina_sra_for_human_ids.v2.tsv


echo "SELECT run_accession,sample_accession,experiment_accession,study_accession,submission_accession,sra_ID,run_ID,run_alias,run_date,updated_date,spots,bases,run_center,experiment_name,run_attribute,experiment_ID,experiment_alias,experiment_title,study_name,sample_name,design_description,library_name,library_strategy,library_source,library_selection,library_layout,library_construction_protocol,read_spec,platform,instrument_model,platform_parameters,experiment_url_link,experiment_attribute,sample_ID,sample_alias,taxon_id,common_name,description,sample_url_link,sample_attribute,study_ID,study_alias,study_title,study_type,study_abstract,center_project_name,study_description,study_url_link,study_attribute,related_studies,primary_study,submission_ID,submission_comment,submission_center,submission_lab,submission_date,sradb_updated,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL FROM SRA;" | sqlite3 -cmd '.separator "\t"' SRAmetadb.sqlite > all_sra_related_md

cat all_illumina_sra_for_human_v2.tsv | perl -ne 'BEGIN { open(IN,"</data/gigatron/intropolis.idmap.v2.hg38.tsv"); %h; $h{'run_accession_s'}='intropolis_sample_id_i'; while($line=<IN>) {chomp($line); @f=split(/\t/,$line); $h{$f[4]}=$f[0]; } close(IN);}  chomp; $s=$_; @f=split(/\t/,$s); $newid=$h{$f[1]}; shift @f; print "$newid\t".(join("\t",@f))."\n";' > illumina_sra_for_human_ids.v2.tsv


#get sample total junction counts
bzcat all_SRA_introns_ids_stats_length_annotated.tsv.gtf.sample2intron.tsv.0based.sorted.online.bz2 | perl -ne 'chomp; ($sid,$tids)=split(/\t/,$_); @tids=split(/,/,$tids); $c=scalar @tids; print "$sid\t$c\n";' > all_SRA_introns_ids_stats_length_annotated.tsv.gtf.sample2intron.tsv.0based.sorted.online.bz2.counts_per_sample

#infer types for lucene from raw metadata fields
cat /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv | perl -ne 'chomp; $n++; @f=split(/\t/,$_); foreach $idx (0 .. (scalar @f)-1) { $type="t"; $type="f" if($f[$idx]=~/^-?\d+?\.?\d+$/); $type="i" if($f[$idx]=~/^-?\d+$/); $counts{$idx}->{$type}++; } END { foreach $idx (sort { $a<=>$b} keys %counts) { print "$idx"; foreach $t (sort {$counts{$idx}->{$b}<=>$counts{$idx}->{$a}} keys %{$counts{$idx}}) { print "\t$t,".$counts{$idx}->{$t}; } print "\n";}}' > /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv.inferred_type_counts

#to find Sarven's ~3 genes with REL-exons
cat sarven_splices.tsv | perl -ne 'chomp; ($f,$t)=split(/\s+/,$_); $f1=$f; $f1=~s/chr\w+://; print "R:$f "; $f1=~s/-/\t/; $s=`python ./snaptron.py "regions=$f" 2>/dev/null| grep "$f1" | cut -f 15,16`; $t1=$t; $t1=~s/chr\w+://; $t1=~s/-/\t/; print "$s"; $s=""; print "R:$t "; $s=`python ./snaptron.py "regions=$t" 2>/dev/null | grep "$t1" | cut -f 15,16`; print "$s";'

curl "http://stingray.cs.jhu.edu:8090/srav1/snaptron?regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000&fields=snaptron_id"

curl "http://stingray.cs.jhu.edu:8090/srav1/density?bigwig_db=snps&regions=chr1:1-100000"

#direct link to custom track
http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr21:33038447-33041505&hgct_customText=track%20type=bigBed%20name=myBigBedTrack%20description=%22a%20bigBed%20track%22%20visibility=full%20bigDataUrl=http://genome.ucsc.edu/goldenPath/help/examples/bigBedExample.bb 

http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&position=chr11:82970135-82997450&hgct_customText=name=snaptron%20description=snaptron_exported_splice_junctions%20visibility=full%20bigDataUrl=%22http://127.0.0.1:1300/snaptron?regions=chr11:82970135-82997450%26rfilter=samples_count>:100%26rfilter=coverage_sum>:1000%26return_format=1%22

#works
http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&position=chr11:82970135-82997450&hgct_customText=http://stingray.cs.jhu.edu:8090/srav1/snaptron?regions=chr11:82970135-82997450%26rfilter=samples_count>:100%26rfilter=coverage_sum>:1000%26return_format=1%22

#filter/limit for gene models
zcat /data2/gigatron2/gensemrefg.hg19_annotations.sorted.gtf.gz | perl -ne 'chomp; @f=split(/\t/,$_); $f[5]=0.0; @f1=split(/;/,$f[8]); $boost=0; $boost=100000 if($f1[1]!~/"NA"/); @f2=split(/,/,$f1[2]); $s1=$f1[2]; print "$s1\n"; $f[5]=(scalar @f2)+$boost; print "".(join("\t",@f))."\n";' | sort -t"   " -k6,6nr | head -10

#find the gaps in the sample id metadata
cut -f 1 data/illumina_sra_for_human_ids.v2.tsv | sort -n | perl -ne 'chomp; $s=$_; if($p) { $d=abs($s-$p); if($d>1) { print "$s-$p:\t"; for($i=$p+1; $i<=$p+($d-1); $i++) { print "$i,"; } print "\n"; } } $p=$s;' > data/illumina_sra_for_human_ids.v2.tsv.missing_sample_ids

#look for total top covered unannotated junctions (Ben's idea):
time python ./snaptron.py "rfilter=annotated:0,length<:10000" | sort -t'       ' -k15,15nr -k16,16nr | bgzip > /data3/unannotated_short_srav1_sorted_samplescount_coverage.tsv.bgz

#client
python query_snaptron.py --query-file alk.tsv --function jir

python ./sbreakpoint.py '1271' > CCDC6_RET_breakpoint.tsv
python ./sbreakpoint.py 'CCDC6-RET' > CCDC6_RET_breakpoint.tsv

curl "http://stingray.cs.jhu.edu:8090/srav1/breakpoint?eml4-alk"

<<<<<<< Updated upstream
sqlite3 data/by_sample_ids.v2.sqlite3 "select * from by_sample_id;" | python pack_samples.py

#dump per-sample # of snaptron ids
sqlite3 ./by_sample_ids.sqlite3 "select * from by_sample_id;" | perl -ne 'chomp; ($sid,$sids)=split(/\|/,$_); @sids=split(/,/,$sids); print "$sid\t".scalar(@sids)."\n";' > /data3/snaptron/gtex_sample_snaptron_id_mapping_counts.tsv


#get lenths per intron, count,coverage per sample
zcat consolidated_gtex_junctions.tsv.snaptron.no_extra.gz | cut -f2,6,13,14 |  perl -ne 'BEGIN { %h; } chomp; ($snid,$length,$samples,$covs)=split(/\t/,$_); print "$snid\t$length\n"; @samples_=split(/,/,$samples); @covs_=split(/,/,$covs); for($i=0;$i<scalar(@samples_);$i++) { if($covs_[$i] != 0) { $s=$samples_[$i]; $c=$covs_[$i]; $h{$s}->[0]+=1; $h{$s}->[1]+=$c;} } END { open(OUT,">/data3/snaptron/gtex_samples_counts_covs.tsv"); for $s (keys %h) { print OUT "$s\t".$h{$s}->[0]."\t".$h{$s}->[1]."\n";}}' > /data3/snaptron/gtex_junction_lengths.tsv

#same for hg38/srav2
zcat intropolis.v2.hg38.tsv.snaptron2.bgzip | cut -f 1,5,12,13 | perl -ne 'BEGIN { %h; } chomp; ($snid,$length,$samples,$covs)=split(/\t/,$_); print "$snid\t$length\n"; @samples_=split(/,/,$samples); @covs_=split(/,/,$covs); for($i=0;$i<scalar(@samples_);$i++) { $s=$samples_[$i]; $c=$covs_[$i]; $h{$s}->[0]+=1; $h{$s}->[1]+=$c; } END { open(OUT,">srav2_samples_counts_covs.tsv"); for $s (keys %h) { print OUT "$s\t".$h{$s}->[0]."\t".$h{$s}->[1]."\n";}}' > srav2_junction_lengths.tsv

#plotting sample2junction count/coverage
#do a log+1 transform in the case of the junction_counts since log(0) is -Inf
ggplot(gs1, aes(x = junction_count)) + geom_histogram() + scale_x_log10() + scale_y_continuous(trans="log1p")
ggplot(gs1, aes(x = junction_coverage_sum)) + geom_histogram() + scale_y_log10() + scale_x_log10()
#transparent blue version
gplot(ss1, aes(x = junction_coverage_sum)) + geom_histogram(color="black", fill="blue", alpha=0.3) + scale_y_log10() + scale_x_log10() + theme_bw()

#experimental
ggplot(gs1, aes(x = junction_count)) + stat_density(aes(y=..count..), color="black", fill="blue", alpha=0.3) + theme_bw() + scale_y_continuous(trans="log1p", expand=c(0,0)) + scale_x_continuous(breaks=round(seq(min(ss1$junction_count), max(ss1$junction_count), by=100000),1))

ggplot(ss1, aes(x = junction_count)) + geom_histogram() + theme_bw() + scale_y_log10(expand=c(0,0)) + scale_x_continuous(breaks=round(seq(min(ss1$junction_count), max(ss1$junction_count), by=100000),1))


#similar for length of intron
ggplot(lengths, aes(x=junction_length)) + geom_histogram(color="black", fill="blue", alpha=0.3) + theme_bw() + scale_y_log10()


#lucene indexer
cat data/all_illumina_sra_for_human_ids.tsv | python lucene_indexer.py

#convert to BED for bedToBigBed formatting
zcat junctions.bgzip | perl -ne 'chomp; @f=split(/\t/,$_); @f1=@f; ($c,$s,$e)=splice(@f1,2,4); $s--; $jid=$f[1]; $scount=int(log($f[14])*200); $scount=1000 if($scount > 1000); $st=$f[6]; print "$c\t$s\t$e\t$jid\t$scount\t$st\n";' > junctions.bed

/data/kent_tools/bedToBigBed -tab -type=bed6 junctions.bed /data/kent_tools/hg19.chrom.sizes ./srav1_hg19_junctions.bb

python ./snaptron.py "regions=CD99&sfilter=description:tissue&sfilter=design_description:brain"


#reformat sample ids and covs together for Ahocoarsick
zcat junctions.bgz | perl -ne 'chomp; @f=split(/\t/,$_); @ids=split(/,/,$f[11]); @covs=split(/,/,$f[12]); @new=(); for($i=0;$i<scalar(@ids);$i++) { push(@new,$ids[$i].":".$covs[$i]); } @f1=splice(@f,0,11); @f2=splice(@f,2); print "".join("\t",@f1)."\t".join(",",@new)."\t".join("\t",@f2)."\n";' | bgzip > /data3/snaptron/srav2.junctions.sids_covs_together.bgz

#both sqlite3 and tabix
zcat junctions.bgz | perl -ne 'chomp; @f=split(/\t/,$_); @ids=split(/,/,$f[11]); @covs=split(/,/,$f[12]); @new=(); for($i=0;$i<scalar(@ids);$i++) { push(@new,$ids[$i].":".$covs[$i]); } @f1=splice(@f,0,11); @f2=splice(@f,2); print "".join("\t",@f1)."\t,".join(",",@new)."\t".join("\t",@f2)."\n";' | tee /data/import | bgzip > /data3/snaptron/srav2.junctions.sids_covs_together2.bgz

#redone metadata indexing
cat /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv | perl -ne 'chomp; $n++; @f=split(/\t/,$_); foreach $idx (0 .. (scalar @f)-1) { $type="t"; $type="n" if($f[$idx] eq "NA"); if($f[$idx]=~/^-?\d+?\.?\d+$/) { $type="f"; } $type="i" if($f[$idx]=~/^-?\d+$/); $counts{$idx}->{$type}++; } END { foreach $idx (sort { $a<=>$b} keys %counts) { print "$idx"; $counts{$idx}->{"n"}=-$counts{$idx}->{"n"} if(defined($counts{$idx}->{"n"})); $counts{$idx}->{"i"}=-$counts{$idx}->{"i"} if(defined($counts{$idx}->{"f"}) && defined($counts{$idx}->{"i"}) > 0); foreach $t (sort {$counts{$idx}->{$b}<=>$counts{$idx}->{$a}} keys %{$counts{$idx}}) { print "\t$t,".$counts{$idx}->{$t}; } print "\n";}}' > /data2/gigatron2/gtex/gtex_pheno_table.w_ids.tsv.type_inferences_w_na_floats

cat gtex_pheno_table.w_ids.tsv | python lucene_indexer.py.gtex gtex_pheno_table.w_ids.tsv.type_inferences_w_na_floats > gtex_lucene_indexer_run 2>&1 &

zcat junctions.bgz | cut -f 14 | perl -ne 'chomp; $c++; $s+=$_; END { $a=$s/$c; print "$c\t$s\t$a\n";}' > junctions.bgz.avg_samples_per_junction

/data/kent_tools/bedToBigBed -tab -type=bed6 junctions.bed /data/kent_tools/hg19.chrom.sizes ./srav1_hg19_junctions.bb

#get over-represented tissues TS analysis
cut -f 1,3,4 client/ts_list.tsv | egrep -e '        1       ' | sort | uniq -c | sort -k1,1nr

python -mcProfile -s "cumtime" ./snaptron.py 'regions=chr11:82970135-82997450&rfilter=samples_count>:100&rfilter=coverage_sum>:1000&sfilter=SMTSD:Brain' > prof1a 2>&1 &

#latest version of sampleid2intronid mapping (fixed to check if coverage was 0 or not)
zcat /data2/gigatron2/gtex/consolidated_gtex_junctions.tsv.snaptron.no_extra.gz | cut -f2,13,14 | perl -ne 'chomp; ($tid,$sids,$covs)=split(/\t/,$_); @sids=split(/,/,$sid); @covs=split(/,/,$covs); for($i=0;$i<scalar(@sids);$i++) { $s=$sids[$i]; $cov=$covs[$i]; if($cov > 0) { print "$s\t$tid\n";}}' | sort -k1,1 -n | perl -ne 'chomp; $s=$_; ($sid,$tid)=split(/\t/,$s); if(defined($pid) && $pid != $sid) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n"; %h=();} $pid=$sid; $h{$tid}=1; END { if(defined($pid)) { print "$pid\t"; foreach $s (sort keys %h) {print "$s,";} print "\n";}}' | bzip2 > /data3/snaptron/gtex_by_sample_id.bgzip

#speed testing AHOC
time python ./snaptron.py "regions=chr1:1-25000000&rfilter=samples_count>:1&sfilter=all:tissue" | wc -l
cut -f 63 data/samples.tsv | perl -ne 'chomp; $s=$_; if($s>=8) { print "$s\n";}' | wc -l

python ./snaptron.py "regions=ALK&rfilter=samples_count>:10&sfilter=SMRIN>:8" | wc -l

cat lucene_indexed_numeric_types.tsv | perl -ne 'BEGIN { %h=("t"=>"text","s"=>"string","i"=>"integer","f"=>"float"); $i=0;} chomp; ($n,$t)=split(/\t/,$_); $t1=$h{$t}; print "".$i++."\t$n\t$t1\n";' > data/samples.fields.tsv

#source for recalculation rfilter test
python ./snaptron.py "regions=ALK&rfilter=coverage_avg>:5&rfilter=samples_count:10" > h4
python ./snaptron.py "regions=ALK&rfilter=coverage_avg>:6&sfilter=intropolis_sample_id:47852" > h5

s1="groups=`base64 -w 0 <(echo -n 'group=r2&regions=CD99|||group=r3&contains=1&regions=chr2:29446395-30142858&rfilter=samples_count>:100&rfilter=annotated:1')`"

#file w/o groups= at start
curl --data-urlencode groups@bulk_test_50.txt.b64 http://localhost:1580//snaptron

#large query test (1200 samples)
perl -e '$s="regions=chr8:46210817-46210817&sids="; $i=0; $s.=join(",",(map {++$i} split(//,"a"x1200))); print "group=r0&$s"; for $x (1..100) { print "|||group=r$x&$s";} END { print "\n";}' > bulk_test_large_samples.txt.100
base64 -w 0 bulk_test_large_samples.txt.100 > bulk_test_large_samples.txt.100.b64
curl --data-urlencode groups@bulk_test_large_samples.txt.100.b64 http://localhost:1580//snaptron

#selection of junctions limiting by multiple samples (ORing the sample IDs)
#TCGA
curl 'http://localhost:1591/genes?regions=ALK&sids=50099,50100'
#SRAV2
curl 'http://localhost:1580/genes?regions=ALK&sids=40099,40100'

curl --data-urlencode groups@jons_breaking_bulk_query.b64 http://localhost:1580//snaptron

#get total: # of junctions, #coverage sum, and average coverage for each sample
time zcat junctions.bgz | cut -f 12 | perl -ne 'chomp; @f=split(/,/,$_); shift(@f); for $f (@f) { ($f,$c)=split(/:/,$f); $count{$f}++; $cov{$f}+=$c; } END { for $f (keys %count) { $avg=$cov{$f}/$count{$f}; print "$f\t".$count{$f}."\t".$cov{$f}."\t$avg\n";}}' > sample_summary_stats.tsv &

#look for client side injection possibilities in stderr
grep "stderr" snquery.py snaputil.py snaptron.py snannotation.py snample.py > stderrs


#create chromosome2region map
ls x* | perl -ne 'chomp; $f=$_; open(IN,"<$f"); while($line=<IN>) { chomp($line); ($chrm,$s,$e)=split(/\t/,$line); if(!$h{$chrm}) { $h{$chrm}->{$f}=[0,$e]; } if(!$h{$chrm}->{$f}) { $h{$chrm}->{$f}=[$s,$e]; } $h{$chrm}->{$f}->[1]=$e; } close(IN); END { for $chrm (sort {$a cmp $b} keys %h) { print "\"$chrm\":["; for $f (sort {$a cmp $b} keys %{$h{$chrm}}) { print "[\"$f\",".$h{$chrm}->{$f}->[0].",".$h{$chrm}->{$f}->[1]."],"; } print "],"; } print "\n";}' | perl -ne 'chomp; $s=$_; $s=~s/\],\]/\]\]/g; $s=~s/,$//; print "$s\n";'

#find gaps between junctions
time zcat junctions.bgz | perl -ne 'chomp; $f=$_; $f=~/^\d+\t(chr[^\t]+)\t(\d+)\t(\d+)\t/; $c=$1; $s=$2; $e=$3; if($pc eq $c) { if($s > $pe) { $d=($s-$pe)+1; print "$c\t$pe\t$s\t$d\n"; $ps=$s; $pe=$e; next; } if($s <= $pe && $e > $pe) { $pe=$e; next; }} $pc=$c; $ps=$s; $pe=$e;' | sort -k4,4nr > junctions.bgz.gaps.sorted.tsv &


git submodule update --init --recursive


time tabix_zstd ./data/bases/b chr11:82973133-83071923 | ./scripts/groupby/calc
curl "http://snaptron.cs.jhu.edu/test/bases?regions%3Dchr11%3A82970135-82997450%26return_format%3D3"

#column-wise sum
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$sum=0; map { $sum+=$_; } @F; print "$sum\n";' > t1
#column-wise mean
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$sum=0; $c=0; map { $sum+=$_; $c++; } @F; printf("%.3f\n",$sum/$c);' > t1m
#row-wise sum
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$c++; $i=0; map { $sums[$i++]+=$_; } @F; END { for $sum (@sums) { printf("\t%.0f",$sum); } print "\n";}' > a1
#row-wise mean
time tabix_zstd ./data/bases/b chr11:82973133-83071923 | cut -f 4- | perl -F'\t' -ane '$c++; $i=0; map { $sums[$i++]+=$_; } @F; END { for $sum (@sums) { printf("\t%.3f",$sum/$c); } print "\n";}' > a1m

time curl -s "http://snaptron.cs.jhu.edu/test/bases?regions=chr11:82973133-83071923&label=bob&calc=1&calc_axis=0&calc_op=mean" | head

docker run --rm -p 21656:1656 -i -t --name snaptron --volume /data7/stest:/deploy:rw --entrypoint "/bin/bash" snaptron