Snakefile.study

#start
import sys
import os
import glob

STUDYDIR='study'
if not os.path.exists(STUDYDIR):
	os.makedirs(STUDYDIR)

shell.executable("/usr/bin/bash")

main_script_path=os.path.join(workflow.basedir,'scripts')

SCRIPTS={'find_done':os.path.join(main_script_path,'find_done.sh'),'find':os.path.join(main_script_path,'find_new_files.sh'),'decompress':os.path.join(main_script_path,'decompress_sums.sh'),'paste':os.path.join(main_script_path,'paste_sums.sh'),'filter':os.path.join(main_script_path,'filter_new_sjs.sh'),'merge':os.path.join(workflow.basedir, 'merge', 'merge.py'),'annotate':os.path.join(workflow.basedir, 'annotate', 'annotate_sjs.py'),'rejoin':os.path.join(workflow.basedir, 'rejoin', 'rejoin'),'sum_counts':os.path.join(workflow.basedir, 'merge', 'sum_counts'),'QC':"python3 %s" % os.path.join(workflow.basedir, 'log_qc', 'parse_logs_for_qc.py'), 'perbase':os.path.join(workflow.basedir, 'merge', 'perbase'),'rejoin_sh':os.path.join(main_script_path,'rejoin_genes.sh'),'rejoin_genes':"pypy %s" % os.path.join(workflow.basedir, 'rejoin', 'rejoin_genes.py')}

#if 'gene_rejoin_mapping' not in config or 'exon_rejoin_mapping' not in config or 'num_samples' not in config:
if 'gene_rejoin_mapping' not in config or 'exon_rejoin_mapping' not in config or 'gene_mapping_final' not in config or 'sample_ids_file' not in config or 'num_samples' not in config:
	sys.stderr.write("need to pass values for 'gene_rejoin_mapping' and/or 'exon_rejoin_mapping' and/or 'gene_mapping_final' and/or 'sample_ids_file' and/or 'num_samples' for the rejoining part of the pipeline!\n")
	sys.exit(-1)

if 'ref_sizes' not in config or 'ref_fasta' not in config:
	sys.stderr.write("need to pass values for 'ref_sizes' and/or 'ref_fasta' the jx motif extraction part of the pipeline!\n")
	sys.exit(-1)

if 'existing_sj_db' not in config:
	config['existing_sj_db']=""
if 'existing_sums' not in config:
	config['existing_sums']=""

if 'compilation_id' not in config:
	config['compilation_id']=0

#e.g. G026,G029,R109,ERCC,SIRV,F006
if 'annotation_list' not in config:
	sys.stderr.write("need to pass value(s) for 'annotation_list' (e.g. 'G026,G029,R109,ERCC,SIRV,F006') for final gene sums!\n")
	sys.exit(-1)

studies = [f.split('/')[-1] for f in glob.glob(config['input']+'/??/*')]

#setup the final set of target files
FILES=["%s/all.exon_bw_count.pasted.%s.tsv.gz" % (STUDYDIR, s) for s in studies]
FILES.extend(["%s/all.gene_counts.rejoined.%s.tsv.gz" % (STUDYDIR, s) for s in studies])
FILES.extend(["%s/all.exon_counts.rejoined.%s.tsv.gz" % (STUDYDIR, s) for s in studies])

annotations = config['annotation_list'].split(',')
#typically G026
main_annotation=annotations[0]
#def get_gene_annotations(wildcards):
gene_annotations=['%s/%s.gene.sums.%s.tsv.gz' % (STUDYDIR, annotation, study) for annotation in annotations for study in studies]
FILES.extend(gene_annotations)


wildcard_constraints:
	run_group_num="[0-9a-zA-Z]{2}"
	#type="(all)|(unique)"

rule all:
	input:
		expand("{file}", file=FILES)

###exon SUM pasting rules
rule find_sums:
	input: 
		config['input'],
		config['sample_ids_file']
	output:
		config['staging'] + '/all.exon_bw_count.groups.manifest'
	params:
		staging=config['staging'],
		script_path=SCRIPTS['find']
	shell:
		"{params.script_path} {input[0]} {input[1]} {params.staging} all.exon_bw_count .zst per_study"

rule decompress_sums:
	input:
		config['staging'] + '/all.exon_bw_count.groups.manifest'
	output:
		config['staging'] + '/all.exon_bw_count.{study}.{run_group_num}.decompressed'
	params:
		study=lambda wildcards: wildcards.study,
		run_group_num=lambda wildcards: wildcards.run_group_num,
		staging=config['staging'],
		script_path=SCRIPTS['decompress']
	shell:
		"{params.script_path} {params.staging}/all.exon_bw_count.{params.study}.{params.run_group_num}.manifest {output}"


#do a rule instantiation per *run* low-order name grouping to do hierarchical pastes
rule paste_sums_per_group:
	input:
		config['staging'] + '/all.exon_bw_count.{study}.{run_group_num}.decompressed'
	output:
		config['staging'] + '/all.exon_bw_count.{study}.{run_group_num}.pasted'
	params:
		study=lambda wildcards: wildcards.study,
		run_group_num=lambda wildcards: wildcards.run_group_num,
		staging=config['staging'],
		script_path=SCRIPTS['paste']
	shell:
		"acc_header=1 {params.script_path} {params.staging}/all.exon_bw_count.{params.study}.{params.run_group_num}.manifest {output}"

def get_pasted_sum_files(wildcards):
	study = wildcards.study
	return [config['staging']+"/all.exon_bw_count.%s.%s.pasted" % (study, f.split('/')[-1]) for f in glob.glob(config['input']+'/??/%s/??' % (study))]

rule collect_pasted_sums:
	input:
		get_pasted_sum_files
	output:
		config['staging'] + '/all.exon_bw_count.{study}.pasted.files.list'
	params:
		study=lambda wildcards: wildcards.study,
		staging=config['staging']
	shell:
		"ls {params.staging}/all.exon_bw_count.{params.study}.??.pasted > {output}"

rule paste_sums_per_study:
	input:
		config['staging'] + '/all.exon_bw_count.{study}.pasted.files.list'
	output:
		os.path.join(STUDYDIR, 'all.exon_bw_count.pasted.{study}.tsv.gz')
	params:
		study=lambda wildcards: wildcards.study,
		staging=config['staging'],
		script_path=SCRIPTS['paste'],
		existing_sums=config['existing_sums']
	shell:
		"{params.script_path} {input} {output} dont_get_ids {params.existing_sums}"

###Rejoin of exon/gene coverage into original annotation rows
rule rejoin_genes:
	input:
		os.path.join(STUDYDIR, 'all.exon_bw_count.pasted.{study}.tsv.gz')
	output:
		os.path.join(STUDYDIR, 'all.gene_counts.rejoined.{study}.tsv.gz')
	threads: 8
	params:
		staging=config['staging'],
		script_path=SCRIPTS['rejoin'],
		script_path2=SCRIPTS['rejoin_sh'],
		gene_mapping_file=config['gene_rejoin_mapping'],
		num_samples=config['num_samples']
	shell:
		"""
		/bin/bash -x {params.script_path2} {input} {output} {params.gene_mapping_file} {params.script_path} gene {threads}
		"""

rule rejoin_exons:
	input:
		os.path.join(STUDYDIR, 'all.exon_bw_count.pasted.{study}.tsv.gz')
	output:
		os.path.join(STUDYDIR, 'all.exon_counts.rejoined.{study}.tsv.gz')
	threads: 8
	params:
		staging=config['staging'],
		script_path=SCRIPTS['rejoin'],
		script_path2=SCRIPTS['rejoin_sh'],
		exon_mapping_file=config['exon_rejoin_mapping'],
		num_samples=config['num_samples']
	shell:
		"""
		/bin/bash -x {params.script_path2} {input} {output} {params.exon_mapping_file} {params.script_path} exon {threads}
		"""

def get_gene_annotations(wildcards):
	return ['%s/%s.gene.sums.%s.tsv.gz' % (STUDYDIR, annotation, wildcards.study) for annotation in annotations]

gene_annotations_uncompressed=['%s/%s.gene.sums.{study}.tsv' % (STUDYDIR, annotation) for annotation in annotations]
rule rejoin_genes_final:
	input:
		os.path.join(STUDYDIR, 'all.gene_counts.rejoined.{study}.tsv.gz'),
		os.path.join(STUDYDIR, 'all.exon_bw_count.pasted.{study}.tsv.gz')
	output:
		gene_annotations_uncompressed
	params:
		staging=config['staging'],
		script_path=SCRIPTS['rejoin_genes'],
		#../G029.G026.R109.F006.20190220.gtf.disjoint2exons2genes.rejoin_genes.bed
		gene_mapping_final_file=config['gene_mapping_final'],
		#G026,G029,R109,ERCC,SIRV,F006
		annotation_list=config['annotation_list'],
		annotation_list_space_delimited=' '.join(config['annotation_list'].split(',')),
		id_mapping=config['sample_ids_file'],
		#G026
		main_annotation=main_annotation,
		study=lambda wildcards: wildcards.study,
		studydir=STUDYDIR
	shell:
		"""
		set +o pipefail
		zcat {input[1]} | head -1 | cut -f 7- > all.exon_bw_count.pasted.gz.{params.study}.samples_header
		set -o pipefail
		zcat {input[0]} | tail -n+2 | {params.script_path} {params.gene_mapping_final_file} gene all.exon_bw_count.pasted.gz.{params.study}.samples_header {params.annotation_list} {params.id_mapping} {params.main_annotation} {params.study}
		for t in {params.annotation_list_space_delimited}; do
			mv ${{t}}.gene.sums.{params.study}.tsv {params.studydir}/
		done
		"""

#gene_annotations_compressed=['%s/%s.gene.sums.{study}.tsv.gz' % (STUDYDIR, annotation) for annotation in annotations]

rule compress_final_rejoined_genes:
	input:
		'%s/{annotation}.gene.sums.{study}.tsv' % (STUDYDIR)
	output:
		'%s/{annotation}.gene.sums.{study}.tsv.gz' % (STUDYDIR)
	threads: 8
	params:
		study=lambda wildcards: wildcards.study,
		studydir=STUDYDIR,
		annotation=lambda wildcards: wildcards.annotation
	shell:
		"""
		pigz --fast -p {threads} {params.studydir}/{params.annotation}.gene.sums.{params.study}.tsv
		"""