Checking in Profile production code for 0.0.6

ccgd-profile · Jan 14, 2016 · 3ac1dca · 3ac1dca
1 parent 085c2bf
commit 3ac1dca
Show file tree

Hide file tree

Showing 8 changed files with 4,444 additions and 58 deletions.
diff --git a/breakmer.py b/breakmer.py
@@ -1,72 +1,97 @@
 #! /usr/bin/local/python
-# -*- coding: utf-8 -*-
 
-import argparse
-import breakmer.params as params
-import breakmer.processor.analysis as breakmer_analysis
+# Fix get_svs_result()
+# Fix filter_by_feature
 
-__author__ = "Ryan Abo"
-__copyright__ = "Copyright 2015, Ryan Abo"
-__email__ = "[email protected]"
-__license__ = "MIT"
+import sys
+import os
+from utils import *
+from sv_processor import *
+import logging
+from optparse import OptionParser
 
 '''
+Author: Ryan Abo
+12/23/2013
 
-Main script that initiates the BreaKmer analysis or auxiliary functions to setup BreaKmer for analysis.
+Requires a configuration file containing key=value style formatting.
 
-There are three functions provided:
-1. run                    = perform analysis to detect structural variation.
-2. start_blat_server      = start the blat server in the background for analysis.
-3. prepare_reference_data = prepare the reference data for the target regions that are specified in the input files.
-
-The blat server provides a challenge in workflow. The best method is to:
-1. prepare reference data using 'prepare_reference_data' function
-2. start the blat server using 'start_blat_server' function
-    breakmer.py start_blat_server -p <port_number> --hostname <hostname> -c <config file>
-3. run the analysis and keep the blat server alive in the background for use in other analyses.
-    breakmer.py run -k -p <port_number> --hostname <hostname> -c <config file> -n <nprocessors> -g <gene_list>
+This is the main script that initiates the analysis. 
 
+Submodules:
+utils.py - utility functions.
+ classes:
+ - fq_read
+ - FastqFile
+ - anno
+ - params
+sv_processor.py - classes to manage analysis, process, analyze, write.
+ classes:
+ - runner - analysis-wide information and analysis (loop through each)
+ - target - target-level information and analysis
+ - contig - contig-level information and analysis
+sv_assembly.py - classes to take sample kmers and produce contigs.
+ classes:
+ - kmers
+ - kmer 
+ - buffer
+ - contig
+sv_caller.py - call SVs from blat result.
+ classes:
+ - align_manager
+ - blat_manager
+ - blat_res
+ - sv_event
 '''
 
-PARSER = argparse.ArgumentParser(description='Program to identify structural variants within targeted locations.', usage='%(prog)s [options]', add_help=True)
-SUBPARSERS = PARSER.add_subparsers(help='Program mode (run, start_blat_server, prepare_reference_data).', dest='fncCmd')
-
-# Setup three separate parsers for the three different functions.
-RUN_PARSER = SUBPARSERS.add_parser('run', help='Run analysis to detect structural variants.')
-SERVER_PARSER = SUBPARSERS.add_parser('start_blat_server', help='Start the blat server prior to performing the analysis.')
-REF_PARSER = SUBPARSERS.add_parser('prepare_reference_data', help='Prepare the reference sequence data for target regions prior to analysis.')
+#-----------------------------------------------------------
+# TODO: Set required options, exit if these are not set.
+#-----------------------------------------------------------
+def parse_config_f(config_fn, opts) :
+  param_opts = {}
+  config_f = open(config_fn, 'rU')
+  flines = config_f.readlines()
+  for line in flines :
+    line = line.strip()
+    linesplit = line.split("=")
+    if len(linesplit) == 1 : 
+      print 'Config line', line, ' not set correctly. Exiting.'
+      sys.exit()
+    else :
+      k,v = linesplit
+      param_opts[k] = v
 
-# Run parser
-RUN_PARSER.add_argument('--log_level', dest='log_level', default='DEBUG', help='Log level [default: DEBUG]')
-RUN_PARSER.add_argument('--indel_size', dest='indel_size', default=15, type=int, help='Indel size filter. [default: %(default)s]')
-RUN_PARSER.add_argument('--trl_sr_thresh', dest='trl_sr_thresh', default=2, type=int, help='Split read support threshold for translocations. [default: %(default)s]')
-RUN_PARSER.add_argument('--indel_sr_thresh', dest='indel_sr_thresh', default=5, type=int, help='Split read support threshold for indels. [default: %(default)s]')
-RUN_PARSER.add_argument('--rearr_sr_thresh', dest='rearr_sr_thresh', default=2, type=int, help='Split read support threshold for rearrangements. [default: %(default)s]')
-RUN_PARSER.add_argument('--rearr_min_seg_len', dest='rearr_minseg_len', default=30, type=int, help='Threshold for minimum segment to be rearranged. [default: %(default)s]')
-RUN_PARSER.add_argument('--trl_min_seg_len', dest='trl_minseg_len', default=25, type=int, help='Threshold for minimum length of translocation segment. [default: %(default)s]')
-RUN_PARSER.add_argument('--align_thresh', dest='align_thresh', default=.90, type=int, help='Threshold for minimum read alignment for assembly. [default: %(default)s]')
-RUN_PARSER.add_argument('--no_output_header', dest='no_output_header', default=False, action='store_true', help='Suppress output headers. [default: %(default)s]')
-RUN_PARSER.add_argument('--discread_only_thresh', dest='discread_only_thresh', default=2, type=int, help='The number of discordant read pairs in a cluster to output without evidence from a split read event. [default: %(default)s]')
-RUN_PARSER.add_argument('--generate_image', dest='generate_image', default=False, action='store_true', help='Generate pileup image for events. [default: %(default)s]')
-RUN_PARSER.add_argument('--hostname', dest='blat_hostname', default='localhost', help='The hostname for the blat server. Localhost will be used if not specified. [default: %(default)s]')
-RUN_PARSER.add_argument('-g', '--gene_list', dest='gene_list', default=None, help='Gene list to consider for analysis. [default: %(default)s]')
-RUN_PARSER.add_argument('-f', '--filter_list', dest='filterList', default=None, help='Input a set of events to filter out. [default: %(default)s]')
-RUN_PARSER.add_argument('-n', '--nprocessors', dest='nprocs', default=1, type=int, help='The number of processors to use for analysis. [default: %(default)s]')
-RUN_PARSER.add_argument('-s', '--start_blat_server', dest='start_blat_server', default=False, action='store_true', help='Start the blat server. Random port number and localhost will be used if neither specified. [default: %(default)s]')
-RUN_PARSER.add_argument('-k', '--keep_blat_server', dest='keep_blat_server', default=False, action='store_true', help='Keep the blat server alive. [default: %(default)s]')
-RUN_PARSER.add_argument('-p', '--port_number', dest='blat_port', default=None, type=int, help='The port number for the blat server. A random port number (8000-9500) will be used if not specified. [default: %(default)s]')
-RUN_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]')
+  for opt in vars(opts) : 
+    param_opts[opt] = vars(opts)[opt]
+  return param_opts
+#-----------------------------------------------------------
 
-# Server parser
-SERVER_PARSER.add_argument('-p', '--port_number', dest='blat_port', default=None, type=int, help='The port number for the blat server. A random port number (8000-9500) will be used if not specified. [default: %(default)s]')
-SERVER_PARSER.add_argument('--hostname', dest='blat_hostname', default='localhost', help='The hostname for the blat server. Localhost will be used if not specified. [default: %(default)s]')
-SERVER_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]')
+#````````````````````````````````````````````````````````````
+usage = '%prog [options] <config file name>'
+desc = """Script to identify structural variants within targeted locations."""
+parser = OptionParser(usage=usage,description=desc)
+parser.add_option('-l', '--log_level', dest='log_level', default='DEBUG', type='string', help='Log level [default: %default]')
+parser.add_option('-a', '--keep_repeat_regions', dest='keep_repeat_regions', default=False, action='store_true', help='Keep indels in repeat regions. Requires a repeat mask bed file. [default: %default]')
+parser.add_option('-p', '--preset_ref_data', dest='preset_ref_data', default=False, action="store_true", help='Preset all the reference data for all the targets before running analysis. [default: %default]')
+parser.add_option('-s', '--indel_size', dest='indel_size', default=15, type='int', help='Indel size filter [default: %default]')
+parser.add_option('-c', '--trl_sr_thresh', dest='trl_sr_thresh', default=2, type='int', help='Split read support threshold for translocations [default: %default]')
+parser.add_option('-d', '--indel_sr_thresh', dest='indel_sr_thresh', default=5, type='int', help='Split read support threshold for indels [default: %default]')
+parser.add_option('-r', '--rearr_sr_thresh', dest='rearr_sr_thresh', default=3, type='int', help='Split read support threshold for rearrangements [default: %default]')
+parser.add_option('-g', '--gene_list', dest='gene_list', default=None, type='string', help='Gene list to consider for analysis [default: %default]')
+parser.add_option('-k', '--keep_intron_vars', dest='keep_intron_vars', default=False, action='store_true', help='Keep intronic indels or rearrangements [default: %default]')
+parser.add_option('-v', '--var_filter', dest='var_filter', default='all', type='string', help='Variant types to report (all, indel, trl, rearrangment) [default: %default]')
+parser.add_option('-m', '--rearr_min_seg_len', dest='rearr_minseg_len', default=30, type='int', help='Threshold for minimum segment to be rearranged [default: %default]')
+parser.add_option('-n', '--trl_min_seg_len', dest='trl_minseg_len', default=25, type='int', help='Threshold for minimum length of translocation segment [default: %default]')
+parser.add_option('-t', '--align_thresh', dest='align_thresh', default=.90, type='int', help='Threshold for minimum read alignment for assembly [default: %default]')
+parser.add_option('-z', '--no_output_header', dest='no_output_header', default=False, action='store_true', help='Suppress output headers [default: %default]')
 
-# Setup reference parser
-REF_PARSER.add_argument('-g', '--gene_list', dest='gene_list', default=None, help='Gene list to consider for analysis. [default: %(default)s]')
-REF_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]')
-REF_PARSER.add_argument('-n', '--nprocessors', dest='nprocs', default=1, type=int, help='The number of processors to use for analysis. [default: %(default)s]')
+if __name__ == '__main__' :
+  opts, args = parser.parse_args(sys.argv[1:])
+  config_fn = args[0]
 
-# Start analysis
-RUN_TRACKER = breakmer_analysis.RunTracker(params.ParamManager(PARSER.parse_args()))
-RUN_TRACKER.run()
+  tic = time.clock()
+  config_d = parse_config_f(config_fn,opts)
+  setup_logger(config_d,'root')
+  r = runner(config_d)
+  r.run(tic)
+#````````````````````````````````````````````````````````````
diff --git a/kmer_region.config b/kmer_region.config
@@ -0,0 +1,16 @@
+analysis_name=<sample_id>
+targets_bed_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/panels/<panel_bed>
+sample_bam_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/analysis/<project_id>/<sample_id>/data/sample.bam
+analysis_dir=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/analysis/<project_id>/<sample_id>
+reference_data_dir=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/ref
+other_regions_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/cluster_regions.bed
+cutadapt=/ifs/rcgroups/ccgd/rpa4/software/cutadapt-1.2.1/bin/cutadapt
+cutadapt_config_file=/ifs/rcgroups/ccgd/rpa4/scripts/sv/kmer_sv/cutadapt.conf
+jellyfish=/ifs/rcgroups/ccgd/rpa4/software/jellyfish-1.1.11/bin/jellyfish
+assembler=/ifs/rcgroups/ccgd/rpa4/software/newbler/install2.6/bin/runAssembly
+blat=/ifs/rcgroups/ccgd/rpa4//software/blat_bin/blat
+blast=/ifs/rcgroups/ccgd/rpa4//software/ncbi-blast-2.2.28+/bin/blastn
+reference_fasta=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/ref/ucsc_hg19_dna.primary-assembly.fa
+gene_annotation_file=/ifs/rcgroups/ccgd/rpa4/ref2/human/ucsc/current/annotation/ucsc_hg19_refgene.txt
+repeat_mask_file=/ifs/rcgroups/ccgd/rpa4/ref2/human/ucsc/current/annotation/ucsc_hg19_rmsk.bed
+kmer_size=15
diff --git a/olc.py b/olc.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+#from Bio import SeqIO
+
+#records = list(SeqIO.parse("rosalind_oap.txt", "fasta"))
+
+#Two sequences you want to align
+#seq11 = records[0].seq
+#seq22 = records[1].seq
+
+#seq22 = "AGGAACAACATTTTCAGCTACAGCACACATACAAATTNAAAAAAAAAAAAAACATAGGAATTTCCCAGCAAGACCATAATGAATAAG" 
+#seq11 = "TTTTTCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAAACATAGGAATT"
+#seq11='TCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAACATAGGAATTTCCCA'
+#seq22='CTAGGAACATCATTTTCAGCTAAAGCACACATACAAATTAAAAAAAAAAAAAACATAGGAATTTCCCAGCAAGACCATAATGAATAAGAAACAATC'
+#seq11='TCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAAACATAGGAA'
+#seq22='CCCGTATTTAAAGGACACACAATTTAGAGGACCAACTGA'
+
+match_award = 1
+mismatch_penalty = -2
+gap_penalty = -2
+
+#Creates empty matrix with all zeros
+def zeros(shape):
+    retval = []
+    for x in range(shape[0]):
+        retval.append([])
+        for y in range(shape[1]):
+            retval[-1].append(0)
+    return retval
+
+#No substituition matrix, just simple linear gap penalty model
+def match_score(alpha, beta):
+    if alpha == beta:
+        return match_award
+    elif alpha == '-' or beta == '-':
+        return gap_penalty
+    else:
+        return mismatch_penalty
+
+def nw(seq1, seq2):
+    global max_score
+
+    m = len(seq1)
+    n = len(seq2) # lengths of two sequences
+
+    # Generate DP (Dynamic Programmed) table and traceback path pointer matrix
+    score = zeros((n+1, m+1)) # the DP table
+    for i in range(n+1) :
+      score[i][0] = 0
+
+    for j in range(m+1) :
+      score[0][j] = 0
+
+    #Traceback matrix
+    pointer = zeros((n+1, m+1)) # to store the traceback path
+    for i in range(n+1) :
+      pointer[i][0] = 1
+    for j in range(m+1) :
+      pointer[0][j] = 2
+
+    # Calculate DP table and mark pointers
+    for i in range(1, n + 1):
+        for j in range(1, m + 1):
+            score_diagonal = score[i-1][j-1] + match_score(seq1[j-1], seq2[i-1])
+            score_up = score[i][j-1] + gap_penalty
+	    score_left = score[i-1][j] + gap_penalty
+	    score[i][j] = max(score_left, score_up, score_diagonal)
+
+	    if score[i][j] == score_diagonal :
+                pointer[i][j] = 3 # 3 means trace diagonal
+	    elif score[i][j] == score_up :
+                pointer[i][j] = 2 # 2 means trace left
+	    elif score[i][j] == score_left :
+                pointer[i][j] = 1 # 1 means trace up
+
+    #Finding the right-most match which represents a longest overlap
+    #Note that .index() will find the index of the first item in the list that matches, 
+    #so if you had several identical "max" values, the index returned would be the one for the first.
+    max_i = -200
+    for ii in range(n+1) :
+      if score[ii][-1] >= max_i :
+	max_i = score[ii][-1]
+	i = ii
+#    print max_i
+
+    prei = i
+    prej = j
+    #Traceback, follow pointers in the traceback matrix
+    align1, align2 = '', '' # initial sequences
+    while 1 : 
+      if pointer[i][j] == 3 :
+	align1 = seq1[j-1] + align1
+	align2 = seq2[i-1] + align2
+	i -= 1
+	j -= 1
+      elif pointer[i][j] == 2 : #2 means trace left
+	align2 = '-' + align2
+	align1 = seq1[j-1] + align1
+	j -= 1
+      elif  pointer[i][j] == 1 : # 1 means trace up
+	align2 = seq2[i-1] + align2
+	align1 = '-' + align1
+	i -= 1
+#      print align1, align2, j, i
+      if (i == 0 or j == 0) : break
+
+    return (align1, align2, prej, j, prei, i, max_i)
+#    print align1 + '\n' + align2
+
+#print seq11
+#print seq22    
+#nw(seq11, seq22)
+#nw(seq22,seq11)