Skip to content

Commit

Permalink
Checking in Profile production code for 0.0.6
Browse files Browse the repository at this point in the history
  • Loading branch information
mducar authored and mducar committed Jan 14, 2016
1 parent 085c2bf commit 3ac1dca
Show file tree
Hide file tree
Showing 8 changed files with 4,444 additions and 58 deletions.
141 changes: 83 additions & 58 deletions breakmer.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,72 +1,97 @@
#! /usr/bin/local/python
# -*- coding: utf-8 -*-

import argparse
import breakmer.params as params
import breakmer.processor.analysis as breakmer_analysis
# Fix get_svs_result()
# Fix filter_by_feature

__author__ = "Ryan Abo"
__copyright__ = "Copyright 2015, Ryan Abo"
__email__ = "[email protected]"
__license__ = "MIT"
import sys
import os
from utils import *
from sv_processor import *
import logging
from optparse import OptionParser

'''
Author: Ryan Abo
12/23/2013
Main script that initiates the BreaKmer analysis or auxiliary functions to setup BreaKmer for analysis.
Requires a configuration file containing key=value style formatting.
There are three functions provided:
1. run = perform analysis to detect structural variation.
2. start_blat_server = start the blat server in the background for analysis.
3. prepare_reference_data = prepare the reference data for the target regions that are specified in the input files.
The blat server provides a challenge in workflow. The best method is to:
1. prepare reference data using 'prepare_reference_data' function
2. start the blat server using 'start_blat_server' function
breakmer.py start_blat_server -p <port_number> --hostname <hostname> -c <config file>
3. run the analysis and keep the blat server alive in the background for use in other analyses.
breakmer.py run -k -p <port_number> --hostname <hostname> -c <config file> -n <nprocessors> -g <gene_list>
This is the main script that initiates the analysis.
Submodules:
utils.py - utility functions.
classes:
- fq_read
- FastqFile
- anno
- params
sv_processor.py - classes to manage analysis, process, analyze, write.
classes:
- runner - analysis-wide information and analysis (loop through each)
- target - target-level information and analysis
- contig - contig-level information and analysis
sv_assembly.py - classes to take sample kmers and produce contigs.
classes:
- kmers
- kmer
- buffer
- contig
sv_caller.py - call SVs from blat result.
classes:
- align_manager
- blat_manager
- blat_res
- sv_event
'''

PARSER = argparse.ArgumentParser(description='Program to identify structural variants within targeted locations.', usage='%(prog)s [options]', add_help=True)
SUBPARSERS = PARSER.add_subparsers(help='Program mode (run, start_blat_server, prepare_reference_data).', dest='fncCmd')

# Setup three separate parsers for the three different functions.
RUN_PARSER = SUBPARSERS.add_parser('run', help='Run analysis to detect structural variants.')
SERVER_PARSER = SUBPARSERS.add_parser('start_blat_server', help='Start the blat server prior to performing the analysis.')
REF_PARSER = SUBPARSERS.add_parser('prepare_reference_data', help='Prepare the reference sequence data for target regions prior to analysis.')
#-----------------------------------------------------------
# TODO: Set required options, exit if these are not set.
#-----------------------------------------------------------
def parse_config_f(config_fn, opts) :
param_opts = {}
config_f = open(config_fn, 'rU')
flines = config_f.readlines()
for line in flines :
line = line.strip()
linesplit = line.split("=")
if len(linesplit) == 1 :
print 'Config line', line, ' not set correctly. Exiting.'
sys.exit()
else :
k,v = linesplit
param_opts[k] = v

# Run parser
RUN_PARSER.add_argument('--log_level', dest='log_level', default='DEBUG', help='Log level [default: DEBUG]')
RUN_PARSER.add_argument('--indel_size', dest='indel_size', default=15, type=int, help='Indel size filter. [default: %(default)s]')
RUN_PARSER.add_argument('--trl_sr_thresh', dest='trl_sr_thresh', default=2, type=int, help='Split read support threshold for translocations. [default: %(default)s]')
RUN_PARSER.add_argument('--indel_sr_thresh', dest='indel_sr_thresh', default=5, type=int, help='Split read support threshold for indels. [default: %(default)s]')
RUN_PARSER.add_argument('--rearr_sr_thresh', dest='rearr_sr_thresh', default=2, type=int, help='Split read support threshold for rearrangements. [default: %(default)s]')
RUN_PARSER.add_argument('--rearr_min_seg_len', dest='rearr_minseg_len', default=30, type=int, help='Threshold for minimum segment to be rearranged. [default: %(default)s]')
RUN_PARSER.add_argument('--trl_min_seg_len', dest='trl_minseg_len', default=25, type=int, help='Threshold for minimum length of translocation segment. [default: %(default)s]')
RUN_PARSER.add_argument('--align_thresh', dest='align_thresh', default=.90, type=int, help='Threshold for minimum read alignment for assembly. [default: %(default)s]')
RUN_PARSER.add_argument('--no_output_header', dest='no_output_header', default=False, action='store_true', help='Suppress output headers. [default: %(default)s]')
RUN_PARSER.add_argument('--discread_only_thresh', dest='discread_only_thresh', default=2, type=int, help='The number of discordant read pairs in a cluster to output without evidence from a split read event. [default: %(default)s]')
RUN_PARSER.add_argument('--generate_image', dest='generate_image', default=False, action='store_true', help='Generate pileup image for events. [default: %(default)s]')
RUN_PARSER.add_argument('--hostname', dest='blat_hostname', default='localhost', help='The hostname for the blat server. Localhost will be used if not specified. [default: %(default)s]')
RUN_PARSER.add_argument('-g', '--gene_list', dest='gene_list', default=None, help='Gene list to consider for analysis. [default: %(default)s]')
RUN_PARSER.add_argument('-f', '--filter_list', dest='filterList', default=None, help='Input a set of events to filter out. [default: %(default)s]')
RUN_PARSER.add_argument('-n', '--nprocessors', dest='nprocs', default=1, type=int, help='The number of processors to use for analysis. [default: %(default)s]')
RUN_PARSER.add_argument('-s', '--start_blat_server', dest='start_blat_server', default=False, action='store_true', help='Start the blat server. Random port number and localhost will be used if neither specified. [default: %(default)s]')
RUN_PARSER.add_argument('-k', '--keep_blat_server', dest='keep_blat_server', default=False, action='store_true', help='Keep the blat server alive. [default: %(default)s]')
RUN_PARSER.add_argument('-p', '--port_number', dest='blat_port', default=None, type=int, help='The port number for the blat server. A random port number (8000-9500) will be used if not specified. [default: %(default)s]')
RUN_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]')
for opt in vars(opts) :
param_opts[opt] = vars(opts)[opt]
return param_opts
#-----------------------------------------------------------

# Server parser
SERVER_PARSER.add_argument('-p', '--port_number', dest='blat_port', default=None, type=int, help='The port number for the blat server. A random port number (8000-9500) will be used if not specified. [default: %(default)s]')
SERVER_PARSER.add_argument('--hostname', dest='blat_hostname', default='localhost', help='The hostname for the blat server. Localhost will be used if not specified. [default: %(default)s]')
SERVER_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]')
#````````````````````````````````````````````````````````````
usage = '%prog [options] <config file name>'
desc = """Script to identify structural variants within targeted locations."""
parser = OptionParser(usage=usage,description=desc)
parser.add_option('-l', '--log_level', dest='log_level', default='DEBUG', type='string', help='Log level [default: %default]')
parser.add_option('-a', '--keep_repeat_regions', dest='keep_repeat_regions', default=False, action='store_true', help='Keep indels in repeat regions. Requires a repeat mask bed file. [default: %default]')
parser.add_option('-p', '--preset_ref_data', dest='preset_ref_data', default=False, action="store_true", help='Preset all the reference data for all the targets before running analysis. [default: %default]')
parser.add_option('-s', '--indel_size', dest='indel_size', default=15, type='int', help='Indel size filter [default: %default]')
parser.add_option('-c', '--trl_sr_thresh', dest='trl_sr_thresh', default=2, type='int', help='Split read support threshold for translocations [default: %default]')
parser.add_option('-d', '--indel_sr_thresh', dest='indel_sr_thresh', default=5, type='int', help='Split read support threshold for indels [default: %default]')
parser.add_option('-r', '--rearr_sr_thresh', dest='rearr_sr_thresh', default=3, type='int', help='Split read support threshold for rearrangements [default: %default]')
parser.add_option('-g', '--gene_list', dest='gene_list', default=None, type='string', help='Gene list to consider for analysis [default: %default]')
parser.add_option('-k', '--keep_intron_vars', dest='keep_intron_vars', default=False, action='store_true', help='Keep intronic indels or rearrangements [default: %default]')
parser.add_option('-v', '--var_filter', dest='var_filter', default='all', type='string', help='Variant types to report (all, indel, trl, rearrangment) [default: %default]')
parser.add_option('-m', '--rearr_min_seg_len', dest='rearr_minseg_len', default=30, type='int', help='Threshold for minimum segment to be rearranged [default: %default]')
parser.add_option('-n', '--trl_min_seg_len', dest='trl_minseg_len', default=25, type='int', help='Threshold for minimum length of translocation segment [default: %default]')
parser.add_option('-t', '--align_thresh', dest='align_thresh', default=.90, type='int', help='Threshold for minimum read alignment for assembly [default: %default]')
parser.add_option('-z', '--no_output_header', dest='no_output_header', default=False, action='store_true', help='Suppress output headers [default: %default]')

# Setup reference parser
REF_PARSER.add_argument('-g', '--gene_list', dest='gene_list', default=None, help='Gene list to consider for analysis. [default: %(default)s]')
REF_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]')
REF_PARSER.add_argument('-n', '--nprocessors', dest='nprocs', default=1, type=int, help='The number of processors to use for analysis. [default: %(default)s]')
if __name__ == '__main__' :
opts, args = parser.parse_args(sys.argv[1:])
config_fn = args[0]

# Start analysis
RUN_TRACKER = breakmer_analysis.RunTracker(params.ParamManager(PARSER.parse_args()))
RUN_TRACKER.run()
tic = time.clock()
config_d = parse_config_f(config_fn,opts)
setup_logger(config_d,'root')
r = runner(config_d)
r.run(tic)
#````````````````````````````````````````````````````````````
16 changes: 16 additions & 0 deletions kmer_region.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
analysis_name=<sample_id>
targets_bed_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/panels/<panel_bed>
sample_bam_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/analysis/<project_id>/<sample_id>/data/sample.bam
analysis_dir=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/analysis/<project_id>/<sample_id>
reference_data_dir=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/ref
other_regions_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/cluster_regions.bed
cutadapt=/ifs/rcgroups/ccgd/rpa4/software/cutadapt-1.2.1/bin/cutadapt
cutadapt_config_file=/ifs/rcgroups/ccgd/rpa4/scripts/sv/kmer_sv/cutadapt.conf
jellyfish=/ifs/rcgroups/ccgd/rpa4/software/jellyfish-1.1.11/bin/jellyfish
assembler=/ifs/rcgroups/ccgd/rpa4/software/newbler/install2.6/bin/runAssembly
blat=/ifs/rcgroups/ccgd/rpa4//software/blat_bin/blat
blast=/ifs/rcgroups/ccgd/rpa4//software/ncbi-blast-2.2.28+/bin/blastn
reference_fasta=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/ref/ucsc_hg19_dna.primary-assembly.fa
gene_annotation_file=/ifs/rcgroups/ccgd/rpa4/ref2/human/ucsc/current/annotation/ucsc_hg19_refgene.txt
repeat_mask_file=/ifs/rcgroups/ccgd/rpa4/ref2/human/ucsc/current/annotation/ucsc_hg19_rmsk.bed
kmer_size=15
113 changes: 113 additions & 0 deletions olc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python

#from Bio import SeqIO

#records = list(SeqIO.parse("rosalind_oap.txt", "fasta"))

#Two sequences you want to align
#seq11 = records[0].seq
#seq22 = records[1].seq

#seq22 = "AGGAACAACATTTTCAGCTACAGCACACATACAAATTNAAAAAAAAAAAAAACATAGGAATTTCCCAGCAAGACCATAATGAATAAG"
#seq11 = "TTTTTCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAAACATAGGAATT"
#seq11='TCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAACATAGGAATTTCCCA'
#seq22='CTAGGAACATCATTTTCAGCTAAAGCACACATACAAATTAAAAAAAAAAAAAACATAGGAATTTCCCAGCAAGACCATAATGAATAAGAAACAATC'
#seq11='TCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAAACATAGGAA'
#seq22='CCCGTATTTAAAGGACACACAATTTAGAGGACCAACTGA'

match_award = 1
mismatch_penalty = -2
gap_penalty = -2

#Creates empty matrix with all zeros
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval

#No substituition matrix, just simple linear gap penalty model
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty

def nw(seq1, seq2):
global max_score

m = len(seq1)
n = len(seq2) # lengths of two sequences

# Generate DP (Dynamic Programmed) table and traceback path pointer matrix
score = zeros((n+1, m+1)) # the DP table
for i in range(n+1) :
score[i][0] = 0

for j in range(m+1) :
score[0][j] = 0

#Traceback matrix
pointer = zeros((n+1, m+1)) # to store the traceback path
for i in range(n+1) :
pointer[i][0] = 1
for j in range(m+1) :
pointer[0][j] = 2

# Calculate DP table and mark pointers
for i in range(1, n + 1):
for j in range(1, m + 1):
score_diagonal = score[i-1][j-1] + match_score(seq1[j-1], seq2[i-1])
score_up = score[i][j-1] + gap_penalty
score_left = score[i-1][j] + gap_penalty
score[i][j] = max(score_left, score_up, score_diagonal)

if score[i][j] == score_diagonal :
pointer[i][j] = 3 # 3 means trace diagonal
elif score[i][j] == score_up :
pointer[i][j] = 2 # 2 means trace left
elif score[i][j] == score_left :
pointer[i][j] = 1 # 1 means trace up

#Finding the right-most match which represents a longest overlap
#Note that .index() will find the index of the first item in the list that matches,
#so if you had several identical "max" values, the index returned would be the one for the first.
max_i = -200
for ii in range(n+1) :
if score[ii][-1] >= max_i :
max_i = score[ii][-1]
i = ii
# print max_i

prei = i
prej = j
#Traceback, follow pointers in the traceback matrix
align1, align2 = '', '' # initial sequences
while 1 :
if pointer[i][j] == 3 :
align1 = seq1[j-1] + align1
align2 = seq2[i-1] + align2
i -= 1
j -= 1
elif pointer[i][j] == 2 : #2 means trace left
align2 = '-' + align2
align1 = seq1[j-1] + align1
j -= 1
elif pointer[i][j] == 1 : # 1 means trace up
align2 = seq2[i-1] + align2
align1 = '-' + align1
i -= 1
# print align1, align2, j, i
if (i == 0 or j == 0) : break

return (align1, align2, prej, j, prei, i, max_i)
# print align1 + '\n' + align2

#print seq11
#print seq22
#nw(seq11, seq22)
#nw(seq22,seq11)
Loading

0 comments on commit 3ac1dca

Please sign in to comment.