-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Checking in Profile production code for 0.0.6
- Loading branch information
mducar
authored and
mducar
committed
Jan 14, 2016
1 parent
085c2bf
commit 3ac1dca
Showing
8 changed files
with
4,444 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,97 @@ | ||
#! /usr/bin/local/python | ||
# -*- coding: utf-8 -*- | ||
|
||
import argparse | ||
import breakmer.params as params | ||
import breakmer.processor.analysis as breakmer_analysis | ||
# Fix get_svs_result() | ||
# Fix filter_by_feature | ||
|
||
__author__ = "Ryan Abo" | ||
__copyright__ = "Copyright 2015, Ryan Abo" | ||
__email__ = "[email protected]" | ||
__license__ = "MIT" | ||
import sys | ||
import os | ||
from utils import * | ||
from sv_processor import * | ||
import logging | ||
from optparse import OptionParser | ||
|
||
''' | ||
Author: Ryan Abo | ||
12/23/2013 | ||
Main script that initiates the BreaKmer analysis or auxiliary functions to setup BreaKmer for analysis. | ||
Requires a configuration file containing key=value style formatting. | ||
There are three functions provided: | ||
1. run = perform analysis to detect structural variation. | ||
2. start_blat_server = start the blat server in the background for analysis. | ||
3. prepare_reference_data = prepare the reference data for the target regions that are specified in the input files. | ||
The blat server provides a challenge in workflow. The best method is to: | ||
1. prepare reference data using 'prepare_reference_data' function | ||
2. start the blat server using 'start_blat_server' function | ||
breakmer.py start_blat_server -p <port_number> --hostname <hostname> -c <config file> | ||
3. run the analysis and keep the blat server alive in the background for use in other analyses. | ||
breakmer.py run -k -p <port_number> --hostname <hostname> -c <config file> -n <nprocessors> -g <gene_list> | ||
This is the main script that initiates the analysis. | ||
Submodules: | ||
utils.py - utility functions. | ||
classes: | ||
- fq_read | ||
- FastqFile | ||
- anno | ||
- params | ||
sv_processor.py - classes to manage analysis, process, analyze, write. | ||
classes: | ||
- runner - analysis-wide information and analysis (loop through each) | ||
- target - target-level information and analysis | ||
- contig - contig-level information and analysis | ||
sv_assembly.py - classes to take sample kmers and produce contigs. | ||
classes: | ||
- kmers | ||
- kmer | ||
- buffer | ||
- contig | ||
sv_caller.py - call SVs from blat result. | ||
classes: | ||
- align_manager | ||
- blat_manager | ||
- blat_res | ||
- sv_event | ||
''' | ||
|
||
PARSER = argparse.ArgumentParser(description='Program to identify structural variants within targeted locations.', usage='%(prog)s [options]', add_help=True) | ||
SUBPARSERS = PARSER.add_subparsers(help='Program mode (run, start_blat_server, prepare_reference_data).', dest='fncCmd') | ||
|
||
# Setup three separate parsers for the three different functions. | ||
RUN_PARSER = SUBPARSERS.add_parser('run', help='Run analysis to detect structural variants.') | ||
SERVER_PARSER = SUBPARSERS.add_parser('start_blat_server', help='Start the blat server prior to performing the analysis.') | ||
REF_PARSER = SUBPARSERS.add_parser('prepare_reference_data', help='Prepare the reference sequence data for target regions prior to analysis.') | ||
#----------------------------------------------------------- | ||
# TODO: Set required options, exit if these are not set. | ||
#----------------------------------------------------------- | ||
def parse_config_f(config_fn, opts) : | ||
param_opts = {} | ||
config_f = open(config_fn, 'rU') | ||
flines = config_f.readlines() | ||
for line in flines : | ||
line = line.strip() | ||
linesplit = line.split("=") | ||
if len(linesplit) == 1 : | ||
print 'Config line', line, ' not set correctly. Exiting.' | ||
sys.exit() | ||
else : | ||
k,v = linesplit | ||
param_opts[k] = v | ||
|
||
# Run parser | ||
RUN_PARSER.add_argument('--log_level', dest='log_level', default='DEBUG', help='Log level [default: DEBUG]') | ||
RUN_PARSER.add_argument('--indel_size', dest='indel_size', default=15, type=int, help='Indel size filter. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--trl_sr_thresh', dest='trl_sr_thresh', default=2, type=int, help='Split read support threshold for translocations. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--indel_sr_thresh', dest='indel_sr_thresh', default=5, type=int, help='Split read support threshold for indels. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--rearr_sr_thresh', dest='rearr_sr_thresh', default=2, type=int, help='Split read support threshold for rearrangements. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--rearr_min_seg_len', dest='rearr_minseg_len', default=30, type=int, help='Threshold for minimum segment to be rearranged. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--trl_min_seg_len', dest='trl_minseg_len', default=25, type=int, help='Threshold for minimum length of translocation segment. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--align_thresh', dest='align_thresh', default=.90, type=int, help='Threshold for minimum read alignment for assembly. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--no_output_header', dest='no_output_header', default=False, action='store_true', help='Suppress output headers. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--discread_only_thresh', dest='discread_only_thresh', default=2, type=int, help='The number of discordant read pairs in a cluster to output without evidence from a split read event. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--generate_image', dest='generate_image', default=False, action='store_true', help='Generate pileup image for events. [default: %(default)s]') | ||
RUN_PARSER.add_argument('--hostname', dest='blat_hostname', default='localhost', help='The hostname for the blat server. Localhost will be used if not specified. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-g', '--gene_list', dest='gene_list', default=None, help='Gene list to consider for analysis. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-f', '--filter_list', dest='filterList', default=None, help='Input a set of events to filter out. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-n', '--nprocessors', dest='nprocs', default=1, type=int, help='The number of processors to use for analysis. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-s', '--start_blat_server', dest='start_blat_server', default=False, action='store_true', help='Start the blat server. Random port number and localhost will be used if neither specified. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-k', '--keep_blat_server', dest='keep_blat_server', default=False, action='store_true', help='Keep the blat server alive. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-p', '--port_number', dest='blat_port', default=None, type=int, help='The port number for the blat server. A random port number (8000-9500) will be used if not specified. [default: %(default)s]') | ||
RUN_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]') | ||
for opt in vars(opts) : | ||
param_opts[opt] = vars(opts)[opt] | ||
return param_opts | ||
#----------------------------------------------------------- | ||
|
||
# Server parser | ||
SERVER_PARSER.add_argument('-p', '--port_number', dest='blat_port', default=None, type=int, help='The port number for the blat server. A random port number (8000-9500) will be used if not specified. [default: %(default)s]') | ||
SERVER_PARSER.add_argument('--hostname', dest='blat_hostname', default='localhost', help='The hostname for the blat server. Localhost will be used if not specified. [default: %(default)s]') | ||
SERVER_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]') | ||
#```````````````````````````````````````````````````````````` | ||
usage = '%prog [options] <config file name>' | ||
desc = """Script to identify structural variants within targeted locations.""" | ||
parser = OptionParser(usage=usage,description=desc) | ||
parser.add_option('-l', '--log_level', dest='log_level', default='DEBUG', type='string', help='Log level [default: %default]') | ||
parser.add_option('-a', '--keep_repeat_regions', dest='keep_repeat_regions', default=False, action='store_true', help='Keep indels in repeat regions. Requires a repeat mask bed file. [default: %default]') | ||
parser.add_option('-p', '--preset_ref_data', dest='preset_ref_data', default=False, action="store_true", help='Preset all the reference data for all the targets before running analysis. [default: %default]') | ||
parser.add_option('-s', '--indel_size', dest='indel_size', default=15, type='int', help='Indel size filter [default: %default]') | ||
parser.add_option('-c', '--trl_sr_thresh', dest='trl_sr_thresh', default=2, type='int', help='Split read support threshold for translocations [default: %default]') | ||
parser.add_option('-d', '--indel_sr_thresh', dest='indel_sr_thresh', default=5, type='int', help='Split read support threshold for indels [default: %default]') | ||
parser.add_option('-r', '--rearr_sr_thresh', dest='rearr_sr_thresh', default=3, type='int', help='Split read support threshold for rearrangements [default: %default]') | ||
parser.add_option('-g', '--gene_list', dest='gene_list', default=None, type='string', help='Gene list to consider for analysis [default: %default]') | ||
parser.add_option('-k', '--keep_intron_vars', dest='keep_intron_vars', default=False, action='store_true', help='Keep intronic indels or rearrangements [default: %default]') | ||
parser.add_option('-v', '--var_filter', dest='var_filter', default='all', type='string', help='Variant types to report (all, indel, trl, rearrangment) [default: %default]') | ||
parser.add_option('-m', '--rearr_min_seg_len', dest='rearr_minseg_len', default=30, type='int', help='Threshold for minimum segment to be rearranged [default: %default]') | ||
parser.add_option('-n', '--trl_min_seg_len', dest='trl_minseg_len', default=25, type='int', help='Threshold for minimum length of translocation segment [default: %default]') | ||
parser.add_option('-t', '--align_thresh', dest='align_thresh', default=.90, type='int', help='Threshold for minimum read alignment for assembly [default: %default]') | ||
parser.add_option('-z', '--no_output_header', dest='no_output_header', default=False, action='store_true', help='Suppress output headers [default: %default]') | ||
|
||
# Setup reference parser | ||
REF_PARSER.add_argument('-g', '--gene_list', dest='gene_list', default=None, help='Gene list to consider for analysis. [default: %(default)s]') | ||
REF_PARSER.add_argument('-c', '--config', dest='config_fn', default=None, required=True, help='The configuration filename that contains additional parameters. [default: %(default)s]') | ||
REF_PARSER.add_argument('-n', '--nprocessors', dest='nprocs', default=1, type=int, help='The number of processors to use for analysis. [default: %(default)s]') | ||
if __name__ == '__main__' : | ||
opts, args = parser.parse_args(sys.argv[1:]) | ||
config_fn = args[0] | ||
|
||
# Start analysis | ||
RUN_TRACKER = breakmer_analysis.RunTracker(params.ParamManager(PARSER.parse_args())) | ||
RUN_TRACKER.run() | ||
tic = time.clock() | ||
config_d = parse_config_f(config_fn,opts) | ||
setup_logger(config_d,'root') | ||
r = runner(config_d) | ||
r.run(tic) | ||
#```````````````````````````````````````````````````````````` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
analysis_name=<sample_id> | ||
targets_bed_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/panels/<panel_bed> | ||
sample_bam_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/analysis/<project_id>/<sample_id>/data/sample.bam | ||
analysis_dir=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/analysis/<project_id>/<sample_id> | ||
reference_data_dir=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/ref | ||
other_regions_file=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/cluster_regions.bed | ||
cutadapt=/ifs/rcgroups/ccgd/rpa4/software/cutadapt-1.2.1/bin/cutadapt | ||
cutadapt_config_file=/ifs/rcgroups/ccgd/rpa4/scripts/sv/kmer_sv/cutadapt.conf | ||
jellyfish=/ifs/rcgroups/ccgd/rpa4/software/jellyfish-1.1.11/bin/jellyfish | ||
assembler=/ifs/rcgroups/ccgd/rpa4/software/newbler/install2.6/bin/runAssembly | ||
blat=/ifs/rcgroups/ccgd/rpa4//software/blat_bin/blat | ||
blast=/ifs/rcgroups/ccgd/rpa4//software/ncbi-blast-2.2.28+/bin/blastn | ||
reference_fasta=/ifs/rcgroups/ccgd/rpa4/analysis/sv/130808/kmer_region_branch/data/ref/ucsc_hg19_dna.primary-assembly.fa | ||
gene_annotation_file=/ifs/rcgroups/ccgd/rpa4/ref2/human/ucsc/current/annotation/ucsc_hg19_refgene.txt | ||
repeat_mask_file=/ifs/rcgroups/ccgd/rpa4/ref2/human/ucsc/current/annotation/ucsc_hg19_rmsk.bed | ||
kmer_size=15 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#!/usr/bin/env python | ||
|
||
#from Bio import SeqIO | ||
|
||
#records = list(SeqIO.parse("rosalind_oap.txt", "fasta")) | ||
|
||
#Two sequences you want to align | ||
#seq11 = records[0].seq | ||
#seq22 = records[1].seq | ||
|
||
#seq22 = "AGGAACAACATTTTCAGCTACAGCACACATACAAATTNAAAAAAAAAAAAAACATAGGAATTTCCCAGCAAGACCATAATGAATAAG" | ||
#seq11 = "TTTTTCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAAACATAGGAATT" | ||
#seq11='TCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAACATAGGAATTTCCCA' | ||
#seq22='CTAGGAACATCATTTTCAGCTAAAGCACACATACAAATTAAAAAAAAAAAAAACATAGGAATTTCCCAGCAAGACCATAATGAATAAGAAACAATC' | ||
#seq11='TCCTGAGAATTTATCTTGTAGGTCAAAACAGATCTAGGAACATCATTTTCAGCTACAGCACACATACAAATTAAAAAAAAAAAAAAACATAGGAA' | ||
#seq22='CCCGTATTTAAAGGACACACAATTTAGAGGACCAACTGA' | ||
|
||
match_award = 1 | ||
mismatch_penalty = -2 | ||
gap_penalty = -2 | ||
|
||
#Creates empty matrix with all zeros | ||
def zeros(shape): | ||
retval = [] | ||
for x in range(shape[0]): | ||
retval.append([]) | ||
for y in range(shape[1]): | ||
retval[-1].append(0) | ||
return retval | ||
|
||
#No substituition matrix, just simple linear gap penalty model | ||
def match_score(alpha, beta): | ||
if alpha == beta: | ||
return match_award | ||
elif alpha == '-' or beta == '-': | ||
return gap_penalty | ||
else: | ||
return mismatch_penalty | ||
|
||
def nw(seq1, seq2): | ||
global max_score | ||
|
||
m = len(seq1) | ||
n = len(seq2) # lengths of two sequences | ||
|
||
# Generate DP (Dynamic Programmed) table and traceback path pointer matrix | ||
score = zeros((n+1, m+1)) # the DP table | ||
for i in range(n+1) : | ||
score[i][0] = 0 | ||
|
||
for j in range(m+1) : | ||
score[0][j] = 0 | ||
|
||
#Traceback matrix | ||
pointer = zeros((n+1, m+1)) # to store the traceback path | ||
for i in range(n+1) : | ||
pointer[i][0] = 1 | ||
for j in range(m+1) : | ||
pointer[0][j] = 2 | ||
|
||
# Calculate DP table and mark pointers | ||
for i in range(1, n + 1): | ||
for j in range(1, m + 1): | ||
score_diagonal = score[i-1][j-1] + match_score(seq1[j-1], seq2[i-1]) | ||
score_up = score[i][j-1] + gap_penalty | ||
score_left = score[i-1][j] + gap_penalty | ||
score[i][j] = max(score_left, score_up, score_diagonal) | ||
|
||
if score[i][j] == score_diagonal : | ||
pointer[i][j] = 3 # 3 means trace diagonal | ||
elif score[i][j] == score_up : | ||
pointer[i][j] = 2 # 2 means trace left | ||
elif score[i][j] == score_left : | ||
pointer[i][j] = 1 # 1 means trace up | ||
|
||
#Finding the right-most match which represents a longest overlap | ||
#Note that .index() will find the index of the first item in the list that matches, | ||
#so if you had several identical "max" values, the index returned would be the one for the first. | ||
max_i = -200 | ||
for ii in range(n+1) : | ||
if score[ii][-1] >= max_i : | ||
max_i = score[ii][-1] | ||
i = ii | ||
# print max_i | ||
|
||
prei = i | ||
prej = j | ||
#Traceback, follow pointers in the traceback matrix | ||
align1, align2 = '', '' # initial sequences | ||
while 1 : | ||
if pointer[i][j] == 3 : | ||
align1 = seq1[j-1] + align1 | ||
align2 = seq2[i-1] + align2 | ||
i -= 1 | ||
j -= 1 | ||
elif pointer[i][j] == 2 : #2 means trace left | ||
align2 = '-' + align2 | ||
align1 = seq1[j-1] + align1 | ||
j -= 1 | ||
elif pointer[i][j] == 1 : # 1 means trace up | ||
align2 = seq2[i-1] + align2 | ||
align1 = '-' + align1 | ||
i -= 1 | ||
# print align1, align2, j, i | ||
if (i == 0 or j == 0) : break | ||
|
||
return (align1, align2, prej, j, prei, i, max_i) | ||
# print align1 + '\n' + align2 | ||
|
||
#print seq11 | ||
#print seq22 | ||
#nw(seq11, seq22) | ||
#nw(seq22,seq11) |
Oops, something went wrong.