Merge pull request #76 from gmcvicker/master

Fixes and improvements to mapping pipeline
bmvdgeijn · Aug 31, 2018 · 5cfa67e · 5cfa67e
2 parents 556055e + 3f6a327
commit 5cfa67e
Show file tree

Hide file tree

Showing 17 changed files with 1,932 additions and 339 deletions.
diff --git a/.gitignore b/.gitignore
@@ -39,6 +39,7 @@ htmlcov/
 .cache
 nosetests.xml
 coverage.xml
+mapping/test_data
 
 # Translations
 *.mo

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+Version 0.3.1 - August 31, 2018
+-----------
+This release makes several improvements/fixes to read filtering by the mapping pipeline. Several of these improvements were suggested by Alex Dobin for using WASP with the STAR mapper.
+
+Changes include:
+* find_intersecting_snps.py was modified to handle dependence of alleles when both reads from a read pair overlap the same variant(s). Previously the reads that were generated for remapping were treated independently, however if read1 has alternate allele for a given SNP, so should read2 if it overlaps the same SNPs. 
+* snp2h5 was altered to add a new phase carray to haplotype.h5 containing phase information from the VCF. find_intersecting_snps.py will now use the phase information from haplotype.h5 to calculate new haplotypes with all possible allelic combinations at unphased sites, resulting in more reads being generated for remapping. If phase information is not provided in haplotype.h5, all sites will be assumed unphased.
+* Supplementary and secondary alignments are now filtered by find_intersecting_snps and filter_remapped_reads
+* Reads whose CIGAR flags change after the remapping step are now discarded, even if they map to the same start position.
+* snp2h5 will now try to add/remove 'chr' from the chromosome name read from the VCF file if the original name does not match any chromosomes in the chromInfo file.
+
+
 Version 0.3.0 - July 3, 2018
 -----------
 This release moves the codebase from Python2 to Python3 and includes several additional improvements.

diff --git a/CHT/README.bam2h5.md b/CHT/README.bam2h5.md
@@ -60,13 +60,6 @@ BAM files must be sorted and indexed.
     counts are randomly assigned to ONE of overlapping SNPs (regardless of 
     their genotype).
 
-* --samples SAMPLES_TXT_FILE [optional]
-
-    Path to text file containing a list of individual identifiers. The
-    ordering of individuals must be consistent with the haplotype
-    file. The samples file is assumed to have one identifier per line
-    in the first column (other columns are ignored).
-
 * --individual INDIVIDUAL [optional]
 
     Identifier for individual, used to determine which
@@ -125,6 +118,6 @@ BAM files must be sorted and indexed.
 	      --alt_as_counts alt_as_counts.$INDIVIDUAL.h5 \
 	      --other_as_counts other_as_counts.$INDIVIDUAL.h5 \
 	      --read_counts read_counts.$INDIVIDUAL.h5 \
-	      --text_counts counts.$INDIVIDUAL.txt.gz \
+	      --txt_counts counts.$INDIVIDUAL.txt.gz \
 	      H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
 
diff --git a/CHT/README.md b/CHT/README.md
@@ -151,7 +151,6 @@ before bam2h5.py can be run.
 	      --snp_index example_data/snp_index.h5 \
 	      --snp_tab example_data/snp_tab.h5 \
 	      --haplotype example_data/haps.h5 \
-	      --samples $ALL_SAMPLES_FILE \
 	      --individual $INDIVIDUAL \
 	      --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
 	      --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \

diff --git a/CHT/bam2h5.py b/CHT/bam2h5.py
@@ -66,17 +66,11 @@
        counts are randomly assigned to ONE of overlapping SNPs (regardless of
        their genotype).
 
-     --samples SAMPLES_TXT_FILE [optional]
-       Path to text file containing a list of individual identifiers. The
-       ordering of individuals must be consistent with the haplotype
-       file. The samples file is assumed to have one identifier per line
-       in the first column (other columns are ignored).
-
      --individual INDIVIDUAL [optional]
        Identifier for individual, used to determine which
        SNPs are heterozygous. Must be provided
        if --haplotype argument is provided and must match one of the
-       individuals in the file provided with --samples argument.
+       samples in the haplotype HDF5 file.
 
 Output Options:
      --data_type uint8|uint16
@@ -122,6 +116,9 @@
 import chromstat
 import util
 
+sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+"/../mapping/")
+import snptable
+
 # codes used by pysam for aligned read CIGAR strings
 BAM_CMATCH     = 0 # M
 BAM_CINS       = 1 # I
@@ -148,6 +145,8 @@
 MAX_UINT8_COUNT = 255
 MAX_UINT16_COUNT = 65535
 
+unimplemented_CIGAR = [0, set()]
+
 
 
 def create_carray(h5f, chrom, data_type):
@@ -283,9 +282,11 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
             # in read and not used in alignment
             pass
         else:
-            sys.stderr.write("skipping because contains CIGAR code %s "
-                             " which is not currently implemented\n" %
-                             BAM_CIGAR_DICT[op])
+            unimplemented_CIGAR[0] += 1
+            unimplemented_CIGAR[1].add(BAM_CIGAR_DICT[op])
+            # sys.stderr.write("skipping because contains CIGAR code %s "
+            #                  " which is not currently implemented\n" %
+            #                  BAM_CIGAR_DICT[op])
 
     # are any of the SNPs indels? If so, discard.
     for i in snp_idx:
@@ -466,22 +467,11 @@ def parse_args():
                         metavar="HAPLOTYPE_H5_FILE",
                         default=None)
 
-    parser.add_argument("--samples",
-                        help="Path to text file containing a list of "
-                        "individual identifiers. The ordering of individuals "
-                        "must be consistent with the haplotype file. The "
-                        "samples file is assumed to have one identifier per "
-                        "line in the first column (other columns are "
-                        "ignored).",
-                        metavar="SAMPLES_TXT_FILE",
-                        default=None)
-
     parser.add_argument("--individual",
                         help="Identifier for individual, used to determine "
-                        "which SNPs are heterozygous. Must be provided "
-                        "if --haplotype argument is provided and must "
-                        "match one of the individuals in the file provided "
-                        "with --samples argument.",
+                        "which SNPs are heterozygous. Must be provided if "
+                        "--haplotype argument is provided and must match one "
+                        "of the samples in the haplotype HDF5 file.",
                         metavar="INDIVIDUAL",
                         default=None)
 
@@ -538,8 +528,8 @@ def parse_args():
 
     args = parser.parse_args()
 
-    if args.haplotype and (args.individual is None or args.samples is None):
-            parser.error("--indidivual and --samples arguments "
+    if args.haplotype and (args.individual is None):
+            parser.error("--indidivual argument "
                          "must also be provided when --haplotype argument "
                          "is provided")
 
@@ -549,53 +539,6 @@ def parse_args():
 
 
 
-def lookup_individual_index(samples_file, ind_name, population=None):
-    """Gets the index of individual that is used
-    to lookup information in the genotype and haplotype tables"""
-    f = open(samples_file, "rt")
-
-    if population:
-        p = population.lower()
-    else:
-        p = None
-
-    idx = 0
-    for line in f:
-        if line.startswith("samples"):
-            # header line
-            continue
-
-        words = line.rstrip().split()
-        name = words[0].replace("NA", "")
-
-        if len(words) > 1:
-            pop = words[1].lower()
-        else:
-            pop = ""
-
-        if len(words) > 2:
-            group = words[2].lower()
-        else:
-            group = ""
-
-        # if specified, only consider a single population or group
-        if p and pop != p and group != p:
-            continue
-
-        if name == ind_name:
-            f.close()
-            return idx
-
-        idx += 1
-
-
-    raise ValueError("individual %s (with population=%s) "
-                     "is not in samples file %s" %
-                     (ind_name, population, samples_file))
-
-
-
-
 def main():
     args = parse_args()
 
@@ -612,10 +555,8 @@ def main():
 
     if args.haplotype:
         hap_h5 = tables.open_file(args.haplotype, "r")
-        ind_idx = lookup_individual_index(args.samples, args.individual)
     else:
         hap_h5 = None
-        ind_idx = None
 
     ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
     alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
@@ -645,10 +586,12 @@ def main():
     else:
         raise NotImplementedError("unsupported datatype %s" % args.data_type)
 
-    # create a list to hold the counts that will be later written
-    # to a txt file
-    if args.text_counts is not None:
-        txt_counts = list()
+    # create a txt file to also holds the counts
+    if args.txt_counts is not None:
+        if os.path.splitext(args.txt_counts)[1] == ".gz":
+            txt_counts = gzip.open(args.txt_counts, 'a+')
+        else:
+            txt_counts = open(args.txt_counts, 'a+')
 
     for chrom in chrom_list:
         sys.stderr.write("%s\n" % chrom.name)
@@ -666,8 +609,18 @@ def main():
         snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
         if hap_h5:
             hap_tab = hap_h5.get_node("/%s" % chrom.name)
+            ind_idx = snptable.SNPTable().get_h5_sample_indices(
+                hap_h5, chrom, [args.individual]
+            )[1]
+            if len(ind_idx) != 0:
+                ind_idx = ind_idx[0]
+            else:
+                hap_tab = None
+                ind_idx = None
         else:
             hap_tab = None
+            ind_idx = None
+
 
         # initialize count arrays for this chromosome to 0
         ref_carray = get_carray(ref_count_h5, chrom)
@@ -709,7 +662,7 @@ def main():
             # file later
             # columns are:
             # chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
-            if args.text_counts is not None:
+            if args.txt_counts is not None:
                 chrom = np.tile(chrom.name, len(snp_tab))
                 pos = np.array([snp['pos'] for snp in snp_tab])
                 ref = np.array([snp['allele1'] for snp in snp_tab])
@@ -719,22 +672,26 @@ def main():
                                          for hap in hap_tab])
                 else:
                     genotype = np.empty((len(snp_tab), 0))
-                txt_counts.append(
+                # write an np array to a txt file
+                np.savetxt(
+                    txt_counts,
                     np.column_stack((chrom, pos, ref, alt, genotype,
-                                     ref_array[pos-1],
-                                     alt_array[pos-1],
-                                     other_array[pos-1]))
+                                    ref_array[pos-1],
+                                    alt_array[pos-1],
+                                    other_array[pos-1])),
+                    fmt="%1s",
+                    delimiter=" "
                 )
 
 
             samfile.close()
 
-    # write the txt_counts np arrays to a txt file
-    if args.text_counts is not None:
-        # we use vstack to combine np arrays row-wise into a multi-dimensional
-        # array
-        np.savetxt(args.text_counts, np.vstack(tuple(txt_counts)),
-                   fmt="%1s", delimiter=" ")
+    if args.txt_counts:
+        # close the open txt file handler
+        txt_counts.close()
+
+    # check if any of the reads contained an unimplemented CIGAR
+    sys.stderr.write("WARNING: Encountered "+str(unimplemented_CIGAR[0])+" instances of any of the following CIGAR codes: "+str(unimplemented_CIGAR[1])+". The regions of reads with these CIGAR codes were skipped because these CIGAR codes are currently unimplemented.\n")
 
     # set track statistics and close HDF5 files
 

diff --git a/examples/example_cht_workflow.sh b/examples/example_cht_workflow.sh
@@ -51,7 +51,6 @@ do
 	      --snp_index $DATA_DIR/snp_index.h5 \
 	      --snp_tab $DATA_DIR/snp_tab.h5 \
 	      --haplotype $DATA_DIR/haps.h5 \
-	      --samples $ALL_SAMPLES_FILE \
 	      --individual $INDIVIDUAL \
 	      --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
 	      --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \

diff --git a/mapping/Snakefile b/mapping/Snakefile
@@ -216,20 +216,29 @@ rule rmdup_pe:
 rule get_as_counts:
     """get allele-specific read counts for SNPs"""
     input:
-         bam=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.sort.bam",
-         snp_index=config["snp_h5_dir"] + "/snp_index.h5",
-         snp_tab=config["snp_h5_dir"] + "/snp_tab.h5",
-         haplotype=config['snp_h5_dir'] + "/haplotype.h5",
+        bam=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.sort.bam",
+        snp_index=config["snp_h5_dir"] + "/snp_index.h5",
+        snp_tab=config["snp_h5_dir"] + "/snp_tab.h5",
+        haplotype=config['snp_h5_dir'] + "/haplotype.h5",
+        chrom=config['chrom_info']
     params:
-         samp1kg=lambda wildcards: SAMP_TO_1KG[wildcards.sample]
+        samp1kg=lambda wildcards: SAMP_TO_1KG[wildcards.sample]
     output:
-        config['output_dir'] + "/as_counts/{sample}.as_counts.txt.gz"
+        ref_as=config['output_dir'] + "/as_counts/{sample}.ref_as_counts.h5",
+        alt_as=config['output_dir'] + "/as_counts/{sample}.alt_as_counts.h5",
+        other_as=config['output_dir'] + "/as_counts/{sample}.other_as_counts.h5",
+        read_counts=config['output_dir'] + "/as_counts/{sample}.read_counts.h5",
+        txt_counts=config['output_dir'] + "/as_counts/{sample}.as_counts.txt.gz"
     shell:
-        "python {config[wasp_dir]}/mapping/get_as_counts.py "
-        "  --snp_tab {input.snp_tab} "
+        "python {config[wasp_dir]}/CHT/bam2h5.py "
+        "  --chrom {input.chrom} "
         "  --snp_index {input.snp_index} "
+        "  --snp_tab {input.snp_tab} "
         "  --haplotype {input.haplotype} "
-        "  --samples {config[sample_file]} "
-        "  --genotype_sample {params.samp1kg} "
-        "  {input.bam} | gzip > {output}"
-
+        "  --individual {params.samp1kg} "
+        "  --ref_as_counts {output.ref_as} "
+        "  --alt_as_counts {output.alt_as} "
+        "  --other_as_counts {output.other_as} "
+        "  --read_counts {output.read_counts} "
+        "  --txt_counts {output.txt_counts} "
+        "{input.bam}"