diff --git a/.ipynb_checkpoints/Deduper - Part 1-checkpoint.ipynb b/.ipynb_checkpoints/Deduper - Part 1-checkpoint.ipynb new file mode 100644 index 0000000..20648e5 --- /dev/null +++ b/.ipynb_checkpoints/Deduper - Part 1-checkpoint.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deduper \n", + "--- \n", + "## Part 1 \n", + "--- \n", + "### Define the Problem \n", + "--- \n", + "- _PCR duplicates arise during PCR amplification of original fragmented, adapter-ligated DNA. They become problematic when 2 copies of the same molecule get onto different primer lawns on an Illumina flowcell. PCR-duplicate removal (or tagging) is necessary to mitigate PCR bias, which is introduced when certain molecules ligate adapters more favorably and are disproportionately amplified. The greater numbers of these preferentially amplified molecules makes them more likely to contaminate neighboring lawns on the flowcell. Failure to remove these duplicates could limit your ability to draw conclusions from your data, as biological information may be confounded by PCR bias._ \n", + "\n", + "### Write Examples \n", + "--- \n", + " - Input SAM file: \n", + " - **sample\\_input.sam** \n", + " - 6 conditions for forward read:\n", + " - Initial UMI (non-duplicate)\n", + " - Same UMI, different chromosome\n", + " - Same UMI, different POS \n", + " - Same UMI, different strand\n", + " - Same UMI, DUPLICATE w/ soft-clipping\n", + " - Ssme UMI, DUPLICATE w/o soft-clipping\n", + " - 6 conditions for reverse read:\n", + " - Same as above\n", + " - Output SAM file: \n", + " - **sample\\_output.sam** \n", + " - 8 reads: 4 from forward, 4 from reverse.\n", + "\n", + "### Develop Algorithm w/ Pseudocode \n", + "--- " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#################\n", + "## PSEUDO-CODE ##\n", + "#################\n", + "# To run SAMtools on the command line from a Python script...\n", + "from subprocess import call \n", + "# Import regular expressions.\n", + "import re\n", + "\n", + "# run SAMtools sort on file(s).\n", + "call(\"samtools sort -l 0 -m maxMem -o -T \", shell=True)\n", + "\n", + "# open the file produced from 'samtools sort' (above) as fin, open the output file for THIS program as fout.\n", + "# open the file of known barcodes as index.\n", + "with open(\"out.sam\") as fin, with open(\"dedup_output.sam\",'a') as fout, with open(\"STL96.txt\") as index:\n", + " # Create an empty set to hold real indexes.\n", + " valid_IDs = set()\n", + " # Add each index to valid_IDs.\n", + " for i in index:\n", + " i = i.strip()\n", + " valid_IDs.add(i)\n", + " # Create a counter for PCR-duplicates.\n", + " PCR_dup = 0\n", + " # Create a dictionary to keep track of UMI information.\n", + " UMI_dict = {}\n", + " # Keep track of total reads.\n", + " tot_rds = 0\n", + " # Iterate through each line of the input SAM file.\n", + " for line in fin:\n", + " # If the line is a header line...\n", + " if re.match(r'^@[A-Z]{2}\\s', line):\n", + " # Store the line w/newline in the temporary string 'header'.\n", + " header = (line + \"\\n\")\n", + " # Write the header to fout.\n", + " fout.write(header)\n", + " # Continue iterating thru 'fin'.\n", + " continue\n", + " else:\n", + " # Increment the number of total reads.\n", + " tot_rds += 1\n", + " # Get the relevant information to check for PCR duplicates.\n", + " parse_read = parseSAM(line)\n", + " # Store info in separate variables.\n", + " this_UMI = parse_read[0]\n", + " this_chr = parse_read[1]\n", + " this_pos = parse_read[2]\n", + " this_CIGAR = parse_read[3]\n", + " this_strand = parse_read[4]\n", + " # If current UMI is in valid_IDs...\n", + " if this_UMI in valid_IDs:\n", + " # If current read HASN'T undergone soft-clipping...\n", + " if this_pos == truePOS(this_pos,this_CIGAR):\n", + " # And current read is not in the dictionary.\n", + " if this_UMI not in UMI_dict:\n", + " # Add it to the dictionary.\n", + " UMI_dict[this_UMI] = (this_chr,this_pos,this_strand)\n", + " read = line + \"\\n\"\n", + " # Write the line to fout.\n", + " fout.write(read)\n", + " # If the UMI is already in the dictionary, check for PCR duplicate...\n", + " elif:\n", + " if this_chr == UMI_dict[this_UMI][0] and this_pos == UMI_dict[this_UMI][1] and this_strand == UMI_dict[this_UMI[2]]:\n", + " # Same UMI, same chromosome, same position, and same strand... This is a PCR duplicate.\n", + " # Don't write the line to file. Continue through file.\n", + " # Increment PCR-duplicate counter.\n", + " PCR_dup += 1\n", + " continue\n", + " # If the UMI is in dictionary, but not a duplicate...\n", + " else:\n", + " read = line + \"\\n\"\n", + " # Write the read to fout.\n", + " fout.write(read)\n", + " # The current read HAS undergone soft-clipping...\n", + " else:\n", + " # Correct the position.\n", + " this_pos = truePOS(this_pos,this_CIGAR)\n", + " # Check for PCR-duplicate.\n", + " if this_chr == UMI_dict[this_UMI][0] and this_pos == UMI_dict[this_UMI][1] and this_strand == UMI_dict[this_UMI[2]]:\n", + " # Same UMI, same chromosome, same position, and same strand... This is a PCR duplicate.\n", + " # Don't write the line to file. Continue through file.\n", + " # Increment PCR-duplicate counter.\n", + " PCR_dup += 1\n", + " continue\n", + " # The read is not a PCR duplicate, write to fout.\n", + " else:\n", + " read = line + \"\\n\"\n", + " fout.write(read)\n", + " # If the UMI isn't valid, DO NOT write the current read to the output file. \n", + " else:\n", + " # Error correction on UMIs would occur in this block.\n", + " continue\n", + " \n", + " \n", + " \n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Determine High-level functions \n", + "--- " + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def truePOS(POS,CIGAR):\n", + " \"\"\"\n", + " description: truePOS checks if soft-clipping has occurred at the left-most mapped position on a read.\n", + " If soft-clipping has occurred, truePOS performs a calculation to determine the INT value for the original \n", + " left-most-mapped position and returns this corrected pos. If soft-clipping has not occurred, truePOS returns\n", + " the original INT value of POS.\n", + " INPUT: POS = A STR representing the left-most-mapped position of the read.\n", + " CIGAR = A STR representing the CIGAR-string for the read.\n", + " OUTPUT: pos = A SC-corrected value for POS in reads where soft-clipping has occurred.\n", + " POS = The original integer value for POS\n", + " \"\"\"\n", + " # Make POS an INT for calculations.\n", + " POS = int(POS)\n", + " # Search for left-most soft-clipping.\n", + " sc = re.search(\"([0-9]{1,2})S[0-9]{1,2}\", CIGAR)\n", + " # If it's found...\n", + " if sc:\n", + " # Set the difference to the INT value before the first S, for calculations.\n", + " diff = int(sc.group(1))\n", + " # Create the new corrected position, to check for PCR duplicates.\n", + " pos = POS - diff\n", + " # Return the corrected position.\n", + " return pos\n", + " # If there's no soft-clipping...\n", + " else:\n", + " # Return the original position\n", + " return POS\n", + " \n", + "def parseSAM(lineSAM):\n", + " \"\"\"\n", + " description: parseSAM takes 1 line from a SAM file and returns the UMI, chromosome #, POS, CIGAR, and the strand.\n", + " INPUT: A SAM file read-line\n", + " OUTPUT: UMI,CHROM,POS,CIGAR,STRAND\n", + " \"\"\"\n", + " umi = re.search(\".*:([A,C,G,T]{8})\\t\", lineSAM)\n", + " barcode = umi.group(1)\n", + " line = lineSAM.split(\"\\t\")\n", + " chromo = line[2]\n", + " POS = line[3]\n", + " CIGAR = line[5]\n", + " flag = int(line[1])\n", + " if((flag & 16) != 16):\n", + " strand = \"+\"\n", + " else:\n", + " strand = \"-\"\n", + " return barcode,chromo,POS,CIGAR,strand" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Test truePOS()\n", + "assert(truePOS(\"500\",\"10S90M1S\")) == 490\n", + "assert(truePOS(\"500\",\"90M11S\")) == 500\n", + "assert(truePOS(\"500\",\"1S98M2S\")) == 499\n", + "assert(truePOS(\"1\",\"101M\")) == 1" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Test parseSAM()\n", + "import re\n", + "# Use a known test file on local computer to test parseSAM().\n", + "with open(\"testset3.sam\") as f:\n", + " header = \"\"\n", + " LN = 0\n", + " for line in f:\n", + " if re.match(r'^@[A-Z]{2}\\s', line):\n", + " continue\n", + " else:\n", + " LN += 1\n", + " if LN < 9:\n", + " stuff = parseSAM(line)\n", + " assert(len(stuff[0])) == 8\n", + " assert(stuff[1]) == \"1\"\n", + " assert(stuff[3]) == \"71M\"\n", + " assert(stuff[4]) == \"+\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Deduper - Part 1.ipynb b/Deduper - Part 1.ipynb new file mode 100644 index 0000000..20648e5 --- /dev/null +++ b/Deduper - Part 1.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deduper \n", + "--- \n", + "## Part 1 \n", + "--- \n", + "### Define the Problem \n", + "--- \n", + "- _PCR duplicates arise during PCR amplification of original fragmented, adapter-ligated DNA. They become problematic when 2 copies of the same molecule get onto different primer lawns on an Illumina flowcell. PCR-duplicate removal (or tagging) is necessary to mitigate PCR bias, which is introduced when certain molecules ligate adapters more favorably and are disproportionately amplified. The greater numbers of these preferentially amplified molecules makes them more likely to contaminate neighboring lawns on the flowcell. Failure to remove these duplicates could limit your ability to draw conclusions from your data, as biological information may be confounded by PCR bias._ \n", + "\n", + "### Write Examples \n", + "--- \n", + " - Input SAM file: \n", + " - **sample\\_input.sam** \n", + " - 6 conditions for forward read:\n", + " - Initial UMI (non-duplicate)\n", + " - Same UMI, different chromosome\n", + " - Same UMI, different POS \n", + " - Same UMI, different strand\n", + " - Same UMI, DUPLICATE w/ soft-clipping\n", + " - Ssme UMI, DUPLICATE w/o soft-clipping\n", + " - 6 conditions for reverse read:\n", + " - Same as above\n", + " - Output SAM file: \n", + " - **sample\\_output.sam** \n", + " - 8 reads: 4 from forward, 4 from reverse.\n", + "\n", + "### Develop Algorithm w/ Pseudocode \n", + "--- " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#################\n", + "## PSEUDO-CODE ##\n", + "#################\n", + "# To run SAMtools on the command line from a Python script...\n", + "from subprocess import call \n", + "# Import regular expressions.\n", + "import re\n", + "\n", + "# run SAMtools sort on file(s).\n", + "call(\"samtools sort -l 0 -m maxMem -o -T \", shell=True)\n", + "\n", + "# open the file produced from 'samtools sort' (above) as fin, open the output file for THIS program as fout.\n", + "# open the file of known barcodes as index.\n", + "with open(\"out.sam\") as fin, with open(\"dedup_output.sam\",'a') as fout, with open(\"STL96.txt\") as index:\n", + " # Create an empty set to hold real indexes.\n", + " valid_IDs = set()\n", + " # Add each index to valid_IDs.\n", + " for i in index:\n", + " i = i.strip()\n", + " valid_IDs.add(i)\n", + " # Create a counter for PCR-duplicates.\n", + " PCR_dup = 0\n", + " # Create a dictionary to keep track of UMI information.\n", + " UMI_dict = {}\n", + " # Keep track of total reads.\n", + " tot_rds = 0\n", + " # Iterate through each line of the input SAM file.\n", + " for line in fin:\n", + " # If the line is a header line...\n", + " if re.match(r'^@[A-Z]{2}\\s', line):\n", + " # Store the line w/newline in the temporary string 'header'.\n", + " header = (line + \"\\n\")\n", + " # Write the header to fout.\n", + " fout.write(header)\n", + " # Continue iterating thru 'fin'.\n", + " continue\n", + " else:\n", + " # Increment the number of total reads.\n", + " tot_rds += 1\n", + " # Get the relevant information to check for PCR duplicates.\n", + " parse_read = parseSAM(line)\n", + " # Store info in separate variables.\n", + " this_UMI = parse_read[0]\n", + " this_chr = parse_read[1]\n", + " this_pos = parse_read[2]\n", + " this_CIGAR = parse_read[3]\n", + " this_strand = parse_read[4]\n", + " # If current UMI is in valid_IDs...\n", + " if this_UMI in valid_IDs:\n", + " # If current read HASN'T undergone soft-clipping...\n", + " if this_pos == truePOS(this_pos,this_CIGAR):\n", + " # And current read is not in the dictionary.\n", + " if this_UMI not in UMI_dict:\n", + " # Add it to the dictionary.\n", + " UMI_dict[this_UMI] = (this_chr,this_pos,this_strand)\n", + " read = line + \"\\n\"\n", + " # Write the line to fout.\n", + " fout.write(read)\n", + " # If the UMI is already in the dictionary, check for PCR duplicate...\n", + " elif:\n", + " if this_chr == UMI_dict[this_UMI][0] and this_pos == UMI_dict[this_UMI][1] and this_strand == UMI_dict[this_UMI[2]]:\n", + " # Same UMI, same chromosome, same position, and same strand... This is a PCR duplicate.\n", + " # Don't write the line to file. Continue through file.\n", + " # Increment PCR-duplicate counter.\n", + " PCR_dup += 1\n", + " continue\n", + " # If the UMI is in dictionary, but not a duplicate...\n", + " else:\n", + " read = line + \"\\n\"\n", + " # Write the read to fout.\n", + " fout.write(read)\n", + " # The current read HAS undergone soft-clipping...\n", + " else:\n", + " # Correct the position.\n", + " this_pos = truePOS(this_pos,this_CIGAR)\n", + " # Check for PCR-duplicate.\n", + " if this_chr == UMI_dict[this_UMI][0] and this_pos == UMI_dict[this_UMI][1] and this_strand == UMI_dict[this_UMI[2]]:\n", + " # Same UMI, same chromosome, same position, and same strand... This is a PCR duplicate.\n", + " # Don't write the line to file. Continue through file.\n", + " # Increment PCR-duplicate counter.\n", + " PCR_dup += 1\n", + " continue\n", + " # The read is not a PCR duplicate, write to fout.\n", + " else:\n", + " read = line + \"\\n\"\n", + " fout.write(read)\n", + " # If the UMI isn't valid, DO NOT write the current read to the output file. \n", + " else:\n", + " # Error correction on UMIs would occur in this block.\n", + " continue\n", + " \n", + " \n", + " \n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Determine High-level functions \n", + "--- " + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def truePOS(POS,CIGAR):\n", + " \"\"\"\n", + " description: truePOS checks if soft-clipping has occurred at the left-most mapped position on a read.\n", + " If soft-clipping has occurred, truePOS performs a calculation to determine the INT value for the original \n", + " left-most-mapped position and returns this corrected pos. If soft-clipping has not occurred, truePOS returns\n", + " the original INT value of POS.\n", + " INPUT: POS = A STR representing the left-most-mapped position of the read.\n", + " CIGAR = A STR representing the CIGAR-string for the read.\n", + " OUTPUT: pos = A SC-corrected value for POS in reads where soft-clipping has occurred.\n", + " POS = The original integer value for POS\n", + " \"\"\"\n", + " # Make POS an INT for calculations.\n", + " POS = int(POS)\n", + " # Search for left-most soft-clipping.\n", + " sc = re.search(\"([0-9]{1,2})S[0-9]{1,2}\", CIGAR)\n", + " # If it's found...\n", + " if sc:\n", + " # Set the difference to the INT value before the first S, for calculations.\n", + " diff = int(sc.group(1))\n", + " # Create the new corrected position, to check for PCR duplicates.\n", + " pos = POS - diff\n", + " # Return the corrected position.\n", + " return pos\n", + " # If there's no soft-clipping...\n", + " else:\n", + " # Return the original position\n", + " return POS\n", + " \n", + "def parseSAM(lineSAM):\n", + " \"\"\"\n", + " description: parseSAM takes 1 line from a SAM file and returns the UMI, chromosome #, POS, CIGAR, and the strand.\n", + " INPUT: A SAM file read-line\n", + " OUTPUT: UMI,CHROM,POS,CIGAR,STRAND\n", + " \"\"\"\n", + " umi = re.search(\".*:([A,C,G,T]{8})\\t\", lineSAM)\n", + " barcode = umi.group(1)\n", + " line = lineSAM.split(\"\\t\")\n", + " chromo = line[2]\n", + " POS = line[3]\n", + " CIGAR = line[5]\n", + " flag = int(line[1])\n", + " if((flag & 16) != 16):\n", + " strand = \"+\"\n", + " else:\n", + " strand = \"-\"\n", + " return barcode,chromo,POS,CIGAR,strand" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Test truePOS()\n", + "assert(truePOS(\"500\",\"10S90M1S\")) == 490\n", + "assert(truePOS(\"500\",\"90M11S\")) == 500\n", + "assert(truePOS(\"500\",\"1S98M2S\")) == 499\n", + "assert(truePOS(\"1\",\"101M\")) == 1" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Test parseSAM()\n", + "import re\n", + "# Use a known test file on local computer to test parseSAM().\n", + "with open(\"testset3.sam\") as f:\n", + " header = \"\"\n", + " LN = 0\n", + " for line in f:\n", + " if re.match(r'^@[A-Z]{2}\\s', line):\n", + " continue\n", + " else:\n", + " LN += 1\n", + " if LN < 9:\n", + " stuff = parseSAM(line)\n", + " assert(len(stuff[0])) == 8\n", + " assert(stuff[1]) == \"1\"\n", + " assert(stuff[3]) == \"71M\"\n", + " assert(stuff[4]) == \"+\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Hills_deduper.py b/Hills_deduper.py new file mode 100644 index 0000000..c69a081 --- /dev/null +++ b/Hills_deduper.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +# Python3 must be loaded prior to implementing this program. +####################### +## IMPORT STATEMENTS ## +####################### +import argparse +from argparse import RawTextHelpFormatter +import re +from subprocess import call +from subprocess import check_output +########################## +## FUNCTION DEFINITIONS ## +########################## +def parseSAM(lineSAM): + """ + description: parseSAM takes 1 line from a SAM file and returns + the UMI, chromosome #, POS, CIGAR, and the strand. + INPUT: A SAM file read-line + OUTPUT: UMI,CHROM,POS,CIGAR,STRAND + """ + line = lineSAM.split("\t") + header = line[0] + umi = re.findall(".*:([A,C,G,T,N]{8}$)", header) + barcode = umi[0] + chromo = line[2] + POS = line[3] + CIGAR = line[5] + flag = int(line[1]) + if((flag & 16) != 16): + strand = "+" + else: + strand = "-" + return barcode,chromo,POS,CIGAR,strand + +def plusPOS(POS,CIGAR): + """ + description: plusPOS checks if soft-clipping has occurred at the left-most mapped position on a "+" read. + If soft-clipping has occurred, truePOS performs a calculation to determine the INT value for the original + left-most-mapped position and returns this corrected pos. If soft-clipping has not occurred, truePOS returns + the original INT value of POS. + INPUT: POS = A STR representing the left-most-mapped position of the read. + CIGAR = A STR representing the CIGAR-string for the read. + OUTPUT: pos = STRING: A soft-clip-corrected value for POS in reads where soft-clipping has occurred. + POS = STRING: The original value for POS. + """ + # Make POS an INT for calculations. + POS = int(POS) + # Search for left-most soft-clipping. + sc = re.search("([0-9]+)S[0-9]+", CIGAR) + # If it's found... + if sc: + # Set the difference to the INT value before the first S, for calculations. + diff = int(sc.group(1)) + # Create the new corrected position, to check for PCR duplicates. + pos = POS - diff + # Return the corrected position. + return str(pos) + # If there's no soft-clipping... + else: + # Return the original position + return str(POS) + +def negPOS(POS,CIGAR): + """ + description: negPOS checks if right-most-soft-clipping,deletions,Ns,or Ms have occurred in the CIGAR string. + If right-most-soft-clipping,deletions,Ns,or Ms are present, negPOS adds the INT value to the original POS. + INPUT: POS = A STR representing the left-most-mapped position of the read. + CIGAR = A STR representing the CIGAR-string for the read. + OUTPUT: pos = A STR: A right-most value for POS in "-" strand reads. + """ + # Make POS an INT for calculations. + pos = int(POS) + # Search for right-most soft-clipping. + sc = re.search("([0-9]+)S$", CIGAR) + # Search for mapped regions. + mapped = re.findall("([0-9]+)M", CIGAR) + # Search for deletions. + dels = re.findall("([0-9]+)D", CIGAR) + # Search for skipped regions. + Ns = re.findall("([0-9]+)N", CIGAR) + # If left-most soft-clipping is found... + if sc: + # Set the difference to the INT value before the first S, for calculations. + diff = int(sc.group(1)) + # Create the new corrected position, to check for PCR duplicates. + pos += diff + elif len(dels) > 0: + for n in dels: + pos += int(n) + elif len(Ns) > 0: + for n in Ns: + pos += int(n) + for n in mapped: + pos += int(n) + return str(pos) +##################### +## ARGUMENT PARSER ## +##################### +# Create the argument parser. +parser = argparse.ArgumentParser( + description= + """ + INPUT: A SAM file for deduplicating + A file containing a list of UMIs + OUTPUT: A deduplicated SAM file + A text file containing basic STATS on deduplication + """, formatter_class = RawTextHelpFormatter) +# Inform the parser of command line options. +# -h,--help is an argparse built-in function and will return the description w/ +# options and option help messages seen below. +parser.add_argument('-f', '--file', help='Absolute SAM file path', + type=str, required=True) +parser.add_argument('-p', '--paired', help='Designates file is paired-end', + action='store_true', required=False) +parser.add_argument('-u', '--umi', help='A file with a list of UMIs', + type=str, required=False) +# Call the parser. +args = parser.parse_args() +# If -p/--paired is given on the command line, issue a technical-error message to st.out +if args.paired == True: + msg = "\nTECHNICAL ERROR: This program does not currently handle paired-end reads.\n" + print(msg) + exit() +# If -u,--umi not specified, issue a technical-error message to st.out. +if args.umi == None: + msg = """\nTECHNICAL ERROR: This program currently requires a file containing a list of UMIs. + + To specify your UMI filename use: -u,--umi + \n""" + print(msg) + exit() +################## +## FILES PART 1 ## +################## +# Save the input file as fIN. +fIN = args.file +# Save the UMI list file as the variable umi. +umi = args.umi +# Split the SAMfile input at the .sam extention, store in a list. +filelist = fIN.split(".sam") +# Set up an input file, which is the output of the samtools sort command. +fin = filelist[0]+"_sorted.sam" +####################### +## UNIX COMMAND LINE ## +####################### +# Load requisite modules and run SAMtools sort on fIN. +check_output("module load samtools && module load python3/3.6.5 && \ + samtools sort -l 0 -m 4000M -o {} -T temp_sorted {}".format(fin,fIN), + shell=True, timeout=None) # ONLY ON TALAPAS +################## +## FILES PART 2 ## +################## +# Split the output file from samtools sort. +fOUTtemp = fin.split("_sorted.sam") +# Create an output file for this program. +fOUT = fOUTtemp[0]+"_deduped.sam" +# Create a statistical output text file. +stats = fOUTtemp[0]+"_STATS.txt" +################ +## OPEN FILES ## +################ +# Open all of the pertinent files. +with open(fin) as sam, open(umi) as u, open(fOUT,'w') as ded, open(stats, 'w') as pout: + # Contain the unique molecular identifiers in a set. + # Initiate an empty set for UMIs. + UMIs = set() + # Initiate an empty set for chromosomes "seen". + chromes = set() + # Populate the set from the list of UMIs. + for i in u: + # Strip the UMI of invisible characters. + i = i.strip() + # Add the UMI to the set UMIs. + UMIs.add(i) + # Initiate Counters for the STATS output. + # Count total duplicates. + pcr_dup = 0 + # Count total reads. + tot_rds = 0 + # Count unknown barcodes. + unk_ids = 0 + # Iterate through each line of the input SAM file. + for line in sam: + # If the line is a header line... + if re.match(r'^@[A-Z]{2}\s', line): + # Write the header to fout. + ded.write(line) + # Continue iterating thru sam. + continue + # The line is not a header. + else: + # Increment the number of total reads. + tot_rds += 1 + # Use parseSAM function to get the relevant SAM file fields to check for PCR duplicates. + read_tup = parseSAM(line) + # Store info in separate variables. + this_UMI = read_tup[0] + this_chr = read_tup[1] + this_pos = read_tup[2] + this_CIGAR = read_tup[3] + this_strand = read_tup[4] + # If current UMI is NOT IN UMIs... + if this_UMI not in UMIs: + # Increment the unknown umi counter. + unk_ids += 1 + # Continue to the next line. + continue + # The UMI is in UMIs. + else: + # If this_chr is NOT in chromes... + if this_chr not in chromes: + # Add the chromosome to the set, chromes. + chromes.add(this_chr) + # Reset a dictionary for + strands: key=UMI, value=set(POS1,POS2,POS3,...) + plusUMIdict = {} + # Reset a dictionary for - strands: key=UMI, value=set(POS1,POS2,POS3,...) + negUMIdict = {} + # If it's the positive strand. + if this_strand == "+": + # Find the true left-most mapped position. + new_pos = plusPOS(this_pos,this_CIGAR) + # Add the UMI and its set of positions to the plusUMIdict. + plusUMIdict[this_UMI] = set([new_pos]) + # Write the line to ded. + ded.write(line) + # Continue to the next line. + continue + # It's the reverse strand... + else: + # Find the right-most read position. + new_pos = negPOS(this_pos,this_CIGAR) + # Add the UMI and its set of positions to the negUMIdict. + negUMIdict[this_UMI] = set([new_pos]) + # Write the line to ded. + ded.write(line) + # Continue to the next line. + continue + # We're still on the same chromosome. + else: + # If it's the positive strand. + if this_strand == "+": + # Find the true left-most mapped position. + new_pos = plusPOS(this_pos,this_CIGAR) + # Check if this_UMI is NOT in plusUMIdict... + if this_UMI not in plusUMIdict: + # Add the UMI as a KEY to the dictionary with the set of forward + # positions as the VALUE. + plusUMIdict[this_UMI] = set([new_pos]) + # Write the line to ded. + ded.write(line) + # Continue to the next line. + continue + # We've already seen this UMI, we'll need to check for duplicates. + else: + # If the corrected position is in the set of positions for this UMI... + if new_pos in plusUMIdict[this_UMI]: + # Increment the PCR duplicate counter. + pcr_dup += 1 + # Continue to the next line. + continue + # If the corrected position is not in the set of positions for this UMI... + else: + # Add the new position to this_UMI's set of positions, in + # plusUMIdict. + plusUMIdict[this_UMI].add(new_pos) + # Write the line to ded. + ded.write(line) + # Continue to the next line. + continue + + # It's the negative strand. + else: + # Find the right-most read position. + new_pos = negPOS(this_pos,this_CIGAR) + # Check if this_UMI is NOT in negUMIdict... + if this_UMI not in negUMIdict: + # Add the UMI as a KEY to the dictionary with the set of reverse + # positions as the VALUE. + negUMIdict[this_UMI] = set([new_pos]) + # Write the line to ded. + ded.write(line) + # Continue to the next line. + continue + # We've already seen this UMI, we'll need to check for duplicates. + else: + # If the corrected position is in this UMI's set of positions... + if new_pos in negUMIdict[this_UMI]: + # Increment the PCR duplicate counter. + pcr_dup += 1 + # Continue to the next line. + continue + # The corrected position is not in this UMI's set of positions... + else: + # Add the corrected position to this UMI's set of positions. + negUMIdict[this_UMI].add(new_pos) + # Write the line to ded. + ded.write(line) + # Continue to the next line. + continue + # Print output to the STATS file. + print("Input SAM File:",fIN,file=pout) + print("List of UMIs Used:",umi,file=pout) + print("Deduplicated Output SAM File:",fOUT,file=pout) + print("Total Reads:",tot_rds,file=pout) + print("Total Duplicates:",pcr_dup,file=pout) + print("Rate of PCR Duplication:",str(round(pcr_dup/tot_rds*100,2))+"%",file=pout) + print("Total Unknown Barcodes:",unk_ids,file=pout) + print("Rate of Unknown Barcodes:",str(round(unk_ids/tot_rds*100,2))+"%",file=pout) +# Create a template to remove the temporary samtools-sorted file. +rmcommand = "rm {}".format(fin) +call(rmcommand, shell=True) diff --git a/sample_input.sam b/sample_input.sam new file mode 100644 index 0000000..76c2509 --- /dev/null +++ b/sample_input.sam @@ -0,0 +1,37 @@ +@HD VN:1.0 SO:unsorted +@PG ID:GSNAP PN:gsnap VN:2017-10-12 CL:gsnap.avx2 --gunzip -t 26 -A sam -m 5 -d mm10_chroms -D /projects/bgmp/coonrod/mmu/INTEL -s /projects/bgmp/coonrod/mmu/INTEL/mm10_chroms/mm10_chroms.maps/Mus_musculus.GRCm38.89.splicesites.iit --split-output=/projects/bgmp/coonrod/deduper/gsnap//Datset1 /projects/bgmp/coonrod/deduper//Dataset1.fastq_dups.gz +@SQ SN:1 LN:195471971 +@SQ SN:2 LN:182113224 +@SQ SN:3 LN:160039680 +@SQ SN:4 LN:156508116 +@SQ SN:5 LN:151834684 +@SQ SN:6 LN:149736546 +@SQ SN:7 LN:145441459 +@SQ SN:8 LN:129401213 +@SQ SN:9 LN:124595110 +@SQ SN:10 LN:130694993 +@SQ SN:11 LN:122082543 +@SQ SN:12 LN:120129022 +@SQ SN:13 LN:120421639 +@SQ SN:14 LN:124902244 +@SQ SN:15 LN:104043685 +@SQ SN:16 LN:98207768 +@SQ SN:17 LN:94987271 +@SQ SN:18 LN:90702639 +@SQ SN:19 LN:61431566 +@SQ SN:X LN:171031299 +@SQ SN:Y LN:91744698 +@SQ SN:MT LN:16299 +NS500451:154:HWKTMBGXX:1:11101:24260:1121:CTGTTCAC 0 2 76814284 36 71M * 0 0 TCCACCACAATCTTACCATCCTTCCTCCAGACCACATCGCGTTCTTTGTTCAACTCACAGCTCAAGTACAA 6AEEEEEEAEEAEEEEAAEEEEEEEEEAEEAEEAAEE