Merge pull request #131 from ncsa/130-neat-crashes-with-target_bed-file-500-lines

joshfactorial · web-flow · commit 9d8aaaf10807 · 2024-10-30T14:18:28.000-05:00
130 neat crashes with target bed file 500 lines
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
@@ -177,18 +177,20 @@ def read_simulator_runner(config: str, output: str):
 
     # TODO check into SeqIO.index_db()
     reference_index = SeqIO.index(str(options.reference), "fasta")
+    reference_keys_with_lens = {key: len(value) for key, value in reference_index.items()}
     _LOG.debug("Reference file indexed.")
 
     if _LOG.getEffectiveLevel() < 20:
         count = 0
-        for contig in reference_index:
-            count += len(reference_index[contig])
+        for contig in reference_keys_with_lens:
+            count += reference_keys_with_lens[contig]
         _LOG.debug(f"Length of reference: {count / 1_000_000:.2f} Mb")
 
-    input_variants_dict = {x: ContigVariants() for x in reference_index}
+    input_variants_dict = {x: ContigVariants() for x in reference_keys_with_lens}
     if options.include_vcf:
         _LOG.info(f"Reading input VCF: {options.include_vcf}.")
         if options.cancer:
+            # TODO Check if we need full ref index or just keys and lens
             sample_names = parse_input_vcf(
                 input_variants_dict,
                 options.include_vcf,
@@ -202,6 +204,7 @@ def read_simulator_runner(config: str, output: str):
             tumor_ind = sample_names['tumor_sample']
             normal_ind = sample_names['normal_sample']
         else:
+            # TODO Check if we need full ref index or just keys and lens
             sample_names = parse_input_vcf(
                 input_variants_dict,
                 options.include_vcf,
@@ -224,7 +227,7 @@ def read_simulator_runner(config: str, output: str):
         target_regions_dict,
         discard_regions_dict,
         mutation_rate_dict
-    ) = parse_beds(options, reference_index, mut_model.avg_mut_rate)
+    ) = parse_beds(options, reference_keys_with_lens, mut_model.avg_mut_rate)
 
     if any(bed_files):
         _LOG.debug("Finished reading input beds.")
@@ -234,7 +237,7 @@ def read_simulator_runner(config: str, output: str):
     if options.produce_bam:
         # This is a dictionary that is the list of the contigs and the length of each.
         # This information will be needed later to create the bam header.
-        bam_header = {key: len(reference_index[key]) for key in reference_index}
+        bam_header = reference_keys_with_lens
 
     # Creates files and sets up objects for files that can be written to as needed.
     # Also creates headers for bam and vcf.
@@ -259,7 +262,7 @@ def read_simulator_runner(config: str, output: str):
     """
     _LOG.info("Beginning simulation.")
 
-    breaks = find_file_breaks(reference_index)
+    breaks = find_file_breaks(reference_keys_with_lens)
 
     _LOG.debug("Input reference partitioned for run")
 
@@ -345,20 +348,20 @@ def read_simulator_runner(config: str, output: str):
 
     if options.produce_bam:
         _LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
-        contig_list = list(reference_index)
+        contig_list = list(reference_keys_with_lens)
         contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
         output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)
 
 
-def find_file_breaks(reference_index: dict) -> dict:
+def find_file_breaks(reference_keys_with_lens: dict) -> dict:
     """
     Returns a dictionary with the chromosomes as keys, which is the start of building the chromosome map
 
-    :param reference_index: a dictionary with chromosome keys and sequence values
+    :param reference_keys_with_lens: a dictionary with chromosome keys and sequence values
     :return: a dictionary containing the chromosomes as keys and either "all" for values, or a list of indices
     """
     partitions = {}
-    for contig in reference_index.keys():
-        partitions[contig] = [(0, len(reference_index[contig]))]
+    for contig in reference_keys_with_lens:
+        partitions[contig] = [(0, reference_keys_with_lens[contig])]
 
     return partitions
diff --git a/neat/read_simulator/utils/bed_func.py b/neat/read_simulator/utils/bed_func.py
@@ -19,12 +19,12 @@
 _LOG = logging.getLogger(__name__)
 
 
-def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mutation_rate: float) -> list:
+def parse_beds(options: Options, ref_keys_counts: dict, average_mutation_rate: float) -> list:
     """
     This single function parses the three possible bed file types for NEAT.
 
     :param options: The options object for this run
-    :param reference_dict: The reference dict generated by SeqIO for this run
+    :param ref_keys_counts: A dictionary containing the reference keys and the associated lengths.
     :param average_mutation_rate: The average mutation rate from the model or user input for this run
     :return: target_dict, discard_dict, mutation_rate_dict
     """
@@ -57,13 +57,13 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
     for i in range(len(bed_files)):
         temp_bed_dict = parse_single_bed(
             bed_files[i],
-            reference_dict,
+            ref_keys_counts,
             processing_structure[i]
         )
         # If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
         # of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
         if processing_structure[i][2]:
-            final_dict = fill_out_bed_dict(reference_dict, temp_bed_dict, processing_structure[i])
+            final_dict = fill_out_bed_dict(ref_keys_counts, temp_bed_dict, processing_structure[i])
         else:
             final_dict = temp_bed_dict
 
@@ -73,7 +73,7 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
 
 
 def parse_single_bed(input_bed: str,
-                     reference_dictionary: _IndexedSeqFileDict,
+                     ref_keys_counts: dict,
                      process_key: tuple[str, float, bool],
                      ) -> dict:
     """
@@ -82,12 +82,13 @@ def parse_single_bed(input_bed: str,
     then we will also import the mutation data.
 
     :param input_bed: A bed file containing the regions of interest
-    :param reference_dictionary: A list of chromosomes to check, along with their reads and lengths
+    :param ref_keys_counts: A dict of chromosomes to check (key), along with their lengths (value)
     :param process_key: Indicates which bed we are processing and the factor we need to process it.
     :return: a dictionary of chromosomes: [(pos1, pos2, factor), etc...]
     """
-    ret_dict = {x: [] for x in reference_dictionary}
+    ret_dict = {x: [] for x in ref_keys_counts}
     in_bed_only = []
+    key_pool = list(ref_keys_counts.keys())
 
     # process_key[2] is True when there is an input bed, False otherwise.
     if process_key[2]:
@@ -107,7 +108,7 @@ def parse_single_bed(input_bed: str,
                         sys.exit(1)
                     # Bed file chromosome names must match the reference.
                     try:
-                        assert my_chr in reference_dictionary
+                        assert my_chr in key_pool
                     except AssertionError:
                         in_bed_only.append(my_chr)
                         _LOG.warning(
@@ -149,7 +150,7 @@ def parse_single_bed(input_bed: str,
                         ret_dict[my_chr].append((int(pos1), int(pos2), True))
 
         # some validation
-        in_ref_only = [k for k in reference_dictionary if k not in ret_dict]
+        in_ref_only = [k for k in ref_keys_counts if k not in ret_dict]
         if in_ref_only:
             _LOG.warning(f'Warning: Reference contains sequences not found in BED file {input_bed}. '
                          'These chromosomes will be omitted from the outputs.')
@@ -161,13 +162,13 @@ def parse_single_bed(input_bed: str,
             _LOG.debug(f'Regions ignored: {list(set(in_bed_only))}')
 
     else:
-        for my_chr in reference_dictionary.keys():
-            ret_dict[my_chr].append((0, len(reference_dictionary[my_chr]), process_key[1]))
+        for my_chr in ref_keys_counts:
+            ret_dict[my_chr].append((0, ref_keys_counts[my_chr], process_key[1]))
 
     return ret_dict
 
 
-def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
+def fill_out_bed_dict(ref_keys_counts: dict,
                       region_dict: dict | None,
                       default_factor: tuple[str, bool, bool],
                       ) -> dict:
@@ -177,9 +178,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
 
     The input to this function is the dict for a single chromosome.
 
-    :param ref_dict: The reference dictionary for the run. Needed to calulate max values. We shouldn't need to access
-        the data it refers to.
-    :param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
+    :param ref_keys_counts: A dictionary with the keys as the contigs from the reference dictionary, and the values as
+        the length of the contig. Needed to calculate max values.
+    :param region_dict: A dict based on the input from the bed file, with keys being all the chromosomes
         present in the reference.
     :param default_factor: This is a tuple representing the type and any extra information. The extra info is for
         mutation rates. If beds were input, then these values come from the bed, else they are set to one value
@@ -191,7 +192,7 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
 
     for contig in region_dict:
         regions = []
-        max_value = len(ref_dict[contig])
+        max_value = ref_keys_counts[contig]
         start = 0
 
         # The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)