Removed the output of unneccessary files, improved README

aljpetri · Dec 13, 2023 · 0abdce3 · 0abdce3
1 parent 7723b12
commit 0abdce3
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -57,14 +57,24 @@ To run the algorithm:<br />
 python isONform_parallel.py --fastq_folder path/to/input/files --t <nr_cores> --outfolder /path/to/outfolder --split_wrt_batches 
 ```
 
+Note: Please always give absolute paths to the files or folders
+
 the isON-pipeline (isONclust, isONcorrect, isONform) can be run via:
 
 ```
 ./full_pipeline.sh <raw_reads.fq>  <outfolder>  <num_cores> <isONform_folder> <iso_abundance> <mode>
 ```
 (Note that this requires pychopper, isONclust and isONcorrect to be installed)
 
+## Outputs <a name="Outputs"></a>
+IsONform outputs three main files: transcriptome.fasta, mapping.txt, and support.txt.
+For each isoform that isONform reconstructs the id has the following form: x_y_z.
 
+'x' denotes the isONclust cluster that the isoform stems from.
+As we cluster reads as in isONcorrect in batches of 1000 reads the 'y' denotes from which batch the isoform was reconstructed.
+The 'z' denotes a unique identifier which enables us to have unique ids for each isoform that we reconstructed.
+In mapping.txt it is indicated from which original reads an isoform has been reconstructed.
+support_txt gives the support (i.e. how many original reads make up the isoform).
 ## Contact <a name="Contact"></a>
 If you encounter any problems, please raise an issue on the issues page, you can also contact the developer of this repository via:
 alexander.petri[at]math.su.se

diff --git a/isONform_parallel.py b/isONform_parallel.py
@@ -193,6 +193,7 @@ def main(args):
     #print("MERGE?", args.merge_sub_isoforms_3, args.merge_sub_isoforms_5)
     globstart = time()
     directory = args.fastq_folder  # os.fsencode(args.fastq_folder)
+    write_low_abundance = False
     #print(directory)
     #print("ARGS",args)
     isONform_location = os.path.dirname(os.path.realpath(__file__))
@@ -298,8 +299,8 @@ def main(args):
             write_fastq = True
         else:
             write_fastq = False
-        batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance, write_fastq)
-        Parallelization_side_functions.generate_full_output(args.outfolder,write_fastq)
+        batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa, args.iso_abundance, write_fastq, write_low_abundance)
+        Parallelization_side_functions.generate_full_output(args.outfolder, write_fastq, write_low_abundance)
         Parallelization_side_functions.remove_folders(args.outfolder)
         shutil.rmtree(split_directory)
         print("Joined back batched files in:", time() - file_handling)
@@ -344,6 +345,7 @@ def main(args):
                         help='Cutoff parameter: maximum length difference at 5prime end, for which subisoforms are still merged into longer isoforms')
     parser.add_argument('--tmpdir', type=str,default=None, help='OPTIONAL PARAMETER: Absolute path to custom folder in which to store temporary files. If tmpdir is not specified, isONform will attempt to write the temporary files into the tmp folder on your system. It is advised to only use this parameter if the symlinking does not work on your system.')
     parser.add_argument('--write_fastq', action="store_true", help=' Indicates that we want to ouptut the final output (transcriptome) as fastq file (New standard: fasta)')
+
     args = parser.parse_args()
     print(len(sys.argv))
     if len(sys.argv) == 1:

diff --git a/modules/Parallelization_side_functions.py b/modules/Parallelization_side_functions.py
@@ -106,28 +106,21 @@ def generate_low_abundance_output(outfolder,write_fastq):
                 g = open(fname, "r")
                 # read content from first file
                 for line in g:
-                    if line.startswith('@'):
-                        line=line+str(actual_folder)
+                    if line.startswith('@') or line.startswith('>'):
+                        line = line + str(actual_folder)
                     # append content to second file
                     f.write(line)
-            """otherfname= os.path.join(outfolder,"cluster"+str(actual_folder)+"_merged_low_abundance.fq")
-            if os.path.isfile(otherfname):
-                other_g = open(otherfname, "r")
-                # read content from first file
-                for other_line in other_g:
-                    # append content to second file
-                    f.write(other_line)
-                #f.write(g.read())"""
 
 
 def remove_folders(outfolder):
     subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()]
     for subfolder in subfolders:
         shutil.rmtree(os.path.join(outfolder,subfolder))
 
-def generate_full_output(outfolder,write_fastq):
-    generate_single_output(outfolder,write_fastq)
-    generate_low_abundance_output(outfolder, write_fastq)
+def generate_full_output(outfolder,write_fastq, write_low_abundance):
+    generate_single_output(outfolder, write_fastq)
     generate_single_mapping(outfolder)
-    generate_low_abundance_mapping(outfolder)
-    generate_single_support(outfolder)
+    generate_single_support(outfolder)
+    if write_low_abundance:
+        generate_low_abundance_output(outfolder, write_fastq)
+        generate_low_abundance_mapping(outfolder)
diff --git a/modules/batch_merging_parallel.py b/modules/batch_merging_parallel.py
@@ -82,24 +82,30 @@ def read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir):
             all_infos_dict[batch_id] = {}
 
 
-def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq):
-    write_low_abundance = False
+def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq, write_low_abundance):
+    #write_low_abundance = False
     support_name = "support_" + str(folder) + ".txt"
     other_support_name = "support_" + str(folder) + "low_abundance.txt"
+    other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt"
     if write_fastq:
         consensus_name = "cluster" + str(folder) + "_merged.fq"
-        other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
+        if write_low_abundance:
+            other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
+            other_consensus = open(os.path.join(outfolder, other_consensus_name), 'w')
+            other_mapping = open(os.path.join(outfolder, other_mapping_name), 'w')
+            other_support_file = open(os.path.join(outfolder, other_support_name), "w")
     else:
         consensus_name = "cluster" + str(folder) + "_merged.fa"
-        other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa"
+        if write_low_abundance:
+            other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa"
+            other_consensus = open(os.path.join(outfolder, other_consensus_name), 'w')
+            other_mapping = open(os.path.join(outfolder, other_mapping_name), 'w')
+            other_support_file = open(os.path.join(outfolder, other_support_name), "w")
+
     mapping_name = "cluster" + str(folder) + "_mapping.txt"
-    other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt"
     support_file = open(os.path.join(outfolder, support_name), "w")
-    other_support_file = open(os.path.join(outfolder, other_support_name), "w")
     consensus_file = open(os.path.join(outfolder, consensus_name), "w")
-    other_consensus = open(os.path.join(outfolder, other_consensus_name), 'w')
     mapping_file = open(os.path.join(outfolder, mapping_name), 'w')
-    other_mapping = open(os.path.join(outfolder, other_mapping_name), 'w')
     skipped_reads = {}
     for batchid, id_dict in all_infos_dict.items():
         for id, infos in id_dict.items():
@@ -143,8 +149,9 @@ def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder,
 
     consensus_file.close()
     mapping_file.close()
-    other_consensus.close()
-    other_mapping.close()
+    if write_low_abundance:
+        other_consensus.close()
+        other_mapping.close()
 
 
 # TODO: add the rest of variables for this method and move filewriting to wrappermethod
@@ -206,7 +213,7 @@ def actual_merging_process(all_infos_dict, delta, delta_len,
 
 
 def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
-                                delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq):
+                                delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq,write_low_abundance):
     print("Batch Merging")
     unique_cl_ids = set()
     subfolders = [f.path for f in os.scandir(outdir) if f.is_dir()]
@@ -266,7 +273,7 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
             for c_id, c_infos in b_infos.items():
                 if not c_infos.merged:
                     nr_reads += len(c_infos.reads)
-            write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq)
+            write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq, write_low_abundance)
         #shutil.rmtree(os.path.join(outdir,cl_id))
 
 DEBUG = False