Skip to content

Commit

Permalink
Removed the output of unneccessary files, improved README
Browse files Browse the repository at this point in the history
  • Loading branch information
aljpetri committed Dec 13, 2023
1 parent 7723b12 commit 0abdce3
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 29 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,24 @@ To run the algorithm:<br />
python isONform_parallel.py --fastq_folder path/to/input/files --t <nr_cores> --outfolder /path/to/outfolder --split_wrt_batches
```

Note: Please always give absolute paths to the files or folders

the isON-pipeline (isONclust, isONcorrect, isONform) can be run via:

```
./full_pipeline.sh <raw_reads.fq> <outfolder> <num_cores> <isONform_folder> <iso_abundance> <mode>
```
(Note that this requires pychopper, isONclust and isONcorrect to be installed)

## Outputs <a name="Outputs"></a>
IsONform outputs three main files: transcriptome.fasta, mapping.txt, and support.txt.
For each isoform that isONform reconstructs the id has the following form: x_y_z.

'x' denotes the isONclust cluster that the isoform stems from.
As we cluster reads as in isONcorrect in batches of 1000 reads the 'y' denotes from which batch the isoform was reconstructed.
The 'z' denotes a unique identifier which enables us to have unique ids for each isoform that we reconstructed.
In mapping.txt it is indicated from which original reads an isoform has been reconstructed.
support_txt gives the support (i.e. how many original reads make up the isoform).
## Contact <a name="Contact"></a>
If you encounter any problems, please raise an issue on the issues page, you can also contact the developer of this repository via:
alexander.petri[at]math.su.se
Expand Down
6 changes: 4 additions & 2 deletions isONform_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def main(args):
#print("MERGE?", args.merge_sub_isoforms_3, args.merge_sub_isoforms_5)
globstart = time()
directory = args.fastq_folder # os.fsencode(args.fastq_folder)
write_low_abundance = False
#print(directory)
#print("ARGS",args)
isONform_location = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -298,8 +299,8 @@ def main(args):
write_fastq = True
else:
write_fastq = False
batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance, write_fastq)
Parallelization_side_functions.generate_full_output(args.outfolder,write_fastq)
batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa, args.iso_abundance, write_fastq, write_low_abundance)
Parallelization_side_functions.generate_full_output(args.outfolder, write_fastq, write_low_abundance)
Parallelization_side_functions.remove_folders(args.outfolder)
shutil.rmtree(split_directory)
print("Joined back batched files in:", time() - file_handling)
Expand Down Expand Up @@ -344,6 +345,7 @@ def main(args):
help='Cutoff parameter: maximum length difference at 5prime end, for which subisoforms are still merged into longer isoforms')
parser.add_argument('--tmpdir', type=str,default=None, help='OPTIONAL PARAMETER: Absolute path to custom folder in which to store temporary files. If tmpdir is not specified, isONform will attempt to write the temporary files into the tmp folder on your system. It is advised to only use this parameter if the symlinking does not work on your system.')
parser.add_argument('--write_fastq', action="store_true", help=' Indicates that we want to ouptut the final output (transcriptome) as fastq file (New standard: fasta)')

args = parser.parse_args()
print(len(sys.argv))
if len(sys.argv) == 1:
Expand Down
23 changes: 8 additions & 15 deletions modules/Parallelization_side_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,21 @@ def generate_low_abundance_output(outfolder,write_fastq):
g = open(fname, "r")
# read content from first file
for line in g:
if line.startswith('@'):
line=line+str(actual_folder)
if line.startswith('@') or line.startswith('>'):
line = line + str(actual_folder)
# append content to second file
f.write(line)
"""otherfname= os.path.join(outfolder,"cluster"+str(actual_folder)+"_merged_low_abundance.fq")
if os.path.isfile(otherfname):
other_g = open(otherfname, "r")
# read content from first file
for other_line in other_g:
# append content to second file
f.write(other_line)
#f.write(g.read())"""


def remove_folders(outfolder):
subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()]
for subfolder in subfolders:
shutil.rmtree(os.path.join(outfolder,subfolder))

def generate_full_output(outfolder,write_fastq):
generate_single_output(outfolder,write_fastq)
generate_low_abundance_output(outfolder, write_fastq)
def generate_full_output(outfolder,write_fastq, write_low_abundance):
generate_single_output(outfolder, write_fastq)
generate_single_mapping(outfolder)
generate_low_abundance_mapping(outfolder)
generate_single_support(outfolder)
generate_single_support(outfolder)
if write_low_abundance:
generate_low_abundance_output(outfolder, write_fastq)
generate_low_abundance_mapping(outfolder)
31 changes: 19 additions & 12 deletions modules/batch_merging_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,24 +82,30 @@ def read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir):
all_infos_dict[batch_id] = {}


def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq):
write_low_abundance = False
def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq, write_low_abundance):
#write_low_abundance = False
support_name = "support_" + str(folder) + ".txt"
other_support_name = "support_" + str(folder) + "low_abundance.txt"
other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt"
if write_fastq:
consensus_name = "cluster" + str(folder) + "_merged.fq"
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
if write_low_abundance:
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
other_consensus = open(os.path.join(outfolder, other_consensus_name), 'w')
other_mapping = open(os.path.join(outfolder, other_mapping_name), 'w')
other_support_file = open(os.path.join(outfolder, other_support_name), "w")
else:
consensus_name = "cluster" + str(folder) + "_merged.fa"
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa"
if write_low_abundance:
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa"
other_consensus = open(os.path.join(outfolder, other_consensus_name), 'w')
other_mapping = open(os.path.join(outfolder, other_mapping_name), 'w')
other_support_file = open(os.path.join(outfolder, other_support_name), "w")

mapping_name = "cluster" + str(folder) + "_mapping.txt"
other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt"
support_file = open(os.path.join(outfolder, support_name), "w")
other_support_file = open(os.path.join(outfolder, other_support_name), "w")
consensus_file = open(os.path.join(outfolder, consensus_name), "w")
other_consensus = open(os.path.join(outfolder, other_consensus_name), 'w')
mapping_file = open(os.path.join(outfolder, mapping_name), 'w')
other_mapping = open(os.path.join(outfolder, other_mapping_name), 'w')
skipped_reads = {}
for batchid, id_dict in all_infos_dict.items():
for id, infos in id_dict.items():
Expand Down Expand Up @@ -143,8 +149,9 @@ def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder,

consensus_file.close()
mapping_file.close()
other_consensus.close()
other_mapping.close()
if write_low_abundance:
other_consensus.close()
other_mapping.close()


# TODO: add the rest of variables for this method and move filewriting to wrappermethod
Expand Down Expand Up @@ -206,7 +213,7 @@ def actual_merging_process(all_infos_dict, delta, delta_len,


def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq):
delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq,write_low_abundance):
print("Batch Merging")
unique_cl_ids = set()
subfolders = [f.path for f in os.scandir(outdir) if f.is_dir()]
Expand Down Expand Up @@ -266,7 +273,7 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
for c_id, c_infos in b_infos.items():
if not c_infos.merged:
nr_reads += len(c_infos.reads)
write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq)
write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq, write_low_abundance)
#shutil.rmtree(os.path.join(outdir,cl_id))

DEBUG = False
Expand Down

0 comments on commit 0abdce3

Please sign in to comment.