@@ -47,42 +47,61 @@ def vcf_collect(
47
47
df_not_symbol = merged_df [merged_df ["Left_ensembl_gene_id" ].notna ()]
48
48
49
49
df_not_symbol = hgnc_df .merge (
50
- df_not_symbol , how = "right" , left_on = "ensembl_gene_id" , right_on = "Left_ensembl_gene_id"
50
+ df_not_symbol ,
51
+ how = "right" ,
52
+ left_on = "ensembl_gene_id" ,
53
+ right_on = "Left_ensembl_gene_id" ,
54
+ )
55
+ df_symbol = hgnc_df .merge (
56
+ df_symbol , how = "right" , left_on = "symbol" , right_on = "GeneA"
51
57
)
52
- df_symbol = hgnc_df .merge (df_symbol , how = "right" , left_on = "symbol" , right_on = "GeneA" )
53
58
df = pd .concat ([df_not_symbol , df_symbol ])
54
59
df = df .rename (columns = {"hgnc_id" : "Left_hgnc_id" })
55
60
56
61
df_symbol = df [df ["Right_ensembl_gene_id" ].isna ()]
57
62
df_not_symbol = df [df ["Right_ensembl_gene_id" ].notna ()]
58
63
59
64
df_not_symbol = hgnc_df .merge (
60
- df_not_symbol , how = "right" , left_on = "ensembl_gene_id" , right_on = "Right_ensembl_gene_id"
65
+ df_not_symbol ,
66
+ how = "right" ,
67
+ left_on = "ensembl_gene_id" ,
68
+ right_on = "Right_ensembl_gene_id" ,
69
+ )
70
+ df_symbol = hgnc_df .merge (
71
+ df_symbol , how = "right" , left_on = "symbol" , right_on = "GeneB"
61
72
)
62
- df_symbol = hgnc_df .merge (df_symbol , how = "right" , left_on = "symbol" , right_on = "GeneB" )
63
73
df = pd .concat ([df_not_symbol , df_symbol ])
64
74
df = df .rename (columns = {"hgnc_id" : "Right_hgnc_id" })
65
75
66
76
gtf_df = build_gtf_dataframe (gtf )
67
- all_df = df .merge (gtf_df , how = "left" , left_on = "CDS_LEFT_ID" , right_on = "Transcript_id" )
68
- all_df [["PosA" , "orig_start" , "orig_end" ]] = all_df [["PosA" , "orig_start" , "orig_end" ]].fillna (0 ).astype (int )
77
+ all_df = df .merge (
78
+ gtf_df , how = "left" , left_on = "CDS_LEFT_ID" , right_on = "Transcript_id"
79
+ )
80
+ all_df [["PosA" , "orig_start" , "orig_end" ]] = (
81
+ all_df [["PosA" , "orig_start" , "orig_end" ]].fillna (0 ).astype (int )
82
+ )
69
83
70
84
all_df = all_df [
71
- ((all_df ["PosA" ] >= all_df ["orig_start" ]) & (all_df ["PosA" ] <= all_df ["orig_end" ]))
85
+ (
86
+ (all_df ["PosA" ] >= all_df ["orig_start" ])
87
+ & (all_df ["PosA" ] <= all_df ["orig_end" ])
88
+ )
72
89
| ((all_df ["orig_start" ] == 0 ) & (all_df ["orig_end" ] == 0 ))
73
90
]
74
91
75
92
all_df .replace ("" , np .nan , inplace = True )
76
93
all_df = all_df .drop_duplicates ()
77
94
78
- all_df [["exon_number" , "transcript_version" ]] = all_df [["exon_number" , "transcript_version" ]].replace (0 , np .nan )
95
+ all_df [["exon_number" , "transcript_version" ]] = all_df [
96
+ ["exon_number" , "transcript_version" ]
97
+ ].replace (0 , np .nan )
79
98
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
80
99
all_df ["exon_number" ] = all_df .groupby ("PosA" )["exon_number" ].transform (
81
100
lambda x : x .fillna (method = "ffill" ).fillna (method = "bfill" )
82
101
)
83
- all_df ["transcript_version" ] = all_df .groupby ("PosA" )["transcript_version" ]. transform (
84
- lambda x : x . fillna ( method = "ffill" ). fillna ( method = "bfill" )
85
- )
102
+ all_df ["transcript_version" ] = all_df .groupby ("PosA" )[
103
+ "transcript_version"
104
+ ]. transform ( lambda x : x . fillna ( method = "ffill" ). fillna ( method = "bfill" ) )
86
105
87
106
all_df = all_df .rename (columns = {"transcript_version" : "Left_transcript_version" })
88
107
all_df = all_df .rename (columns = {"exon_number" : "Left_exon_number" })
@@ -115,25 +134,36 @@ def vcf_collect(
115
134
]
116
135
].drop_duplicates ()
117
136
all_df ["CDS_RIGHT_ID" ] = all_df ["CDS_RIGHT_ID" ].astype ("str" )
118
- all_df = all_df .merge (gtf_df , how = "left" , left_on = "CDS_RIGHT_ID" , right_on = "Transcript_id" )
119
- all_df [["PosB" , "orig_start" , "orig_end" ]] = all_df [["PosB" , "orig_start" , "orig_end" ]].fillna (0 )
120
- all_df [["PosB" , "orig_start" , "orig_end" ]] = all_df [["PosB" , "orig_start" , "orig_end" ]].astype (int )
137
+ all_df = all_df .merge (
138
+ gtf_df , how = "left" , left_on = "CDS_RIGHT_ID" , right_on = "Transcript_id"
139
+ )
140
+ all_df [["PosB" , "orig_start" , "orig_end" ]] = all_df [
141
+ ["PosB" , "orig_start" , "orig_end" ]
142
+ ].fillna (0 )
143
+ all_df [["PosB" , "orig_start" , "orig_end" ]] = all_df [
144
+ ["PosB" , "orig_start" , "orig_end" ]
145
+ ].astype (int )
121
146
all_df = all_df [
122
- ((all_df ["PosB" ] >= all_df ["orig_start" ]) & (all_df ["PosB" ] <= all_df ["orig_end" ]))
147
+ (
148
+ (all_df ["PosB" ] >= all_df ["orig_start" ])
149
+ & (all_df ["PosB" ] <= all_df ["orig_end" ])
150
+ )
123
151
| ((all_df ["orig_start" ] == 0 ) & (all_df ["orig_end" ] == 0 ))
124
152
]
125
153
126
154
all_df [["PosA" , "PosB" ]] = all_df [["PosA" , "PosB" ]].replace (0 , np .nan )
127
155
all_df = all_df .replace ("" , np .nan )
128
156
129
- all_df [["exon_number" , "transcript_version" ]] = all_df [["exon_number" , "transcript_version" ]].replace (0 , np .nan )
157
+ all_df [["exon_number" , "transcript_version" ]] = all_df [
158
+ ["exon_number" , "transcript_version" ]
159
+ ].replace (0 , np .nan )
130
160
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
131
161
all_df ["exon_number" ] = all_df .groupby ("PosB" )["exon_number" ].transform (
132
162
lambda x : x .fillna (method = "ffill" ).fillna (method = "bfill" )
133
163
)
134
- all_df ["transcript_version" ] = all_df .groupby ("PosB" )["transcript_version" ]. transform (
135
- lambda x : x . fillna ( method = "ffill" ). fillna ( method = "bfill" )
136
- )
164
+ all_df ["transcript_version" ] = all_df .groupby ("PosB" )[
165
+ "transcript_version"
166
+ ]. transform ( lambda x : x . fillna ( method = "ffill" ). fillna ( method = "bfill" ) )
137
167
138
168
all_df = all_df .rename (columns = {"transcript_version" : "Right_transcript_version" })
139
169
all_df = all_df .rename (columns = {"exon_number" : "Right_exon_number" })
@@ -212,7 +242,9 @@ def parse_args(argv=None):
212
242
type = Path ,
213
243
help = "HGNC database." ,
214
244
)
215
- parser .add_argument ("--sample" , metavar = "SAMPLE" , type = Path , help = "Sample name." , default = "Sample" )
245
+ parser .add_argument (
246
+ "--sample" , metavar = "SAMPLE" , type = Path , help = "Sample name." , default = "Sample"
247
+ )
216
248
parser .add_argument (
217
249
"--out" ,
218
250
metavar = "OUT" ,
@@ -273,14 +305,28 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
273
305
df = pd .read_csv (file , sep = "\t " )
274
306
df = df .rename (columns = {"#FusionName" : "FUSION" })
275
307
if not (df .empty ):
276
- df [["ChromosomeA" , "PosA" , "Strand1" ]] = df ["LeftBreakpoint" ].str .split (":" , expand = True )
277
- df [["ChromosomeB" , "PosB" , "Strand2" ]] = df ["RightBreakpoint" ].str .split (":" , expand = True )
278
- df [["LeftGeneName" , "Left_ensembl_gene_id" ]] = df ["LeftGene" ].str .split ("^" , expand = True )
279
- df [["RightGeneName" , "Right_ensembl_gene_id" ]] = df ["RightGene" ].str .split ("^" , expand = True )
308
+ df [["ChromosomeA" , "PosA" , "Strand1" ]] = df ["LeftBreakpoint" ].str .split (
309
+ ":" , expand = True
310
+ )
311
+ df [["ChromosomeB" , "PosB" , "Strand2" ]] = df ["RightBreakpoint" ].str .split (
312
+ ":" , expand = True
313
+ )
314
+ df [["LeftGeneName" , "Left_ensembl_gene_id" ]] = df ["LeftGene" ].str .split (
315
+ "^" , expand = True
316
+ )
317
+ df [["RightGeneName" , "Right_ensembl_gene_id" ]] = df ["RightGene" ].str .split (
318
+ "^" , expand = True
319
+ )
280
320
df ["annots" ] = (
281
321
df ["annots" ]
282
322
.apply (convert_to_list )
283
- .apply (lambda x : "," .join (map (str , x )) if isinstance (x , list ) else str (x ) if pd .notna (x ) else "" )
323
+ .apply (
324
+ lambda x : (
325
+ "," .join (map (str , x ))
326
+ if isinstance (x , list )
327
+ else str (x ) if pd .notna (x ) else ""
328
+ )
329
+ )
284
330
)
285
331
else :
286
332
for i in [
@@ -304,7 +350,9 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
304
350
return df .set_index (["FUSION" ])
305
351
306
352
307
- def replace_value_with_column_name (row : pd .Series , value_to_replace : str , column_name : str ) -> str :
353
+ def replace_value_with_column_name (
354
+ row : pd .Series , value_to_replace : str , column_name : str
355
+ ) -> str :
308
356
"""
309
357
Replace a specific value in a row with the corresponding column name.
310
358
"""
@@ -334,9 +382,12 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
334
382
Make all column headers uppercase.
335
383
"""
336
384
with open (fusionreport_file ) as f :
337
- from_html = [line .split ('rows": [' )[1 ] for line in f if 'name="fusion_list' in line ]
338
- expression = ast .literal_eval (from_html [0 ].split ('], "tool' )[0 ])
339
- fusion_report = pd .DataFrame .from_dict ({k : [v ] for k , v in expression .items ()})
385
+ from_html = [
386
+ line .split ('rows": ' )[1 ] for line in f if 'name="fusion_list' in line
387
+ ]
388
+ tmp = str (from_html )[2 :]
389
+ tmp2 = tmp .split (', "tools": ' )[0 ]
390
+ fusion_report = pd .DataFrame (ast .literal_eval (tmp2 ))
340
391
if not "arriba" in fusion_report .columns :
341
392
fusion_report ["arriba" ] = ""
342
393
if not "fusioncatcher" in fusion_report .columns :
@@ -352,25 +403,31 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
352
403
fusion_report ["starfusion" ] = fusion_report [["starfusion" ]].apply (
353
404
replace_value_with_column_name , args = ("true" , "starfusion" ), axis = 1
354
405
)
355
- fusion_report ["FOUND_IN" ] = fusion_report [[ "arriba" , "starfusion" , "fusioncatcher" ]]. apply (
356
- concatenate_columns , axis = 1
357
- )
406
+ fusion_report ["FOUND_IN" ] = fusion_report [
407
+ [ "arriba" , "starfusion" , "fusioncatcher" ]
408
+ ]. apply ( concatenate_columns , axis = 1 )
358
409
fusion_report .columns = fusion_report .columns .str .upper ()
359
- fusion_report ["FOUND_DB" ] = fusion_report ["FOUND_DB" ].apply (lambda x : "," . join ( x ))
360
- fusion_report [[ "GeneA" , "GeneB" ]] = fusion_report [ "FUSION" ]. str . split ( "--" , expand = True )
361
-
362
- return fusion_report [["FUSION" , " GeneA" , "GeneB" , "TOOLS_HITS" , "SCORE" , "FOUND_DB" , "FOUND_IN" ]]. set_index (
363
- [ "FUSION" ]
410
+ fusion_report ["FOUND_DB" ] = fusion_report ["FOUND_DB" ].apply (
411
+ lambda x : "," . join ( x ) if len ( x ) > 0 else ""
412
+ )
413
+ fusion_report [["GeneA" , "GeneB" ]] = fusion_report [ "FUSION" ]. str . split (
414
+ "--" , expand = True
364
415
)
365
416
417
+ return fusion_report [
418
+ ["FUSION" , "GeneA" , "GeneB" , "TOOLS_HITS" , "SCORE" , "FOUND_DB" , "FOUND_IN" ]
419
+ ].set_index (["FUSION" ])
420
+
366
421
367
422
def read_fusionreport_csv (file : str ) -> pd .DataFrame :
368
423
df = pd .read_csv (file )
369
424
columns_to_iterate = ["starfusion" , "arriba" , "fusioncatcher" ]
370
425
for column in columns_to_iterate :
371
426
if column not in df .columns :
372
427
df [column ] = ""
373
- df [["starfusion" , "arriba" , "fusioncatcher" ]] = df [["starfusion" , "arriba" , "fusioncatcher" ]].astype ("str" )
428
+ df [["starfusion" , "arriba" , "fusioncatcher" ]] = df [
429
+ ["starfusion" , "arriba" , "fusioncatcher" ]
430
+ ].astype ("str" )
374
431
for index , row in df .iterrows ():
375
432
for column in columns_to_iterate :
376
433
cell_value = row [column ]
@@ -398,7 +455,18 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame:
398
455
df [["GeneA" , "GeneB" ]] = df ["Fusion" ].str .split ("--" , expand = True )
399
456
df = df .set_index ("Fusion" )
400
457
df .to_csv ("tmp.csv" )
401
- return df [["GeneA" , "GeneB" , "ChromosomeA" , "PosA" , "StrandA" , "ChromosomeB" , "PosB" , "StrandB" ]]
458
+ return df [
459
+ [
460
+ "GeneA" ,
461
+ "GeneB" ,
462
+ "ChromosomeA" ,
463
+ "PosA" ,
464
+ "StrandA" ,
465
+ "ChromosomeB" ,
466
+ "PosB" ,
467
+ "StrandB" ,
468
+ ]
469
+ ]
402
470
403
471
404
472
def column_manipulation (df : pd .DataFrame ) -> pd .DataFrame :
@@ -424,8 +492,12 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
424
492
df ["Right_hgnc_id" ] = df ["Right_hgnc_id" ].fillna (0 ).astype (int ).astype (str )
425
493
df ["Left_exon_number" ] = df ["Left_exon_number" ].fillna (0 ).astype (int ).astype (str )
426
494
df ["Right_exon_number" ] = df ["Right_exon_number" ].fillna (0 ).astype (int ).astype (str )
427
- df ["Left_transcript_version" ] = df ["Left_transcript_version" ].fillna (0 ).astype (int ).astype (str )
428
- df ["Right_transcript_version" ] = df ["Right_transcript_version" ].fillna (0 ).astype (int ).astype (str )
495
+ df ["Left_transcript_version" ] = (
496
+ df ["Left_transcript_version" ].fillna (0 ).astype (int ).astype (str )
497
+ )
498
+ df ["Right_transcript_version" ] = (
499
+ df ["Right_transcript_version" ].fillna (0 ).astype (int ).astype (str )
500
+ )
429
501
df ["PosA" ] = df ["PosA" ].fillna (0 ).astype (int ).astype (str )
430
502
df ["PosB" ] = df ["PosB" ].fillna (0 ).astype (int ).astype (str )
431
503
df ["PROT_FUSION_TYPE" ] = df ["PROT_FUSION_TYPE" ].replace ("." , "nan" )
@@ -452,7 +524,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
452
524
f"EXON_NUMBER_A={ row ['Left_exon_number' ]} ;EXON_NUMBER_B={ row ['Right_exon_number' ]} ;"
453
525
f"ANNOTATIONS={ row ['annots' ]} "
454
526
)
455
- df .loc [index , "Sample" ] = f"./1:{ row ['JunctionReadCount' ]} :{ row ['SpanningFragCount' ]} :{ row ['FFPM' ]} "
527
+ df .loc [index , "Sample" ] = (
528
+ f"./1:{ row ['JunctionReadCount' ]} :{ row ['SpanningFragCount' ]} :{ row ['FFPM' ]} "
529
+ )
456
530
457
531
return df
458
532
@@ -474,7 +548,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
474
548
"FORMAT" ,
475
549
"Sample" ,
476
550
]
477
- ].to_csv (path_or_buf = out_file , sep = "\t " , header = None , index = False , quoting = csv .QUOTE_NONE )
551
+ ].to_csv (
552
+ path_or_buf = out_file , sep = "\t " , header = None , index = False , quoting = csv .QUOTE_NONE
553
+ )
478
554
479
555
with open (out_file , "r+" ) as f :
480
556
content = f .read ()
@@ -496,9 +572,15 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
496
572
Build a DataFrame from GTF file converted in TSV, extracting relevant columns.
497
573
"""
498
574
df = pd .read_csv (file , sep = "\t " )
499
- df [["fusion_dump" , "Transcript_id" ]] = df ["transcript_id" ].str .split ("^" , expand = True )
500
- df [["orig_chromosome" , "orig_start" , "orig_end" , "orig_dir" ]] = df ["orig_coord_info" ].str .split ("," , expand = True )
501
- return df [["Transcript_id" , "transcript_version" , "exon_number" , "orig_start" , "orig_end" ]]
575
+ df [["fusion_dump" , "Transcript_id" ]] = df ["transcript_id" ].str .split (
576
+ "^" , expand = True
577
+ )
578
+ df [["orig_chromosome" , "orig_start" , "orig_end" , "orig_dir" ]] = df [
579
+ "orig_coord_info"
580
+ ].str .split ("," , expand = True )
581
+ return df [
582
+ ["Transcript_id" , "transcript_version" , "exon_number" , "orig_start" , "orig_end" ]
583
+ ]
502
584
503
585
504
586
def main (argv = None ):
@@ -511,7 +593,9 @@ def main(argv=None):
511
593
or not args .fusionreport_csv
512
594
or not args .hgnc
513
595
):
514
- logger .error (f"The given input file { args .fusioninspector } or { args .fusionreport } was not found!" )
596
+ logger .error (
597
+ f"The given input file { args .fusioninspector } or { args .fusionreport } was not found!"
598
+ )
515
599
sys .exit (2 )
516
600
vcf_collect (
517
601
args .fusioninspector ,
0 commit comments