Skip to content

Commit fe3de40

Browse files
authored
Merge pull request #481 from nf-core/vcf_fix2
Vcf fix2
2 parents 5858c3c + 2b6ff48 commit fe3de40

File tree

2 files changed

+132
-47
lines changed

2 files changed

+132
-47
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
- fix bug when using parameter "whitelist" [#466](https://github.com/nf-core/rnafusion/pull/466)
1717
- fix VCF_COLLECT handling when a tool is absent from FUSIONREPORT report [#458](https://github.com/nf-core/rnafusion/pull/458)
1818
- fix VCF_COLLECT when fusioninspector output is empty but fusionreport is not [#465](https://github.com/nf-core/rnafusion/pull/465)
19+
- fix VCF_COLLECT bug [#481](https://github.com/nf-core/rnafusion/pull/481)
1920

2021
### Removed
2122

bin/vcf_collect.py

+131-47
Original file line numberDiff line numberDiff line change
@@ -47,42 +47,61 @@ def vcf_collect(
4747
df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()]
4848

4949
df_not_symbol = hgnc_df.merge(
50-
df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
50+
df_not_symbol,
51+
how="right",
52+
left_on="ensembl_gene_id",
53+
right_on="Left_ensembl_gene_id",
54+
)
55+
df_symbol = hgnc_df.merge(
56+
df_symbol, how="right", left_on="symbol", right_on="GeneA"
5157
)
52-
df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneA")
5358
df = pd.concat([df_not_symbol, df_symbol])
5459
df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})
5560

5661
df_symbol = df[df["Right_ensembl_gene_id"].isna()]
5762
df_not_symbol = df[df["Right_ensembl_gene_id"].notna()]
5863

5964
df_not_symbol = hgnc_df.merge(
60-
df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id"
65+
df_not_symbol,
66+
how="right",
67+
left_on="ensembl_gene_id",
68+
right_on="Right_ensembl_gene_id",
69+
)
70+
df_symbol = hgnc_df.merge(
71+
df_symbol, how="right", left_on="symbol", right_on="GeneB"
6172
)
62-
df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneB")
6373
df = pd.concat([df_not_symbol, df_symbol])
6474
df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})
6575

6676
gtf_df = build_gtf_dataframe(gtf)
67-
all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
68-
all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
77+
all_df = df.merge(
78+
gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id"
79+
)
80+
all_df[["PosA", "orig_start", "orig_end"]] = (
81+
all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
82+
)
6983

7084
all_df = all_df[
71-
((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"]))
85+
(
86+
(all_df["PosA"] >= all_df["orig_start"])
87+
& (all_df["PosA"] <= all_df["orig_end"])
88+
)
7289
| ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
7390
]
7491

7592
all_df.replace("", np.nan, inplace=True)
7693
all_df = all_df.drop_duplicates()
7794

78-
all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan)
95+
all_df[["exon_number", "transcript_version"]] = all_df[
96+
["exon_number", "transcript_version"]
97+
].replace(0, np.nan)
7998
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
8099
all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform(
81100
lambda x: x.fillna(method="ffill").fillna(method="bfill")
82101
)
83-
all_df["transcript_version"] = all_df.groupby("PosA")["transcript_version"].transform(
84-
lambda x: x.fillna(method="ffill").fillna(method="bfill")
85-
)
102+
all_df["transcript_version"] = all_df.groupby("PosA")[
103+
"transcript_version"
104+
].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
86105

87106
all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
88107
all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
@@ -115,25 +134,36 @@ def vcf_collect(
115134
]
116135
].drop_duplicates()
117136
all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str")
118-
all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
119-
all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0)
120-
all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int)
137+
all_df = all_df.merge(
138+
gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id"
139+
)
140+
all_df[["PosB", "orig_start", "orig_end"]] = all_df[
141+
["PosB", "orig_start", "orig_end"]
142+
].fillna(0)
143+
all_df[["PosB", "orig_start", "orig_end"]] = all_df[
144+
["PosB", "orig_start", "orig_end"]
145+
].astype(int)
121146
all_df = all_df[
122-
((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"]))
147+
(
148+
(all_df["PosB"] >= all_df["orig_start"])
149+
& (all_df["PosB"] <= all_df["orig_end"])
150+
)
123151
| ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
124152
]
125153

126154
all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan)
127155
all_df = all_df.replace("", np.nan)
128156

129-
all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan)
157+
all_df[["exon_number", "transcript_version"]] = all_df[
158+
["exon_number", "transcript_version"]
159+
].replace(0, np.nan)
130160
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
131161
all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform(
132162
lambda x: x.fillna(method="ffill").fillna(method="bfill")
133163
)
134-
all_df["transcript_version"] = all_df.groupby("PosB")["transcript_version"].transform(
135-
lambda x: x.fillna(method="ffill").fillna(method="bfill")
136-
)
164+
all_df["transcript_version"] = all_df.groupby("PosB")[
165+
"transcript_version"
166+
].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
137167

138168
all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
139169
all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
@@ -212,7 +242,9 @@ def parse_args(argv=None):
212242
type=Path,
213243
help="HGNC database.",
214244
)
215-
parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample")
245+
parser.add_argument(
246+
"--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample"
247+
)
216248
parser.add_argument(
217249
"--out",
218250
metavar="OUT",
@@ -273,14 +305,28 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
273305
df = pd.read_csv(file, sep="\t")
274306
df = df.rename(columns={"#FusionName": "FUSION"})
275307
if not (df.empty):
276-
df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True)
277-
df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True)
278-
df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True)
279-
df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True)
308+
df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(
309+
":", expand=True
310+
)
311+
df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(
312+
":", expand=True
313+
)
314+
df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split(
315+
"^", expand=True
316+
)
317+
df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split(
318+
"^", expand=True
319+
)
280320
df["annots"] = (
281321
df["annots"]
282322
.apply(convert_to_list)
283-
.apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "")
323+
.apply(
324+
lambda x: (
325+
",".join(map(str, x))
326+
if isinstance(x, list)
327+
else str(x) if pd.notna(x) else ""
328+
)
329+
)
284330
)
285331
else:
286332
for i in [
@@ -304,7 +350,9 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
304350
return df.set_index(["FUSION"])
305351

306352

307-
def replace_value_with_column_name(row: pd.Series, value_to_replace: str, column_name: str) -> str:
353+
def replace_value_with_column_name(
354+
row: pd.Series, value_to_replace: str, column_name: str
355+
) -> str:
308356
"""
309357
Replace a specific value in a row with the corresponding column name.
310358
"""
@@ -334,9 +382,12 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
334382
Make all column headers uppercase.
335383
"""
336384
with open(fusionreport_file) as f:
337-
from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line]
338-
expression = ast.literal_eval(from_html[0].split('], "tool')[0])
339-
fusion_report = pd.DataFrame.from_dict({k: [v] for k, v in expression.items()})
385+
from_html = [
386+
line.split('rows": ')[1] for line in f if 'name="fusion_list' in line
387+
]
388+
tmp = str(from_html)[2:]
389+
tmp2 = tmp.split(', "tools": ')[0]
390+
fusion_report = pd.DataFrame(ast.literal_eval(tmp2))
340391
if not "arriba" in fusion_report.columns:
341392
fusion_report["arriba"] = ""
342393
if not "fusioncatcher" in fusion_report.columns:
@@ -352,25 +403,31 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
352403
fusion_report["starfusion"] = fusion_report[["starfusion"]].apply(
353404
replace_value_with_column_name, args=("true", "starfusion"), axis=1
354405
)
355-
fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply(
356-
concatenate_columns, axis=1
357-
)
406+
fusion_report["FOUND_IN"] = fusion_report[
407+
["arriba", "starfusion", "fusioncatcher"]
408+
].apply(concatenate_columns, axis=1)
358409
fusion_report.columns = fusion_report.columns.str.upper()
359-
fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x))
360-
fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)
361-
362-
return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(
363-
["FUSION"]
410+
fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(
411+
lambda x: ",".join(x) if len(x) > 0 else ""
412+
)
413+
fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split(
414+
"--", expand=True
364415
)
365416

417+
return fusion_report[
418+
["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]
419+
].set_index(["FUSION"])
420+
366421

367422
def read_fusionreport_csv(file: str) -> pd.DataFrame:
368423
df = pd.read_csv(file)
369424
columns_to_iterate = ["starfusion", "arriba", "fusioncatcher"]
370425
for column in columns_to_iterate:
371426
if column not in df.columns:
372427
df[column] = ""
373-
df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str")
428+
df[["starfusion", "arriba", "fusioncatcher"]] = df[
429+
["starfusion", "arriba", "fusioncatcher"]
430+
].astype("str")
374431
for index, row in df.iterrows():
375432
for column in columns_to_iterate:
376433
cell_value = row[column]
@@ -398,7 +455,18 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame:
398455
df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True)
399456
df = df.set_index("Fusion")
400457
df.to_csv("tmp.csv")
401-
return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]]
458+
return df[
459+
[
460+
"GeneA",
461+
"GeneB",
462+
"ChromosomeA",
463+
"PosA",
464+
"StrandA",
465+
"ChromosomeB",
466+
"PosB",
467+
"StrandB",
468+
]
469+
]
402470

403471

404472
def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
@@ -424,8 +492,12 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
424492
df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str)
425493
df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str)
426494
df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str)
427-
df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str)
428-
df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str)
495+
df["Left_transcript_version"] = (
496+
df["Left_transcript_version"].fillna(0).astype(int).astype(str)
497+
)
498+
df["Right_transcript_version"] = (
499+
df["Right_transcript_version"].fillna(0).astype(int).astype(str)
500+
)
429501
df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str)
430502
df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str)
431503
df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan")
@@ -452,7 +524,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
452524
f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};"
453525
f"ANNOTATIONS={row['annots']}"
454526
)
455-
df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
527+
df.loc[index, "Sample"] = (
528+
f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
529+
)
456530

457531
return df
458532

@@ -474,7 +548,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
474548
"FORMAT",
475549
"Sample",
476550
]
477-
].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE)
551+
].to_csv(
552+
path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE
553+
)
478554

479555
with open(out_file, "r+") as f:
480556
content = f.read()
@@ -496,9 +572,15 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
496572
Build a DataFrame from GTF file converted in TSV, extracting relevant columns.
497573
"""
498574
df = pd.read_csv(file, sep="\t")
499-
df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True)
500-
df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True)
501-
return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]]
575+
df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split(
576+
"^", expand=True
577+
)
578+
df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df[
579+
"orig_coord_info"
580+
].str.split(",", expand=True)
581+
return df[
582+
["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]
583+
]
502584

503585

504586
def main(argv=None):
@@ -511,7 +593,9 @@ def main(argv=None):
511593
or not args.fusionreport_csv
512594
or not args.hgnc
513595
):
514-
logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
596+
logger.error(
597+
f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!"
598+
)
515599
sys.exit(2)
516600
vcf_collect(
517601
args.fusioninspector,

0 commit comments

Comments
 (0)