Skip to content

Commit 52ac390

Browse files
committed
documentation of MaxEntScan support
1 parent 5d32a69 commit 52ac390

File tree

8 files changed

+47
-6
lines changed

8 files changed

+47
-6
lines changed

pcgr/annoutils.py

+9
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
236236
csq_record['CDS_RELATIVE_POSITION'] = '.'
237237
csq_record['LOSS_OF_FUNCTION'] = False
238238
csq_record['LOF_FILTER'] = '.'
239+
csq_record['MAXENTSCAN'] = '.'
239240

240241
splice_variant = False
241242
#print(csq_record.keys())
@@ -262,25 +263,33 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
262263
if re.search(pcgr_vars.CSQ_NULL_PATTERN, str(csq_record['Consequence'])) is not None:
263264
csq_record['NULL_VARIANT'] = True
264265

266+
if not csq_record['MaxEntScan_diff'] is None and not csq_record['MaxEntScan_ref'] is None and not csq_record['MaxEntScan_alt'] is None:
267+
csq_record['MAXENTSCAN'] = 'MES|' + str(csq_record['MaxEntScan_diff']) + '|' + \
268+
str(csq_record['MaxEntScan_ref']) + '|' + str(csq_record['MaxEntScan_alt'])
269+
265270
if re.search(pcgr_vars.CSQ_SPLICE_DONOR_PATTERN, str(csq_record['Consequence'])) is not None:
266271
if re.search(r'(\+3(A|G)>|\+4A>|\+5G>)', str(csq_record['HGVSc'])) is not None:
267272
csq_record['SPLICE_DONOR_RELEVANT'] = True
268273
if not csq_record['MaxEntScan_diff'] is None and re.search('splice_donor_(5th|variant)',str(csq_record['Consequence'])) is not None:
269274
if abs(csq_record['MaxEntScan_diff']) >= pcgr_vars.DONOR_DISRUPTION_MES_CUTOFF:
270275
csq_record['LOSS_OF_FUNCTION'] = True
276+
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|DONOR_DISRUPTING'
271277
else:
272278
if csq_record['LOSS_OF_FUNCTION'] is True:
273279
csq_record['LOSS_OF_FUNCTION'] = False
274280
csq_record['LOF_FILTER'] = "NON_DONOR_DISRUPTING"
281+
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|NON_DONOR_DISRUPTING'
275282

276283
if re.search(pcgr_vars.CSQ_SPLICE_ACCEPTOR_PATTERN, str(csq_record['Consequence'])) is not None:
277284
if not csq_record['MaxEntScan_diff'] is None and re.search('splice_acceptor', str(csq_record['Consequence'])) is not None:
278285
if abs(csq_record['MaxEntScan_diff']) >= pcgr_vars.ACCEPTOR_DISRUPTION_MES_CUTOFF:
279286
csq_record['LOSS_OF_FUNCTION'] = True
287+
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|ACCEPTOR_DISRUPTING'
280288
else:
281289
if csq_record['LOSS_OF_FUNCTION'] is True:
282290
csq_record['LOSS_OF_FUNCTION'] = False
283291
csq_record['LOF_FILTER'] = "NON_ACCEPTOR_DISRUPTING"
292+
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|NON_ACCEPTOR_DISRUPTING'
284293

285294
if re.search(pcgr_vars.CSQ_SPLICE_REGION_PATTERN, str(csq_record['Consequence'])) is not None:
286295
match = re.search(

pcgrr/R/input_data.R

+15
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,21 @@ load_dna_variants <- function(
640640
)
641641
}
642642

643+
if ("MAXENTSCAN" %in% colnames(results[['variant']]) &
644+
"SPLICE_EFFECT" %in% colnames(results[['variant']])) {
645+
results[['variant']] <-
646+
results[['variant']] |>
647+
dplyr::mutate(
648+
SPLICE_EFFECT = paste(
649+
SPLICE_EFFECT, MAXENTSCAN, sep = ", "
650+
)) |>
651+
dplyr::mutate(
652+
SPLICE_EFFECT = stringr::str_replace_all(
653+
.data$SPLICE_EFFECT, "^(NA, )|^(NA, NA)$", ""
654+
)
655+
)
656+
}
657+
643658
## Rename annotations for more clarity
644659
if ("TSG_SUPPORT" %in% colnames(results[['variant']])) {
645660
results[['variant']] <-

pcgrr/data-raw/data-raw.R

+5
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ data_coltype_defs[['snv_indel_somatic_raw']] <- readr::cols_only(
206206
IMPACT = readr::col_character(),
207207
LOSS_OF_FUNCTION = readr::col_logical(),
208208
LOF_FILTER = readr::col_character(),
209+
MAXENTSCAN = readr::col_character(),
210+
MaxEntScan_diff = readr::col_number(),
209211
SPLICE_DONOR_RELEVANT = readr::col_logical(),
210212
SPLICE_EFFECT_MUTSPLICEDB = readr::col_character(),
211213
NULL_VARIANT = readr::col_logical(),
@@ -313,6 +315,8 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only(
313315
IMPACT = readr::col_character(),
314316
LOSS_OF_FUNCTION = readr::col_logical(),
315317
LOF_FILTER = readr::col_character(),
318+
MAXENTSCAN = readr::col_character(),
319+
MaxEntScan_diff = readr::col_number(),
316320
SPLICE_DONOR_RELEVANT = readr::col_logical(),
317321
SPLICE_EFFECT_MUTSPLICEDB = readr::col_character(),
318322
NULL_VARIANT = readr::col_logical(),
@@ -787,6 +791,7 @@ dt_display[['snv_indel_gene_actionable']] <-
787791
'HGVSc',
788792
'HGVSc_RefSeq',
789793
'PREDICTED_EFFECT',
794+
'SPLICE_EFFECT',
790795
'LOSS_OF_FUNCTION',
791796
'LOF_FILTER',
792797
'ONCOGENICITY',

pcgrr/data/data_coltype_defs.rda

44 Bytes
Binary file not shown.

pcgrr/inst/templates/pcgr_quarto_report/documentation.qmd

+4-2
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,17 @@ For SNVs and InDels, PCGR considers the following consequence types as candidate
107107
* Frameshift variants - [SO:0001589](http://www.sequenceontology.org/miso/current_svn/term/SO:0001589)
108108
* Splice-site disruptions (2bp donor/acceptor site) - [SO:0001574](http://www.sequenceontology.org/miso/current_svn/term/SO:0001574),
109109
[SO:0001575](http://www.sequenceontology.org/miso/current_svn/term/SO:0001575)
110-
* Start losses - [SO:0001574](http://www.sequenceontology.org/miso/current_svn/term/SO:0002012)
110+
* Splice donor 5th base disruptions - [SO:0001787](http://www.sequenceontology.org/miso/current_svn/term/SO:0001787)
111+
* Start losses - [SO:0002012](http://www.sequenceontology.org/miso/current_svn/term/SO:0002012)
111112

112-
If variants of other consequence types (e.g. synonymous variants, or splice site variants beyond the canonical 2bp site) are found to affect splicing, specifically through records in [MutSpliceDB](https://brb.nci.nih.gov/splicing/), these are also marked as loss-of-function candidates.
113+
If variants of other consequence types (e.g. synonymous variants, or splice site variants beyond the canonical 2bp site/5th donor base) are found to affect splicing, specifically through records in [MutSpliceDB](https://brb.nci.nih.gov/splicing/), these are also marked as loss-of-function candidates.
113114

114115
A collection of filters is next applied, which can potentialy remove the loss-of-function property for candidates identified above:
115116

116117
* Frameshifts/stop gains within the last 5% of the CDS
117118
* Splice site variants that are not predicted to affect a donor site (GC -> GT)
118119
* Variants where [MaxEntScan](https://pubmed.ncbi.nlm.nih.gov/15285897/) does not predict an effect on splicing
120+
- A MaxEntScan score difference (between reference and alternative allele) of at least 6 is considered disruptive for donor sites, while 7 is required for acceptor sites (thresholds adopted from [LOFTEE](https://github.com/konradjk/loftee))
119121
- Annotations in MutSpliceDB will have precedence if any conflicting evidence with MaxEntScan output is found
120122

121123
If a variant is filtered as non-LoF through any of these criteria, this will be evident from the `LOF_FILTER` variable (found in the interactive tables of the HTML report as well as the TSV/Excel output).

pcgrr/inst/templates/pcgr_quarto_report/snv_indel/oncogenicity.qmd

+10-2
Original file line numberDiff line numberDiff line change
@@ -239,11 +239,19 @@ noncoding_variant_set <-
239239
"PROTEIN_DOMAIN",
240240
"MUTATION_HOTSPOT",
241241
"MUTATION_HOTSPOT_CANCERTYPE",
242-
"HGVSc_RefSeq",
243-
"PREDICTED_EFFECT",
244242
"TARGETED_INHIBITORS",
245243
"TARGETED_INHIBITORS_ALL",
246244
"VEP_ALL_CSQ"))) |>
245+
dplyr::select(
246+
c("SYMBOL",
247+
"ALTERATION",
248+
"GENENAME",
249+
"CONSEQUENCE",
250+
"ONCOGENICITY",
251+
"COSMIC_ID",
252+
"LOSS_OF_FUNCTION"),
253+
dplyr::everything()
254+
) |>
247255
dplyr::arrange(
248256
dplyr::desc(.data$ONCOGENICITY_SCORE),
249257
dplyr::desc(.data$TISSUE_ASSOC_RANK),

pcgrr/vignettes/CHANGELOG.Rmd

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ for oncogenicity classification
6767
- Added oncogenicity documentation in variant tables of HTML report (indicates which criteria that was matched for a given variant)
6868
- Removed `VEP_ALL_CSQ` from variant tables in HTML report to reduce file size - still available in TSV output
6969
- Added MutSpliceDB splice site effects (used e.g. for loss-of-function annotation)
70+
- Added MaxEntScan plugin in VEP for splice site disruption prediction (used e.g. for loss-of-function annotation)
7071
- Added more prediction algorithms from dbNSFP
7172
- Fixed bug that caused crash for missing values in DP/AF values of input VCF
7273
- Fixed erroneous re-formatting of MAF integer columns to floats

pcgrr/vignettes/output.Rmd

+3-2
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,8 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i
189189
| `PFAM_DOMAIN` | Pfam domain identifier (from VEP) |
190190
| `INTOGEN_DRIVER_MUT` | Indicates if existing variant is predicted as driver mutation from IntoGen Catalog of Driver Mutations |
191191
| `EFFECT_PREDICTIONS` | Insilico predictions variant effect on protein function and pre-mRNA splicing from [database of non-synonymous functional predictions - dbNSFP v5.0](https://www.dbnsfp.org/). Predicted effects are provided by different sources/algorithms (separated by `&`), `T` = Tolerated, `N` = Neutral, `D` = Damaging |
192-
| `SPLICE_EFFECT` | Effect of splicing, from MutSpliceDB. Format: <ENTREZGENE>|<SYMBOL>|<REFSEQ_TRANSCRIPT_ID>|<HGVSc>|<EFFECT><AFFECTED_EXONS>|<SOURCE> |
192+
| `SPLICE_EFFECT` | Effect of splicing, from MutSpliceDB and/or MaxEntScan. Format: <ENTREZGENE>|<SYMBOL>|<REFSEQ_TRANSCRIPT_ID>|<HGVSc>|<EFFECT><AFFECTED_EXONS>|<SOURCE> (MutSpliceDB),
193+
MES|<SCORE_DIFF>|SCORE_REF>|<SCORE_ALT>|EFFECT (MaxEntScan). |
193194
| `DBNSFP_BAYESDEL_ADDAF` | predicted effect from BayesDel (dbNSFP) |
194195
| `DBNSFP_LIST_S2` | predicted effect from LIST-S2 (dbNSFP) |
195196
| `DBNSFP_SIFT` | predicted effect from SIFT (dbNSFP) |
@@ -322,7 +323,7 @@ The following variables are included in the TSV file (VCF tags issued by the use
322323
| 48. `TUMOR_SUPPRESSOR_SUPPORT` | Tumor suppressor annotation support (CGC/CancerMine/NCG) |
323324
| 49. `TARGETED_INHIBITORS2` | Targeted inhibitors |
324325
| 50. `EFFECT_PREDICTIONS` | Variant effect predictions - from dbNSFP |
325-
| 51. `SPLICE_EFFECT` | Splice effect annotation from MutSpliceDB |
326+
| 51. `SPLICE_EFFECT` | Splice effect annotations from MutSpliceDB and MaxEntScan (see details above) |
326327
| 52. `REGULATORY_ANNOTATION` | Regulatory annotation |
327328
| 53. `VEP_ALL_CSQ` | VEP consequence - all transcripts |
328329
| 54. `gnomADe_AF` | gnomAD exomes allele frequency - globally |

0 commit comments

Comments
 (0)