diff --git a/.gitattributes b/.gitattributes index aed97b2a..57191e29 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,8 @@ tests/db/compile/** filter=lfs diff=lfs merge=lfs -text tests/db/hpo/** filter=lfs diff=lfs merge=lfs -text tests/seqvars/ingest/db/grch37/** filter=lfs diff=lfs merge=lfs -text tests/seqvars/ingest/db/grch38/** filter=lfs diff=lfs merge=lfs -text +tests/strucvars/query/*.vcf filter=lfs diff=lfs merge=lfs -text +tests/strucvars/query/grch37/** filter=lfs diff=lfs merge=lfs -text +tests/strucvars/query/noref/** filter=lfs diff=lfs merge=lfs -text +tests/strucvars/query/db/** filter=lfs diff=lfs merge=lfs -text +src/strucvars/query/snapshots/*smoke_test*.snap filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 52f43719..568f3a1d 100644 --- a/README.md +++ b/README.md @@ -283,7 +283,49 @@ Convert output of [varfish-db-downloader](https://github.com/bihealth/varfish-db $ varfish-server-worker strucvars txt-to-bin \ --input-type {ClinvarSv,StrucvarInhouse,...} \ --path-input IN.txt \ - --path-output-bin DST.bin + --path-output DST.bin +``` + +## The `strucvars query` Command + +Run a query on a VCF file with structural variants as created by `strucvars ingest` using a varfish worker database. + +``` +$ varfish-server-worker strucvars query \ + --genome-release grch37 \ + --path-db path/to/worker-db \ + --path-input IN.vcf.gz \ + --path-output OUT.jsonl +``` + +The worker database has the following structure: + +``` +$ROOT/ + noref/ + genes/ + acmg.tsv -- ACMG SF list genes + mim2gene.tsv -- OMIM to NCBI mapping from clingen + xlink.bin -- gene crosslinks + {genome_release}/ -- one per genome release + mehari/ + txs.bin.zstd -- mehari transcripts + features/ -- features important for annotation + masked_repeat.bin -- masked repeats + masked_seqdup.bin -- masked segmental duplications + strucvars/ -- structural variant specific + bgdbs/ -- background databases + dbvar.bin -- dbVar + dgv.bin -- DGV + dgv-gs.bin -- DGV gold standard + exac.bin -- ExAC CNVs + g1k.bin -- 1000 genomes CNVs + gnomad.bin -- gnomAD-SVs + clinvar.bin -- ClinVar SVs + inhouse.bin -- inhouse SV database + patho-mms.bed -- well-known pathogenic DELs/DUPs + tads/ + hesc.bed -- hESC TAD definitions ``` # Developer Information diff --git a/build.rs b/build.rs index d1f75b46..2b3415e4 100644 --- a/build.rs +++ b/build.rs @@ -1,6 +1,8 @@ // The custom build script, needed as we use protocolbuffers. fn main() { + println!("cargo:rerun-if-changed=src/proto/varfish/v1/clinvar.proto"); + println!("cargo:rerun-if-changed=src/proto/varfish/v1/sv.proto"); prost_build::Config::new() .protoc_arg("-Isrc/proto") // Add serde serialization and deserialization to the generated code. @@ -9,7 +11,10 @@ fn main() { .type_attribute(".", "#[serde_with::skip_serializing_none]") // Define the protobuf files to compile. .compile_protos( - &["varfish/v1/clinvar.proto", "varfish/v1/sv.proto"], + &[ + "src/proto/varfish/v1/clinvar.proto", + "src/proto/varfish/v1/sv.proto", + ], &["src/"], ) .unwrap(); diff --git a/misc/ok-query-trio.json b/misc/ok-query-trio.json index ed1f64e0..df9007bb 100644 --- a/misc/ok-query-trio.json +++ b/misc/ok-query-trio.json @@ -51,14 +51,13 @@ "CNV" ], "tx_effects": [ - "transcript_ablation", - "transcript_translocation", + "transcript_variant", "exon_variant", "splice_region_variant", "intron_variant", "upstream_variant", "downstream_variant", - "intergenic_variant", + "intergenic_variant" ], "gene_allowlist": null, "genomic_region": null, diff --git a/src/common.rs b/src/common.rs index 9669e613..44175e9e 100644 --- a/src/common.rs +++ b/src/common.rs @@ -149,10 +149,10 @@ where )] pub enum GenomeRelease { // GRCh37 / hg19 - #[strum(serialize = "GRCh37")] + #[strum(serialize = "grch37")] Grch37, /// GRCh38 / hg38 - #[strum(serialize = "GRCh38")] + #[strum(serialize = "grch38")] Grch38, } @@ -207,6 +207,8 @@ pub enum Genotype { Het, /// hom. alt. HomAlt, + /// other, includes no-call + WithNoCall, } impl std::str::FromStr for Genotype { @@ -217,6 +219,7 @@ impl std::str::FromStr for Genotype { "0/0" | "0|0" | "0" => Genotype::HomRef, "0/1" | "1/0" | "0|1" | "1|0" => Genotype::Het, "1/1" | "1|1" | "1" => Genotype::HomAlt, + "./." | "./0" | "./1" | "0/." | "1/." => Genotype::WithNoCall, _ => anyhow::bail!("invalid genotype value: {:?}", s), }) } diff --git a/src/seqvars/aggregate/mod.rs b/src/seqvars/aggregate/mod.rs index e4646635..a7f5afdc 100644 --- a/src/seqvars/aggregate/mod.rs +++ b/src/seqvars/aggregate/mod.rs @@ -76,6 +76,7 @@ fn handle_record( }; let carrier_genotype = match (chrom, individual.sex, genotype) { + (_, _, Genotype::WithNoCall) => continue, // on the autosomes, male/female count the same (Chrom::Auto, _, Genotype::HomRef) => { res_counts.count_an += 2; diff --git a/src/seqvars/prefilter/mod.rs b/src/seqvars/prefilter/mod.rs index d1f2c824..374b4c7a 100644 --- a/src/seqvars/prefilter/mod.rs +++ b/src/seqvars/prefilter/mod.rs @@ -282,7 +282,6 @@ mod test { ); let params_file = tmpdir.to_path_buf().join("params.json"); - eprintln!("params_json = {}", ¶ms_json); std::fs::write(¶ms_file, ¶ms_json)?; let args = super::Args { diff --git a/src/strucvars/aggregate/output.rs b/src/strucvars/aggregate/output.rs index 379dd8b3..f5b8f85d 100644 --- a/src/strucvars/aggregate/output.rs +++ b/src/strucvars/aggregate/output.rs @@ -154,6 +154,7 @@ impl Record { }; match (chrom, individual.sex, genotype) { + (_, _, Genotype::WithNoCall) => continue, // on the autosomes, male/female count the same (Chrom::Auto, _, Genotype::HomRef) => (), (Chrom::Auto, _, Genotype::Het) => { diff --git a/src/strucvars/ingest/mod.rs b/src/strucvars/ingest/mod.rs index a17cacaa..535e9180 100644 --- a/src/strucvars/ingest/mod.rs +++ b/src/strucvars/ingest/mod.rs @@ -254,7 +254,6 @@ impl mehari::annotate::seqvars::AnnotatedVcfWriter for WriterWrapper { } let key_callers: vcf::record::info::field::Key = "callers".parse()?; - eprintln!("callers = {:?}", &input_record.info().get(&key_callers)); if let Some(Some(callers)) = input_record.info().get(&key_callers) { if let vcf::record::info::field::Value::Array( vcf::record::info::field::value::Array::String(callers), diff --git a/src/strucvars/query/bgdbs.rs b/src/strucvars/query/bgdbs.rs index ffc15cff..50311ec3 100644 --- a/src/strucvars/query/bgdbs.rs +++ b/src/strucvars/query/bgdbs.rs @@ -15,7 +15,7 @@ use crate::{ }; use super::{ - records::ChromRange, + schema::ChromRange, schema::{CaseQuery, StructuralVariant, SvType}, }; diff --git a/src/strucvars/query/clinvar.rs b/src/strucvars/query/clinvar.rs index 8db02b1d..df92e245 100644 --- a/src/strucvars/query/clinvar.rs +++ b/src/strucvars/query/clinvar.rs @@ -9,7 +9,7 @@ use tracing::{info, warn}; use crate::common::{reciprocal_overlap, GenomeRelease, CHROMS}; use super::{ - records::ChromRange, + schema::ChromRange, schema::{Pathogenicity, StructuralVariant, SvType}, }; diff --git a/src/strucvars/query/genes.rs b/src/strucvars/query/genes.rs index 4a926d85..12ffcf48 100644 --- a/src/strucvars/query/genes.rs +++ b/src/strucvars/query/genes.rs @@ -259,7 +259,7 @@ impl OmimDb { } #[tracing::instrument] -fn load_omim_db(path: &Path) -> Result { +fn load_mim2gene_db(path: &Path) -> Result { tracing::debug!("loading OMIM TSV records from {:?}...", path); let before_loading = Instant::now(); @@ -291,7 +291,7 @@ pub struct GeneDb { pub ensembl: GeneRegionDb, pub xlink: XlinkDb, pub acmg: AcmgDb, - pub omim: OmimDb, + pub mim2gene: OmimDb, } impl GeneDb { @@ -327,7 +327,11 @@ pub fn load_gene_db(path_db: &str, genome_release: GenomeRelease) -> Result, - /// Radius around BND sites used when building the database. #[arg(long, default_value_t = 50)] pub slack_bnd: i32, @@ -92,6 +92,9 @@ pub struct Args { /// Maximal distance to TAD to consider. #[arg(long, default_value_t = 10_000)] pub max_tad_distance: i32, + /// Optional seed for RNG. + #[arg(long)] + pub rng_seed: Option, } /// Gene information. @@ -156,10 +159,10 @@ struct ResultPayload { #[derive(Debug, Default, Serialize)] struct ResultRecord { sodar_uuid: Uuid, - svqueryresultset: u64, release: String, chromosome: String, chromosome_no: i32, + // TODO: remove bin bin: u32, chromosome2: String, chromosome_no2: i32, @@ -188,7 +191,7 @@ fn resolve_gene_id(database: Database, gene_db: &GeneDb, gene_id: u32) -> Vec Vec vec![Gene { ensembl_id: Some(format!("ENSG{gene_id:011}")), @@ -230,44 +233,50 @@ fn run_query( mehari_tx_db: &TxSeqDatabase, mehari_tx_idx: &TxIntervalTrees, chrom_to_acc: &HashMap, + rng: &mut rand::rngs::StdRng, ) -> Result { + let chrom_to_chrom_no = &CHROM_TO_CHROM_NO; let chrom_map = build_chrom_map(); let mut stats = QueryStats::default(); - // Construct reader and writer for CSV records - let mut csv_reader = csv::ReaderBuilder::new() - .has_headers(true) - .delimiter(b'\t') - .from_reader(open_read_maybe_gz(&args.path_input_svs)?); + let mut input_reader = open_read_maybe_gz(&args.path_input).map_err(|e| { + anyhow::anyhow!("could not open file {} for reading: {}", args.path_input, e) + })?; + let mut input_reader = vcf::Reader::new(&mut input_reader); + let input_header = input_reader.read_header()?; + let mut csv_writer = csv::WriterBuilder::new() .has_headers(true) .delimiter(b'\t') .quote_style(QuoteStyle::Never) - .from_path(&args.path_output_svs)?; + .from_path(&args.path_output)?; // Read through input records using the query interpreter as a filter - for record in csv_reader.deserialize() { + for input_record in input_reader.records(&input_header) { stats.count_total += 1; - let record_sv: RecordSv = record?; - let schema_sv: SchemaSv = record_sv.clone().into(); + let record_sv: StructuralVariant = + StructuralVariant::from_vcf(&input_record?, &input_header) + .map_err(|e| anyhow::anyhow!("could not parse VCF record: {}", e))?; + + tracing::debug!("processing record {:?}", record_sv); let mut result_payload = ResultPayload { - call_info: schema_sv.call_info.clone(), + call_info: record_sv.call_info.clone(), callers: record_sv.callers.clone(), ..ResultPayload::default() }; let mut ovl_gene_ids = Vec::new(); - let sv_query = if matches!(schema_sv.sv_type, SvType::Ins | SvType::Bnd) { - schema_sv.pos.saturating_sub(1)..schema_sv.pos + let sv_query = if matches!(record_sv.sv_type, SvType::Ins | SvType::Bnd) { + record_sv.pos.saturating_sub(1)..record_sv.pos } else { - schema_sv.pos.saturating_sub(1)..schema_sv.end + record_sv.pos.saturating_sub(1)..record_sv.end }; let passes = interpreter.passes( - &schema_sv, - &mut |sv: &SchemaSv| { + &record_sv, + &mut |sv: &StructuralVariant| { result_payload.overlap_counts = dbs.bg_dbs.count_overlaps( sv, &interpreter.query, @@ -277,12 +286,12 @@ fn run_query( ); result_payload.overlap_counts.clone() }, - &mut |sv: &SchemaSv| { + &mut |sv: &StructuralVariant| { result_payload.masked_breakpoints = dbs.masked.masked_breakpoint_count(sv, &chrom_map); result_payload.masked_breakpoints.clone() }, - &mut |sv: &SchemaSv| { + &mut |sv: &StructuralVariant| { ovl_gene_ids = dbs.genes.overlapping_gene_ids( interpreter.query.database, *chrom_map @@ -293,7 +302,7 @@ fn run_query( ovl_gene_ids.sort(); ovl_gene_ids.clone() }, - &mut |sv: &SchemaSv| { + &mut |sv: &StructuralVariant| { result_payload.tx_effects = compute_tx_effects(sv, mehari_tx_db, mehari_tx_idx, &dbs.genes, chrom_to_acc); let mut res = Vec::new(); @@ -307,8 +316,8 @@ fn run_query( )?; if passes.pass_all { - if schema_sv.sv_type != SvType::Ins && schema_sv.sv_type != SvType::Bnd { - result_payload.sv_length = Some((schema_sv.end - schema_sv.pos + 1) as u32); + if record_sv.sv_type != SvType::Ins && record_sv.sv_type != SvType::Bnd { + result_payload.sv_length = Some((record_sv.end - record_sv.pos + 1) as u32); } // Copy effective and compatible genotypes to output. @@ -323,15 +332,15 @@ fn run_query( // Count passing record in statistics stats.count_passed += 1; - *stats.by_sv_type.entry(schema_sv.sv_type).or_default() += 1; + *stats.by_sv_type.entry(record_sv.sv_type).or_default() += 1; // Get overlaps with known pathogenic SVs and ClinVar SVs result_payload.known_pathogenic = - dbs.patho_dbs.overlapping_records(&schema_sv, &chrom_map); + dbs.patho_dbs.overlapping_records(&record_sv, &chrom_map); result_payload.clinvar_ovl_rcvs = dbs .clinvar_sv .overlapping_rcvs( - &schema_sv, + &record_sv, &chrom_map, interpreter.query.clinvar_sv_min_pathogenicity, interpreter.query.clinvar_sv_min_overlap, @@ -345,7 +354,7 @@ fn run_query( let gene_ids: HashSet<_> = HashSet::from_iter(ovl_gene_ids.iter()); let tads = dbs.tad_sets - .overlapping_tads(TadSetChoice::Hesc, &schema_sv, &chrom_map); + .overlapping_tads(TadSetChoice::Hesc, &record_sv, &chrom_map); let mut tad_gene_ids = Vec::new(); tads.iter() .map(|tad| { @@ -364,7 +373,7 @@ fn run_query( }; result_payload.tad_boundary_distance = dbs.tad_sets - .boundary_dist(TadSetChoice::Hesc, &schema_sv, &chrom_map); + .boundary_dist(TadSetChoice::Hesc, &record_sv, &chrom_map); // Convert the genes into more verbose records and put them into the result ovl_gene_ids.iter().for_each(|gene_id| { @@ -399,23 +408,61 @@ fn run_query( } } + let (bin, bin2) = if record_sv.sv_type == SvType::Bnd { + ( + mehari::annotate::seqvars::binning::bin_from_range( + record_sv.pos as i32 - 2, + record_sv.pos as i32 - 1, + )? as u32, + mehari::annotate::seqvars::binning::bin_from_range( + record_sv.end as i32 - 1, + record_sv.end as i32, + )? as u32, + ) + } else if record_sv.sv_type == SvType::Ins { + ( + mehari::annotate::seqvars::binning::bin_from_range( + record_sv.pos as i32 - 2, + record_sv.pos as i32 - 1, + )? as u32, + 0, + ) + } else { + ( + mehari::annotate::seqvars::binning::bin_from_range( + record_sv.pos as i32 - 1, + record_sv.end as i32, + )? as u32, + 0, + ) + }; + // Finally, write out the record. + let mut uuid_buf = [0u8; 16]; + rng.fill_bytes(&mut uuid_buf); csv_writer.serialize(&ResultRecord { - sodar_uuid: uuid::Uuid::new_v4(), - svqueryresultset: args.result_set_id, - release: record_sv.release.clone(), - chromosome: record_sv.chromosome.clone(), - chromosome_no: record_sv.chromosome_no, - start: record_sv.start, - bin: record_sv.bin, + sodar_uuid: Uuid::from_bytes(uuid_buf), + release: match args.genome_release { + GenomeRelease::Grch37 => "GRCh37".into(), + GenomeRelease::Grch38 => "GRCh38".into(), + }, + chromosome: record_sv.chrom.clone(), + chromosome_no: *chrom_to_chrom_no + .get(&record_sv.chrom) + .expect("invalid chromosome") as i32, + start: record_sv.pos, + bin, chromosome2: record_sv - .chromosome2 - .unwrap_or(record_sv.chromosome) + .chrom2 + .as_ref() + .unwrap_or(&record_sv.chrom) .clone(), - chromosome_no2: record_sv.chromosome_no2, - bin2: record_sv.bin2, + chromosome_no2: *chrom_to_chrom_no + .get(&record_sv.chrom) + .expect("invalid chromosome") as i32, + bin2, end: record_sv.end, - pe_orientation: record_sv.pe_orientation, + pe_orientation: record_sv.strand_orientation, sv_type: record_sv.sv_type, sv_sub_type: record_sv.sv_sub_type, payload: serde_json::to_string(&result_payload)?, @@ -440,7 +487,7 @@ fn construct_gene(entrez_id: u32, gene_db: &GeneDb) -> Gene { ensembl_id: Some(format!("ENSG{:011}", record.ensembl_gene_id)), hgnc_id: Some(record.hgnc_id.clone()), is_acmg: gene_db.acmg.contains(record.entrez_id), - is_disease_gene: gene_db.omim.contains(record.entrez_id), + is_disease_gene: gene_db.mim2gene.contains(record.entrez_id), } } @@ -617,7 +664,7 @@ fn gene_tx_effect_for_range(tx: &Transcript, pos: i32, end: i32) -> Vec Result<(), anyhow: tracing::info!("args_common = {:?}", &args_common); tracing::info!("args = {:?}", &args); + // Initialize the random number generator from command line seed if given or local entropy + // source. + let mut rng = if let Some(rng_seed) = args.rng_seed { + rand::rngs::StdRng::seed_from_u64(rng_seed) + } else { + rand::rngs::StdRng::from_entropy() + }; + tracing::info!("Loading query..."); let query: CaseQuery = serde_json::from_reader(File::open(&args.path_query_json)?)?; tracing::info!( @@ -967,6 +1022,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: &mehari_tx_db, &mehari_tx_idx, &chrom_to_acc, + &mut rng, )?; tracing::info!("... done running query in {:?}", before_query.elapsed()); tracing::info!( @@ -987,3 +1043,33 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: ); Ok(()) } + +#[cfg(test)] +mod test { + #[test] + #[tracing_test::traced_test] + fn smoke_test() -> Result<(), anyhow::Error> { + let tmpdir = temp_testdir::TempDir::default(); + let path_output = format!("{}/out.tsv", tmpdir.to_string_lossy()); + + let args_common = Default::default(); + let args = super::Args { + genome_release: crate::common::GenomeRelease::Grch37, + path_db: "tests/strucvars/query/db".into(), + path_query_json: "tests/strucvars/query/Case_3.query.json".into(), + path_input: "tests/strucvars/query/Case_3.ingested.vcf".into(), + path_output, + max_results: None, + slack_bnd: 50, + slack_ins: 50, + min_overlap: 0.8, + max_tad_distance: 10_000, + rng_seed: Some(42), + }; + super::run(&args_common, &args)?; + + insta::assert_snapshot!(std::fs::read_to_string(args.path_output.as_str())?); + + Ok(()) + } +} diff --git a/src/strucvars/query/pathogenic.rs b/src/strucvars/query/pathogenic.rs index fec84d1b..b5dc8b4e 100644 --- a/src/strucvars/query/pathogenic.rs +++ b/src/strucvars/query/pathogenic.rs @@ -14,7 +14,7 @@ use crate::{ }; use super::{ - records::ChromRange, + schema::ChromRange, schema::{StructuralVariant, SvType}, }; @@ -178,7 +178,7 @@ pub fn load_patho_dbs( let result = PathoDbBundle { mms: load_patho_db_records( Path::new(path_db) - .join(format!("{}/strucvars/patho-mms.bin", genome_release)) + .join(format!("{}/strucvars/patho-mms.bed", genome_release)) .as_path(), )?, }; diff --git a/src/strucvars/query/records.rs b/src/strucvars/query/records.rs deleted file mode 100644 index 9344e5e3..00000000 --- a/src/strucvars/query/records.rs +++ /dev/null @@ -1,180 +0,0 @@ -//! Records for I/O in `sv query`. - -use indexmap::IndexMap; -use mehari::annotate::strucvars::csq::interface::StrandOrientation; -use serde::{ - de::{self, IntoDeserializer}, - Deserialize, Deserializer, Serialize, -}; - -use super::schema::{ - CallInfo as SchemaCallInfo, StructuralVariant as SchemaStructuralVariant, SvSubType, SvType, -}; - -/// Chromosomal interval. -#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] -pub struct ChromRange { - /// Chromosome name. - pub chromosome: String, - /// 0-based begin position. - pub begin: i32, - /// 0-based end position. - pub end: i32, -} - -/// Structural variant as written out by VarFish Annotator. -#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] -pub struct StructuralVariant { - /// Genome release - pub release: String, - /// Chromosome name - pub chromosome: String, - /// Chromosome number - pub chromosome_no: i32, - /// UCSC bin on chromosome - pub bin: u32, - /// 1-based start position of the variant (or position on first chromosome - /// for break-ends) - pub start: i32, - /// The names of the calling tools. - #[serde(alias = "caller", deserialize_with = "deserialize_callers")] - pub callers: Vec, - /// Type of the structural variant - #[serde(deserialize_with = "deserialize_sv_type")] - pub sv_type: SvType, - /// Sub type of the structural variant - #[serde(deserialize_with = "deserialize_sv_sub_type")] - pub sv_sub_type: SvSubType, - /// Potentially the second involved chromosome - pub chromosome2: Option, - /// Chromosome number of second chromosome - pub chromosome_no2: i32, - /// UCSC bin on chromosome2 - pub bin2: u32, - /// End position (position on second chromosome for break-ends) - pub end: i32, - /// The strand orientation of the structural variant, if applicable. - #[serde(deserialize_with = "deserialize_pe_orientation")] - pub pe_orientation: StrandOrientation, - /// Mapping of sample to genotype information for the SV. - #[serde(deserialize_with = "deserialize_genotype")] - pub genotype: IndexMap, -} - -impl From for ChromRange { - fn from(val: StructuralVariant) -> Self { - ChromRange { - chromosome: val.chromosome, - begin: val.start.saturating_sub(1), - end: val.end, - } - } -} - -impl From for SchemaStructuralVariant { - fn from(val: StructuralVariant) -> Self { - fn to_u32(val: Option) -> Option { - val.and_then(|val| if val >= 0 { Some(val as u32) } else { None }) - } - - SchemaStructuralVariant { - chrom: val.chromosome, - pos: val.start, - sv_type: val.sv_type, - sv_sub_type: val.sv_sub_type, - chrom2: val.chromosome2, - end: val.end, - strand_orientation: Some(val.pe_orientation), - call_info: IndexMap::from_iter(val.genotype.into_iter().map(|(k, v)| { - ( - k, - SchemaCallInfo { - genotype: v.gt, - quality: v.gq, - paired_end_cov: to_u32(v.pec), - paired_end_var: to_u32(v.pev), - split_read_cov: to_u32(v.src), - split_read_var: to_u32(v.srv), - copy_number: to_u32(v.cn), - average_normalized_cov: v.anc, - point_count: to_u32(v.pc), - average_mapping_quality: v.amq, - effective_genotype: None, // only set later on - matched_gt_criteria: None, // only set later on - }, - ) - })), - } - } -} - -fn deserialize_callers<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let s: &str = Deserialize::deserialize(deserializer)?; - Ok(s.split(';').map(|s| s.to_string()).collect()) -} - -fn deserialize_sv_type<'de, D>(deserializer: D) -> Result -where - D: Deserializer<'de>, -{ - let s: &str = Deserialize::deserialize(deserializer)?; - let end = s.find('_').unwrap_or(s.len()); - SvType::deserialize(s[..end].into_deserializer()) -} - -fn deserialize_sv_sub_type<'de, D>(deserializer: D) -> Result -where - D: Deserializer<'de>, -{ - let s: &str = Deserialize::deserialize(deserializer)?; - SvSubType::deserialize(s.replace('_', ":").into_deserializer()) -} - -fn deserialize_pe_orientation<'de, D>(deserializer: D) -> Result -where - D: Deserializer<'de>, -{ - let s: &str = Deserialize::deserialize(deserializer)?; - if s.eq(".") { - Ok(StrandOrientation::NotApplicable) - } else { - StrandOrientation::deserialize(s.into_deserializer()) - } -} - -fn deserialize_genotype<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let buf = String::deserialize(deserializer)?; - let json = buf.replace("\"\"\"", "\""); - serde_json::from_str(&json).map_err(de::Error::custom) -} - -/// Information on the call as combined by the annotator. -#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] -pub struct Genotype { - /// The genotype, if applicable, e.g., "0/1", "./1", "." - pub gt: Option, - /// Genotype quality score, if applicable - pub gq: Option, - /// Paired-end coverage, if applicable - pub pec: Option, - /// Paired-end variant support, if applicable - pub pev: Option, - /// Split-read coverage, if applicable - pub src: Option, - /// Split-read variant support, if applicable - pub srv: Option, - /// Integer copy number estimate, if applicable - pub cn: Option, - /// Average normalized coverage, if applicable - pub anc: Option, - /// Number of buckets/targets supporting the CNV call, if applicable - pub pc: Option, - /// Average mapping quality, if applicable - pub amq: Option, -} diff --git a/src/strucvars/query/schema.rs b/src/strucvars/query/schema.rs index f7585625..7b25e3b9 100644 --- a/src/strucvars/query/schema.rs +++ b/src/strucvars/query/schema.rs @@ -2,7 +2,10 @@ use crate::common::{Database, TadSet}; use indexmap::IndexMap; -use mehari::annotate::strucvars::csq::interface::StrandOrientation; +use mehari::annotate::strucvars::{ + bnd::Breakend, csq::interface::StrandOrientation, PeOrientation, +}; +use noodles_vcf as vcf; use regex::Regex; use serde::{Deserialize, Deserializer, Serialize}; use strum_macros::{Display, EnumIter, EnumString}; @@ -22,6 +25,17 @@ impl Range { } } +/// Chromosomal interval. +#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] +pub struct ChromRange { + /// Chromosome name. + pub chromosome: String, + /// 0-based begin position. + pub begin: i32, + /// 0-based end position. + pub end: i32, +} + /// Genomic region #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct GenomicRegion { @@ -1301,9 +1315,11 @@ pub struct StructuralVariant { pub chrom2: Option, /// End position (position on second chromosome for break-ends) pub end: i32, - /// The strand orientation of the structural variant, if applicable. - pub strand_orientation: Option, + /// The strand orientation of the structural variant. + pub strand_orientation: StrandOrientation, + /// The callers of the variant. + pub callers: Vec, /// Mapping of sample to genotype information for the SV. pub call_info: IndexMap, } @@ -1324,6 +1340,213 @@ impl StructuralVariant { Some((self.end - self.pos + 1) as u32) } } + + /// Convert from VCF record. + pub fn from_vcf(record: &vcf::Record, header: &vcf::Header) -> Result { + let chrom = record.chromosome().to_string(); + let pos: usize = record.position().into(); + let pos = pos as i32; + let sv_type: SvType = if let Some(Some(vcf::record::info::field::Value::String(sv_type))) = + record.info().get(&vcf::record::info::field::key::SV_TYPE) + { + sv_type + .as_str() + .parse() + .map_err(|e| anyhow::anyhow!("could not parse SVTYPE {}: {}", &sv_type, e))? + } else { + anyhow::bail!("no INFO/SVTYPE in VCF record") + }; + let sv_sub_type = match sv_type { + SvType::Del => SvSubType::Del, + SvType::Dup => SvSubType::Dup, + SvType::Inv => SvSubType::Inv, + SvType::Ins => SvSubType::Ins, + SvType::Bnd => SvSubType::Bnd, + SvType::Cnv => SvSubType::Cnv, + }; + let end = if let Some(Some(vcf::record::info::field::Value::Integer(end))) = record + .info() + .get(&vcf::record::info::field::key::END_POSITION) + { + *end + } else { + anyhow::bail!("no INFO/END in VCF record") + }; + let chrom2 = if let Some(Some(vcf::record::info::field::Value::String(chrom2))) = + record.info().get( + &"chr2" + .parse::() + .expect("chr2 invalid key?"), + ) { + Some(chrom2.clone()) + } else { + None + }; + + let strand_orientation = match sv_type { + SvType::Del => StrandOrientation::ThreeToFive, + SvType::Dup => StrandOrientation::FiveToThree, + SvType::Inv => StrandOrientation::FiveToFive, + SvType::Bnd => { + let pe_orientation = Breakend::from_ref_alt_str( + &record.reference_bases().to_string(), + &record + .alternate_bases() + .first() + .ok_or_else(|| anyhow::anyhow!("no alternate allele?"))? + .to_string(), + )? + .pe_orientation; + match pe_orientation { + PeOrientation::ThreeToThree => StrandOrientation::ThreeToThree, + PeOrientation::FiveToFive => StrandOrientation::FiveToFive, + PeOrientation::ThreeToFive => StrandOrientation::ThreeToFive, + PeOrientation::FiveToThree => StrandOrientation::FiveToThree, + PeOrientation::Other => StrandOrientation::NotApplicable, + } + } + SvType::Ins | SvType::Cnv => StrandOrientation::NotApplicable, + }; + + let key_callers: vcf::record::info::field::Key = + "callers".parse().expect("callers invalid key?"); + let callers = if let Some(Some(vcf::record::info::field::Value::Array( + vcf::record::info::field::value::Array::String(callers), + ))) = record.info().get(&key_callers) + { + callers.iter().flatten().cloned().collect() + } else { + anyhow::bail!("no INFO/callers in VCF record") + }; + + let call_info = Self::build_call_info(record, header)?; + + Ok(Self { + chrom, + pos, + sv_type, + sv_sub_type, + chrom2, + end, + strand_orientation, + callers, + call_info, + }) + } + + /// Build call information. + pub fn build_call_info( + record: &vcf::Record, + header: &vcf::Header, + ) -> Result, anyhow::Error> { + let mut result = IndexMap::new(); + + for (name, sample) in header + .sample_names() + .iter() + .zip(record.genotypes().values()) + { + let genotype = if let Some(Some(vcf::record::genotypes::sample::Value::String(gt))) = + sample.get(&vcf::record::genotypes::keys::key::GENOTYPE) + { + Some(gt.clone()) + } else { + None + }; + let quality = if let Some(Some(vcf::record::genotypes::sample::Value::Float(quality))) = + sample.get(&vcf::record::genotypes::keys::key::CONDITIONAL_GENOTYPE_QUALITY) + { + Some(*quality) + } else { + None + }; + let paired_end_cov = + if let Some(Some(vcf::record::genotypes::sample::Value::Integer(paired_end_cov))) = + sample.get(&"pec".parse::()?) + { + Some(*paired_end_cov as u32) + } else { + None + }; + let paired_end_var = + if let Some(Some(vcf::record::genotypes::sample::Value::Integer(paired_end_var))) = + sample.get(&"pev".parse::()?) + { + Some(*paired_end_var as u32) + } else { + None + }; + let split_read_cov = + if let Some(Some(vcf::record::genotypes::sample::Value::Integer(split_read_cov))) = + sample.get(&"src".parse::()?) + { + Some(*split_read_cov as u32) + } else { + None + }; + let split_read_var = + if let Some(Some(vcf::record::genotypes::sample::Value::Integer(split_read_var))) = + sample.get(&"srv".parse::()?) + { + Some(*split_read_var as u32) + } else { + None + }; + let copy_number = + if let Some(Some(vcf::record::genotypes::sample::Value::Integer(copy_number))) = + sample.get(&"cn".parse::()?) + { + Some(*copy_number as u32) + } else { + None + }; + let average_normalized_cov = if let Some(Some( + vcf::record::genotypes::sample::Value::Float(average_normalized_cov), + )) = + sample.get(&"anc".parse::()?) + { + Some(*average_normalized_cov) + } else { + None + }; + let point_count = + if let Some(Some(vcf::record::genotypes::sample::Value::Integer(point_count))) = + sample.get(&"pc".parse::()?) + { + Some(*point_count as u32) + } else { + None + }; + let average_mapping_quality = if let Some(Some( + vcf::record::genotypes::sample::Value::Float(average_mapping_quality), + )) = + sample.get(&"amq".parse::()?) + { + Some(*average_mapping_quality) + } else { + None + }; + + result.insert( + name.clone(), + CallInfo { + genotype, + quality, + paired_end_cov, + paired_end_var, + split_read_cov, + split_read_var, + copy_number, + average_normalized_cov, + point_count, + average_mapping_quality, + ..Default::default() + }, + ); + } + + Ok(result) + } } impl From for clinvar::pbs::VariationType { @@ -1805,7 +2028,8 @@ mod tests { sv_sub_type: SvSubType::DelMeL1, chrom2: None, end: 200, - strand_orientation: Some(StrandOrientation::ThreeToFive), + strand_orientation: StrandOrientation::ThreeToFive, + callers: Vec::new(), call_info: IndexMap::new(), }; assert_eq!(sv.size().unwrap(), 101); @@ -1820,7 +2044,8 @@ mod tests { sv_sub_type: SvSubType::Ins, chrom2: None, end: 100, - strand_orientation: Some(StrandOrientation::ThreeToFive), + strand_orientation: StrandOrientation::ThreeToFive, + callers: Vec::new(), call_info: IndexMap::new(), }; assert!(sv.size().is_none()); @@ -1835,7 +2060,8 @@ mod tests { sv_sub_type: SvSubType::Bnd, chrom2: Some("chr2".to_owned()), end: 200, - strand_orientation: Some(StrandOrientation::ThreeToFive), + strand_orientation: StrandOrientation::ThreeToFive, + callers: Vec::new(), call_info: IndexMap::new(), }; assert!(sv.size().is_none()); @@ -1850,7 +2076,8 @@ mod tests { sv_sub_type: SvSubType::DelMeL1, chrom2: None, end: 245, - strand_orientation: Some(StrandOrientation::ThreeToFive), + strand_orientation: StrandOrientation::ThreeToFive, + callers: Vec::new(), call_info: IndexMap::new(), }; insta::assert_snapshot!(serde_json::to_string_pretty(&sv).unwrap()); diff --git a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__structural_variant_serde_smoke.snap b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__structural_variant_serde_smoke.snap index c6015179..378f6108 100644 --- a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__structural_variant_serde_smoke.snap +++ b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__structural_variant_serde_smoke.snap @@ -10,5 +10,6 @@ expression: "serde_json::to_string_pretty(&sv).unwrap()" "chrom2": null, "end": 245, "strand_orientation": "3to5", + "callers": [], "call_info": {} } diff --git a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap new file mode 100644 index 00000000..f2f81fec --- /dev/null +++ b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb4c02455aa29e6c74dbc8bf8d229e29ddc8bd89b77782974922c07aa4e2ded +size 709528 diff --git a/src/strucvars/query/tads.rs b/src/strucvars/query/tads.rs index b5a0f422..d45f2428 100644 --- a/src/strucvars/query/tads.rs +++ b/src/strucvars/query/tads.rs @@ -14,7 +14,7 @@ use crate::{ use super::{ interpreter::{BND_SLACK, INS_SLACK}, - records::ChromRange, + schema::ChromRange, schema::{StructuralVariant, SvType}, }; @@ -88,7 +88,7 @@ impl TadSet { SvType::Bnd => { let chrom_idx2 = *chrom_map .get(sv.chrom2.as_ref().expect("no chrom2?")) - .expect("invalid chromosome"); + .unwrap_or_else(|| panic!("invalid chromosome: {:?}", &sv.chrom2)); vec![ ( chrom_idx, diff --git a/src/strucvars/txt_to_bin/cli.rs b/src/strucvars/txt_to_bin/cli.rs index 422b888d..57098930 100644 --- a/src/strucvars/txt_to_bin/cli.rs +++ b/src/strucvars/txt_to_bin/cli.rs @@ -97,7 +97,7 @@ pub struct Args { pub path_input: String, /// Path to output BIN file. #[arg(long)] - pub path_output_bin: PathBuf, + pub path_output: PathBuf, } /// Main entry point for the `sv bg-db-to-bin` command. @@ -115,42 +115,34 @@ pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow: .assembly .expect("assembly required for ClinvarSv conversion"); let assembly: crate::strucvars::txt_to_bin::clinvar::input::Assembly = assembly.into(); - clinvar::convert_to_bin(&args.path_input, &args.path_output_bin, assembly)? + clinvar::convert_to_bin(&args.path_input, &args.path_output, assembly)? } InputType::StrucvarInhouse => vardbs::convert_to_bin( &args.path_input, - &args.path_output_bin, + &args.path_output, InputFileType::InhouseDb, )?, - InputType::StrucvarDbVar => vardbs::convert_to_bin( - &args.path_input, - &args.path_output_bin, - InputFileType::Dbvar, - )?, + InputType::StrucvarDbVar => { + vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Dbvar)? + } InputType::StrucvarDgv => { - vardbs::convert_to_bin(&args.path_input, &args.path_output_bin, InputFileType::Dgv)? + vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Dgv)? + } + InputType::StrucvarDgvGs => { + vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::DgvGs)? } - InputType::StrucvarDgvGs => vardbs::convert_to_bin( - &args.path_input, - &args.path_output_bin, - InputFileType::DgvGs, - )?, InputType::StrucvarExacCnv => { - vardbs::convert_to_bin(&args.path_input, &args.path_output_bin, InputFileType::Exac)? + vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Exac)? } InputType::StrucvarG1k => { - vardbs::convert_to_bin(&args.path_input, &args.path_output_bin, InputFileType::G1k)? + vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::G1k)? } - InputType::StrucvarGnomadSv => vardbs::convert_to_bin( - &args.path_input, - &args.path_output_bin, - InputFileType::Gnomad, - )?, - InputType::GeneRegion => { - gene_region::convert_to_bin(&args.path_input, &args.path_output_bin)? + InputType::StrucvarGnomadSv => { + vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Gnomad)? } - InputType::MaskedRegion => masked::convert_to_bin(&args.path_input, &args.path_output_bin)?, - InputType::Xlink => xlink::convert_to_bin(&args.path_input, &args.path_output_bin)?, + InputType::GeneRegion => gene_region::convert_to_bin(&args.path_input, &args.path_output)?, + InputType::MaskedRegion => masked::convert_to_bin(&args.path_input, &args.path_output)?, + InputType::Xlink => xlink::convert_to_bin(&args.path_input, &args.path_output)?, } tracing::info!("... done with conversion"); @@ -183,7 +175,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/clinvar/clinvar-svs.jsonl.gz", ), - path_output_bin: tmp_dir.join("clinvar.bin"), + path_output: tmp_dir.join("clinvar.bin"), }; super::run(&common_args, &args)?; @@ -203,7 +195,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/inhouse.tsv", ), - path_output_bin: tmp_dir.join("strucvar_inhouse.bin"), + path_output: tmp_dir.join("strucvar_inhouse.bin"), }; super::run(&common_args, &args)?; @@ -223,7 +215,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dbvar.bed.gz", ), - path_output_bin: tmp_dir.join("strucvar_dbvar.bin"), + path_output: tmp_dir.join("strucvar_dbvar.bin"), }; super::run(&common_args, &args)?; @@ -243,7 +235,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv.bed.gz", ), - path_output_bin: tmp_dir.join("strucvar_dgv.bin"), + path_output: tmp_dir.join("strucvar_dgv.bin"), }; super::run(&common_args, &args)?; @@ -263,7 +255,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv_gs.bed.gz", ), - path_output_bin: tmp_dir.join("strucvar_dgv_gs.bin"), + path_output: tmp_dir.join("strucvar_dgv_gs.bin"), }; super::run(&common_args, &args)?; @@ -283,7 +275,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/exac.bed.gz", ), - path_output_bin: tmp_dir.join("exac.bin"), + path_output: tmp_dir.join("exac.bin"), }; super::run(&common_args, &args)?; @@ -303,7 +295,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/g1k.bed.gz", ), - path_output_bin: tmp_dir.join("g1k.bin"), + path_output: tmp_dir.join("g1k.bin"), }; super::run(&common_args, &args)?; @@ -323,7 +315,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz", ), - path_output_bin: tmp_dir.join("gnomad.bin"), + path_output: tmp_dir.join("gnomad.bin"), }; super::run(&common_args, &args)?; @@ -343,7 +335,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/features/grch37/gene_regions/refseq.bed.gz", ), - path_output_bin: tmp_dir.join("refseq.bin"), + path_output: tmp_dir.join("refseq.bin"), }; super::run(&common_args, &args)?; @@ -363,7 +355,7 @@ mod test { path_input: String::from( "tests/db/to-bin/varfish-db-downloader/features/grch37/masked/repeat.bed.gz", ), - path_output_bin: tmp_dir.join("masked.bin"), + path_output: tmp_dir.join("masked.bin"), }; super::run(&common_args, &args)?; @@ -381,7 +373,7 @@ mod test { assembly: None, input_type: InputType::Xlink, path_input: String::from("tests/db/to-bin/varfish-db-downloader/genes/xlink/hgnc.tsv"), - path_output_bin: tmp_dir.join("xlink.bin"), + path_output: tmp_dir.join("xlink.bin"), }; super::run(&common_args, &args)?; diff --git a/src/strucvars/txt_to_bin/clinvar/mod.rs b/src/strucvars/txt_to_bin/clinvar/mod.rs index d74df9bb..72e0d08b 100644 --- a/src/strucvars/txt_to_bin/clinvar/mod.rs +++ b/src/strucvars/txt_to_bin/clinvar/mod.rs @@ -147,7 +147,7 @@ fn convert_jsonl_to_protobuf( /// Perform conversion to protocolbuffers `.bin` file. pub fn convert_to_bin( path_input_jsonl: P, - path_output_bin: Q, + path_output: Q, assembly: input::Assembly, ) -> Result<(), anyhow::Error> where @@ -169,7 +169,7 @@ where trace_rss_now(); let before_writing = Instant::now(); - let mut output_file = File::create(&path_output_bin)?; + let mut output_file = File::create(&path_output)?; output_file.write_all(&clinvar_db.encode_to_vec())?; output_file.flush()?; tracing::debug!( diff --git a/src/strucvars/txt_to_bin/gene_region.rs b/src/strucvars/txt_to_bin/gene_region.rs index af8fcabe..00398c78 100644 --- a/src/strucvars/txt_to_bin/gene_region.rs +++ b/src/strucvars/txt_to_bin/gene_region.rs @@ -30,7 +30,7 @@ mod input { } /// Perform conversion to protocolbuffers `.bin` file. -pub fn convert_to_bin(path_input_tsv: P, path_output_bin: Q) -> Result<(), anyhow::Error> +pub fn convert_to_bin(path_input_tsv: P, path_output: Q) -> Result<(), anyhow::Error> where P: AsRef, Q: AsRef, @@ -38,7 +38,7 @@ where tracing::debug!( "Converting gene regions from BED {:?} to binary {:?}", path_input_tsv.as_ref(), - path_output_bin.as_ref() + path_output.as_ref() ); let chrom_map = build_chrom_map(); @@ -74,7 +74,7 @@ where trace_rss_now(); let before_writing = Instant::now(); - let mut output_file = File::create(&path_output_bin)?; + let mut output_file = File::create(&path_output)?; output_file.write_all(&gene_region_db.encode_to_vec())?; output_file.flush()?; tracing::debug!( diff --git a/src/strucvars/txt_to_bin/masked.rs b/src/strucvars/txt_to_bin/masked.rs index fd542dc4..b90345b2 100644 --- a/src/strucvars/txt_to_bin/masked.rs +++ b/src/strucvars/txt_to_bin/masked.rs @@ -30,7 +30,7 @@ mod input { } /// Perform conversion to protocolbuffers `.bin` file. -pub fn convert_to_bin(path_input_tsv: P, path_output_bin: Q) -> Result<(), anyhow::Error> +pub fn convert_to_bin(path_input_tsv: P, path_output: Q) -> Result<(), anyhow::Error> where P: AsRef, Q: AsRef, @@ -38,7 +38,7 @@ where tracing::debug!( "Converting masked region from BED {:?} to binary {:?}", path_input_tsv.as_ref(), - path_output_bin.as_ref() + path_output.as_ref() ); let chrom_map = build_chrom_map(); @@ -73,7 +73,7 @@ where trace_rss_now(); let before_writing = Instant::now(); - let mut output_file = File::create(&path_output_bin)?; + let mut output_file = File::create(&path_output)?; output_file.write_all(&masked_region_db.encode_to_vec())?; output_file.flush()?; tracing::debug!( diff --git a/src/strucvars/txt_to_bin/vardbs/mod.rs b/src/strucvars/txt_to_bin/vardbs/mod.rs index ed5b6c02..770030c8 100644 --- a/src/strucvars/txt_to_bin/vardbs/mod.rs +++ b/src/strucvars/txt_to_bin/vardbs/mod.rs @@ -36,7 +36,7 @@ fn deserialize_loop( reader: &mut csv::Reader>, ) -> Result, anyhow::Error> where - Rec: TryInto> + for<'de> serde::Deserialize<'de>, + Rec: core::fmt::Debug + TryInto> + for<'de> serde::Deserialize<'de>, >>::Error: core::fmt::Debug, >>::Error: Send, >>::Error: std::marker::Sync, @@ -80,7 +80,7 @@ where /// Perform conversion to protobuf `.bin` file. pub fn convert_to_bin( path_input_tsv: P, - path_output_bin: Q, + path_output: Q, input_type: InputFileType, ) -> Result<(), anyhow::Error> where @@ -115,7 +115,7 @@ where trace_rss_now(); let before_writing = Instant::now(); - let mut output_file = File::create(&path_output_bin)?; + let mut output_file = File::create(&path_output)?; output_file.write_all(&bg_db.encode_to_vec())?; output_file.flush()?; tracing::debug!( diff --git a/src/strucvars/txt_to_bin/xlink.rs b/src/strucvars/txt_to_bin/xlink.rs index adb21c58..ade4575c 100644 --- a/src/strucvars/txt_to_bin/xlink.rs +++ b/src/strucvars/txt_to_bin/xlink.rs @@ -25,7 +25,7 @@ pub mod input { } /// Perform conversion to protocolbuffers `.bin` file. -pub fn convert_to_bin(path_input_tsv: P, path_output_bin: Q) -> Result<(), anyhow::Error> +pub fn convert_to_bin(path_input_tsv: P, path_output: Q) -> Result<(), anyhow::Error> where P: AsRef, Q: AsRef, @@ -65,7 +65,7 @@ where trace_rss_now(); let before_writing = Instant::now(); - let mut output_file = File::create(&path_output_bin)?; + let mut output_file = File::create(&path_output)?; output_file.write_all(&xlink_db.encode_to_vec())?; output_file.flush()?; tracing::debug!( diff --git a/tests/strucvars/query/Case_3.ingested.vcf b/tests/strucvars/query/Case_3.ingested.vcf new file mode 100644 index 00000000..0b9f2292 --- /dev/null +++ b/tests/strucvars/query/Case_3.ingested.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c526a6ebd03471dcf1e7b0b86e9c0911d27d02dc6f06a845b531c10005004f0 +size 10557593 diff --git a/tests/strucvars/query/Case_3.query.json b/tests/strucvars/query/Case_3.query.json new file mode 100644 index 00000000..15a42466 --- /dev/null +++ b/tests/strucvars/query/Case_3.query.json @@ -0,0 +1,328 @@ +{ + "database": "refseq", + "svdb_dgv_enabled": false, + "svdb_dgv_min_overlap": null, + "svdb_dgv_max_count": null, + "svdb_dgv_gs_enabled": false, + "svdb_dgv_gs_min_overlap": null, + "svdb_dgv_gs_max_count": null, + "svdb_gnomad_enabled": true, + "svdb_gnomad_min_overlap": 0.8, + "svdb_gnomad_max_count": 10, + "svdb_exac_enabled": true, + "svdb_exac_min_overlap": 0.8, + "svdb_exac_max_count": 10, + "svdb_dbvar_enabled": true, + "svdb_dbvar_min_overlap": 0.8, + "svdb_dbvar_max_count": 20, + "svdb_g1k_enabled": true, + "svdb_g1k_min_overlap": 0.8, + "svdb_g1k_max_count": 10, + "svdb_inhouse_enabled": true, + "svdb_inhouse_min_overlap": 0.8, + "svdb_inhouse_max_count": 10, + "clinvar_sv_min_overlap": 0.8, + "clinvar_sv_min_pathogenicity": "likely-pathogenic", + "sv_size_min": 500, + "sv_size_max": null, + "sv_types": [ + "DEL", + "DUP", + "INV", + "INS", + "BND", + "CNV" + ], + "sv_sub_types": [ + "DEL", + "DEL:ME", + "DEL:ME:SVA", + "DEL:ME:L1", + "DEL:ME:ALU", + "DUP", + "DUP:TANDEM", + "INV", + "INS", + "INS:ME", + "INS:ME:SVA", + "INS:ME:L1", + "INS:ME:ALU", + "BND", + "CNV" + ], + "tx_effects": [ + "transcript_variant", + "exon_variant", + "splice_region_variant", + "intron_variant", + "upstream_variant", + "downstream_variant", + "intergenic_variant" + ], + "gene_allowlist": null, + "genomic_region": null, + "regulatory_overlap": 100, + "regulatory_ensembl_features": null, + "regulatory_vista_validation": null, + "regulatory_custom_configs": [], + "tad_set": null, + "genotype": { + "Case_3_index-N1-DNA1-WGS1": "variant", + "Case_3_father-N1-DNA1-WGS1": "variant", + "Case_3_mother-N1-DNA1-WGS1": "variant" + }, + "genotype_criteria": [ + { + "genotype": "variant", + "select_sv_sub_type": [ + "DEL", + "DEL:ME", + "DEL:ME:SVA", + "DEL:ME:L1", + "DEL:ME:ALU", + "DUP", + "DUP:TANDEM", + "CNV" + ], + "select_sv_min_size": 500, + "select_sv_max_size": null, + "gt_one_of": null, + "min_gq": null, + "min_pr_cov": null, + "max_pr_cov": null, + "min_pr_ref": null, + "max_pr_ref": null, + "min_pr_var": null, + "max_pr_var": null, + "min_sr_cov": null, + "max_sr_cov": null, + "min_sr_ref": null, + "max_sr_ref": null, + "min_sr_var": null, + "max_sr_var": null, + "min_srpr_cov": null, + "max_srpr_cov": null, + "min_srpr_ref": null, + "max_srpr_ref": null, + "min_srpr_var": 5, + "max_srpr_var": null, + "min_sr_ab": null, + "max_sr_ab": null, + "min_pr_ab": null, + "max_pr_ab": null, + "min_srpr_ab": null, + "max_srpr_ab": null, + "min_rd_dev": null, + "max_rd_dev": null, + "min_amq": null, + "max_amq": null, + "comment": null + }, + { + "genotype": "variant", + "select_sv_sub_type": [ + "INV" + ], + "select_sv_min_size": 500, + "select_sv_max_size": null, + "gt_one_of": null, + "min_gq": null, + "min_pr_cov": null, + "max_pr_cov": null, + "min_pr_ref": null, + "max_pr_ref": null, + "min_pr_var": 2, + "max_pr_var": null, + "min_sr_cov": null, + "max_sr_cov": null, + "min_sr_ref": null, + "max_sr_ref": null, + "min_sr_var": 2, + "max_sr_var": null, + "min_srpr_cov": null, + "max_srpr_cov": null, + "min_srpr_ref": null, + "max_srpr_ref": null, + "min_srpr_var": 5, + "max_srpr_var": null, + "min_sr_ab": null, + "max_sr_ab": null, + "min_pr_ab": null, + "max_pr_ab": null, + "min_srpr_ab": null, + "max_srpr_ab": null, + "min_rd_dev": null, + "max_rd_dev": null, + "min_amq": null, + "max_amq": null, + "comment": null + }, + { + "genotype": "variant", + "select_sv_sub_type": [ + "BND", + "INS" + ], + "select_sv_min_size": null, + "select_sv_max_size": null, + "gt_one_of": null, + "min_gq": null, + "min_pr_cov": null, + "max_pr_cov": null, + "min_pr_ref": null, + "max_pr_ref": null, + "min_pr_var": 10, + "max_pr_var": null, + "min_sr_cov": null, + "max_sr_cov": null, + "min_sr_ref": null, + "max_sr_ref": null, + "min_sr_var": 10, + "max_sr_var": null, + "min_srpr_cov": null, + "max_srpr_cov": null, + "min_srpr_ref": null, + "max_srpr_ref": null, + "min_srpr_var": null, + "max_srpr_var": null, + "min_sr_ab": null, + "max_sr_ab": null, + "min_pr_ab": null, + "max_pr_ab": null, + "min_srpr_ab": null, + "max_srpr_ab": null, + "min_rd_dev": null, + "max_rd_dev": null, + "min_amq": null, + "max_amq": null, + "comment": null + }, + { + "genotype": "non-variant", + "select_sv_sub_type": [ + "DEL", + "DEL:ME", + "DEL:ME:SVA", + "DEL:ME:L1", + "DEL:ME:ALU", + "DUP", + "DUP:TANDEM", + "CNV" + ], + "select_sv_min_size": 500, + "select_sv_max_size": null, + "gt_one_of": null, + "min_gq": null, + "min_pr_cov": null, + "max_pr_cov": null, + "min_pr_ref": null, + "max_pr_ref": null, + "min_pr_var": null, + "max_pr_var": null, + "min_sr_cov": null, + "max_sr_cov": null, + "min_sr_ref": null, + "max_sr_ref": null, + "min_sr_var": null, + "max_sr_var": null, + "min_srpr_cov": null, + "max_srpr_cov": null, + "min_srpr_ref": null, + "max_srpr_ref": null, + "min_srpr_var": null, + "max_srpr_var": 4, + "min_sr_ab": null, + "max_sr_ab": null, + "min_pr_ab": null, + "max_pr_ab": null, + "min_srpr_ab": null, + "max_srpr_ab": null, + "min_rd_dev": null, + "max_rd_dev": null, + "min_amq": null, + "max_amq": null, + "comment": null + }, + { + "genotype": "non-variant", + "select_sv_sub_type": [ + "INV" + ], + "select_sv_min_size": 500, + "select_sv_max_size": null, + "gt_one_of": null, + "min_gq": null, + "min_pr_cov": null, + "max_pr_cov": null, + "min_pr_ref": null, + "max_pr_ref": null, + "min_pr_var": null, + "max_pr_var": 1, + "min_sr_cov": null, + "max_sr_cov": null, + "min_sr_ref": null, + "max_sr_ref": null, + "min_sr_var": null, + "max_sr_var": 1, + "min_srpr_cov": null, + "max_srpr_cov": null, + "min_srpr_ref": null, + "max_srpr_ref": null, + "min_srpr_var": null, + "max_srpr_var": 4, + "min_sr_ab": null, + "max_sr_ab": null, + "min_pr_ab": null, + "max_pr_ab": null, + "min_srpr_ab": null, + "max_srpr_ab": null, + "min_rd_dev": null, + "max_rd_dev": null, + "min_amq": null, + "max_amq": null, + "comment": null + }, + { + "genotype": "non-variant", + "select_sv_sub_type": [ + "BND", + "INS" + ], + "select_sv_min_size": null, + "select_sv_max_size": null, + "gt_one_of": null, + "min_gq": null, + "min_pr_cov": null, + "max_pr_cov": null, + "min_pr_ref": null, + "max_pr_ref": null, + "min_pr_var": null, + "max_pr_var": 9, + "min_sr_cov": null, + "max_sr_cov": null, + "min_sr_ref": null, + "max_sr_ref": null, + "min_sr_var": null, + "max_sr_var": 9, + "min_srpr_cov": null, + "max_srpr_cov": null, + "min_srpr_ref": null, + "max_srpr_ref": null, + "min_srpr_var": null, + "max_srpr_var": null, + "min_sr_ab": null, + "max_sr_ab": null, + "min_pr_ab": null, + "max_pr_ab": null, + "min_srpr_ab": null, + "max_srpr_ab": null, + "min_rd_dev": null, + "max_rd_dev": null, + "min_amq": null, + "max_amq": null, + "comment": null + } + ], + "recessive_mode": null, + "recessive_index": null + } diff --git a/tests/strucvars/query/bootstrap.sh b/tests/strucvars/query/bootstrap.sh new file mode 100644 index 00000000..549236b9 --- /dev/null +++ b/tests/strucvars/query/bootstrap.sh @@ -0,0 +1,148 @@ +#!/usr/bin/bash + +# Setup database excerpts required for running the SV queries. +# +# Define `SRC` with the path to the "output/full" directory of the +# `varfish-db-downloader`. +# +# This will pull all data from chromosome 16. + +set -x +set -euo pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +STRUCVARS_AGGREGATE="cargo run --release -- strucvars aggregate" +TXT_TO_BIN="cargo run --release -- strucvars txt-to-bin --assembly=grch37" + +mkdir -p $SCRIPT_DIR/db/worker/grch37/{features,genes,strucvars/bgdbs,tads} +mkdir -p $SCRIPT_DIR/db/worker/noref/genes +mkdir -p $SCRIPT_DIR/db/mehari/grch37 + +# Filter BED files +filter-bed() +{ + zcat --force "$@" \ + | awk -F $'\t' '(($1 ~ /^#/) || ($1 ~ /16$/))' +} + +# Write to a file (helpful in interpreting `set -x` output.) +write-to() +{ + tee $1 >/dev/null +} + +# mehari/grch37/ + +mkdir -p /tmp/mehari-data-grch37 + +pushd $SCRIPT_DIR/mehari-tx-scripts ; \ +DATA_DIR=/tmp/mehari-data-grch37 \ +ACTIVATE_CONDA="conda activate" \ +USE_CONDA_ENV=varfish-server-worker-query-bootstrap \ +GENOME_RELEASE=grch37 \ +bash run.sh ; \ +popd + +cp /tmp/mehari-data-grch37/pass-2/txs.bin.zst* \ +$SCRIPT_DIR/db/mehari/grch37 + +# worker/noref/genes/ + +cp $SRC/mehari/genes-xlink-20230913/genes-xlink.tsv \ +$SCRIPT_DIR/db/worker/noref/genes/xlink.tsv +cp $SRC/worker/acmg-sf-3.1+0.10.1/acmg_sf.tsv \ +$SCRIPT_DIR/db/worker/noref/genes/acmg.tsv +cp $SRC/worker/mim2gene-20230913+0.10.1/mim2gene.tsv \ +$SCRIPT_DIR/db/worker/noref/genes/mim2gene.tsv + +$TXT_TO_BIN \ + --input-type=xlink \ + --path-input=$SCRIPT_DIR/db/worker/noref/genes/xlink.tsv \ + --path-output=$SCRIPT_DIR/db/worker/noref/genes/xlink.bin + +filter-bed $SRC/tracks/track-features-ucsc-genomicsuperdups-grch37-20111025+0/genomicSuperDups.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/features/masked_seqdup.bed +$TXT_TO_BIN \ + --input-type=masked-region \ + --path-input=$SCRIPT_DIR/db/worker/grch37/features/masked_seqdup.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/features/masked_seqdup.bin + +# grch37/features/ + +filter-bed $SRC/tracks/track-features-ucsc-rmsk-grch37-20200322+0/rmsk.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/features/masked_repeat.bed +$TXT_TO_BIN \ + --input-type=masked-region \ + --path-input=$SCRIPT_DIR/db/worker/grch37/features/masked_repeat.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/features/masked_repeat.bin + +filter-bed $SRC/tracks/track-features-ucsc-genomicsuperdups-grch37-20111025+0/genomicSuperDups.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/features/masked_seqdup.bed +$TXT_TO_BIN \ + --input-type=masked-region \ + --path-input=$SCRIPT_DIR/db/worker/grch37/features/masked_seqdup.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/features/masked_seqdup.bin + +# grch37/strucvars/ + +filter-bed tests/strucvars/query/Case_3_ingested.vcf \ +| $STRUCVARS_AGGREGATE --path-output $SCRIPT_DIR/db/worker/grch37/strucvars/inhouse.bed /dev/stdin +$TXT_TO_BIN \ + --input-type=strucvar-inhouse \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/inhouse.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/inhouse.bin + +$TXT_TO_BIN \ + --input-type=clinvar-sv \ + --path-input=$SCRIPT_DIR/db/worker/grch37/clinvar-full-release.head.jsonl.gz \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/clinvar.bin + +filter-bed $SRC/tracks/track-strucvars-dbvar-grch37-20230516+0/dbvar.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dbvar.bed +$TXT_TO_BIN \ + --input-type=strucvar-db-var \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dbvar.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dbvar.bin + +filter-bed $SRC/tracks/track-strucvars-dgv-grch37-20200225+0/dgv.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dgv.bed +$TXT_TO_BIN \ + --input-type=strucvar-dgv \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dgv.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dgv.bin + +filter-bed $SRC/tracks/track-strucvars-dgv-gs-grch37-20200225+0/dgv-gs.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dgv-gs.bed +$TXT_TO_BIN \ + --input-type=strucvar-dgv-gs \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dgv-gs.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/dgv-gs.bin + +filter-bed $SRC/tracks/track-strucvars-exac-grch37-0.3.1+0/exac.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/exac.bed +$TXT_TO_BIN \ + --input-type=strucvar-exac-cnv \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/exac.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/exac.bin + +filter-bed $SRC/tracks/track-strucvars-g1k-grch37-phase3v2+0/g1k.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/g1k.bed +$TXT_TO_BIN \ + --input-type=strucvar-g1k \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/g1k.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/g1k.bin + +filter-bed $SRC/tracks/track-strucvars-gnomad-grch37-2.1.1+0/gnomad.bed.gz \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/gnomad.bed +$TXT_TO_BIN \ + --input-type=strucvar-gnomad-sv \ + --path-input=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/gnomad.bed \ + --path-output=$SCRIPT_DIR/db/worker/grch37/strucvars/bgdbs/gnomad.bin + +# grch37/tads/ + +filter-bed $SRC/worker/tads-grch37-dixon2015/hesc.bed \ +| write-to $SCRIPT_DIR/db/worker/grch37/tads/hesc.bed + +filter-bed $SRC/worker/patho-mms-grch37-20220730+0.10.1/patho-mms.bed \ +| write-to $SCRIPT_DIR/db/worker/grch37/strucvars/patho-mms.bed diff --git a/tests/strucvars/query/db/mehari/grch37/txs.bin.zst b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst new file mode 100644 index 00000000..62032208 --- /dev/null +++ b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b215c083bc26990eefcc5b8dc9c3fbe1d3613ad15e3022a420537419ca282d5b +size 109803 diff --git a/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.report b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.report new file mode 100644 index 00000000..ee1ffb48 --- /dev/null +++ b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.report @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32559d580326744bc9ddc1732e4bde5da5ad75459c659bf2b672bba1516519eb +size 2316 diff --git a/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.report.sha256 b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.report.sha256 new file mode 100644 index 00000000..6db4f47e --- /dev/null +++ b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.report.sha256 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a415f7105628d204e1a60e16de6e83f9298c41773ebc140cd7462a7f71be6af8 +size 85 diff --git a/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.sha256 b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.sha256 new file mode 100644 index 00000000..e28fda2a --- /dev/null +++ b/tests/strucvars/query/db/mehari/grch37/txs.bin.zst.sha256 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccdc4caa3b7f48cb7e7bcfd93acddb67b8f5c25585d0392834272231bb4d948f +size 78 diff --git a/tests/strucvars/query/db/worker/grch37/clinvar-full-release.head.jsonl.gz b/tests/strucvars/query/db/worker/grch37/clinvar-full-release.head.jsonl.gz new file mode 100644 index 00000000..31558869 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/clinvar-full-release.head.jsonl.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86a854f6dd64390c2cdf4027a36427d7d817d2516cc3c376f1675429f833748 +size 2076475 diff --git a/tests/strucvars/query/db/worker/grch37/features/masked_repeat.bed b/tests/strucvars/query/db/worker/grch37/features/masked_repeat.bed new file mode 100644 index 00000000..8263ed30 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/features/masked_repeat.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12748a59058135e793cd3260e84d0448046e635e2f28fe45a69c8c8a01b6ece7 +size 6330455 diff --git a/tests/strucvars/query/db/worker/grch37/features/masked_repeat.bin b/tests/strucvars/query/db/worker/grch37/features/masked_repeat.bin new file mode 100644 index 00000000..25f67be8 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/features/masked_repeat.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc74516c4f3cb04e7ff3110e93e71a489ba5eb69a7d753169b1e5633a0650eae +size 2374264 diff --git a/tests/strucvars/query/db/worker/grch37/features/masked_seqdup.bed b/tests/strucvars/query/db/worker/grch37/features/masked_seqdup.bed new file mode 100644 index 00000000..fea095c5 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/features/masked_seqdup.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcbc488c9604d995ee989f2d312957d5d3822a087ba13e653f9c68a56d6db18 +size 114013 diff --git a/tests/strucvars/query/db/worker/grch37/features/masked_seqdup.bin b/tests/strucvars/query/db/worker/grch37/features/masked_seqdup.bin new file mode 100644 index 00000000..53f8fe0b --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/features/masked_seqdup.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75617e153ba0e38676c3ea0a2794fc9df87d73adf28bc2e11ac4a0548b7df6d8 +size 44696 diff --git a/tests/strucvars/query/db/worker/grch37/genes/ensembl_regions.bin b/tests/strucvars/query/db/worker/grch37/genes/ensembl_regions.bin new file mode 100644 index 00000000..e3564b8a --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/genes/ensembl_regions.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ae4dce528abe91b2430443221912a532122962d020a9f667cdc4ee65cf76abb +size 1026161 diff --git a/tests/strucvars/query/db/worker/grch37/genes/refseq_regions.bin b/tests/strucvars/query/db/worker/grch37/genes/refseq_regions.bin new file mode 100644 index 00000000..a18852c8 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/genes/refseq_regions.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af07bad558f76a7dd52c26a8f9b36c7d8d52884cfe300fe15c9ce17385ab3727 +size 784004 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dbvar.bed b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dbvar.bed new file mode 100644 index 00000000..51b6d724 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dbvar.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d4b94751b5966831ed66656e240628bc809b6ad41285ca6b5638ae00b6d8ceb +size 6557478 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dbvar.bin b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dbvar.bin new file mode 100644 index 00000000..1c31d013 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dbvar.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0445ca63c27cdb6cafbb3b388df12ece7782e69e0b301a970d127b6972efd92f +size 3591420 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv-gs.bed b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv-gs.bed new file mode 100644 index 00000000..091c982b --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv-gs.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fbdb755375330ceb1ae87a7607053d5ff756b37966315ac61c5c1a919f241b3 +size 42797 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv-gs.bin b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv-gs.bin new file mode 100644 index 00000000..8ed2cb66 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv-gs.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec58806888cb2cc9f40de9ef6a677e04cc228c68c78ac2ab50415c5f500d1cd +size 27871 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv.bed b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv.bed new file mode 100644 index 00000000..34781240 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32595d36bbc253066d2be272751e42609d8abda6deac64b332a081dbb8117852 +size 942294 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv.bin b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv.bin new file mode 100644 index 00000000..87566318 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/dgv.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26abe9ef271fa43c26dcb55f4be88555b542bde20923a4e4e966aa63f602b132 +size 519656 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/exac.bed b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/exac.bed new file mode 100644 index 00000000..43543dfd --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/exac.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0430ecee465ed546325e7f036893405c49fc136db84c3f3c550abc248e437c27 +size 184635 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/exac.bin b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/exac.bin new file mode 100644 index 00000000..9aec1f3a --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/exac.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e1a549a094b2bb7b36864848dcdf24ac7b5db3eb103df869e2720f136e3e5d +size 143904 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/g1k.bed b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/g1k.bed new file mode 100644 index 00000000..5a01a6c7 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/g1k.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b36b6704783fc24c564346999f1fe4a0dbb273c24a43cf17573e9946f40fb923 +size 62609 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/g1k.bin b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/g1k.bin new file mode 100644 index 00000000..a0a0f87b --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/g1k.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ed52c1d08caa4e0e7af7db1e31854117e36f2e48bb548930cc45450b3396ff +size 38563 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/gnomad.bed b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/gnomad.bed new file mode 100644 index 00000000..437956eb --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/gnomad.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99901704f2a6351592df2f66dec0e489704888e6c1073b185931d88acd41394d +size 340315 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/gnomad.bin b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/gnomad.bin new file mode 100644 index 00000000..4f2ff1d5 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/bgdbs/gnomad.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:404c5988eedbc93208e796bfa2aa71104886df2c3914c9d11c139909ae483bde +size 220553 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/clinvar.bin b/tests/strucvars/query/db/worker/grch37/strucvars/clinvar.bin new file mode 100644 index 00000000..b26349d4 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/clinvar.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1cb39ed584c2c275837fb8ef56297da76de75d5f165a818c537edeaeb3d53c9 +size 36324 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/inhouse.bed b/tests/strucvars/query/db/worker/grch37/strucvars/inhouse.bed new file mode 100644 index 00000000..78f45a38 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/inhouse.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5905a304ba297fa268c732bc2ae758f3b003f59ece6a1f5c3f9652331ad8003a +size 90145 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/inhouse.bin b/tests/strucvars/query/db/worker/grch37/strucvars/inhouse.bin new file mode 100644 index 00000000..a16bde69 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/inhouse.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf4244d68790c818d27aa6c42bc2b6b27195a95c8d2f6d7e704c9abed409f761 +size 42606 diff --git a/tests/strucvars/query/db/worker/grch37/strucvars/patho-mms.bed b/tests/strucvars/query/db/worker/grch37/strucvars/patho-mms.bed new file mode 100644 index 00000000..32177141 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/strucvars/patho-mms.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978a29a96cdc37faf52d037b05de31c2c23a85db66610a5ac21f12e07a2aa561 +size 665 diff --git a/tests/strucvars/query/db/worker/grch37/tads/hesc.bed b/tests/strucvars/query/db/worker/grch37/tads/hesc.bed new file mode 100644 index 00000000..a9fdf358 --- /dev/null +++ b/tests/strucvars/query/db/worker/grch37/tads/hesc.bed @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b34d3768327321083f66e6f3681f760881af6d38c22723c7aaf9f615a9794c3 +size 1176 diff --git a/tests/strucvars/query/db/worker/noref/genes/acmg.tsv b/tests/strucvars/query/db/worker/noref/genes/acmg.tsv new file mode 100644 index 00000000..538077b2 --- /dev/null +++ b/tests/strucvars/query/db/worker/noref/genes/acmg.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a4a4cfdb4f151f8769418fb912cbc98186de14e1cc2e0786387b1bf7ea38e1 +size 10530 diff --git a/tests/strucvars/query/db/worker/noref/genes/mim2gene.tsv b/tests/strucvars/query/db/worker/noref/genes/mim2gene.tsv new file mode 100644 index 00000000..f4d6e9d7 --- /dev/null +++ b/tests/strucvars/query/db/worker/noref/genes/mim2gene.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3422958a685427c366b01fd47431798cb1816f114a4b56c11f919b3c6de81ada +size 90299 diff --git a/tests/strucvars/query/db/worker/noref/genes/xlink.bin b/tests/strucvars/query/db/worker/noref/genes/xlink.bin new file mode 100644 index 00000000..6780e43e --- /dev/null +++ b/tests/strucvars/query/db/worker/noref/genes/xlink.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4470d95fc5b6eada062c8c7155bb4bf4fca38b12245ee789bb3799907cd466ee +size 1260467 diff --git a/tests/strucvars/query/db/worker/noref/genes/xlink.tsv b/tests/strucvars/query/db/worker/noref/genes/xlink.tsv new file mode 100644 index 00000000..e379cad4 --- /dev/null +++ b/tests/strucvars/query/db/worker/noref/genes/xlink.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8cbb72b97ebd6ac0d7cd62a8d040c2318b3ea6702e7ead4e7ec88065ab9ee7 +size 1805640 diff --git a/tests/strucvars/query/mehari-tx-scripts/README.md b/tests/strucvars/query/mehari-tx-scripts/README.md new file mode 100644 index 00000000..769d190d --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/README.md @@ -0,0 +1,3 @@ +This is a copy of the scripts here: + +- https://github.com/bihealth/mehari-data-tx/tree/main/src diff --git a/tests/strucvars/query/mehari-tx-scripts/config.json b/tests/strucvars/query/mehari-tx-scripts/config.json new file mode 100644 index 00000000..94e46302 --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/config.json @@ -0,0 +1,14 @@ +{ + "grch37": { + "cdot_release": "0.2.14", + "cdot_filename": "cdot-0.2.14.GCF_000001405.25_GRCh37.p13_genomic.105.20201022.gff.json.gz", + "ensembl_release": "grch37/release-105", + "ensembl_token": "GRCh37" + }, + "grch38": { + "cdot_release": "0.2.14", + "cdot_filename": "cdot-0.2.14.GCF_000001405.39_GRCh38.p13_genomic.109.20211119.gff.json.gz", + "ensembl_release": "release-109", + "ensembl_token": "GRCh38" + } +} diff --git a/tests/strucvars/query/mehari-tx-scripts/download.sh b/tests/strucvars/query/mehari-tx-scripts/download.sh new file mode 100644 index 00000000..73a1fb45 --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/download.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -euo pipefail + +export DATA_DIR=${DATA_DIR-/tmp/mehari-data} +export SEQREPO_ROOT_DIR=$DATA_DIR/seqrepo + +set -x + +mkdir -p $DATA_DIR/tmp/$GENOME_RELEASE +cd $DATA_DIR/tmp/$GENOME_RELEASE + +wget -q \ + https://ftp.ensembl.org/pub/$ENSEMBL_RELEASE/fasta/homo_sapiens/cdna/Homo_sapiens.$ENSEMBL_TOKEN.cdna.all.fa.gz +wget -q \ + https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.{1..12}.rna.fna.gz \ + https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed +wget -q\ + https://github.com/SACGF/cdot/releases/download/v$CDOT_RELEASE/$CDOT_FILENAME diff --git a/tests/strucvars/query/mehari-tx-scripts/fix-ncbi.sh b/tests/strucvars/query/mehari-tx-scripts/fix-ncbi.sh new file mode 100644 index 00000000..48e0e9ae --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/fix-ncbi.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# cf. http://redsymbol.net/articles/unofficial-bash-strict-mode/ +set -euo pipefail +IFS=$'\n\t' + +set -x + +mkdir -p $DATA_DIR/for-fix + +{ grep "because it has no sequence" $DATA_DIR/pass-1/txs.bin.zst.report || true; } \ +| cut -f 2 \ +> $DATA_DIR/for-fix/missing.txt + +wc -l $DATA_DIR/for-fix/missing.txt + +split -l 20 $DATA_DIR/for-fix/missing.txt $DATA_DIR/for-fix/missing.txt_ + +if [[ ! -e $(echo $DATA_DIR/for-fix/missing.txt_* | tr ' ' '\n' | head -n 1) ]]; then + # nothing to do + touch $DATA_DIR/for-fix/missing.fasta + exit 0 +fi + +for f in $DATA_DIR/for-fix/missing.txt_*; do + esearch -db nucleotide -query "$(cat $f | tr '\n' ' ')" \ + | efetch -format fasta \ + >> $DATA_DIR/for-fix/missing.fasta +done + +2>&1 seqrepo load --instance-name master --namespace RefSeq \ + $DATA_DIR/for-fix/missing.fasta \ +| tail diff --git a/tests/strucvars/query/mehari-tx-scripts/pass-1.sh b/tests/strucvars/query/mehari-tx-scripts/pass-1.sh new file mode 100644 index 00000000..5a7d74cd --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/pass-1.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Build mehari transcript protobuf file (pass 1). +# +# Because the NCBI transcripts are not stable, fetching the transcript sequences +# will fail for certain transcripts. This will be apparent from the report file +# written out by this pass. The `fix-seqrepo.sh` script will attempt to work +# around this issue by fetching the sequences directly from NCBI and then `pass-2.sh` +# will work without such failures. + +set -euo pipefail + +export USE_CONDA_ENV=${USE_CONDA_ENV-mehari-data} +export DATA_DIR=${DATA_DIR-/tmp/mehari-data} +export SEQREPO_ROOT_DIR=$DATA_DIR/seqrepo +export PATH=$PATH:/home/runner/micromamba-root/envs/mehari-data/bin + +set -x + +mkdir -p $DATA_DIR/pass-1 + +symbols="$(echo ACSF3 ACSM2B ACSM3 ADHD1 ARL6IP1 ARMC5 BMIQ5 C16orf58 C16orf71 C16orf82 C16orf84 C16orf95 C16orf96 CARHSP1 CASP16P CCDC113 Ccdc78 CDIPT CFDP1 CHDS1 CIAPIN1 CKLF CLUAP1 CMTM2 CCDC135 COTL1 CPNE7 CTRL DCTPP1 DHX38 EMP2 ENKD1 ERAF FAHD1 FAM57B FBRS FOXC2-AS1 GLG1 HBAP1 HIRIP3 HN1L IBD8 IHPS2 ITFG3 KDM8 KIAA0895L LOC124220 LOC81691 LUC7L LYPLA3 MC1R MCOPCT1 METRN MKL2 MPHOSPH6 MT1G MT1X NIP30 NOB1 NOMO1 NPW NUBP2 NUPR1 OGFOD1 PDF PDPR PKDTS PMFBP1 POLR3K PRMT7 PRR35 RPS15A RSL1D1 SHCBP1 SLZ1 SNAI3-AS1 SNORD71 SPSB3 SRCAP TANGO6 TAO2 TBC1D24 TEDC2 TELO2 TMEM112 TMEM8A TNRC6A TSR3 UNKL VAT1L VPS35L WFDC1 ZG16 ZNF23 ZNF200 ZNF263 ZNF629 ZNF843 | tr ' ' '\n')" + +mehari db create txs \ + --path-out $DATA_DIR/pass-1/txs.bin.zst \ + --path-seqrepo-instance $DATA_DIR/seqrepo/master \ + --path-cdot-json $DATA_DIR/tmp/$GENOME_RELEASE/$CDOT_FILENAME \ + --genome-release $GENOME_RELEASE \ + $(for symbol in $symbols; do echo --gene-symbols=$symbol; done) + +# Ensure that the output can be decompressed. +zstd -c -d $DATA_DIR/pass-1/txs.bin.zst > /dev/null diff --git a/tests/strucvars/query/mehari-tx-scripts/pass-2.sh b/tests/strucvars/query/mehari-tx-scripts/pass-2.sh new file mode 100644 index 00000000..7bae24fb --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/pass-2.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Also see comment at top of pass-1.sh. + +# cf. http://redsymbol.net/articles/unofficial-bash-strict-mode/ +set -euo pipefail +IFS=$'\n\t' + +set -x + +mkdir -p $DATA_DIR/pass-2 + +symbols="$(echo ACSF3 ACSM2B ACSM3 ADHD1 ARL6IP1 ARMC5 BMIQ5 C16orf58 C16orf71 C16orf82 C16orf84 C16orf95 C16orf96 CARHSP1 CASP16P CCDC113 Ccdc78 CDIPT CFDP1 CHDS1 CIAPIN1 CKLF CLUAP1 CMTM2 CCDC135 COTL1 CPNE7 CTRL DCTPP1 DHX38 EMP2 ENKD1 ERAF FAHD1 FAM57B FBRS FOXC2-AS1 GLG1 HBAP1 HIRIP3 HN1L IBD8 IHPS2 ITFG3 KDM8 KIAA0895L LOC124220 LOC81691 LUC7L LYPLA3 MC1R MCOPCT1 METRN MKL2 MPHOSPH6 MT1G MT1X NIP30 NOB1 NOMO1 NPW NUBP2 NUPR1 OGFOD1 PDF PDPR PKDTS PMFBP1 POLR3K PRMT7 PRR35 RPS15A RSL1D1 SHCBP1 SLZ1 SNAI3-AS1 SNORD71 SPSB3 SRCAP TANGO6 TAO2 TBC1D24 TEDC2 TELO2 TMEM112 TMEM8A TNRC6A TSR3 UNKL VAT1L VPS35L WFDC1 ZG16 ZNF23 ZNF200 ZNF263 ZNF629 ZNF843 | tr ' ' '\n')" + +mehari db create txs \ + --path-out $DATA_DIR/pass-2/txs.bin.zst \ + --path-seqrepo-instance $DATA_DIR/seqrepo/master \ + --path-cdot-json $DATA_DIR/tmp/$GENOME_RELEASE/$CDOT_FILENAME \ + --genome-release $GENOME_RELEASE \ + $(for symbol in $symbols; do echo --gene-symbols=$symbol; done) +cd $DATA_DIR/pass-2 + +# Ensure that the output can be decompressed. +zstd -c -d txs.bin.zst > /dev/null + +sha256sum txs.bin.zst > txs.bin.zst.sha256 +sha256sum txs.bin.zst.report > txs.bin.zst.report.sha256 diff --git a/tests/strucvars/query/mehari-tx-scripts/run.sh b/tests/strucvars/query/mehari-tx-scripts/run.sh new file mode 100644 index 00000000..c1cc7c4e --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/run.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# This script needs the following miniconda environment +# +# mamba create -y \ +# -n varfish-server-worker-query-bootstrap \ +# python=3.8 \ +# entrez-direct=16.2 \ +# biocommons.seqrepo=0.6.5 \ +# mehari=0.11 \ +# htslib=1.17 + +set -x + +# Make the script directory available to all called scribes. +export SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# Obtain genome release from environment variables. +export GENOME_RELEASE=${GENOME_RELEASE-grch37} + +# Configure paths. +export DATA_DIR=${DATA_DIR-$HOME/mehari-data-tx} +export SEQREPO_ROOT_DIR=$DATA_DIR/seqrepo + +# Parse configuration into environment variables. +export CDOT_RELEASE=$(jq -r ".$GENOME_RELEASE.cdot_release" config.json) +export CDOT_FILENAME=$(jq -r ".$GENOME_RELEASE.cdot_filename" config.json) +export ENSEMBL_RELEASE=$(jq -r ".$GENOME_RELEASE.ensembl_release" config.json) +export ENSEMBL_TOKEN=$(jq -r ".$GENOME_RELEASE.ensembl_token" config.json) + +# cf. http://redsymbol.net/articles/unofficial-bash-strict-mode/ +set -euo pipefail +IFS=$'\n\t' + +# Cleanup $DATA_DIR if asked to do so. +if [ "${CLEANUP_DATA_DIR-false}" == true ]; then + rm -rf $DATA_DIR +fi + +# Run the individual steps. +# bash -x $SCRIPT_DIR/download.sh +# bash -x $SCRIPT_DIR/seqrepo.sh +bash -x $SCRIPT_DIR/pass-1.sh +bash -x $SCRIPT_DIR/fix-ncbi.sh +bash -x $SCRIPT_DIR/pass-2.sh diff --git a/tests/strucvars/query/mehari-tx-scripts/seqrepo.sh b/tests/strucvars/query/mehari-tx-scripts/seqrepo.sh new file mode 100644 index 00000000..1dcbf2e1 --- /dev/null +++ b/tests/strucvars/query/mehari-tx-scripts/seqrepo.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# cf. http://redsymbol.net/articles/unofficial-bash-strict-mode/ +set -euo pipefail +IFS=$'\n\t' + +set -x + +mkdir -p $SEQREPO_ROOT_DIR +seqrepo init -i master + +# seqrepo load is too verbose and we cannot silence it +2>&1 seqrepo \ + load --instance-name master --namespace ENSEMBL \ + $DATA_DIR/tmp/$GENOME_RELEASE/Homo_sapiens.$ENSEMBL_TOKEN.cdna.all.fa.gz \ +| tail + +# seqrepo load is too verbose and we cannot silence it +2>&1 seqrepo \ + load --instance-name master --namespace RefSeq \ + $DATA_DIR/tmp/$GENOME_RELEASE/human.{1..12}.rna.fna.gz \ +| tail diff --git a/tests/strucvars/query/mehari-tx-scripts/sys b/tests/strucvars/query/mehari-tx-scripts/sys new file mode 100644 index 00000000..e69de29b