Skip to content

Commit

Permalink
feat: adding "seqvars aggregate" command (#211)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Oct 9, 2023
1 parent 984dc8b commit 718aff5
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ rocksdb = { version = "0.21.0", features = ["multi-threaded-cf"] }
noodles-bgzf = "0.24.0"
rand = "0.8"
rand_core = "0.6"
rocksdb-utils-lookup = "0.3.0"

[build-dependencies]
prost-build = "0.12"
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ At the moment, the following sub commands exist:
- `seqvars` -- subcommands for processing sequence (aka small/SNV/indel) variants
- `seqvars ingest` -- convert single VCF file into internal format for use with `seqvars query`
- `seqvars prefilter` -- limit the result of `seqvars prefilter` by population frequency and/or distance to exon
- `seqvars aggregate` -- read through multiple VCF files written by `seqvars ingest` and computes a carrier counts table.
- `seqvars query` -- perform sequence variant filtration and on-the-fly annotation
- `strucvars` -- subcommands for processing structural (aka large variants, CNVs, etc.) variants
- `strucvars ingest` -- convert one or more structural variant files for use with `strucvars query`
Expand Down Expand Up @@ -161,6 +162,19 @@ $ varfish-server-worker strucvars prefilter \
--path-input INPUT.vcf \
--params @path/to/params.json \
[--params ...] \
## The `seqvars aggregate` Command
This command reads through multiple files written by `seqvars ingest` and computes a in-house carrier counts table.
You can specify the VCF files individually on the command line or pass in files that have paths to the VCF files line by line.
The resulting table is a folder to a RocksDB database.
```shell session
varfish-server-worker seqvars aggregate \
--genome-build {grch37,grch38} \
--path-out-rocksdb rocksdb/folder \
--path-in-vcf path/to/vcf.gz \
--path-in-vcf @path/to/file/list.txt
```

## The `strucvars ingest` Command
Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct Seqvars {
/// Enum supporting the parsing of "sv *" sub commands.
#[derive(Debug, Subcommand)]
enum SeqvarsCommands {
Aggregte(seqvars::aggregate::Args),
Ingest(seqvars::ingest::Args),
Prefilter(seqvars::prefilter::Args),
}
Expand Down
116 changes: 116 additions & 0 deletions src/seqvars/aggregate/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//! Implementation of `seqvars aggregate` subcommand.

use std::sync::Arc;

use crate::common;

/// Command line arguments for `seqvars aggregate` subcommand.
#[derive(Debug, clap::Parser)]
#[command(author, version, about = "ingest sequence variant VCF", long_about = None)]
pub struct Args {
/// The assumed genome build.
#[clap(long)]
pub genomebuild: crate::common::GenomeRelease,
/// Path to the output RocksDB.
#[clap(long)]
pub path_out_rocksdb: String,
/// Path to input VCF file(s).
#[clap(long)]
pub path_input: Vec<String>,

/// Column family name for the count data.
#[clap(long, default_value = "counts")]
pub cf_counts: String,
/// Column family name for the carrier UUID data.
#[clap(long, default_value = "carriers")]
pub cf_carriers: String,

/// Optional path to RocksDB WAL directory.
#[arg(long)]
pub path_wal_dir: Option<String>,
}

/// Perform import of VCF files.
fn vcf_import(
db: &rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>,
path_input: &[&str],
cf_counts: &str,
cf_carriers: &str,
) -> Result<(), anyhow::Error> {
Ok(())
}

/// Main entry point for `seqvars aggregate` sub command.
pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
let before_anything = std::time::Instant::now();
tracing::info!("args_common = {:#?}", &args_common);
tracing::info!("args = {:#?}", &args);

common::trace_rss_now();

// Build path of all input files to read, read through files given by `@path`.
let path_input = args
.path_input
.iter()
.map(|path| {
if path.starts_with("@") {
std::fs::read_to_string(path.trim_start_matches('@'))
.expect("checked above")
.lines()
.map(|line| line.trim())
.filter(|line| !line.is_empty())
.map(|line| line.to_string())
.collect::<Vec<_>>()
} else {
vec![path.clone()]
}
})
.flatten()
.collect::<Vec<_>>();

tracing::info!("Opening RocksDB...");
let options = rocksdb_utils_lookup::tune_options(
rocksdb::Options::default(),
args.path_wal_dir.as_ref().map(|s| s.as_ref()),
);
let cf_names = &["meta", &args.cf_counts, &args.cf_carriers];
let db = Arc::new(rocksdb::DB::open_cf_with_opts(
&options,
&args.path_out_rocksdb,
cf_names
.iter()
.map(|name| (name.to_string(), options.clone()))
.collect::<Vec<_>>(),
)?);
tracing::info!(" writing meta information");
let cf_meta = db.cf_handle("meta").unwrap();
db.put_cf(&cf_meta, "varfish-worker-version", common::worker_version())?;
db.put_cf(&cf_meta, "db-name", "seqvars-aggregation")?;
tracing::info!("... done opening RocksDB");

tracing::info!("Importing VCF files ...");
let before_import = std::time::Instant::now();
let paths = path_input.iter().map(|s| s.as_ref()).collect::<Vec<_>>();
vcf_import(&db, &paths, &args.cf_counts, &args.cf_carriers)?;
tracing::info!(
"... done importing VCF files in {:?}",
before_import.elapsed()
);

tracing::info!("Running RocksDB compaction ...");
let before_compaction = std::time::Instant::now();
rocksdb_utils_lookup::force_compaction_cf(&db, cf_names, Some(" "), true)?;
tracing::info!(
"... done compacting RocksDB in {:?}",
before_compaction.elapsed()
);

tracing::info!(
"All of `seqvars aggregate` completed in {:?}",
before_anything.elapsed()
);
Ok(())
}

#[cfg(test)]
mod test {}
1 change: 1 addition & 0 deletions src/seqvars/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod aggregate;
pub mod ingest;
pub mod prefilter;

0 comments on commit 718aff5

Please sign in to comment.