Skip to content

Commit

Permalink
feat: adding "seqvars aggregate" command (#211) (#214)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Oct 10, 2023
1 parent 984dc8b commit d6db0b7
Show file tree
Hide file tree
Showing 37 changed files with 1,610 additions and 556 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ rocksdb = { version = "0.21.0", features = ["multi-threaded-cf"] }
noodles-bgzf = "0.24.0"
rand = "0.8"
rand_core = "0.6"
rocksdb-utils-lookup = "0.3.0"
rayon = "1.8.0"
byteorder = { version = "1.5.0", features = ["i128"] }

[build-dependencies]
prost-build = "0.12"
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ At the moment, the following sub commands exist:
- `seqvars` -- subcommands for processing sequence (aka small/SNV/indel) variants
- `seqvars ingest` -- convert single VCF file into internal format for use with `seqvars query`
- `seqvars prefilter` -- limit the result of `seqvars prefilter` by population frequency and/or distance to exon
- `seqvars aggregate` -- read through multiple VCF files written by `seqvars ingest` and computes a carrier counts table.
- `seqvars query` -- perform sequence variant filtration and on-the-fly annotation
- `strucvars` -- subcommands for processing structural (aka large variants, CNVs, etc.) variants
- `strucvars ingest` -- convert one or more structural variant files for use with `strucvars query`
Expand Down Expand Up @@ -119,6 +120,7 @@ Overall, the command will emit the following header rows in addition to the `##c
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##x-varfish-case-uuid=d2bad2ec-a75d-44b9-bd0a-83a3f1331b7c
##x-varfish-version=<ID=varfish-server-worker,Version=x.y.z>
##x-varfish-version=<ID=orig-caller,Name=Dragen,Version=SW: 07.021.624.3.10.9, HW: 07.021.624>
##x-varfish-version=<ID=orig-caller,Name=GatkHaplotypeCaller,Version=4.4.0.0>
Expand Down Expand Up @@ -161,6 +163,19 @@ $ varfish-server-worker strucvars prefilter \
--path-input INPUT.vcf \
--params @path/to/params.json \
[--params ...] \
## The `seqvars aggregate` Command
This command reads through multiple files written by `seqvars ingest` and computes a in-house carrier counts table.
You can specify the VCF files individually on the command line or pass in files that have paths to the VCF files line by line.
The resulting table is a folder to a RocksDB database.
```shell session
varfish-server-worker seqvars aggregate \
--genome-build {grch37,grch38} \
--path-out-rocksdb rocksdb/folder \
--path-in-vcf path/to/vcf.gz \
--path-in-vcf @path/to/file/list.txt
```

## The `strucvars ingest` Command
Expand Down
4 changes: 4 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct Seqvars {
/// Enum supporting the parsing of "sv *" sub commands.
#[derive(Debug, Subcommand)]
enum SeqvarsCommands {
Aggregate(seqvars::aggregate::Args),
Ingest(seqvars::ingest::Args),
Prefilter(seqvars::prefilter::Args),
}
Expand Down Expand Up @@ -119,6 +120,9 @@ fn main() -> Result<(), anyhow::Error> {
}
},
Commands::Seqvars(seqvars) => match &seqvars.command {
SeqvarsCommands::Aggregate(args) => {
seqvars::aggregate::run(&cli.common, args)?;
}
SeqvarsCommands::Ingest(args) => {
seqvars::ingest::run(&cli.common, args)?;
}
Expand Down
190 changes: 190 additions & 0 deletions src/seqvars/aggregate/ds.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
//! Module with datastructures stored in RocksDB.

use byteorder::{ByteOrder, LittleEndian};

/// Genotype counts.
#[derive(Debug, Default, Clone)]
pub struct Counts {
/// Number of alleles in samples.
pub count_an: u32,
/// Number of het. carriers.
pub count_het: u32,
/// Number of hom. carriers.
pub count_hom: u32,
/// Number of hemi. carriers.
pub count_hemi: u32,
}

impl Counts {
/// Convert to a byte vector.
pub fn to_vec(&self) -> Vec<u8> {
let mut buf = Vec::with_capacity(16);
buf.extend_from_slice(&self.count_an.to_le_bytes());
buf.extend_from_slice(&self.count_het.to_le_bytes());
buf.extend_from_slice(&self.count_hom.to_le_bytes());
buf.extend_from_slice(&self.count_hemi.to_le_bytes());
buf
}

/// Convert from a byte vector.
pub fn from_vec(buf: &[u8]) -> Self {
Self {
count_an: LittleEndian::read_u32(&buf[0..4]),
count_het: LittleEndian::read_u32(&buf[4..8]),
count_hom: LittleEndian::read_u32(&buf[8..12]),
count_hemi: LittleEndian::read_u32(&buf[12..16]),
}
}

/// Aggregate other into self.
pub fn aggregate(&mut self, other: Self) {
self.count_an += other.count_an;
self.count_het += other.count_het;
self.count_hom += other.count_hom;
self.count_hemi += other.count_hemi;
}
}

/// Genotype in a carrier.
#[derive(Debug, Default, Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
pub enum Genotype {
#[default]
HomRef, // same as hemizygous reference
Het,
HomAlt,
HemiAlt,
}

impl Genotype {
/// Convert to a byte.
pub fn to_byte(self) -> u8 {
match self {
Genotype::HomRef => 0,
Genotype::Het => 1,
Genotype::HomAlt => 2,
Genotype::HemiAlt => 3,
}
}

/// Convert from a byte.
pub fn from_byte(byte: u8) -> Self {
match byte {
0 => Genotype::HomRef,
1 => Genotype::Het,
2 => Genotype::HomAlt,
3 => Genotype::HemiAlt,
_ => panic!("invalid genotype byte"),
}
}
}

/// Store one carrier by UUID and index in the pedigree.
#[derive(Debug, Default, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct Carrier {
/// Case UUID.
pub uuid: uuid::Uuid,
/// Index of the carrier in the pedigree.
pub index: u8,
/// Genotype of the carrier.
pub genotype: Genotype,
}

/// Carrier UUIDs.
///
/// We store the UUIDs serialized as byte vectors and the index of the carrier in the pedigree.
#[derive(Debug, Default, Clone)]
pub struct CarrierList {
/// List of carrier UUIDs.
pub carriers: Vec<Carrier>,
}

impl CarrierList {
/// Convert to a byte vector.
pub fn to_vec(&self) -> Vec<u8> {
let mut buf = Vec::with_capacity(2 + 18 * self.carriers.len());
buf.extend_from_slice(&(self.carriers.len() as u16).to_le_bytes());
for carrier in &self.carriers {
buf.extend_from_slice(&carrier.uuid.as_u128().to_le_bytes());
buf.push(carrier.index);
buf.push(carrier.genotype.to_byte());
}
buf
}

/// Convert from a byte vector.
pub fn from_vec(buf: &[u8]) -> Self {
let mut carriers = Vec::with_capacity((buf.len() - 2) / 18);
let num_carriers = LittleEndian::read_u16(&buf[0..2]) as usize;
for i in 0..num_carriers {
let uuid =
uuid::Uuid::from_u128(LittleEndian::read_u128(&buf[2 + 18 * i..2 + 18 * i + 16]));
let index = buf[2 + 18 * i + 16];
let genotype = Genotype::from_byte(buf[2 + 18 * i + 17]);
carriers.push(Carrier {
uuid,
index,
genotype,
});
}
Self { carriers }
}

/// Order the carriers by UUID.
pub fn sort(&mut self) {
self.carriers.sort();
}

/// Aggregate other into self.
pub fn aggregate(&mut self, mut other: Self) {
self.carriers.append(&mut other.carriers);
self.carriers.sort();
self.carriers.dedup();
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn counts() {
let counts = Counts {
count_an: 1,
count_het: 2,
count_hom: 3,
count_hemi: 4,
};

let buf = counts.to_vec();
insta::assert_debug_snapshot!(&buf);
assert_eq!(buf.len(), 16);

let counts2 = Counts::from_vec(&buf);
insta::assert_debug_snapshot!(&counts2);
}

#[test]
fn carrier_list() {
let carrier_list = CarrierList {
carriers: vec![
Carrier {
uuid: uuid::Uuid::parse_str("00000000-0000-0000-0000-000000000000").unwrap(),
index: 0,
genotype: Genotype::HomRef,
},
Carrier {
uuid: uuid::Uuid::parse_str("00000000-0000-0000-0000-000000000001").unwrap(),
index: 1,
genotype: Genotype::HemiAlt,
},
],
};

let buf = carrier_list.to_vec();
insta::assert_debug_snapshot!(&buf);
assert_eq!(buf.len(), 38);

let carrier_list2 = CarrierList::from_vec(&buf);
insta::assert_debug_snapshot!(&carrier_list2);
}
}
Loading

0 comments on commit d6db0b7

Please sign in to comment.