Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply filters and group-by logic to a metadata iterator #750

Merged
merged 12 commits into from
Aug 12, 2021
1,487 changes: 1,047 additions & 440 deletions augur/filter.py

Large diffs are not rendered by default.

49 changes: 46 additions & 3 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,50 @@
import csv

from .io import open_file, read_sequences
from .utils import is_vcf, read_vcf


def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
parser.add_argument('--sequences', '-s', required=True, help="sequences in FASTA or VCF formats. Augur will summarize the content of FASTA sequences and only report the names of strains found in a given VCF.")
parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")


def index_vcf(vcf_path, index_path):
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""Create an index with a list of strain names from a given VCF. We do not
calculate any statistics for VCFs.

Parameters
----------
vcf_path : str or Path-like
path to a VCF file to index.
index_path : str or Path-like
path to a tab-delimited file containing the composition details for each
sequence in the given input file.

Returns
-------
int :
number of strains indexed

"""
strains, _ = read_vcf(vcf_path)
num_of_seqs = 0

with open_file(index_path, 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter = '\t')

#write header i output file
header = ['strain']
tsv_writer.writerow(header)

for record in strains:
tsv_writer.writerow([record])
num_of_seqs += 1

return num_of_seqs


def index_sequence(sequence, values):
"""Count the number of nucleotides for a given sequence record.

Expand Down Expand Up @@ -165,11 +201,18 @@ def run(args):
the composition as a data frame to the given sequence index path.
'''
try:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
if is_vcf(args.sequences):
num_of_seqs = index_vcf(args.sequences, args.output)
tot_length = None
else:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1

if args.verbose:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
if tot_length:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
else:
print("Analysed %i sequences" % num_of_seqs)
61 changes: 61 additions & 0 deletions augur/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import Bio.SeqIO
import Bio.SeqRecord
from contextlib import contextmanager
import pandas as pd
from pathlib import Path
from xopen import xopen

Expand Down Expand Up @@ -35,6 +36,66 @@ def open_file(path_or_buffer, mode="r", **kwargs):
yield path_or_buffer


def read_metadata(metadata_file, valid_index_cols=("strain", "name"), **kwargs):
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""Read metadata from a given filename and into a pandas `DataFrame` or
`TextFileReader` object.

Parameters
----------
metadata_file : str
Path to a metadata file to load.
valid_index_cols : list[str]
List of possible index column names to check for, ordered by priority.
kwargs : dict
Keyword arguments to pass through to pandas.read_csv.

Returns
-------
pandas.DataFrame or pandas.TextFileReader

Raises
------
KeyError :
When the metadata file does not have any valid index columns.

"""
default_kwargs = {
"sep": None,
"engine": "python",
"skipinitialspace": True,
"dtype": {
"strain": "string",
"name": "string,"
huddlej marked this conversation as resolved.
Show resolved Hide resolved
}
}
default_kwargs.update(kwargs)

# Inspect the first chunk of the metadata, to find any valid index columns.
chunk = pd.read_csv(
metadata_file,
iterator=True,
**default_kwargs,
).read(nrows=1)

index_cols = [
valid_index_col
for valid_index_col in valid_index_cols
if valid_index_col in chunk.columns
]

# If we couldn't find a valid index column in the metadata, alert the user.
if len(index_cols) == 0:
raise KeyError(f"Could not find any valid index columns from the list of possible columns ({valid_index_cols})")
else:
index_col = index_cols[0]

default_kwargs["index_col"] = index_col
return pd.read_csv(
metadata_file,
**default_kwargs
)


def read_sequences(*paths, format="fasta"):
"""Read sequences from one or more paths.

Expand Down
21 changes: 19 additions & 2 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,34 @@ class AugurException(Exception):
pass


def is_vcf(fname):
def is_vcf(filename):
"""Convenience method to check if a file is a vcf file.

>>> is_vcf(None)
False
>>> is_vcf("./foo")
False
>>> is_vcf("./foo.vcf")
True
>>> is_vcf("./foo.vcf.GZ")
True
"""
return fname.lower().endswith(".vcf") or fname.lower().endswith(".vcf.gz")
return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))

def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
headers = chrom_line.strip().split("\t")
sequences = headers[headers.index("FORMAT") + 1:]

# because we need 'seqs to remove' for VCF
return sequences, sequences.copy()

def myopen(fname, mode):
if fname.endswith('.gz'):
Expand Down
44 changes: 29 additions & 15 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,21 @@ Filter with subsampling, requesting 1 sequence per group (for a group with 3 dis
\s*3 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter with subsampling, requesting no more than 10 sequences.
With 10 groups to subsample from, this should produce one sequence per group.
Filter with subsampling, requesting no more than 8 sequences.
With 8 groups to subsample from (after filtering), this should produce one sequence per group.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --group-by country year month \
> --subsample-max-sequences 10 \
> --subsample-max-sequences 8 \
> --subsample-seed 314159 \
> --no-probabilistic-sampling \
> --output "$TMP/filtered.fasta" > /dev/null
$ grep ">" "$TMP/filtered.fasta" | wc -l
\s*10 (re)
\s*8 (re)
$ rm -f "$TMP/filtered.fasta"

Filter with subsampling where no more than 5 sequences are requested and no groups are specified.
Expand Down Expand Up @@ -242,6 +242,7 @@ Repeat with sequence and strain outputs. We should get the same results.
> --max-date 2020-01-30 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-sequences "$TMP/filtered.fasta" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[1]
$ wc -l "$TMP/filtered_strains.txt"
Expand All @@ -257,8 +258,8 @@ Filter TB strains from VCF and save as a list of filtered strains.
> --sequences filter/tb.vcf.gz \
> --metadata filter/tb_metadata.tsv \
> --min-date 2012 \
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
$ wc -l "$TMP/filtered_strains.txt"
\s*3 .* (re)
$ rm -f "$TMP/filtered_strains.txt"
Expand All @@ -269,22 +270,35 @@ The metadata are missing one strain that has a sequence.
The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country.
The query initially filters 3 strains from Colombia, one of which is added back by the include.

$ echo "NotReal" > "$TMP/include.txt"
$ echo "COL/FLR_00008/2015" >> "$TMP/include.txt"
$ ${AUGUR} filter \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --query "country != 'Colombia'" \
> --include "$TMP/include.txt" \
> --output-strains "$TMP/filtered_strains.txt"
> --non-nucleotide \
> --exclude-ambiguous-dates-by year \
> --include filter/include.txt \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-log "$TMP/filtered_log.tsv"
4 strains were dropped during filtering
\t1 had no sequence data (esc)
\t1 had no metadata (esc)
\t3 of these were filtered out by the query: (esc)
\t\t"country != 'Colombia'" (esc)
(esc)
\t1 strains were added back because they were requested by include files (esc)
\t1 strains from include files were not added because they lacked sequence or metadata (esc)
\t1 had no sequence data (esc)
\t3 of these were filtered out by the query: "country != 'Colombia'" (esc)
\t1 strains were added back because they were in filter/include.txt (esc)
8 strains passed all filters

$ diff -u <(sort -k 1,1 filter/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
$ rm -f "$TMP/filtered_strains.txt"

Subsample one strain per year with priorities.
There are two years (2015 and 2016) represented in the metadata.
The two highest priority strains are in these two years.

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --group-by year \
> --priority filter/priorities.tsv \
> --sequences-per-group 1 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null

$ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt")
$ rm -f "$TMP/filtered_strains.txt"
6 changes: 6 additions & 0 deletions tests/functional/filter/filtered_log.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
strain filter kwargs
HND/2016/HU_ME59 filter_by_sequence_index []
COL/FLR_00008/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]"
Colombia/2016/ZC204Se filter_by_query "[[""query"", ""country != 'Colombia'""]]"
COL/FLR_00024/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]"
COL/FLR_00008/2015 include "[[""include_file"", ""filter/include.txt""]]"
2 changes: 2 additions & 0 deletions tests/functional/filter/include.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
NotReal
COL/FLR_00008/2015
6 changes: 6 additions & 0 deletions tests/functional/filter/priorities.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
COL/FLR_00024/2015 100
PRVABC59 50
COL/FLR_00008/2015 10
Colombia/2016/ZC204Se 10
ZKC2/2016 100
VEN/UF_1/2016 50