Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply filters and group-by logic to a metadata iterator #750

Merged
merged 12 commits into from
Aug 12, 2021
1,505 changes: 1,065 additions & 440 deletions augur/filter.py

Large diffs are not rendered by default.

49 changes: 46 additions & 3 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,50 @@
import csv

from .io import open_file, read_sequences
from .utils import is_vcf, read_vcf


def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
parser.add_argument('--sequences', '-s', required=True, help="sequences in FASTA or VCF formats. Augur will summarize the content of FASTA sequences and only report the names of strains found in a given VCF.")
parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")


def index_vcf(vcf_path, index_path):
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""Create an index with a list of strain names from a given VCF. We do not
calculate any statistics for VCFs.

Parameters
----------
vcf_path : str or Path-like
path to a VCF file to index.
index_path : str or Path-like
path to a tab-delimited file containing the composition details for each
sequence in the given input file.

Returns
-------
int :
number of strains indexed

"""
strains, _ = read_vcf(vcf_path)
num_of_seqs = 0

with open_file(index_path, 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter = '\t')

#write header i output file
header = ['strain']
tsv_writer.writerow(header)

for record in strains:
tsv_writer.writerow([record])
num_of_seqs += 1

return num_of_seqs


def index_sequence(sequence, values):
"""Count the number of nucleotides for a given sequence record.

Expand Down Expand Up @@ -165,11 +201,18 @@ def run(args):
the composition as a data frame to the given sequence index path.
'''
try:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
if is_vcf(args.sequences):
num_of_seqs = index_vcf(args.sequences, args.output)
tot_length = None
else:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1

if args.verbose:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
if tot_length:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
else:
print("Analysed %i sequences" % num_of_seqs)
86 changes: 86 additions & 0 deletions augur/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import Bio.SeqIO
import Bio.SeqRecord
from contextlib import contextmanager
import pandas as pd
from pathlib import Path
from xopen import xopen

Expand Down Expand Up @@ -35,6 +36,91 @@ def open_file(path_or_buffer, mode="r", **kwargs):
yield path_or_buffer


def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None):
"""Read metadata from a given filename and into a pandas `DataFrame` or
`TextFileReader` object.

Parameters
----------
metadata_file : str
Path to a metadata file to load.
id_columns : list[str]
List of possible id column names to check for, ordered by priority.
chunk_size : int
Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory.

Returns
-------
pandas.DataFrame or pandas.TextFileReader

Raises
------
KeyError :
When the metadata file does not have any valid index columns.

For standard use, request a metadata file and get a pandas DataFrame.

>>> read_metadata("tests/functional/filter/metadata.tsv").index.values[0]
'COL/FLR_00024/2015'

Requesting an index column that doesn't exist should produce an error.

>>> read_metadata("tests/functional/filter/metadata.tsv", id_columns=("Virus name",))
Traceback (most recent call last):
...
Exception: None of the possible id columns (('Virus name',)) were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')

We also allow iterating through metadata in fixed chunk sizes.

>>> for chunk in read_metadata("tests/functional/filter/metadata.tsv", chunk_size=5):
... print(chunk.shape)
...
(5, 14)
(5, 14)
(1, 14)

"""
kwargs = {
"sep": None,
"engine": "python",
"skipinitialspace": True,
}

if chunk_size:
kwargs["chunksize"] = chunk_size

# Inspect the first chunk of the metadata, to find any valid index columns.
chunk = pd.read_csv(
metadata_file,
iterator=True,
**kwargs,
).read(nrows=1)

id_columns_present = [
id_column
for id_column in id_columns
if id_column in chunk.columns
]

# If we couldn't find a valid index column in the metadata, alert the user.
if not id_columns_present:
raise Exception(f"None of the possible id columns ({id_columns!r}) were found in the metadata's columns {tuple(chunk.columns)!r}")
else:
index_col = id_columns_present[0]

# If we found a valid column to index the DataFrame, specify that column and
# also tell pandas that the column should be treated like a string instead
# of having its type inferred. This latter argument allows users to provide
# numerical ids that don't get converted to numbers by pandas.
kwargs["index_col"] = index_col
kwargs["dtype"] = {index_col: "string"}

return pd.read_csv(
metadata_file,
**kwargs
)


def read_sequences(*paths, format="fasta"):
"""Read sequences from one or more paths.

Expand Down
21 changes: 19 additions & 2 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,34 @@ class AugurException(Exception):
pass


def is_vcf(fname):
def is_vcf(filename):
"""Convenience method to check if a file is a vcf file.

>>> is_vcf(None)
False
>>> is_vcf("./foo")
False
>>> is_vcf("./foo.vcf")
True
>>> is_vcf("./foo.vcf.GZ")
True
"""
return fname.lower().endswith(".vcf") or fname.lower().endswith(".vcf.gz")
return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))

def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
headers = chrom_line.strip().split("\t")
sequences = headers[headers.index("FORMAT") + 1:]

# because we need 'seqs to remove' for VCF
return sequences, sequences.copy()

def myopen(fname, mode):
if fname.endswith('.gz'):
Expand Down
78 changes: 63 additions & 15 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,21 @@ Filter with subsampling, requesting 1 sequence per group (for a group with 3 dis
\s*3 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter with subsampling, requesting no more than 10 sequences.
With 10 groups to subsample from, this should produce one sequence per group.
Filter with subsampling, requesting no more than 8 sequences.
With 8 groups to subsample from (after filtering), this should produce one sequence per group.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --group-by country year month \
> --subsample-max-sequences 10 \
> --subsample-max-sequences 8 \
> --subsample-seed 314159 \
> --no-probabilistic-sampling \
> --output "$TMP/filtered.fasta" > /dev/null
$ grep ">" "$TMP/filtered.fasta" | wc -l
\s*10 (re)
\s*8 (re)
$ rm -f "$TMP/filtered.fasta"

Filter with subsampling where no more than 5 sequences are requested and no groups are specified.
Expand Down Expand Up @@ -205,6 +205,26 @@ Finally, exclude all sequences except those from the two sets of strains (there
\s*4 (re)
$ rm -f "$TMP/filtered.fasta"

Repeat this filter without a sequence index.
We should get the same outputs without building a sequence index on the fly, because the exclude-all flag tells us we only want to force-include strains and skip all other filters.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --metadata filter/metadata.tsv \
> --exclude-all \
> --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
> --output "$TMP/filtered.fasta" \
> --output-metadata "$TMP/filtered.tsv" > /dev/null
$ grep "^>" "$TMP/filtered.fasta" | wc -l
\s*4 (re)
$ rm -f "$TMP/filtered.fasta"

Metadata should have the same number of records as the sequences plus a header.

$ wc -l "$TMP/filtered.tsv"
\s*5 .* (re)
$ rm -f "$TMP/filtered.tsv"

Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains).

$ ${AUGUR} filter \
Expand Down Expand Up @@ -242,6 +262,7 @@ Repeat with sequence and strain outputs. We should get the same results.
> --max-date 2020-01-30 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-sequences "$TMP/filtered.fasta" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[1]
$ wc -l "$TMP/filtered_strains.txt"
Expand All @@ -251,14 +272,28 @@ Repeat with sequence and strain outputs. We should get the same results.
$ rm -f "$TMP/filtered_strains.txt"
$ rm -f "$TMP/filtered.fasta"

Repeat without any sequence-based filters.
Since we expect metadata to be filtered by presence of strains in input sequences, this should produce no results because the intersection of metadata and sequences is empty.

$ ${AUGUR} filter \
> --sequences "$TMP/dummy.fasta" \
> --metadata filter/metadata.tsv \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[1]
$ wc -l "$TMP/filtered_strains.txt"
\s*0 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter TB strains from VCF and save as a list of filtered strains.

$ ${AUGUR} filter \
> --sequences filter/tb.vcf.gz \
> --metadata filter/tb_metadata.tsv \
> --min-date 2012 \
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
$ wc -l "$TMP/filtered_strains.txt"
\s*3 .* (re)
$ rm -f "$TMP/filtered_strains.txt"
Expand All @@ -269,22 +304,35 @@ The metadata are missing one strain that has a sequence.
The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country.
The query initially filters 3 strains from Colombia, one of which is added back by the include.

$ echo "NotReal" > "$TMP/include.txt"
$ echo "COL/FLR_00008/2015" >> "$TMP/include.txt"
$ ${AUGUR} filter \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --query "country != 'Colombia'" \
> --include "$TMP/include.txt" \
> --output-strains "$TMP/filtered_strains.txt"
> --non-nucleotide \
> --exclude-ambiguous-dates-by year \
> --include filter/include.txt \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-log "$TMP/filtered_log.tsv"
4 strains were dropped during filtering
\t1 had no sequence data (esc)
\t1 had no metadata (esc)
\t3 of these were filtered out by the query: (esc)
\t\t"country != 'Colombia'" (esc)
(esc)
\t1 strains were added back because they were requested by include files (esc)
\t1 strains from include files were not added because they lacked sequence or metadata (esc)
\t1 had no sequence data (esc)
\t3 of these were filtered out by the query: "country != 'Colombia'" (esc)
\t1 strains were added back because they were in filter/include.txt (esc)
8 strains passed all filters

$ diff -u <(sort -k 1,1 filter/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
$ rm -f "$TMP/filtered_strains.txt"

Subsample one strain per year with priorities.
There are two years (2015 and 2016) represented in the metadata.
The two highest priority strains are in these two years.

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --group-by year \
> --priority filter/priorities.tsv \
> --sequences-per-group 1 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null

$ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt")
$ rm -f "$TMP/filtered_strains.txt"
6 changes: 6 additions & 0 deletions tests/functional/filter/filtered_log.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
strain filter kwargs
HND/2016/HU_ME59 filter_by_sequence_index []
COL/FLR_00008/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]"
Colombia/2016/ZC204Se filter_by_query "[[""query"", ""country != 'Colombia'""]]"
COL/FLR_00024/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]"
COL/FLR_00008/2015 include "[[""include_file"", ""filter/include.txt""]]"
2 changes: 2 additions & 0 deletions tests/functional/filter/include.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
NotReal
COL/FLR_00008/2015
6 changes: 6 additions & 0 deletions tests/functional/filter/priorities.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
COL/FLR_00024/2015 100
PRVABC59 50
COL/FLR_00008/2015 10
Colombia/2016/ZC204Se 10
ZKC2/2016 100
VEN/UF_1/2016 50