Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply filters and group-by logic to a metadata iterator #750

Merged
merged 12 commits into from
Aug 12, 2021
89 changes: 25 additions & 64 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
import treetime.utils
from typing import Collection

from .index import index_sequences
from .index import index_sequences, index_vcf
from .io import open_file, read_sequences, write_sequences
from .utils import read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous

comment_char = '#'
MAX_NUMBER_OF_PROBABILISTIC_SAMPLING_ATTEMPTS = 10
Expand Down Expand Up @@ -104,46 +104,6 @@ def read_metadata(metadata_file, **kwargs):
)


def filename_is_vcf(filename):
"""Returns whether the given file likely represents a VCF.

Parameters
----------
filename : str
Filename to test.

Returns
-------
bool :
Whether filename represents a VCF or not.

>>> filename_is_vcf("my_data.vcf")
True
>>> filename_is_vcf("my_data.vcf.gz")
True
>>> filename_is_vcf("my_data.csv")
False

"""
return filename and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))


def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
headers = chrom_line.strip().split("\t")
sequences = headers[headers.index("FORMAT") + 1:]

# because we need 'seqs to remove' for VCF
return sequences, sequences.copy()


def write_vcf(input_filename, output_filename, dropped_samps):
if _filename_gz(input_filename):
input_arg = "--gzvcf"
Expand Down Expand Up @@ -657,6 +617,15 @@ def construct_filters(args, sequence_index):
# filtering logic.
return exclude_by, include_by

# Filter by sequence index.
if sequence_index is not None:
exclude_by.append((
filter_by_sequence_index,
{
"sequence_index": sequence_index,
},
))

# Remove strains explicitly excluded by name.
if args.exclude:
for exclude_file in args.exclude:
Expand Down Expand Up @@ -703,15 +672,6 @@ def construct_filters(args, sequence_index):
}
))

# Filter by sequence index.
if sequence_index is not None:
exclude_by.append((
filter_by_sequence_index,
{
"sequence_index": sequence_index,
},
))

# Filter by sequence length.
if args.min_length:
# Skip VCF files and warn the user that the min length filter does not
Expand Down Expand Up @@ -1246,24 +1206,19 @@ def run(args):
if not validate_arguments(args):
return 1

# Load sequence names from VCF or a sequence index. Sequence information is
# optional. Determine whether the sequence index exists or whether should be
# generated. We need to generate an index if sequence output has been
# requested (so we can filter strains by sequences that are present) or if
# any other sequence-based filters have been requested.
# Determine whether the sequence index exists or whether should be
# generated. We need to generate an index if the input sequences are in a
# VCF, if sequence output has been requested (so we can filter strains by
# sequences that are present), or if any other sequence-based filters have
# been requested.
sequence_strains = None
sequence_index_path = args.sequence_index
build_sequence_index = False

# If VCF, open and get sequence names
is_vcf = filename_is_vcf(args.sequences)

if is_vcf:
vcf_sequences, _ = read_vcf(args.sequences)
sequence_strains = set(vcf_sequences)
elif sequence_index_path is None and args.sequences:
if sequence_index_path is None and args.sequences:
sequence_filters_requested = any(getattr(args, arg) for arg in SEQUENCE_ONLY_FILTERS)
if args.output or sequence_filters_requested:
if is_vcf or args.output or sequence_filters_requested:
build_sequence_index = True
huddlej marked this conversation as resolved.
Show resolved Hide resolved

if build_sequence_index:
Expand All @@ -1279,8 +1234,14 @@ def run(args):
"You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.",
file=sys.stderr
)
index_sequences(args.sequences, sequence_index_path)

if is_vcf:
index_vcf(args.sequences, sequence_index_path)
else:
index_sequences(args.sequences, sequence_index_path)

# Load the sequence index, if a path exists.
sequence_index = None
if sequence_index_path:
sequence_index = pd.read_csv(
sequence_index_path,
Expand Down
49 changes: 46 additions & 3 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,50 @@
import csv

from .io import open_file, read_sequences
from .utils import is_vcf, read_vcf


def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
parser.add_argument('--sequences', '-s', required=True, help="sequences in FASTA or VCF formats. Augur will summarize the content of FASTA sequences and only report the names of strains found in a given VCF.")
parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")


def index_vcf(vcf_path, index_path):
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""Create an index with a list of strain names from a given VCF. We do not
calculate any statistics for VCFs.
Parameters
----------
vcf_path : str or Path-like
path to a VCF file to index.
index_path : str or Path-like
path to a tab-delimited file containing the composition details for each
sequence in the given input file.
Returns
-------
int :
number of strains indexed
"""
strains, _ = read_vcf(vcf_path)
num_of_seqs = 0

with open_file(index_path, 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter = '\t')

#write header i output file
header = ['strain']
tsv_writer.writerow(header)

for record in strains:
tsv_writer.writerow([record])
num_of_seqs += 1

return num_of_seqs


def index_sequence(sequence, values):
"""Count the number of nucleotides for a given sequence record.
Expand Down Expand Up @@ -165,11 +201,18 @@ def run(args):
the composition as a data frame to the given sequence index path.
'''
try:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
if is_vcf(args.sequences):
num_of_seqs = index_vcf(args.sequences, args.output)
tot_length = None
else:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1

if args.verbose:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
if tot_length:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
else:
print("Analysed %i sequences" % num_of_seqs)
21 changes: 19 additions & 2 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,34 @@ class AugurException(Exception):
pass


def is_vcf(fname):
def is_vcf(filename):
"""Convenience method to check if a file is a vcf file.
>>> is_vcf(None)
False
>>> is_vcf("./foo")
False
>>> is_vcf("./foo.vcf")
True
>>> is_vcf("./foo.vcf.GZ")
True
"""
return fname.lower().endswith(".vcf") or fname.lower().endswith(".vcf.gz")
return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))

def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
headers = chrom_line.strip().split("\t")
sequences = headers[headers.index("FORMAT") + 1:]

# because we need 'seqs to remove' for VCF
return sequences, sequences.copy()

def myopen(fname, mode):
if fname.endswith('.gz'):
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ Repeat with sequence and strain outputs. We should get the same results.
> --max-date 2020-01-30 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-sequences "$TMP/filtered.fasta" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[1]
$ wc -l "$TMP/filtered_strains.txt"
Expand All @@ -257,7 +258,6 @@ Filter TB strains from VCF and save as a list of filtered strains.
> --sequences filter/tb.vcf.gz \
> --metadata filter/tb_metadata.tsv \
> --min-date 2012 \
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
$ wc -l "$TMP/filtered_strains.txt"
Expand Down