Skip to content

Commit

Permalink
Merge pull request #890: Move filter date parsing logic to utils,…
Browse files Browse the repository at this point in the history
… use in `frequencies`
  • Loading branch information
victorlin authored Apr 15, 2022
2 parents b464692 + 542bca5 commit 8c54817
Show file tree
Hide file tree
Showing 9 changed files with 409 additions and 81 deletions.
65 changes: 65 additions & 0 deletions augur/dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import datetime
from textwrap import dedent
import isodate
import treetime.utils

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")

def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise ValueError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")

def numeric_date_type(date):
"""Wraps numeric_date() for argparse usage.
This raises an ArgumentTypeError, otherwise the custom exception message won't be shown in console output due to:
https://github.com/python/cpython/blob/5c4d1f6e0e192653560ae2941a6677fbf4fbd1f2/Lib/argparse.py#L2503-L2513
"""
try:
return numeric_date(date)
except ValueError as e:
raise argparse.ArgumentTypeError(str(e)) from e
62 changes: 3 additions & 59 deletions augur/filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""
Filter and subsample a sequence set.
"""
import argparse
from Bio import SeqIO
from collections import defaultdict
import csv
import datetime
import heapq
import isodate
import itertools
import json
import numpy as np
Expand All @@ -18,10 +15,9 @@
import re
import sys
from tempfile import NamedTemporaryFile
import treetime.utils
from textwrap import dedent
from typing import Collection

from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from .index import index_sequences, index_vcf
from .io import open_file, read_metadata, read_sequences, write_sequences
from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
Expand All @@ -33,12 +29,6 @@
"non_nucleotide",
)

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")


class FilterException(Exception):
"""Representation of an error that occurred during filtering.
Expand Down Expand Up @@ -1118,10 +1108,8 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--min-date', type=numeric_date,
help=f"""minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""")
metadata_filter_group.add_argument('--max-date', type=numeric_date,
help=f"""maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""")
metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand Down Expand Up @@ -1696,50 +1684,6 @@ def _filename_gz(filename):
return filename.lower().endswith(".gz")


def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")


def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True):
"""Calculate the number of sequences per group for a given maximum number of
sequences to be returned and the number of sequences in each requested
Expand Down
27 changes: 5 additions & 22 deletions augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
infer frequencies of mutations or clades
"""
import json, os, sys
import datetime
import numpy as np
from collections import defaultdict
from Bio import Phylo, AlignIO
from Bio.Align import MultipleSeqAlignment
import treetime.utils

from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from .utils import read_metadata, read_node_data, write_json, get_numerical_dates


Expand All @@ -26,10 +25,10 @@ def register_arguments(parser):
help="number of units between pivots")
parser.add_argument("--pivot-interval-units", type=str, default="months", choices=['months', 'weeks'],
help="space pivots by months (default) or by weeks")
parser.add_argument('--min-date', type=numeric_date,
help="date to begin frequencies calculations; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
parser.add_argument('--max-date', type=numeric_date,
help="date to end frequencies calculations; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
parser.add_argument('--min-date', type=numeric_date_type,
help=f"date to begin frequencies calculations; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
parser.add_argument('--max-date', type=numeric_date_type,
help=f"date to end frequencies calculations; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")

# Tree-specific arguments
parser.add_argument('--tree', '-t', type=str,
Expand Down Expand Up @@ -230,19 +229,3 @@ def run(args):

write_json(frequencies, args.output)
print("mutation frequencies written to", args.output, file=sys.stdout)


def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as a number (a float) with year as the integer part, or
in the YYYY-MM-DD (ISO 8601) syntax.
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
"""
try:
return float(date)
except ValueError:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
57 changes: 57 additions & 0 deletions tests/functional/frequencies.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
Integration tests for augur frequencies.

$ pushd "$TESTDIR" > /dev/null
$ export AUGUR="../../bin/augur"

Calculate KDE-based tip frequencies from a refined tree.
Timepoints used to estimate frequencies (i.e., "pivots") get calculated from the range of dates in the given metadata.

$ ${AUGUR} frequencies \
> --method kde \
> --tree "frequencies/tree.nwk" \
> --metadata "frequencies/metadata.tsv" \
> --pivot-interval 3 \
> --output "$TMP/tip-frequencies.json" > /dev/null

$ diff -u --ignore-matching-lines version "frequencies/zika_tip-frequencies.json" "$TMP/tip-frequencies.json"
$ rm -f "$TMP/tip-frequencies.json"

Calculate KDE-based tip frequencies for a time period with fixed dates.
Pivots get calculated from the requested date range.

$ ${AUGUR} frequencies \
> --method kde \
> --tree "frequencies/tree.nwk" \
> --metadata "frequencies/metadata.tsv" \
> --pivot-interval 3 \
> --min-date 2015-01-01 \
> --max-date 2016-01-01 \
> --output "$TMP/tip-frequencies.json" > /dev/null

$ diff -u --ignore-matching-lines version "frequencies/zika_tip-frequencies_with_fixed_dates.json" "$TMP/tip-frequencies.json"
$ rm -f "$TMP/tip-frequencies.json"

Calculate KDE-based tip frequencies for a time period with relative dates.
Testing relative dates deterministically from the shell is tricky.
To keep these tests simple and avoid freezing the system date to specific values, this test checks the logical consistency of the requested relative dates and pivot interval.
With a minimum date of 1 year (12 months) ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we expect only 2 pivots in the final output.
Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 2 requested pivots.
As long as we always calculate 2 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below.

$ ${AUGUR} frequencies \
> --method kde \
> --tree "frequencies/tree.nwk" \
> --metadata "frequencies/metadata.tsv" \
> --pivot-interval 3 \
> --min-date 1Y \
> --max-date 6M \
> --output "$TMP/tip-frequencies.json" > /dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" \
> --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \
> "frequencies/zika_tip-frequencies_with_relative_dates.json" \
> "$TMP/tip-frequencies.json"
{}
$ rm -f "$TMP/tip-frequencies.json"

$ popd > /dev/null
13 changes: 13 additions & 0 deletions tests/functional/frequencies/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
strain virus accession date region country division city db segment authors url title journal paper_url
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 Direct Submission Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 Direct Submission Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China https://www.ncbi.nlm.nih.gov/pubmed/
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant Genome Announc 5 (17), e00231-17 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28450510
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador Genome Announc 5 (8), e01673-16 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28232448
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
1 change: 1 addition & 0 deletions tests/functional/frequencies/tree.nwk
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
((Colombia/2016/ZC204Se:0.00105368,(PAN/CDC_259359_V1_V3/2015:0.00076051,(COL/FLR_00008/2015:0.00044440,VEN/UF_1/2016:0.00089377)NODE_0000008:0.00038502)NODE_0000007:0.00019253)NODE_0000001:0.00080159,(BRA/2016/FC_6706:0.00214920,(ZKC2/2016:0.00173693,(HND/2016/HU_ME59:0.00206150,PRVABC59:0.00135309)NODE_0000004:0.00013537,(EcEs062_16:0.00175918,DOM/2016/BB_0183:0.00184905)NODE_0000002:0.00021565)NODE_0000003:0.00013737)NODE_0000005:0.00019772)NODE_0000006:0.00100000;
92 changes: 92 additions & 0 deletions tests/functional/frequencies/zika_tip-frequencies.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"BRA/2016/FC_6706": {
"frequencies": [
0.022579,
0.011312,
0.206983,
0.102282
]
},
"COL/FLR_00008/2015": {
"frequencies": [
0.243453,
0.208443,
0.008645,
0.010659
]
},
"Colombia/2016/ZC204Se": {
"frequencies": [
0.126056,
0.231597,
0.014167,
0.017099
]
},
"DOM/2016/BB_0183": {
"frequencies": [
0.017888,
0.009342,
0.183671,
0.148759
]
},
"EcEs062_16": {
"frequencies": [
0.018981,
0.009763,
0.190994,
0.134398
]
},
"HND/2016/HU_ME59": {
"frequencies": [
0.009484,
0.006254,
0.090552,
0.457824
]
},
"PAN/CDC_259359_V1_V3/2015": {
"frequencies": [
0.224832,
0.214575,
0.008963,
0.011176
]
},
"PRVABC59": {
"frequencies": [
0.243453,
0.208443,
0.008645,
0.010659
]
},
"VEN/UF_1/2016": {
"frequencies": [
0.030662,
0.016483,
0.206983,
0.070196
]
},
"ZKC2/2016": {
"frequencies": [
0.062612,
0.083787,
0.080395,
0.036947
]
},
"generated_by": {
"program": "augur",
"version": "7.0.2"
},
"pivots": [
2015.75,
2016.0,
2016.25,
2016.5
]
}
Loading

0 comments on commit 8c54817

Please sign in to comment.