Skip to content

Commit

Permalink
Merge pull request #758 from nextstrain/fix-filter-date-parsing
Browse files Browse the repository at this point in the history
Fix handling of missing data in metadata
  • Loading branch information
huddlej authored Aug 13, 2021
2 parents eb89c09 + beb93cd commit ae80716
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 10 deletions.
9 changes: 6 additions & 3 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):

group_by_strain = {}
for strain in strains:
skip_strain = False
group = []
m = metadata.loc[strain].to_dict()
# collect group specifiers
Expand All @@ -875,8 +876,9 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
try:
year = int(m["date"].split('-')[0])
except:
print("WARNING: no valid year, skipping",strain, m["date"])
continue
print(f"WARNING: no valid year, skipping strain '{strain}' with date value of '{m['date']}'.", file=sys.stderr)
skip_strain = True
break
if c=='month':
try:
month = int(m["date"].split('-')[1])
Expand All @@ -888,7 +890,8 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
else:
group.append('unknown')

group_by_strain[strain] = tuple(group)
if not skip_strain:
group_by_strain[strain] = tuple(group)

# If we could not find any requested categories, we cannot complete subsampling.
distinct_groups = set(group_by_strain.values())
Expand Down
1 change: 1 addition & 0 deletions augur/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
"sep": None,
"engine": "python",
"skipinitialspace": True,
"na_filter": False,
}

if chunk_size:
Expand Down
9 changes: 5 additions & 4 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ Filter using only metadata without sequence input or output and save results as
> --min-length 10500 \
> --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null

Output should include the 8 sequences matching the filters and a header line.
Output should include the 7 sequences matching the filters and a header line.

$ wc -l "$TMP/filtered_metadata.tsv"
\s*9 .* (re)
\s*8 .* (re)
$ rm -f "$TMP/filtered_metadata.tsv"

Filter using only metadata and save results as a list of filtered strains.
Expand All @@ -128,10 +128,10 @@ Filter using only metadata and save results as a list of filtered strains.
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null

Output should include only the 8 sequences matching the filters (without a header line).
Output should include only the 7 sequences matching the filters (without a header line).

$ wc -l "$TMP/filtered_strains.txt"
\s*8 .* (re)
\s*7 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter using only metadata without a sequence index.
Expand Down Expand Up @@ -333,6 +333,7 @@ The two highest priority strains are in these two years.
> --priority filter/priorities.tsv \
> --sequences-per-group 1 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null
WARNING: no valid year, skipping strain 'COL/FLR_00024/2015' with date value of ''.

$ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt")
$ rm -f "$TMP/filtered_strains.txt"
2 changes: 1 addition & 1 deletion tests/functional/filter/metadata.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
strain virus accession date region country division city db segment authors url title journal paper_url
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
COL/FLR_00024/2015 zika MF574569 South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/filter/priorities.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
COL/FLR_00024/2015 100
PRVABC59 50
COL/FLR_00024/2015 50
PRVABC59 100
COL/FLR_00008/2015 10
Colombia/2016/ZC204Se 10
ZKC2/2016 100
Expand Down

0 comments on commit ae80716

Please sign in to comment.