Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter: Rewrite using SQLite3 #1242

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 10 additions & 40 deletions augur/dates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,38 +76,7 @@ def numeric_date_type(date):
except InvalidDate as error:
raise argparse.ArgumentTypeError(str(error)) from error

def is_date_ambiguous(date, ambiguous_by):
"""
Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or any parts).

Parameters
----------
date : str
Date string in the format of YYYY-MM-DD
ambiguous_by : str
Field of the date string to test for ambiguity ("day", "month", "year", "any")
"""
date_components = date.split('-', 2)

if len(date_components) == 3:
year, month, day = date_components
elif len(date_components) == 2:
year, month = date_components
day = "XX"
else:
year = date_components[0] if date_components[0] else 'X'
month = "XX"
day = "XX"

# Determine ambiguity hierarchically such that, for example, an ambiguous
# month implicates an ambiguous day even when day information is available.
return any((
"X" in year,
"X" in month and ambiguous_by in ("any", "month", "day"),
"X" in day and ambiguous_by in ("any", "day")
))

def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
def get_numerical_date_from_value(value, fmt=None, min_max_year=None, ambiguity_resolver='both'):
value = str(value)
if re.match(r'^-*\d+\.\d+$', value):
# numeric date which can be negative
Expand All @@ -120,7 +89,15 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
ambig_date = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year)
except InvalidDate as error:
raise AugurError(str(error)) from error
return [treetime.utils.numeric_date(d) for d in ambig_date]
ambig_date_numeric = [treetime.utils.numeric_date(d) for d in ambig_date]
if ambiguity_resolver == 'both':
return ambig_date_numeric
elif ambiguity_resolver == 'min':
return ambig_date_numeric[0]
elif ambiguity_resolver == 'max':
return ambig_date_numeric[1]
else:
raise Exception(f"Invalid value for ambiguity_resolver: {ambiguity_resolver!r}.")
try:
return treetime.utils.numeric_date(datetime.datetime.strptime(value, fmt))
except:
Expand All @@ -142,10 +119,3 @@ def get_numerical_dates(metadata:pd.DataFrame, name_col = None, date_col='date',
strains = metadata.index.values
dates = metadata[date_col].astype(float)
return dict(zip(strains, dates))

def get_year_month(year, month):
return f"{year}-{str(month).zfill(2)}"

def get_year_week(year, month, day):
year, week = datetime.date(year, month, day).isocalendar()[:2]
return f"{year}-{str(week).zfill(2)}"
13 changes: 11 additions & 2 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,25 @@ def register_arguments(parser):
input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
input_group.add_argument('--metadata-index', metavar="FILE", help="SQLite3 database file with metadata preloaded")
input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used. NOTE: this only applies to --query/--query-pandas.")
input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")

metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
metadata_filter_group.add_argument(
'--query',
'--query-pandas', '--query',
help="""Filter samples by attribute.
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument(
'--query-sqlite',
help="""Filter samples by attribute.
Uses SQL WHERE clause querying, see https://www.sqlite.org/lang_expr.html for syntax.
(e.g., --query "country = 'Colombia'" or --query "(country = 'USA' AND division = 'Washington')")"""
)
metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", action="extend", help=f"""
Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
Automatic type inference will be attempted on all unspecified columns used in the query.
Expand Down Expand Up @@ -104,9 +111,11 @@ def register_arguments(parser):
output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
output_group.add_argument('--output-metadata-index', help="SQLite3 database file with metadata preloaded")
output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
output_group.add_argument('--output-group-by-sizes', help="tab-delimited file one row per group with target size.")
output_group.add_argument('--debug', action='store_true', help="Run in debug mode.")
output_group.add_argument(
'--empty-output-reporting',
type=EmptyOutputReportingMethod.argtype,
Expand Down
Loading
Loading