nextstrain · victorlin · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024 · Dec 31, 2022
diff --git a/augur/dates/__init__.py b/augur/dates/__init__.py
@@ -76,38 +76,7 @@ def numeric_date_type(date):
     except InvalidDate as error:
         raise argparse.ArgumentTypeError(str(error)) from error
 
-def is_date_ambiguous(date, ambiguous_by):
-    """
-    Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or any parts).
-
-    Parameters
-    ----------
-    date : str
-        Date string in the format of YYYY-MM-DD
-    ambiguous_by : str
-        Field of the date string to test for ambiguity ("day", "month", "year", "any")
-    """
-    date_components = date.split('-', 2)
-
-    if len(date_components) == 3:
-        year, month, day = date_components
-    elif len(date_components) == 2:
-        year, month = date_components
-        day = "XX"
-    else:
-        year = date_components[0] if date_components[0] else 'X'
-        month = "XX"
-        day = "XX"
-
-    # Determine ambiguity hierarchically such that, for example, an ambiguous
-    # month implicates an ambiguous day even when day information is available.
-    return any((
-        "X" in year,
-        "X" in month and ambiguous_by in ("any", "month", "day"),
-        "X" in day and ambiguous_by in ("any", "day")
-    ))
-
-def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
+def get_numerical_date_from_value(value, fmt=None, min_max_year=None, ambiguity_resolver='both'):
     value = str(value)
     if re.match(r'^-*\d+\.\d+$', value):
         # numeric date which can be negative
@@ -120,7 +89,15 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
             ambig_date = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year)
         except InvalidDate as error:
             raise AugurError(str(error)) from error
-        return [treetime.utils.numeric_date(d) for d in ambig_date]
+        ambig_date_numeric = [treetime.utils.numeric_date(d) for d in ambig_date]
+        if ambiguity_resolver == 'both':
+            return ambig_date_numeric
+        elif ambiguity_resolver == 'min':
+            return ambig_date_numeric[0]
+        elif ambiguity_resolver == 'max':
+            return ambig_date_numeric[1]
+        else:
+            raise Exception(f"Invalid value for ambiguity_resolver: {ambiguity_resolver!r}.")
     try:
         return treetime.utils.numeric_date(datetime.datetime.strptime(value, fmt))
     except:
@@ -142,10 +119,3 @@ def get_numerical_dates(metadata:pd.DataFrame, name_col = None, date_col='date',
         strains = metadata.index.values
         dates = metadata[date_col].astype(float)
     return dict(zip(strains, dates))
-
-def get_year_month(year, month):
-    return f"{year}-{str(month).zfill(2)}"
-
-def get_year_week(year, month, day):
-    year, week = datetime.date(year, month, day).isocalendar()[:2]
-    return f"{year}-{str(week).zfill(2)}"
diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -18,18 +18,25 @@ def register_arguments(parser):
     input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
     input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
     input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
+    input_group.add_argument('--metadata-index', metavar="FILE", help="SQLite3 database file with metadata preloaded")
     input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
-    input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
+    input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used. NOTE: this only applies to --query/--query-pandas.")
     input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
     input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
 
     metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
     metadata_filter_group.add_argument(
-        '--query',
+        '--query-pandas', '--query',
         help="""Filter samples by attribute.
         Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
         (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
     )
+    metadata_filter_group.add_argument(
+        '--query-sqlite',
+        help="""Filter samples by attribute.
+        Uses SQL WHERE clause querying, see https://www.sqlite.org/lang_expr.html for syntax.
+        (e.g., --query "country = 'Colombia'" or --query "(country = 'USA' AND division = 'Washington')")"""
+    )
     metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", action="extend", help=f"""
         Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
         Automatic type inference will be attempted on all unspecified columns used in the query.
@@ -104,9 +111,11 @@ def register_arguments(parser):
     output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
     output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
     output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
+    output_group.add_argument('--output-metadata-index', help="SQLite3 database file with metadata preloaded")
     output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
     output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
     output_group.add_argument('--output-group-by-sizes', help="tab-delimited file one row per group with target size.")
+    output_group.add_argument('--debug', action='store_true', help="Run in debug mode.")
     output_group.add_argument(
         '--empty-output-reporting',
         type=EmptyOutputReportingMethod.argtype,