Skip to content

Commit

Permalink
🚧 support db file as metadata index
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Sep 6, 2024
1 parent a0a8e81 commit 99c5a0d
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 13 deletions.
2 changes: 2 additions & 0 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def register_arguments(parser):
input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
input_group.add_argument('--metadata-index', metavar="FILE", help="SQLite3 database file with metadata preloaded")
input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used. NOTE: this only applies to --query/--query-pandas.")
input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
Expand Down Expand Up @@ -110,6 +111,7 @@ def register_arguments(parser):
output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
output_group.add_argument('--output-metadata-index', help="SQLite3 database file with metadata preloaded")
output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
output_group.add_argument('--output-group-by-sizes', help="tab-delimited file one row per group with target size.")
Expand Down
34 changes: 21 additions & 13 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from argparse import Namespace
import shutil
from tempfile import NamedTemporaryFile
from augur.errors import AugurError

Expand Down Expand Up @@ -26,22 +27,29 @@ def run(args: Namespace):

print_debug(f"Temporary database file: {constants.RUNTIME_DB_FILE!r}")

initialize_input_source_table()
if args.metadata_index:
shutil.copyfile(args.metadata_index, file.name)
else:
initialize_input_source_table()

try:
metadata = Metadata(args.metadata, id_columns=args.metadata_id_columns, delimiters=args.metadata_delimiters)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
columns = get_useful_metadata_columns(args, metadata.id_column, metadata.columns)
import_metadata(metadata, columns)
try:
metadata = Metadata(args.metadata, id_columns=args.metadata_id_columns, delimiters=args.metadata_delimiters)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
columns = get_useful_metadata_columns(args, metadata.id_column, metadata.columns)
import_metadata(metadata, columns)

import_sequence_index(args)
import_sequence_index(args)

parse_dates()
parse_dates()

if args.output_metadata_index:
print(f"Saving database file to {args.output_metadata_index!r}")
shutil.copyfile(file.name, args.output_metadata_index)

exclude_by, include_by = construct_filters(args)
apply_filters(exclude_by, include_by)
Expand Down

0 comments on commit 99c5a0d

Please sign in to comment.