Clarify behavior for options that use strain ID

victorlin · victorlin · commit 348613b80623 · 2024-03-05T15:48:38.000-08:00
Add reference to --metadata-id-columns which affects the behavior of
these options.
diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -66,7 +66,8 @@ def register_arguments(parser):
              "201X-10-01"). Similarly, an ambiguous month makes the
              corresponding day ambiguous (e.g., "2010-XX-01").""")
     metadata_filter_group.add_argument('--exclude', type=str, nargs="+", metavar="FILE", default=argparse.SUPPRESS,
-        help="File(s) with list of strains to exclude.")
+        help="""File(s) with list of strain IDs to exclude. The ID column is
+             determined by --metadata-id-columns.""")
     metadata_filter_group.add_argument('--exclude-where', nargs='+', metavar="CONDITION", default=argparse.SUPPRESS,
         help="""Exclude strains matching these conditions. Ex: \"host=rat\" or
              \"host!=rat\". Multiple values are processed as OR (matching any of
@@ -75,8 +76,9 @@ def register_arguments(parser):
         help="""Exclude all strains by default. Use this with the include
              arguments to select a specific subset of strains.""")
     metadata_filter_group.add_argument('--include', type=str, nargs="+", metavar="FILE", default=argparse.SUPPRESS,
-        help="""File(s) with list of strains to include regardless of
-             priorities, subsampling, or absence of an entry in --sequences.""")
+        help="""File(s) with list of strain IDs to include regardless of
+             priorities, subsampling, or absence of an entry in --sequences. The
+             ID column is determined by --metadata-id-columns.""")
     metadata_filter_group.add_argument('--include-where', nargs='+', metavar="CONDITION", default=argparse.SUPPRESS,
         help="""Include strains with these values. ex: host=rat. Multiple values
              are processed as OR (having any of those specified will be
@@ -128,13 +130,14 @@ def register_arguments(parser):
 
     subsample_group.add_argument('--priority', type=str, metavar="FILE", default=argparse.SUPPRESS,
         help="""Tab-delimited file with list of priority scores for strains
-             (e.g., "<strain>\\t<priority>") and no header. When scores are
+             (e.g., "<strain ID>\\t<priority>") and no header. When scores are
              provided, Augur converts scores to floating point values, sorts
              strains within each subsampling group from highest to lowest
              priority, and selects the top N strains per group where N is the
              calculated or requested number of strains per group. Higher numbers
              indicate higher priority. Since priorities represent relative
-             values between strains, these values can be arbitrary.""")
+             values between strains, these values can be arbitrary. The ID
+             column is determined by --metadata-id-columns.""")
     subsample_group.add_argument('--subsample-seed', type=int, metavar="N", default=argparse.SUPPRESS,
         help="""Random number generator seed to allow reproducible subsampling
              (with same input data).""")
@@ -149,7 +152,8 @@ def register_arguments(parser):
     output_group.add_argument('--output-metadata', metavar="FILE", default=argparse.SUPPRESS,
         help="Metadata for strains that passed filters.")
     output_group.add_argument('--output-strains', metavar="FILE", default=argparse.SUPPRESS,
-        help="List of strains that passed filters (no header).")
+        help="""List of strain IDs that passed filters (no header). The ID
+             column is determined by --metadata-id-columns.""")
     output_group.add_argument('--output-log', metavar="FILE", default=argparse.SUPPRESS,
         help="""Tab-delimited file with one row for each filtered strain and the
              reason it was filtered. Keyword arguments used for a given filter
diff --git a/tests/functional/filter/cram/filter-help.t b/tests/functional/filter/cram/filter-help.t
@@ -92,7 +92,8 @@ Show help text
                           "201X-10-01"). Similarly, an ambiguous month makes the
                           corresponding day ambiguous (e.g., "2010-XX-01").
     --exclude FILE [FILE ...]
-                          File(s) with list of strains to exclude.
+                          File(s) with list of strain IDs to exclude. The ID
+                          column is determined by --metadata-id-columns.
     --exclude-where CONDITION [CONDITION ...]
                           Exclude strains matching these conditions. Ex:
                           "host=rat" or "host!=rat". Multiple values are
@@ -102,9 +103,10 @@ Show help text
                           include arguments to select a specific subset of
                           strains.
     --include FILE [FILE ...]
-                          File(s) with list of strains to include regardless of
-                          priorities, subsampling, or absence of an entry in
-                          --sequences.
+                          File(s) with list of strain IDs to include regardless
+                          of priorities, subsampling, or absence of an entry in
+                          --sequences. The ID column is determined by
+                          --metadata-id-columns.
     --include-where CONDITION [CONDITION ...]
                           Include strains with these values. ex: host=rat.
                           Multiple values are processed as OR (having any of
@@ -151,15 +153,16 @@ Show help text
                           max-sequences` is provided. (default: True)
     --no-probabilistic-sampling
     --priority FILE       Tab-delimited file with list of priority scores for
-                          strains (e.g., "<strain>\t<priority>") and no header.
-                          When scores are provided, Augur converts scores to
-                          floating point values, sorts strains within each
-                          subsampling group from highest to lowest priority, and
-                          selects the top N strains per group where N is the
-                          calculated or requested number of strains per group.
-                          Higher numbers indicate higher priority. Since
-                          priorities represent relative values between strains,
-                          these values can be arbitrary.
+                          strains (e.g., "<strain ID>\t<priority>") and no
+                          header. When scores are provided, Augur converts
+                          scores to floating point values, sorts strains within
+                          each subsampling group from highest to lowest
+                          priority, and selects the top N strains per group
+                          where N is the calculated or requested number of
+                          strains per group. Higher numbers indicate higher
+                          priority. Since priorities represent relative values
+                          between strains, these values can be arbitrary. The ID
+                          column is determined by --metadata-id-columns.
     --subsample-seed N    Random number generator seed to allow reproducible
                           subsampling (with same input data).
   
@@ -173,7 +176,8 @@ Show help text
     --output-metadata FILE
                           Metadata for strains that passed filters.
     --output-strains FILE
-                          List of strains that passed filters (no header).
+                          List of strain IDs that passed filters (no header).
+                          The ID column is determined by --metadata-id-columns.
     --output-log FILE     Tab-delimited file with one row for each filtered
                           strain and the reason it was filtered. Keyword
                           arguments used for a given filter are reported in JSON