Merge remote-tracking branch 'origin/master' into feat/probabilistic_…

…sampling
nextstrain · Dec 18, 2020 · 7e4eb13 · 7e4eb13
2 parents e4d520c + 98c0ea3
commit 7e4eb13
Show file tree

Hide file tree

Showing 32 changed files with 1,282 additions and 106 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,6 +3,25 @@
 ## __NEXT__
 
 
+## 10.1.1 (16 November 2020)
+
+### Bug Fixes
+
+* dependencies: Require the most recent minor versions of TreeTime (0.8.X) to fix numpy matrix errors [#633][]
+
+[#633]: https://github.com/nextstrain/augur/pull/633
+
+## 10.1.0 (13 November 2020)
+
+### Features
+
+* docs: Migrate non-reference documentation to docs.nextstrain.org [#620][]
+* filter: Add `--exclude-ambiguous-dates-by` flag to enable exclusion of samples with ambiguous dates [#623][] and [#631][]
+
+[#620]: https://github.com/nextstrain/augur/pull/620
+[#623]: https://github.com/nextstrain/augur/pull/623
+[#631]: https://github.com/nextstrain/augur/pull/631
+
 ## 10.0.4 (6 November 2020)
 
 ### Bug Fixes

diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ The output of augur is a series of JSONs that can be used to visualize your resu
 * [Technical documentation for Augur](https://nextstrain-augur.readthedocs.io/en/stable/installation/installation.html)
 * [Contributor guide](https://github.com/nextstrain/.github/blob/master/CONTRIBUTING.md)
 * [Project board with available issues](https://github.com/orgs/nextstrain/projects/6)
-* [Developer docs for Augur](./DEV_DOCS.md)
+* [Developer docs for Augur](./docs/contribute/DEV_DOCS.md)
 
 ## Quickstart
 

diff --git a/augur/__version__.py b/augur/__version__.py
@@ -1,4 +1,4 @@
-__version__ = '10.0.4'
+__version__ = '10.1.1'
 
 
 def is_augur_version_compatible(version):

diff --git a/augur/data/lat_longs.tsv b/augur/data/lat_longs.tsv
@@ -53,6 +53,8 @@ country	burkina faso	12.5	-1.66667
 country	cambodia	13.0	105.0
 country	cameroon	6.0	12.5
 country	canada	61.0666922	-107.991707
+country	cape_verde	14.998654	-23.530702
+country	cape verde	14.998654	-23.530702
 country	cayman_islands	19.323979	-81.138315
 country	cayman islands	19.323979	-81.138315
 country	central_african_republic	7.0	21.0

diff --git a/augur/filter.py b/augur/filter.py
@@ -10,7 +10,7 @@
 import sys
 import datetime
 import treetime.utils
-from .utils import read_metadata, get_numerical_dates, run_shell_command, shquote
+from .utils import read_metadata, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
 
 comment_char = '#'
 
@@ -106,6 +106,8 @@ def register_arguments(parser):
                                 help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
     parser.add_argument('--include-where', nargs='+',
                                 help="Include samples with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be included), not AND. This rule is applied last and ensures any sequences matching these rules will be included.")
+    parser.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
+                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
     parser.add_argument('--query', help="Filter samples by attribute. Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.")
     parser.add_argument('--output', '-o', help="output file", required=True)
 
@@ -232,6 +234,17 @@ def run(args):
             num_excluded_by_length = len(seq_keep) - len(seq_keep_by_length)
             seq_keep = seq_keep_by_length
 
+    # filter by ambiguous dates
+    num_excluded_by_ambiguous_date = 0
+    if args.exclude_ambiguous_dates_by and 'date' in meta_columns:
+        seq_keep_by_date = []
+        for seq_name in seq_keep:
+            if not is_date_ambiguous(meta_dict[seq_name]['date'], args.exclude_ambiguous_dates_by):
+                seq_keep_by_date.append(seq_name)
+
+        num_excluded_by_ambiguous_date = len(seq_keep) - len(seq_keep_by_date)
+        seq_keep = seq_keep_by_date
+
     # filter by date
     num_excluded_by_date = 0
     if (args.min_date or args.max_date) and 'date' in meta_columns:
@@ -445,6 +458,8 @@ def run(args):
         print("\t%i of these were filtered out by the query:\n\t\t\"%s\"" % (num_excluded_by_query, args.query))
     if args.min_length:
         print("\t%i of these were dropped because they were shorter than minimum length of %sbp" % (num_excluded_by_length, args.min_length))
+    if args.exclude_ambiguous_dates_by and num_excluded_by_ambiguous_date:
+        print("\t%i of these were dropped because of their ambiguous date in %s" % (num_excluded_by_ambiguous_date, args.exclude_ambiguous_dates_by))
     if (args.min_date or args.max_date) and 'date' in meta_columns:
         print("\t%i of these were dropped because of their date (or lack of date)" % (num_excluded_by_date))
     if args.non_nucleotide:

diff --git a/augur/utils.py b/augur/utils.py
@@ -73,6 +73,37 @@ def ambiguous_date_to_date_range(uncertain_date, fmt, min_max_year=None):
 def read_metadata(fname, query=None):
     return MetadataFile(fname, query).read()
 
+def is_date_ambiguous(date, ambiguous_by="any"):
+    """
+    Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or any parts).
+
+    Parameters
+    ----------
+    date : str
+        Date string in the format of YYYY-MM-DD
+    ambiguous_by : str
+        Field of the date string to test for ambiguity ("day", "month", "year", "any")
+    """
+    date_components = date.split('-', 2)
+
+    if len(date_components) == 3:
+        year, month, day = date_components
+    elif len(date_components) == 2:
+        year, month = date_components
+        day = "XX"
+    else:
+        year = date_components[0]
+        month = "XX"
+        day = "XX"
+
+    # Determine ambiguity hierarchically such that, for example, an ambiguous
+    # month implicates an ambiguous day even when day information is available.
+    return any((
+        "X" in year,
+        "X" in month and ambiguous_by in ("any", "month", "day"),
+        "X" in day and ambiguous_by in ("any", "day")
+    ))
+
 def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None):
     if fmt:
         from datetime import datetime

diff --git a/docs/conf.py b/docs/conf.py
@@ -53,15 +53,27 @@ def prose_list(items):
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['recommonmark', 'sphinx.ext.autodoc', 'sphinxarg.ext', 'sphinx.ext.napoleon', 'sphinx_markdown_tables']
+extensions = ['recommonmark', 'sphinx.ext.autodoc', 'sphinxarg.ext', 'sphinx.ext.napoleon', 'sphinx_markdown_tables', 'sphinx.ext.intersphinx']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store',
+    'contribute/DEV_DOCS.md',
+    'faq/colors.md',
+    'faq/fasta_input.md',
+    'faq/import-beast.md',
+    'faq/lat_longs.md',
+    'faq/seq_traits.md',
+    'faq/translate_ref.md',
+    'faq/vcf_input.md',
+    'tutorials/tb_tutorial.md',
+    'tutorials/zika_tutorial.md',
+    'usage/augur_snakemake.md',
+]
 
 # A string of reStructuredText that will be included at the end of every source
 # file that is read. This is a possible place to add substitutions that should
@@ -78,6 +90,12 @@ def prose_list(items):
 #
 html_theme = 'nextstrain-sphinx-theme'
 
+html_theme_options = {
+    'logo_only': False, # if True, don't display project name at top of the sidebar
+    'collapse_navigation': False, # if True, no [+] icons in sidebar
+    'titles_only': True, # if True, page subheadings not included in nav
+}
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
@@ -88,3 +106,9 @@ def prose_list(items):
 html_css_files = [
     'css/custom.css',
 ]
+
+# -- Cross-project references ------------------------------------------------
+
+intersphinx_mapping = {
+    'docs.nextstrain.org': ('https://docs.nextstrain.org/en/latest/', None),
+}
diff --git a/DEV_DOCS.md → docs/contribute/DEV_DOCS.md b/DEV_DOCS.md → docs/contribute/DEV_DOCS.md
@@ -1,6 +1,6 @@
-# Development Docs for Contributors
+# Augur Development Docs for Contributors
 
-Thank you for helping us to improve Nextstrain! This document describes:
+Thank you for helping us to improve Augur! This document describes:
 
 - Getting Started
 - Contributing code

diff --git a/docs/faq/augur_snakemake.md b/docs/faq/augur_snakemake.md
diff --git a/docs/faq/community_hosting.md b/docs/faq/community_hosting.md
diff --git a/docs/faq/faq.rst b/docs/faq/faq.rst
@@ -10,15 +10,7 @@ common questions and problems users run into.
    :maxdepth: 1
    :glob:
 
+   what-is-a-build
    metadata
-   translate_ref
    clades
-   community_hosting
-   import-beast
-   colors
-   lat_longs
    Specifying `refine` rates <refine>
-   Using Augur and Snakemake <augur_snakemake>
-   vcf_input
-   fasta_input
-   seq_traits
diff --git a/docs/faq/what-is-a-build.md b/docs/faq/what-is-a-build.md
@@ -0,0 +1,32 @@
+# The concept of a 'build'
+
+Nextstrain's focus on providing a _real-time_ snapshot of evolving pathogen populations necessitates a reproducible analysis that can be rerun when new sequences are available.
+The individual steps necessary to repeat analysis together comprise a "build".
+
+
+Because no two datasets or pathogens are the same, we build Augur to be flexible and suitable for different analyses.
+The individual Augur commands are composable, and can be mixed and matched with other scripts as needed.
+These steps, taken together, are what we refer to as a "build".
+
+
+### Example build
+
+The [Zika virus tutorial](https://docs.nextstrain.org/en/latest/tutorials/zika.html#build-steps) describes a build which contains the following steps:
+
+1. Prepare pathogen sequences and metadata
+2. Align sequences
+3. Construct a phylogeny from aligned sequences
+4. Annotate the phylogeny with inferred ancestral pathogen dates, sequences, and traits
+5. Export the annotated phylogeny and corresponding metadata into auspice-readable format
+
+and each of these can be run via a separate `augur` command.
+
+If you look at the [other tutorials](https://docs.nextstrain.org/en/latest/tutorials/index.html), each one uses a slightly different combination of `augur` commands depending on the pathogen.
+
+### Snakemake
+
+While it is possible to run a build by running each of the individual steps, we typically group these together into a make-type file.
+[Snakemake](https://snakemake.readthedocs.io/en/stable/index.html) is "a tool to create reproducible and scalable data analyses... via a human-readable, Python-based language."
+
+> Snakemake is installed as part of the [conda environment](https://docs.nextstrain.org/en/latest/guides/install/local-installation.html#install-augur-auspice-with-conda) or the [docker container](https://docs.nextstrain.org/en/latest/guides/install/cli-install.html).
+If you ever see a build which has a "Snakefile" then you can run this by typing `snakemake --cores 1` or `nextstrain build --cpus 1 .`, respectively.
diff --git a/docs/index.rst b/docs/index.rst
@@ -5,9 +5,19 @@ Augur: A bioinformatics toolkit for phylogenetic analysis
     *One held to foretell events by omens.*
     (`Merriam-Webster <https://www.merriam-webster.com/dictionary/augur>`__)
 
+.. note::
+   The documentation you are viewing is Augur's reference guide, which means it is information-oriented and targeted at users who just need info about how Augur works.
+
+   * If you have a question about how to achieve a specific goal with Augur, check out our :doc:`Augur-focused How-to Guides section <docs.nextstrain.org:guides/bioinformatics/index>` in the main Nextstrain documentation.
+   * If you want to learn the basics of how to use Augur from scratch, check out our :doc:`Zika tutorial <docs.nextstrain.org:tutorials/zika_tutorial>` in the main Nextstrain documentation.
+   * If you want to understand how Augur fits together with Auspice to visualize results, check out our :doc:`Data Formats section <docs.nextstrain.org:reference/formats/data-formats>` in the main Nextstrain documentation.
+
+
+
 Augur is a bioinformatics toolkit to track evolution from sequence and serological data.
 It provides a collection of commands which are designed to be composable into larger processing pipelines.
 Augur originated as part of `Nextstrain <https://nextstrain.org>`__, an open-source project to harness the scientific and public health potential of pathogen genome data.
+All source code is available on `GitHub <https://github.com/nextstrain/augur>`__.
 
 .. note:: We have just released version 6 of augur -- `check our upgrading guide <releases/migrating-v5-v6.html>`__
 
@@ -26,7 +36,6 @@ The ``refine`` step is necessary to ensure that cross-referencing between tree n
 The different augur modules can be strung together by workflow managers like snakemake and nextflow.
 The nextstrain team uses `snakemake <https://snakemake.readthedocs.io/en/stable/>`__ to run and manage the different analysis that you see on `nextstrain.org <https://nextstrain.org>`__.
 
-
 .. toctree::
    :maxdepth: 2
    :caption: Table of contents
@@ -37,7 +46,6 @@ The nextstrain team uses `snakemake <https://snakemake.readthedocs.io/en/stable/
    usage/usage
    releases/releases
    faq/faq
-   tutorials/tutorials
    examples/examples
    api/api
    authors/authors
diff --git a/docs/releases/migrating-v5-v6.md b/docs/releases/migrating-v5-v6.md
@@ -119,7 +119,8 @@ These may have been inferred for internal nodes by Augur functions like `augur t
 Certain traits have a geographic interpretation, e.g. "country".
 Auspice will attempt to display these traits on a map (and provide a drop-down to switch between them if there are more than one).
 
-> _Make sure that these have corresponding entry in the lat-longs TSV file supplied to `export`. See how to do this [here](/faq/lat_longs)._
+> _Make sure that these have corresponding entry in the lat-longs TSV file supplied to `export`. See how to do this [here](https://docs.nextstrain.org/en/latest/guides/bioinformatics/lat_longs.html)._
+
 
 
 ---
@@ -528,5 +529,6 @@ In Auspice v2, all values are now displayed exactly as they arrive, allowing use
 
 Don't forget to also change them in any custom lat-long and/or coloring files you are using. We've also become stricter about the format of the files that pass in color and lat-long information. Previously, it didn't matter if columns were separated by spaces or tabs - now, they must be separated by tabs.
 
-You can find out more about how to add [custom coloring](/faq/colors) and [lat-long](/faq/lat_longs) values.
+You can find out more about how to add [custom coloring](https://docs.nextstrain.org/en/latest/guides/bioinformatics/colors.html) and [lat-long](https://docs.nextstrain.org/en/latest/guides/bioinformatics/lat_longs.html) values.
+
 If you use the command `parse` to generate a metadata table from fields in a fasta header, you can use the flag `--prettify-fields` to apply some prettifying operations to specific metadata entries, see the documentation [`parse`](/usage/cli/parse).
diff --git a/docs/releases/v6.md b/docs/releases/v6.md
@@ -46,7 +46,7 @@ Users can ask for this output and specify a file name using `--output-sequences`
 <span style='color: orange'>Deprecation warning:</span> The argument `--output` is now deprecated. Please use `--output-node-data` instead.
 
 ## Import BEAST MCC trees
-We now have instructions and functionality to import BEAST trees, see [here](/faq/import-beast).
+We now have instructions and functionality to import BEAST trees, see [here](https://docs.nextstrain.org/en/latest/guides/bioinformatics/import-beast.html).
 
 ## Prettifying of strings
 Previous auspice version "prettified" metadata strings (like changing 'north_america' to 'North America').
@@ -110,4 +110,4 @@ We've tried to use redirects to ensure that all the old links continue to work.
 * Errors in formatting of input files (e.g. metadata files, Auspice config files) weren't handled nicely, often resulting in hard-to-interpret stack traces.
 We now try to catch these and print an error indicating the offending file.
 
-* Tests using Python version 2 have now been removed.
+* Tests using Python version 2 have now been removed.
diff --git a/docs/tutorials/tb_tutorial.md b/docs/tutorials/tb_tutorial.md
@@ -7,7 +7,7 @@ As in the Zika fasta-input [tutorial](zika_tutorial), we'll build up a Snakefile
 
 ## Setup
 
-To run this tutorial you'll need to [install augur](../installation/installation) and [install Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html).
+To run this tutorial you'll need to [install augur](../guides/install/augur_install.md) and [install Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html).
 
 ## Build steps
 Nextstrain builds typically require the following steps:

diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst
diff --git a/docs/tutorials/zika_tutorial.md b/docs/tutorials/zika_tutorial.md
@@ -7,7 +7,7 @@ We will work off the tutorial for Zika virus on the [nextstrain web site](https:
 
 ## Setup
 
-To run this tutorial you'll need to [install augur](../installation/installation) and [install Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html).
+To run this tutorial you'll need to [install augur](../guides/install/augur_install.md) and [install Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html).
 
 ## Augur commands