projectglow · kianfar77 · Oct 16, 2019 · Oct 11, 2019 · Oct 11, 2019 · Oct 11, 2019
diff --git a/docs/Makefile b/docs/Makefile
@@ -6,7 +6,7 @@ SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
-
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(SPHINXOPTS) $(SOURCEDIR)
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -17,3 +17,6 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+livehtml:
+	sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
diff --git a/docs/source/_static/notebooks/etl/lift-over.html b/docs/source/_static/notebooks/etl/lift-over.html
diff --git a/...rce/_static/notebooks/sample-qc-demo.html → ..._static/notebooks/etl/sample-qc-demo.html b/...rce/_static/notebooks/sample-qc-demo.html → ..._static/notebooks/etl/sample-qc-demo.html
diff --git a/...ource/_static/notebooks/variant-data.html → ...e/_static/notebooks/etl/variant-data.html b/...ource/_static/notebooks/variant-data.html → ...e/_static/notebooks/etl/variant-data.html
diff --git a/...ce/_static/notebooks/variant-qc-demo.html → ...static/notebooks/etl/variant-qc-demo.html b/...ce/_static/notebooks/variant-qc-demo.html → ...static/notebooks/etl/variant-qc-demo.html
diff --git a/docs/source/_static/notebooks/etl/vcf2delta.html b/docs/source/_static/notebooks/etl/vcf2delta.html
diff --git a/docs/source/_static/notebooks/tertiary/gwas.html b/docs/source/_static/notebooks/tertiary/gwas.html
diff --git a/docs/source/_static/notebooks/tertiary/hail-overview.html b/docs/source/_static/notebooks/tertiary/hail-overview.html
diff --git a/docs/source/_static/notebooks/tertiary/normalizevariants-transformer.html b/docs/source/_static/notebooks/tertiary/normalizevariants-transformer.html
diff --git a/docs/source/_static/notebooks/tertiary/pandas-lmm.html b/docs/source/_static/notebooks/tertiary/pandas-lmm.html
diff --git a/...e/_static/notebooks/pipe-transformer.html → .../notebooks/tertiary/pipe-transformer.html b/...e/_static/notebooks/pipe-transformer.html → .../notebooks/tertiary/pipe-transformer.html
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -35,9 +35,9 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'glow'
-copyright = '2019, Glow Project'
-author = 'Glow Project'
+project = 'Glow'
+copyright = '2019, Glow Authors'
+author = 'Glow Authos'
 
 # The short X.Y version
 version = ''
@@ -106,7 +106,10 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    'page_width': '85%',
+    'sidebar_width': '15%'
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@@ -211,4 +214,4 @@
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
 
-#
+#
diff --git a/docs/source/etl/index.rst b/docs/source/etl/index.rst
@@ -0,0 +1,18 @@
+.. meta::
+  :description: Learn about using pre-packaged pipelines contained in |DBR| for Health and Life Sciences.
+
+=========================
+Variant Data Manipulation
+=========================
+
+Glow offers functionalities to perform genomic variant data ETL, manipulation, and quality control.
+
+.. toctree::
+   :maxdepth: 2
+
+   variant-data.rst
+   vcf2delta.rst
+   variant-qc.rst
+   sample-qc.rst
+   lift-over.rst
+   utility-functions.rst
diff --git a/docs/source/etl/lift-over.rst b/docs/source/etl/lift-over.rst
@@ -0,0 +1,111 @@
+=========
+Liftover
+=========
+
+Liftover tools convert genomic data between reference assemblies. The `UCSC liftOver tool`_  uses a `chain file`_ to
+perform simple coordinate conversion, for example on `BED files`_. The `Picard LiftOverVcf tool`_ also uses the new
+`reference assembly file`_ to transform variant information (eg. alleles and INFO fields).
+Glow can be used to run `coordinate liftover`_ and `variant liftover`_.
+
+.. _`UCSC liftOver tool`: https://genome.ucsc.edu/cgi-bin/hgLiftOver
+.. _`chain file`: https://genome.ucsc.edu/goldenPath/help/chain.html
+.. _`reference assembly file`: https://software.broadinstitute.org/gatk/documentation/article?id=11013
+.. _`BED files`: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
+.. _`Picard LiftOverVcf tool`: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_vcf_LiftoverVcf.php
+
+Create a liftover cluster
+==========================
+
+For both coordinate and variant liftover, you need a chain file on every node of the cluster. The following example downloads a chain file for liftover
+from the b37 to the hg38 reference assembly.
+
+.. code-block:: bash
+
+    #!/usr/bin/env bash
+    set -ex
+    set -o pipefail
+    mkdir /opt/liftover
+    curl https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain --output /opt/liftover/b37ToHg38.over.chain
+
+Coordinate liftover
+====================
+
+To perform liftover for genomic coordinates, use the function ``lift_over_coordinates``. ``lift_over_coordinates`` has
+the following parameters.
+
+- chromosome: ``string``
+- start: ``long``
+- end: ``long``
+- chain file: ``string`` (constant value, such as one created with ``lit()``)
+- minimum fraction of bases that must remap: ``double`` (optional, defaults to ``.95``)
+
+The returned ``struct`` has the following values if liftover succeeded. If not, the UDF returns ``null``.
+
+- ``contigName``: ``string``
+- ``start``: ``long``
+- ``end``: ``long``
+
+.. code-block:: py
+
+    from pyspark.sql.functions import expr
+    liftover_expr = "lift_over_coordinates(contigName, start, end, '/opt/liftover/b37ToHg38.over.chain', .99)"
+    input_with_lifted_df = input_df.withColumn('lifted', expr(liftover_expr))
+
+
+Variant liftover
+=================
+
+For genetic variant data, use the ``lift_over_variants`` transformer. In addition to performing liftover for genetic
+coordinates, variant liftover performs the following transformations:
+
+- Reverse-complement and left-align the variant if needed
+- Adjust the SNP, and correct AF-like INFO fields and the relevant genotypes if the reference and alternate alleles have
+  been swapped in the new genome build
+
+Pull a target assembly :ref:`reference file <reference-genomes>` down to every node in the Spark cluster in addition to
+a chain file before performing variant liftover.
+
+The ``lift_over_variants`` transformer operates on a DataFrame containing genetic variants and supports the following
+:ref:`options <transformer-options>`.
+
+.. list-table::
+  :header-rows: 1
+
+  * - Parameter
+    - Default
+    - Description
+  * - chain_file
+    - n/a
+    - The path of the chain file.
+  * - reference_file
+    - n/a
+    - The path of the target reference file.
+  * - min_match_ratio
+    - .95
+    - Minimum fraction of bases that must remap.
+
+The output DataFrame's schema consists of the input DataFrame's schema with the following fields appended:
+
+- ``INFO_SwappedAlleles``: ``boolean`` (null if liftover failed, true if the reference and alternate alleles were
+  swapped, false otherwise)
+- ``INFO_ReverseComplementedAlleles``: ``boolean`` (null if liftover failed, true if the reference and alternate
+  alleles were reverse complemented, false otherwise)
+- ``liftOverStatus``: ``struct``
+
+   * ``success``: ``boolean`` (true if liftover succeeded, false otherwise)
+   * ``errorMessage``: ``string`` (null if liftover succeeded, message describing reason for liftover failure otherwise)
+
+If liftover succeeds, the output row contains the liftover result and ``liftOverStatus.success`` is true.
+If liftover fails, the output row contains the original input row, the additional ``INFO`` fields are null,
+``liftOverStatus.success`` is false, and ``liftOverStatus.errorMessage`` contains the reason liftover failed.
+
+.. code-block:: py
+
+    import glow
+    chain_file = '/opt/liftover/b37ToHg38.over.chain'
+    reference_file = '/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
+    output_df = glow.transform('lift_over_variants', input_df, chain_file=chain_file, reference_file=reference_file)
+
+
+.. notebook:: ../_static/notebooks/etl/lift-over.html
+  :title: Liftover notebook
diff --git a/docs/source/api/sample-qc.rst → docs/source/etl/sample-qc.rst b/docs/source/api/sample-qc.rst → docs/source/etl/sample-qc.rst
@@ -40,4 +40,4 @@ Each of these functions returns a map from sample ID to a struct containing metr
     - ``genotypes`` array with a ``conditionalQuality`` field
     - A struct with ``min``, ``max``, ``mean``, and ``stddev``
 
-.. notebook:: ../_static/notebooks/sample-qc-demo.html
+.. notebook:: ../_static/notebooks/etl/sample-qc-demo.html
diff --git a/docs/source/api/glue-functions.rst → docs/source/etl/utility-functions.rst b/docs/source/api/glue-functions.rst → docs/source/etl/utility-functions.rst
diff --git a/docs/source/api/variant-data.rst → docs/source/etl/variant-data.rst b/docs/source/api/variant-data.rst → docs/source/etl/variant-data.rst
@@ -2,7 +2,7 @@
 Variant I/O with Spark SQL
 ==========================
 
-Glow includes Spark SQL support for reading and writing variant data in parallel directly from S3.
+Glow makes it possible to read and write variant data at scale using Spark SQL.
 
 .. tip::
 
@@ -19,7 +19,7 @@ the DataFrame API using Python, R, Scala, or SQL.
 
 .. code-block:: py
 
-  df = spark.read.format("com.databricks.vcf").load(path)
+  df = spark.read.format("vcf").load(path)
 
 The returned DataFrame has a schema that mirrors a single row of a VCF. Information that applies to an entire
 variant (SNV or indel), such as the contig name, start and end positions, and INFO attributes,
@@ -37,8 +37,6 @@ You can control the behavior of the VCF reader with a few parameters. All parame
 +----------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Parameter            | Type    | Default | Description                                                                                                                                             |
 +======================+=========+=========+=========================================================================================================================================================+
-| asADAMVariantContext | boolean | false   | If true, rows are emitted in the VariantContext schema from the `ADAM <https://github.com/bigdatagenomics/adam>`_ project.                              |
-+----------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
 | includeSampleIds     | boolean | true    | If true, each genotype includes the name of the sample ID it belongs to. Sample names increases the size of each row, both in memory and on storage.    |
 +----------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
 | splitToBiallelic     | boolean | false   | If true, multiallelic variants are split into two or more biallelic variants.                                                                           |
@@ -53,7 +51,7 @@ You can save a DataFrame as a VCF file, which you can then read with other tools
 
 .. code-block:: py
 
-  df.write.format("com.databricks.bigvcf").save(<path-to-file>)
+  df.write.format("bigvcf").save(<path-to-file>)
 
 The file extension of the output path determines which, if any, compression codec should be used.
 For instance, writing to a path such as ``/genomics/my_vcf.vcf.bgz`` will cause the output file to be
@@ -63,7 +61,7 @@ If you'd rather save a sharded VCF where each partition saves to a separate file
 
 .. code-block:: py
 
-  df.write.format("com.databricks.vcf").save(path)
+  df.write.format("vcf").save(path)
 
 To control the behavior of the sharded VCF writer, you can provide the following option:
 
@@ -80,7 +78,7 @@ For both the single and sharded VCF writer, you can use the following option to
 | Parameter   | Type   | Default | Description                                                                                                        |
 +=============+========+=========+====================================================================================================================+
 | vcfHeader   | string | infer   | If ``infer``, infers the header from the DataFrame schema. This value can be a complete header                     |
-|             |        |         | starting with ``##`` or a Hadoop filesystem path (for example, ``dbfs://...``) to a VCF file. The header from      |
+|             |        |         | starting with ``##`` or a Hadoop filesystem path to a VCF file. The header from                                    |
 |             |        |         | this file is used as the VCF header for each partition.                                                            |
 +-------------+--------+---------+--------------------------------------------------------------------------------------------------------------------+
 
@@ -92,7 +90,7 @@ Glow also provides the ability to read BGEN files, including those distributed b
 
 .. code-block:: py
 
-  df = spark.read.format("com.databricks.bgen").load(path)
+  df = spark.read.format("bgen").load(path)
 
 As with the VCF reader, the provided path can be a file, directory, or glob pattern. If ``.bgi``
 index files are located in the same directory as the data files, the reader uses the indexes to
@@ -113,7 +111,7 @@ You can use the ``DataFrameWriter`` API to save a single BGEN file, which you ca
 
 .. code-block:: py
 
-  df.write.format("com.databricks.bigbgen").save(path)
+  df.write.format("bigbgen").save(path)
 
 If the genotype arrays are missing ploidy and/or phasing information, the BGEN writer infers the values using the
 provided values for ploidy, phasing, or ``posteriorProbabilities`` in the genotype arrays. You can provide the value for ploidy
@@ -134,4 +132,4 @@ To control the behavior of the BGEN writer, you can provide the following option
 | defaultInferredPhasing | boolean | false   | The inferred phasing if phasing is missing and cannot be inferred from ``posteriorProbabilities``.                                 |
 +------------------------+---------+---------+------------------------------------------------------------------------------------------------------------------------------------+
 
-.. notebook:: ../_static/notebooks/variant-data.html
+.. notebook:: ../_static/notebooks/etl/variant-data.html
diff --git a/docs/source/api/variant-qc.rst → docs/source/etl/variant-qc.rst b/docs/source/api/variant-qc.rst → docs/source/etl/variant-qc.rst
@@ -39,4 +39,4 @@ You can calculate quality control statistics on your variant data using Spark SQ
     - The ``genotypes`` array
     - A struct containing the min, max, mean, and sample standard deviation for genotype quality (GQ in VCF v4.2 specification) across all samples
 
-.. notebook:: ../_static/notebooks/variant-qc-demo.html
+.. notebook:: ../_static/notebooks/etl/variant-qc-demo.html
diff --git a/docs/source/etl/vcf2delta.rst b/docs/source/etl/vcf2delta.rst
@@ -0,0 +1,17 @@
+============================
+Create a Genomics Delta Lake
+============================
+
+Genomics data is usually stored in specialized flat-file formats such as VCF or BGEN.
+
+The example below shows how to ingest a VCF into a genomics `Delta Lake table <https://delta.io>`_ using Glow in Python
+(R, Scala, and SQL are also supported).
+
+You can use Delta tables for second-latency queries, performant range-joins (similar to the single-node
+bioinformatics tool bedtools intersect), aggregate analyses such as calculating summary statistics,
+machine learning or deep learning.
+
+.. tip:: We recommend ingesting VCF files into Delta tables once volumes reach >1000 samples, >10 billion genotypes or >1 terabyte.
+
+.. notebook:: ../_static/notebooks/etl/vcf2delta.html
+  :title: VCF to Delta Lake table notebook
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,14 +1,11 @@
-glow
+Glow
 ====
 
 Glow is an open-source genomic data analysis tool using `Apache Spark <https://spark.apache.org>`__.
 
 .. toctree::
    :maxdepth: 2
 
-   api/variant-data
-   api/variant-qc
-   api/sample-qc
-   api/pipe-transformer
-   api/glue-functions
-   modules
+   etl/index
+   tertiary/index
+.. modules
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
@@ -1,7 +1,7 @@
 Python API
 ==========
 
-.. automodule:: db_genomics.dbg
+.. automodule:: glow.glow
     :members:
     :undoc-members:
     :show-inheritance:
diff --git a/docs/source/tertiary/index.rst b/docs/source/tertiary/index.rst
@@ -0,0 +1,16 @@
+.. meta::
+  :description: Learn how to perform population-scale statistical analyses of genetic variants.
+
+=================
+Tertiary Analysis
+=================
+
+Perform population-scale statistical analyses of genetic variants.
+
+.. toctree::
+   :maxdepth: 2
+
+   pipe-transformer
+   variant-normalization
+   pandas-udf
+   regression-tests
diff --git a/docs/source/tertiary/pandas-udf.rst b/docs/source/tertiary/pandas-udf.rst
@@ -0,0 +1,9 @@
+=================================
+Using Python Statistics Libraries
+=================================
+
+This notebook demonstrates how to use :ref:`pandas UDFs <pandas-udf>` to run native Python code with
+PySpark when working with genomic data.
+
+.. notebook:: ../_static/notebooks/tertiary/pandas-lmm.html
+  :title: pandas example notebook