diff --git a/docs/Makefile b/docs/Makefile index ba501f6f5..cb530b74a 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -6,7 +6,7 @@ SPHINXOPTS = SPHINXBUILD = sphinx-build SOURCEDIR = source BUILDDIR = build - +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(SPHINXOPTS) $(SOURCEDIR) # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @@ -17,3 +17,6 @@ help: # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +livehtml: + sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html diff --git a/docs/source/_static/notebooks/etl/lift-over.html b/docs/source/_static/notebooks/etl/lift-over.html new file mode 100644 index 000000000..a5e6f8951 --- /dev/null +++ b/docs/source/_static/notebooks/etl/lift-over.html @@ -0,0 +1,42 @@ + + + + +Lift Over - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/notebooks/sample-qc-demo.html b/docs/source/_static/notebooks/etl/sample-qc-demo.html similarity index 100% rename from docs/source/_static/notebooks/sample-qc-demo.html rename to docs/source/_static/notebooks/etl/sample-qc-demo.html diff --git a/docs/source/_static/notebooks/variant-data.html b/docs/source/_static/notebooks/etl/variant-data.html similarity index 100% rename from docs/source/_static/notebooks/variant-data.html rename to docs/source/_static/notebooks/etl/variant-data.html diff --git a/docs/source/_static/notebooks/variant-qc-demo.html b/docs/source/_static/notebooks/etl/variant-qc-demo.html similarity index 100% rename from docs/source/_static/notebooks/variant-qc-demo.html rename to docs/source/_static/notebooks/etl/variant-qc-demo.html diff --git a/docs/source/_static/notebooks/etl/vcf2delta.html b/docs/source/_static/notebooks/etl/vcf2delta.html new file mode 100644 index 000000000..82ab20264 --- /dev/null +++ b/docs/source/_static/notebooks/etl/vcf2delta.html @@ -0,0 +1,42 @@ + + + + +genomics_data_warehouse - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/notebooks/tertiary/gwas.html b/docs/source/_static/notebooks/tertiary/gwas.html new file mode 100644 index 000000000..17acc209c --- /dev/null +++ b/docs/source/_static/notebooks/tertiary/gwas.html @@ -0,0 +1,42 @@ + + + + +engineering_population_scale_gwas - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/notebooks/tertiary/hail-overview.html b/docs/source/_static/notebooks/tertiary/hail-overview.html new file mode 100644 index 000000000..761305d0d --- /dev/null +++ b/docs/source/_static/notebooks/tertiary/hail-overview.html @@ -0,0 +1,42 @@ + + + + +hail-overview - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/notebooks/tertiary/normalizevariants-transformer.html b/docs/source/_static/notebooks/tertiary/normalizevariants-transformer.html new file mode 100644 index 000000000..6f14fa199 --- /dev/null +++ b/docs/source/_static/notebooks/tertiary/normalizevariants-transformer.html @@ -0,0 +1,42 @@ + + + + +normalizevariants-python - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/notebooks/tertiary/pandas-lmm.html b/docs/source/_static/notebooks/tertiary/pandas-lmm.html new file mode 100644 index 000000000..3462dd939 --- /dev/null +++ b/docs/source/_static/notebooks/tertiary/pandas-lmm.html @@ -0,0 +1,42 @@ + + + + +pandas-lmm - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/notebooks/pipe-transformer.html b/docs/source/_static/notebooks/tertiary/pipe-transformer.html similarity index 100% rename from docs/source/_static/notebooks/pipe-transformer.html rename to docs/source/_static/notebooks/tertiary/pipe-transformer.html diff --git a/docs/source/conf.py b/docs/source/conf.py index 8b055c626..e4bdf4524 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -35,9 +35,9 @@ # -- Project information ----------------------------------------------------- -project = 'glow' -copyright = '2019, Glow Project' -author = 'Glow Project' +project = 'Glow' +copyright = '2019, Glow Authors' +author = 'Glow Authos' # The short X.Y version version = '' @@ -106,7 +106,10 @@ # further. For a list of options available for each theme, see the # documentation. # -# html_theme_options = {} +html_theme_options = { + 'page_width': '85%', + 'sidebar_width': '15%' +} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -211,4 +214,4 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True -# \ No newline at end of file +# diff --git a/docs/source/etl/index.rst b/docs/source/etl/index.rst new file mode 100644 index 000000000..a3a265ab3 --- /dev/null +++ b/docs/source/etl/index.rst @@ -0,0 +1,18 @@ +.. meta:: + :description: Learn about using pre-packaged pipelines contained in |DBR| for Health and Life Sciences. + +========================= +Variant Data Manipulation +========================= + +Glow offers functionalities to perform genomic variant data ETL, manipulation, and quality control. + +.. toctree:: + :maxdepth: 2 + + variant-data.rst + vcf2delta.rst + variant-qc.rst + sample-qc.rst + lift-over.rst + utility-functions.rst diff --git a/docs/source/etl/lift-over.rst b/docs/source/etl/lift-over.rst new file mode 100644 index 000000000..2d16bf866 --- /dev/null +++ b/docs/source/etl/lift-over.rst @@ -0,0 +1,111 @@ +========= +Liftover +========= + +Liftover tools convert genomic data between reference assemblies. The `UCSC liftOver tool`_ uses a `chain file`_ to +perform simple coordinate conversion, for example on `BED files`_. The `Picard LiftOverVcf tool`_ also uses the new +`reference assembly file`_ to transform variant information (eg. alleles and INFO fields). +Glow can be used to run `coordinate liftover`_ and `variant liftover`_. + +.. _`UCSC liftOver tool`: https://genome.ucsc.edu/cgi-bin/hgLiftOver +.. _`chain file`: https://genome.ucsc.edu/goldenPath/help/chain.html +.. _`reference assembly file`: https://software.broadinstitute.org/gatk/documentation/article?id=11013 +.. _`BED files`: https://genome.ucsc.edu/FAQ/FAQformat.html#format1 +.. _`Picard LiftOverVcf tool`: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_vcf_LiftoverVcf.php + +Create a liftover cluster +========================== + +For both coordinate and variant liftover, you need a chain file on every node of the cluster. The following example downloads a chain file for liftover +from the b37 to the hg38 reference assembly. + +.. code-block:: bash + + #!/usr/bin/env bash + set -ex + set -o pipefail + mkdir /opt/liftover + curl https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain --output /opt/liftover/b37ToHg38.over.chain + +Coordinate liftover +==================== + +To perform liftover for genomic coordinates, use the function ``lift_over_coordinates``. ``lift_over_coordinates`` has +the following parameters. + +- chromosome: ``string`` +- start: ``long`` +- end: ``long`` +- chain file: ``string`` (constant value, such as one created with ``lit()``) +- minimum fraction of bases that must remap: ``double`` (optional, defaults to ``.95``) + +The returned ``struct`` has the following values if liftover succeeded. If not, the UDF returns ``null``. + +- ``contigName``: ``string`` +- ``start``: ``long`` +- ``end``: ``long`` + +.. code-block:: py + + from pyspark.sql.functions import expr + liftover_expr = "lift_over_coordinates(contigName, start, end, '/opt/liftover/b37ToHg38.over.chain', .99)" + input_with_lifted_df = input_df.withColumn('lifted', expr(liftover_expr)) + + +Variant liftover +================= + +For genetic variant data, use the ``lift_over_variants`` transformer. In addition to performing liftover for genetic +coordinates, variant liftover performs the following transformations: + +- Reverse-complement and left-align the variant if needed +- Adjust the SNP, and correct AF-like INFO fields and the relevant genotypes if the reference and alternate alleles have + been swapped in the new genome build + +Pull a target assembly :ref:`reference file ` down to every node in the Spark cluster in addition to +a chain file before performing variant liftover. + +The ``lift_over_variants`` transformer operates on a DataFrame containing genetic variants and supports the following +:ref:`options `. + +.. list-table:: + :header-rows: 1 + + * - Parameter + - Default + - Description + * - chain_file + - n/a + - The path of the chain file. + * - reference_file + - n/a + - The path of the target reference file. + * - min_match_ratio + - .95 + - Minimum fraction of bases that must remap. + +The output DataFrame's schema consists of the input DataFrame's schema with the following fields appended: + +- ``INFO_SwappedAlleles``: ``boolean`` (null if liftover failed, true if the reference and alternate alleles were + swapped, false otherwise) +- ``INFO_ReverseComplementedAlleles``: ``boolean`` (null if liftover failed, true if the reference and alternate + alleles were reverse complemented, false otherwise) +- ``liftOverStatus``: ``struct`` + + * ``success``: ``boolean`` (true if liftover succeeded, false otherwise) + * ``errorMessage``: ``string`` (null if liftover succeeded, message describing reason for liftover failure otherwise) + +If liftover succeeds, the output row contains the liftover result and ``liftOverStatus.success`` is true. +If liftover fails, the output row contains the original input row, the additional ``INFO`` fields are null, +``liftOverStatus.success`` is false, and ``liftOverStatus.errorMessage`` contains the reason liftover failed. + +.. code-block:: py + + import glow + chain_file = '/opt/liftover/b37ToHg38.over.chain' + reference_file = '/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa' + output_df = glow.transform('lift_over_variants', input_df, chain_file=chain_file, reference_file=reference_file) + + +.. notebook:: ../_static/notebooks/etl/lift-over.html + :title: Liftover notebook diff --git a/docs/source/api/sample-qc.rst b/docs/source/etl/sample-qc.rst similarity index 97% rename from docs/source/api/sample-qc.rst rename to docs/source/etl/sample-qc.rst index 70d8f7d7e..65a3c9e77 100644 --- a/docs/source/api/sample-qc.rst +++ b/docs/source/etl/sample-qc.rst @@ -40,4 +40,4 @@ Each of these functions returns a map from sample ID to a struct containing metr - ``genotypes`` array with a ``conditionalQuality`` field - A struct with ``min``, ``max``, ``mean``, and ``stddev`` -.. notebook:: ../_static/notebooks/sample-qc-demo.html +.. notebook:: ../_static/notebooks/etl/sample-qc-demo.html diff --git a/docs/source/api/glue-functions.rst b/docs/source/etl/utility-functions.rst similarity index 100% rename from docs/source/api/glue-functions.rst rename to docs/source/etl/utility-functions.rst diff --git a/docs/source/api/variant-data.rst b/docs/source/etl/variant-data.rst similarity index 91% rename from docs/source/api/variant-data.rst rename to docs/source/etl/variant-data.rst index 9729158c0..506c2ecaa 100644 --- a/docs/source/api/variant-data.rst +++ b/docs/source/etl/variant-data.rst @@ -2,7 +2,7 @@ Variant I/O with Spark SQL ========================== -Glow includes Spark SQL support for reading and writing variant data in parallel directly from S3. +Glow makes it possible to read and write variant data at scale using Spark SQL. .. tip:: @@ -19,7 +19,7 @@ the DataFrame API using Python, R, Scala, or SQL. .. code-block:: py - df = spark.read.format("com.databricks.vcf").load(path) + df = spark.read.format("vcf").load(path) The returned DataFrame has a schema that mirrors a single row of a VCF. Information that applies to an entire variant (SNV or indel), such as the contig name, start and end positions, and INFO attributes, @@ -37,8 +37,6 @@ You can control the behavior of the VCF reader with a few parameters. All parame +----------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ | Parameter | Type | Default | Description | +======================+=========+=========+=========================================================================================================================================================+ -| asADAMVariantContext | boolean | false | If true, rows are emitted in the VariantContext schema from the `ADAM `_ project. | -+----------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ | includeSampleIds | boolean | true | If true, each genotype includes the name of the sample ID it belongs to. Sample names increases the size of each row, both in memory and on storage. | +----------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ | splitToBiallelic | boolean | false | If true, multiallelic variants are split into two or more biallelic variants. | @@ -53,7 +51,7 @@ You can save a DataFrame as a VCF file, which you can then read with other tools .. code-block:: py - df.write.format("com.databricks.bigvcf").save() + df.write.format("bigvcf").save() The file extension of the output path determines which, if any, compression codec should be used. For instance, writing to a path such as ``/genomics/my_vcf.vcf.bgz`` will cause the output file to be @@ -63,7 +61,7 @@ If you'd rather save a sharded VCF where each partition saves to a separate file .. code-block:: py - df.write.format("com.databricks.vcf").save(path) + df.write.format("vcf").save(path) To control the behavior of the sharded VCF writer, you can provide the following option: @@ -80,7 +78,7 @@ For both the single and sharded VCF writer, you can use the following option to | Parameter | Type | Default | Description | +=============+========+=========+====================================================================================================================+ | vcfHeader | string | infer | If ``infer``, infers the header from the DataFrame schema. This value can be a complete header | -| | | | starting with ``##`` or a Hadoop filesystem path (for example, ``dbfs://...``) to a VCF file. The header from | +| | | | starting with ``##`` or a Hadoop filesystem path to a VCF file. The header from | | | | | this file is used as the VCF header for each partition. | +-------------+--------+---------+--------------------------------------------------------------------------------------------------------------------+ @@ -92,7 +90,7 @@ Glow also provides the ability to read BGEN files, including those distributed b .. code-block:: py - df = spark.read.format("com.databricks.bgen").load(path) + df = spark.read.format("bgen").load(path) As with the VCF reader, the provided path can be a file, directory, or glob pattern. If ``.bgi`` index files are located in the same directory as the data files, the reader uses the indexes to @@ -113,7 +111,7 @@ You can use the ``DataFrameWriter`` API to save a single BGEN file, which you ca .. code-block:: py - df.write.format("com.databricks.bigbgen").save(path) + df.write.format("bigbgen").save(path) If the genotype arrays are missing ploidy and/or phasing information, the BGEN writer infers the values using the provided values for ploidy, phasing, or ``posteriorProbabilities`` in the genotype arrays. You can provide the value for ploidy @@ -134,4 +132,4 @@ To control the behavior of the BGEN writer, you can provide the following option | defaultInferredPhasing | boolean | false | The inferred phasing if phasing is missing and cannot be inferred from ``posteriorProbabilities``. | +------------------------+---------+---------+------------------------------------------------------------------------------------------------------------------------------------+ -.. notebook:: ../_static/notebooks/variant-data.html +.. notebook:: ../_static/notebooks/etl/variant-data.html diff --git a/docs/source/api/variant-qc.rst b/docs/source/etl/variant-qc.rst similarity index 97% rename from docs/source/api/variant-qc.rst rename to docs/source/etl/variant-qc.rst index 236593325..77a60e38e 100644 --- a/docs/source/api/variant-qc.rst +++ b/docs/source/etl/variant-qc.rst @@ -39,4 +39,4 @@ You can calculate quality control statistics on your variant data using Spark SQ - The ``genotypes`` array - A struct containing the min, max, mean, and sample standard deviation for genotype quality (GQ in VCF v4.2 specification) across all samples -.. notebook:: ../_static/notebooks/variant-qc-demo.html +.. notebook:: ../_static/notebooks/etl/variant-qc-demo.html diff --git a/docs/source/etl/vcf2delta.rst b/docs/source/etl/vcf2delta.rst new file mode 100644 index 000000000..5821e8ee0 --- /dev/null +++ b/docs/source/etl/vcf2delta.rst @@ -0,0 +1,17 @@ +============================ +Create a Genomics Delta Lake +============================ + +Genomics data is usually stored in specialized flat-file formats such as VCF or BGEN. + +The example below shows how to ingest a VCF into a genomics `Delta Lake table `_ using Glow in Python +(R, Scala, and SQL are also supported). + +You can use Delta tables for second-latency queries, performant range-joins (similar to the single-node +bioinformatics tool bedtools intersect), aggregate analyses such as calculating summary statistics, +machine learning or deep learning. + +.. tip:: We recommend ingesting VCF files into Delta tables once volumes reach >1000 samples, >10 billion genotypes or >1 terabyte. + +.. notebook:: ../_static/notebooks/etl/vcf2delta.html + :title: VCF to Delta Lake table notebook diff --git a/docs/source/index.rst b/docs/source/index.rst index f7c750e84..0d18947a9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,4 +1,4 @@ -glow +Glow ==== Glow is an open-source genomic data analysis tool using `Apache Spark `__. @@ -6,9 +6,6 @@ Glow is an open-source genomic data analysis tool using `Apache Spark ` to run native Python code with +PySpark when working with genomic data. + +.. notebook:: ../_static/notebooks/tertiary/pandas-lmm.html + :title: pandas example notebook diff --git a/docs/source/api/pipe-transformer.rst b/docs/source/tertiary/pipe-transformer.rst similarity index 84% rename from docs/source/api/pipe-transformer.rst rename to docs/source/tertiary/pipe-transformer.rst index 9c7eb0269..0cbafcbaf 100644 --- a/docs/source/api/pipe-transformer.rst +++ b/docs/source/tertiary/pipe-transformer.rst @@ -2,7 +2,7 @@ Parallelizing Command-Line Tools With the Pipe Transformer ========================================================== -To help you use single-node tools on massive data sets, Glow includes a +To use single-node tools on massive data sets, Glow includes a utility called the Pipe Transformer to process Spark DataFrames with command-line programs. Python usage @@ -17,11 +17,11 @@ Transformer to reverse each of the strings in the input DataFrame using the ``re .. code-block:: py - import db_genomics as dbg + import glow # Create a text-only DataFrame df = spark.createDataFrame([['foo'], ['bar'], ['baz']], ['value']) - display(dbg.transform('pipe', df, cmd='["rev"]', inputformatter='text', outputformatter='text')) + display(glow.transform('pipe', df, cmd='["rev"]', inputformatter='text', outputformatter='text')) The options in this example demonstrate how to control the basic behavior of the transformer: @@ -39,14 +39,20 @@ partition of the input DataFrame as VCF by choosing ``vcf`` as the input and out .. code-block:: py - import db_genomics as dbg + import glow - df = spark.read.format("com.databricks.vcf")\ + df = spark.read.format("vcf")\ .load("/databricks-datasets/genomics/1kg-vcfs")\ .limit(1000) - dbg.transform('pipe', df, cmd='["grep", "-v", "#INFO"]', - inputformatter='vcf', in_vcfheader='infer', outputformatter='vcf') + glow.transform( + 'pipe', + df, + cmd='["grep", "-v", "#INFO"]', + inputformatter='vcf', + in_vcfheader='infer', + outputformatter='vcf' + ) When you use the VCF input formatter, you must specify a method to determine the VCF header. The simplest option is ``infer``, which instructs the Pipe Transformer to derive a VCF header from the @@ -60,13 +66,14 @@ String]``. .. code-block:: scala - import com.databricks.hls.DBGenomics + import io.projectglow.Glow - DBGenomics.transform("pipe", df, Map( + Glow.transform("pipe", df, Map( "cmd" -> "[\"grep\", \"-v\", \"#INFO\"]" "inputFormatter" -> "vcf", "outputFormatter" -> "vcf", - "in_vcfHeader" -> "infer")) + "in_vcfHeader" -> "infer") + ) Options ======= @@ -109,7 +116,7 @@ VCF input formatter: * ``infer``: Derive a VCF header from the DataFrame schema. * The complete contents of a VCF header starting with ``##`` - * A Hadoop filesystem path (for example, ``dbfs://...``) to a VCF file. The header from this file is used as the VCF header for each partition. + * A Hadoop filesystem path to a VCF file. The header from this file is used as the VCF header for each partition. The CSV input and output formatters accept most of the same options as the CSV data source. You must prefix options to the input formatter with ``in_``, and options to the output formatter with ``out_``. For example, ``in_quote`` sets the quote character when writing the input DataFrame to the piped program. @@ -119,4 +126,4 @@ The following options are not supported: - ``path`` options are ignored - The ``parserLib`` option is ignored. ``univocity`` is always used as the CSV parsing library. -.. notebook:: ../_static/notebooks/pipe-transformer.html +.. notebook:: ../_static/notebooks/tertiary/pipe-transformer.html diff --git a/docs/source/tertiary/regression-tests.rst b/docs/source/tertiary/regression-tests.rst new file mode 100644 index 000000000..d7fb18874 --- /dev/null +++ b/docs/source/tertiary/regression-tests.rst @@ -0,0 +1,109 @@ +============================================== +Genome-wide Association Study Regression Tests +============================================== + +Glow contains functions for performing simple regression analyses used in +genome-wide association studies (GWAS). + +Linear regression +================= + +``linear_regression_gwas`` performs a linear regression association test optimized for performance +in a GWAS setting. + +Example +------- + +.. code-block:: py + + from pyspark.ml.linalg import DenseMatrix + import pyspark.sql.functions as fx + import numpy as np + + # Read in VCF file + df = spark.read.format('vcf')\ + .option("splitToBiallelic", True)\ + .load('/databricks-datasets/genomics/1kg-vcfs')\ + .limit(10).cache() + + # Generate random phenotypes and an intercept-only covariate matrix + n_samples = df.select(fx.size('genotypes')).first()[0] + covariates = DenseMatrix(n_samples, 1, np.ones(n_samples)) + phenotypes = np.random.random(n_samples).tolist() + covariates_and_phenotypes = spark.createDataFrame([[covariates, phenotypes.tolist()]], + ['covariates', 'phenotypes']) + + # Run linear regression test + display(df.crossJoin(covariates_and_phenotypes).selectExpr( + 'contigName', + 'start' + 'names' + # genotype_states returns the number of alt alleles for each sample + 'expand_struct(linear_regression_gwas(genotype_states(genotypes), phenotypes, covariates))')) + + +Parameters +---------- + +.. list-table:: + :header-rows: 1 + + * - Name + - Type + - Details + * - ``genotypes`` + - ``array`` (or numeric type that can be cast to ``double``) + - A numeric representation of the genotype for each sample at a given site, for example the + result of the ``genotype_states`` function. This parameter can vary for each row in the dataset. + * - ``covariates`` + - ``spark.ml`` ``Matrix`` + - A matrix containing the covariates to use in the linear regression model. Each row in the + matrix represents observations for a sample. The indexing must match that of the ``genotypes`` + array that is, the 0th row in the covariate matrix should correspond to the same sample as the + 0th element in the ``genotypes`` array. This matrix must be constant for each row in the + dataset. If desired, you must explicitly include an intercept covariate in this matrix. + * - ``phenotypes`` + - ``array`` (or numeric type that can be cast to ``double``) + - A numeric representation of the phentoype for each sample. This parameter may vary for each + row in the dataset. The indexing of this array must match the ``genotypes`` and + ``covariates`` parameters. + +Return +------ + +The function returns a struct with the following fields. The computation of each value matches the +`lm R package `_. + +.. list-table:: + :header-rows: 1 + + * - Name + - Type + - Details + * - ``beta`` + - ``double`` + - The fit effect coefficient of the ``genotypes`` parameter. + * - ``standardError`` + - ``double`` + - The standard error of ``beta``. + * - ``pValue`` + - ``double`` + - The P-value of the t-statistic for ``beta``. + +Implementation details +---------------------- + +The linear regression model is fit using the QR decomposition. For performance, the QR decomposition +of the covariate matrix is computed once and reused for each (``genotypes``, ``phenotypes``) pair. + + +Example notebook and blog post +------------------------------ + + +A detailed example and explanation of a GWAS workflow is available `here `_. + + + +.. notebook:: ../_static/notebooks/tertiary/gwas.html + :title: GWAS notebook diff --git a/docs/source/tertiary/saige.rst b/docs/source/tertiary/saige.rst new file mode 100644 index 000000000..dba24288e --- /dev/null +++ b/docs/source/tertiary/saige.rst @@ -0,0 +1,205 @@ +==================== +SAIGE and SAIGE-GENE +==================== + +`SAIGE`_ is a command-line tool written in R for performing genetic association tests on biobank-scale data. +Glow makes it possible to run `single-variant association test method SAIGE`_ and the `region-based association test method SAIGE-GENE`_ using its :ref:`Pipe Transformer `. + + +.. _`SAIGE`: https://github.com/weizhouUMICH/SAIGE +.. _`single-variant association test method SAIGE`: https://www.nature.com/articles/s41588-018-0184-y +.. _`region-based association test method SAIGE-GENE`: https://www.biorxiv.org/content/10.1101/583278v1 + + +.. _saige-init-script: + +Create a SAIGE cluster +====================== + +Install `HTSLib`_ as well as the SAIGE package, its dependencies, and associated input files on every node in the +cluster using an :ref:`init script `. +We recommend creating a mount point to store and download these files from cloud storage. +Replace the values in the script with your mount point and local directory. + +.. code-block:: sh + + #!/usr/bin/env bash + + set -ex + set -o pipefail + + # Install SKAT for SAIGE-GENE + sudo apt-get -y install libboost-all-dev autoconf + Rscript -e 'install.packages("SKAT", repos="https://cran.us.r-project.org")' + + # Install SAIGE + cd /opt + git clone https://github.com/weizhouUMICH/SAIGE + cd SAIGE + Rscript extdata/install_packages.R + R CMD INSTALL *.tar.gz + + # Install HTSLIB + cd /opt + git clone https://github.com/samtools/htslib + cd htslib + autoheader + autoconf + ./configure + make + make install + + # Sync SAIGE input files + aws=/databricks/python2/bin/aws + source /databricks/init/setup_mount_point_creds.sh + $aws s3 sync "$MOUNT_ROOT/" "" + +.. _`HTSlib`: https://github.com/samtools/htslib +.. _`SAIGE package`: https://github.com/samtools/htslib +.. _`HTSlib`: https://github.com/samtools/htslib + +Run association tests +===================== + +To parallelize the association test, use Glow's :ref:`Pipe Transformer `. + +- Command: A JSON array containing a :ref:`SAIGE ` or :ref:`SAIGE-GENE ` script +- Input formatter: VCF +- Output formatter: CSV + + - Header flag: true + - Delimiter: space + +.. code-block:: py + + import glow + import json + + cmd = json.dumps(["bash", "-c", saige_script]) + output_df = glow.transform( + "pipe", input_df, + cmd=cmd, + input_formatter='vcf', + in_vcf_header=input_vcf, + output_formatter='csv', + out_header='true', + out_delimiter=' ' + ) + +You can read the input DataFrame directly from a :ref:`VCF ` or from a Delta table :ref:`with VCF rows `. + +Single-variant association test (SAIGE) +--------------------------------------- + +The following cell shows an example SAIGE script to be piped. +The options are explained in the `single-variant association test docs`_. + +.. _saige-cmd: + +.. code-block:: sh + + #!/bin/sh + set -e + + export tmpdir=$(mktemp -d -t vcf.XXXXXX) + cat - > ${tmpdir}/input.vcf + bgzip -c ${tmpdir}/input.vcf > ${tmpdir}/input.vcf.gz + tabix -p vcf ${tmpdir}/input.vcf.gz + + Rscript /opt/SAIGE/extdata/step2_SPAtests.R \ + --vcfFile=${tmpdir}/input.vcf.gz \ + --vcfFileIndex=${tmpdir}/input.vcf.gz.tbi \ + --SAIGEOutputFile=${tmpdir}/output.txt \ + --sampleFile=/input/sampleIds.txt \ + --GMMATmodelFile=/step-1/output.rda \ + --varianceRatioFile=/step-1/output.varianceRatio.txt \ + --vcfField=GT \ + --chrom=22 \ + --minMAF=0.0001 \ + --minMAC=1 \ + --numLinesOutput=2 \ + >&2 + + cat ${tmpdir}/output.txt + rm -rf ${tmpdir} + +.. _`single-variant association test docs`: https://github.com/weizhouUMICH/SAIGE/wiki/Genetic-association-tests-using-SAIGE#step-2-performing-the-single-variant-association-tests + +Region-based association test (SAIGE-GENE) +------------------------------------------ + +Before piping, partition the input DataFrame by region or gene, with each partition containing the sorted, distinct, and +relevant set of VCF rows. The regions and corresponding variants should match those in the group file referenced in the +SAIGE-GENE script. Regions can be derived from any source, such as gene annotations from the +:ref:`SnpEff pipeline `. + +.. code-block:: py + + input_df = genotype_df.join(region_df, ["contigName", "start", "end"]) \ + .repartition("geneName") \ + .select("contigName", "start", "end", "referenceAllele", "alternateAlleles", "genotypes") \ + .sortWithinPartitions("contigName", "start") + +The following cell shows an example SAIGE-GENE script to be piped. +The options are explained in the `region-based association test docs`_. + +.. _saige-gene-cmd: + +.. code-block:: sh + + #!/bin/sh + set -e + + export tmpdir=$(mktemp -d -t vcf.XXXXXX) + uniq -w 100 - > ${tmpdir}/input.vcf + bgzip -c ${tmpdir}/input.vcf > ${tmpdir}/input.vcf.gz + tabix -p vcf ${tmpdir}/input.vcf.gz + + Rscript /opt/SAIGE/extdata/step2_SPAtests.R \ + --vcfFile=${tmpdir}/input.vcf.gz \ + --vcfFileIndex=${tmpdir}/input.vcf.gz.tbi \ + --SAIGEOutputFile=${tmpdir}/output.txt \ + --groupFile=${tmpdir}/groupFile.txt \ + --sampleFile=/input/sampleIds.txt \ + --GMMATmodelFile=/step-1/output.rda \ + --varianceRatioFile=/step-1/output.varianceRatio.txt \ + --sparseSigmaFile=/step-1/output.varianceRatio.txt_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseSigma.mtx \ + --vcfField=GT \ + --chrom=22 \ + --minMAF=0 \ + --minMAC=0.5 \ + --maxMAFforGroupTest=0.5 \ + --numLinesOutput=2 \ + >&2 + + cat ${tmpdir}/output.txt + rm -rf ${tmpdir} + +After piping, disambiguate the association results for each gene in the output DataFrame. + +.. code-block:: python + + from pyspark.sql import Window + + window_spec = Window.partitionBy("Gene") + assoc_df = output_df.withColumn("numMarkerIDs", size(split("markerIDs", ";"))) \ + .withColumn("maxNumMarkerIDs", max("numMarkerIDs").over(window_spec)) \ + .filter("numMarkerIDs = maxNumMarkerIDs") \ + .drop("numMarkerIDs", "maxNumMarkerIDs") \ + .drop_duplicates(["Gene"]) + +.. _`region-based association test docs`: https://github.com/weizhouUMICH/SAIGE/wiki/Genetic-association-tests-using-SAIGE#step-2-performing-the-region--or-gene-based-association-tests + +Example notebooks +================= + +.. notebook:: genomics/null-fit.html + :title: Null logistic mixed model fit notebook + +.. notebook:: ../_static/notebooks/tertiary/saige.html + :title: SAIGE notebook + +.. notebook:: ../_static/notebooks/tertiary/saige-gene.html + :title: SAIGE-GENE notebook + +.. include:: /shared/products.rst diff --git a/docs/source/tertiary/variant-normalization.rst b/docs/source/tertiary/variant-normalization.rst new file mode 100644 index 000000000..4a3ea968a --- /dev/null +++ b/docs/source/tertiary/variant-normalization.rst @@ -0,0 +1,59 @@ +=============================================== +Variant Normalization (with Optional Splitting) +=============================================== +Different genomic analysis tools often represent the same genomic variant in different ways, making it non-trivial to integrate and compare variants across call sets. Therefore, **variant normalization** is an essential step on variants before further downstream analysis to make sure the same variant is represented identically in different call sets. Normalization is achieved by making sure the variant is parsimonious and left aligned (see `Variant Normalization `_ for more details). + +Glow provides the ``normalize_variants`` transformer to be applied on a variant DataFrame to normalize its variants, bringing unprecedented scalability to this operation. When applied on an input DataFrame of variants (e.g., generated by loading VCF or BGEN files), this transformer generates a DataFrame containing normalized variants. + +**Splitting multiallelic variants to biallelic variants** is another transformation sometimes required before further downstream analysis. Using the ``mode`` option, the ``normalize_variants`` transformer can also be optionally configured to split multiallelic variants to biallelic variants before normalizing them, or even just split the multiallelic variants without normalizing them. + +.. note:: + + * The variant normalization algorithm used by the ``normlize_variants`` transformer follows the same logic as the one used in well-known normalization tools such as `bcftools norm `_ or `vt normalize `_ tools. This normalization logic is different from the one use by GATK's `LeftAlignAndTrimVariants `_, which sometimes yields incorrect normalization (see `Variant Normalization `_ for more details). + * The splitting logic is the same as the one used in GATK's `LeftAlignAndTrimVariants `_ tool using ``--splitMultiallelics`` option. In splitting a multiallelic variant, this transformer recalculates the GT blocks for the resulting biallelic variants if possible, and drops all VCF INFO fields, except for AC, AN, and AF, which are recalculated based on the newly calculated GT blocks, if any, and otherwise dropped. + +Python usage +============ +Assuming ``df_original`` is a variable of type DataFrame which contains the genomic variant records, and ``ref_genome_path`` is a variable of type String containing the path to the reference genome file, a minimal example of using this transformer for normalization in Python is: + +.. code-block:: py + + import glow + df_normalized = glow.transform("normalize_variants", df_original, reference_genome_path=ref_genome_path) + +Scala usage +=========== +The same can be done in Scala as follows: + +.. code-block:: scala + + import io.projectglow.Glow + df_normalized = Glow.transform("normalize_variants", df_original, Map("reference_genome_path" -> ref_genome_path)) + +Options +======= +The ``normalize_variants`` transformer has the following options: + +.. list-table:: + :header-rows: 1 + + * - Option + - Type + - Possible values and description + * - ``mode`` + - String + - | ``normalize``: Only normalizes the variants (if user does not pass the option, ``normalize`` is assumed as default) + | ``split_and_normalize``: Split multiallelic variants to biallelic variants and normalize them + | ``split``: Only split the multiallelic variants to biallelic without normalizing + * - referenceGenomePath + - String + - Path to the reference genome ``.fasta`` or ``.fa`` file (required for normalization) + + **Notes**: + + * ``.fai`` and ``.dict`` files with the same name must be present in the same folder. + * This option is not required for the ``split`` mode as the reference genome is only used for normalization. + + +.. notebook:: ../_static/notebooks/tertiary/normalizevariants-transformer.html + :title: Variant normalization notebook