diff --git a/docs/source/_static/images/wgr_runtime.png b/docs/source/_static/images/wgr_runtime.png new file mode 100644 index 000000000..21b4cfee4 Binary files /dev/null and b/docs/source/_static/images/wgr_runtime.png differ diff --git a/docs/source/_static/notebooks/tertiary/glowgr.html b/docs/source/_static/notebooks/tertiary/glowgr.html new file mode 100644 index 000000000..2f75b89dd --- /dev/null +++ b/docs/source/_static/notebooks/tertiary/glowgr.html @@ -0,0 +1,42 @@ + + + + +GloWGR - Databricks + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/etl/vcf2delta.rst b/docs/source/etl/vcf2delta.rst index 3db8d592f..7e69e5d63 100644 --- a/docs/source/etl/vcf2delta.rst +++ b/docs/source/etl/vcf2delta.rst @@ -1,3 +1,5 @@ +.. _vcf2delta: + ============================ Create a Genomics Delta Lake ============================ diff --git a/docs/source/tertiary/index.rst b/docs/source/tertiary/index.rst index 04be9657f..adecbdc12 100644 --- a/docs/source/tertiary/index.rst +++ b/docs/source/tertiary/index.rst @@ -13,3 +13,4 @@ Perform population-scale statistical analyses of genetic variants. pipe-transformer pandas-udf regression-tests + whole-genome-regression diff --git a/docs/source/tertiary/regression-tests.rst b/docs/source/tertiary/regression-tests.rst index c6bcfda79..edba86802 100644 --- a/docs/source/tertiary/regression-tests.rst +++ b/docs/source/tertiary/regression-tests.rst @@ -81,7 +81,7 @@ Example standardError=0.1783963733160434, pValue=0.44349953631952943 ) - assert_rows_equal(lin_reg_df.head(), expected_lin_reg_row) + assert_rows_equal(lin_reg_df.filter('contigName = 22 and start = 16050114').head(), expected_lin_reg_row) Parameters ---------- @@ -191,7 +191,7 @@ Example waldConfidenceInterval=[0.7813704896767115, 3.247273366082802], pValue=0.19572327843236637 ) - assert_rows_equal(lrt_log_reg_df.head(), expected_lrt_log_reg_row) + assert_rows_equal(lrt_log_reg_df.filter('contigName = 22 and start = 16050114').head(), expected_lrt_log_reg_row) expected_firth_log_reg_row = Row( contigName='22', @@ -202,7 +202,7 @@ Example waldConfidenceInterval=[0.7719062301156017, 3.2026291934794795], pValue=0.20086839802280376 ) - assert_rows_equal(firth_log_reg_df.head(), expected_firth_log_reg_row) + assert_rows_equal(firth_log_reg_df.filter('contigName = 22 and start = 16050114').head(), expected_firth_log_reg_row) Parameters ---------- diff --git a/docs/source/tertiary/whole-genome-regression.rst b/docs/source/tertiary/whole-genome-regression.rst new file mode 100644 index 000000000..5970af5ff --- /dev/null +++ b/docs/source/tertiary/whole-genome-regression.rst @@ -0,0 +1,388 @@ +=============================== +GloWGR: Whole-Genome Regression +=============================== + +.. invisible-code-block: python + + import glow + glow.register(spark) + + genotypes_vcf = 'test-data/gwas/genotypes.vcf.gz' + covariates_csv = 'test-data/gwas/covariates.csv.gz' + continuous_phenotypes_csv = 'test-data/gwas/continuous-phenotypes.csv.gz' + +Glow supports Whole Genome Regression (WGR) as GloWGR, a parallelized version of the +`regenie `_ method (see the +`preprint `_). + +.. image:: ../_static/images/wgr_runtime.png + :scale: 50 % + +GloWGR consists of the following stages. + +- Block the genotype matrix across samples and variants. +- Perform dimensionality reduction with ridge regression. +- Estimate phenotypic values with ridge regression. + +.. note:: + + GloWGR currently supports only quantitative phenotypes. + +---------------- +Data preparation +---------------- + +GloWGR accepts three input datasources. + +Genotype data +============= + +The genotype data may be read from any variant datasource supported by Glow, such as one read from +:ref:`VCF, BGEN or PLINK `. For scalability, recommend ingesting flat genotype files into +:ref:`Delta tables `. + +The DataFrame must also include a column ``values`` containing a numeric representation of each genotype. The genotypic +values may not be missing, or equal for every sample in a variant (eg. all samples are homozygous reference). + +Example +------- + +When loading in the variants, perform the following transformations: + +- Split multiallelic variants with the ``split_multiallelics`` transformer. +- Calculate the number of alternate alleles for biallelic variants with ``glow.genotype_states``. +- Replace any missing values with the mean of the non-missing values using ``glow.mean_substitute``. +- Filter out variants whose values are equal across all samples. + +.. code-block:: python + + from pyspark.sql.functions import col, lit + + variants = spark.read.format('vcf').load(genotypes_vcf) + genotypes = glow.transform('split_multiallelics', variants) \ + .withColumn('values', glow.mean_substitute(glow.genotype_states(col('genotypes')))) \ + .filter('size(array_distinct(values)) > 1') \ + +Phenotype data +============== + +The phenotype data is represented as a Pandas DataFrame indexed by the sample ID. Each column represents a single +phenotype. It is assumed that there are no missing phenotype values, and that the phenotypes are mean centered at 0. + +Example +------- + +.. code-block:: python + + import pandas as pd + + label_df = pd.read_csv(continuous_phenotypes_csv, index_col='sample_id') \ + .apply(lambda x: x-x.mean())[['Continuous_Trait_1', 'Continuous_Trait_2']] + +Covariate data +============== + +The covariate data is represented as a Pandas DataFrame indexed by the sample ID. + +Example +------- + +.. code-block:: python + + covariates = pd.read_csv(covariates_csv, index_col='sample_id') + covariates['intercept'] = 1. + +------------------------ +Genotype matrix blocking +------------------------ + +``glow.wgr.functions.block_variants_and_samples`` creates two objects: a block genotype matrix and a sample block +mapping. + +Parameters +========== + +- ``genotypes``: Genotype DataFrame created by reading from any variant datasource supported by Glow, such as VCF. Must + also include a column ``values`` containing a numeric representation of each genotype. +- ``sample_ids``: List of sample IDs. Can be created by applying ``glow.wgr.functions.get_sample_ids`` to a genotype + DataFrame. +- ``variants_per_block``: Number of variants to include per block. We recommend 1000. +- ``sample_block_count``: Number of sample blocks to create. We recommend 10. + +Return +====== + +The function returns a block genotype matrix and a sample block mapping. + +Block genotype matrix +--------------------- + +If we imagine the block genotype matrix conceptually, we think of an *NxM* matrix *X* where each row *n* represents an +individual sample, each column *m* represents a variant, and each cell *(n, m)* contains a genotype value for sample *n* +at variant *m*. We then imagine laying a coarse grid on top of this matrix such that matrix cells within the same +coarse grid cell are all assigned to the same block *x*. Each block *x* is indexed by a sample block ID (corresponding +to a list of rows belonging to the block) and a header block ID (corresponding to a list of columns belonging to the +block). The sample block IDs are generally just integers 0 through the number of sample blocks. The header block IDs +are strings of the form 'chr_C_block_B', which refers to the Bth block on chromosome C. The Spark DataFrame +representing this block matrix can be thought of as the transpose of each block *xT* all stacked one atop another. Each +row represents the values from a particular column from *X*, for the samples corresponding to a particular sample block. +The fields in the DataFrame are: + +- ``header``: A column name in the conceptual matrix *X*. +- ``size``: The number of individuals in the sample block for the row. +- ``values``: Genotype values for this header in this sample block. If the matrix is sparse, contains only non-zero values. +- ``header_block``: An ID assigned to the block *x* containing this header. +- ``sample_block``: An ID assigned to the block *x* containing the group of samples represented on this row. +- ``position``: An integer assigned to this header that specifies the correct sort order for the headers in this block. +- ``mu``: The mean of the genotype calls for this header. +- ``sig``: The standard deviation of the genotype calls for this header. + +Sample block mapping +-------------------- + +The sample block mapping consists of key-value pairs, where each key is a sample block ID and each value is a list of +sample IDs contained in that sample block. + +The order of these IDs match the order of the ``values`` arrays in the block genotype DataFrame. + +Example +======= + +.. code-block:: python + + from glow.wgr.linear_model import RidgeReducer, RidgeRegression + from glow.wgr.functions import block_variants_and_samples, get_sample_ids + from pyspark.sql.functions import col, lit + + variants_per_block = 1000 + sample_block_count = 10 + sample_ids = get_sample_ids(genotypes) + block_df, sample_blocks = block_variants_and_samples( + genotypes, sample_ids, variants_per_block, sample_block_count) + +------------------------ +Dimensionality reduction +------------------------ + +The first step in the fitting procedure is to apply a dimensionality reduction to the block matrix *X* using the +``RidgeReducer``. + +This is accomplished by fitting multiple ridge models within each block *x* and producing a new block matrix where each +column represents the prediction of one ridge model applied within one block. This approach to model building is +generally referred to as **stacking**. We will call the block genotype matrix we started with the **level 0** matrix in +the stack *X0*, and the output of the ridge reduction step the **level 1** matrix *X1*. The ``RidgeReducer`` class is +used for this step, which is initialized with a list of ridge regularization values (referred to here as alpha). Since +ridge models are indexed by these alpha values, the ``RidgeReducer`` will generate one ridge model per value of alpha +provided, which in turn will produce one column per block in *X0*, so the final dimensions of matrix *X1* will be +*Nx(LxK)*, where *L* is the number of header blocks in *X0* and *K* is the number of alpha values provided to the +``RidgeReducer``. In practice, we can estimate a span of alpha values in a reasonable order of magnitude based on +guesses at the heritability of the phenotype we are fitting. + +Initialization +============== + +When the ``RidgeReducer`` is initialized, it will assign names to the provided alphas and store them in a dictionary +accessible as ``RidgeReducer.alphas``. + +Example +------- + +If alpha values are not provided, they will be generated during ``RidgeReducer.fit`` based on the unique number of +headers *h* in the blocked genotype matrix *X0*, and a set of heritability values. These are only sensible if the +phenotypes are on the scale of one. + +.. math:: + + \vec{\alpha} = h / [0.01, 0.25, 0.50, 0.75, 0.99] + +.. code-block:: python + + reducer = RidgeReducer() + +Model fitting +============= + +In explicit terms, the reduction of a block *x0* from *X0* to the corresponding block *x1* from *X1* is accomplished by +the matrix multiplication *x0 * B = x1*, where *B* is a coefficient matrix of size *mxK*, where *m* is the number of +columns in block *x0* and *K* is the number of alpha values used in the reduction. As an added wrinkle, if the ridge +reduction is being performed against multiple phenotypes at once, each phenotype will have its own *B*, and for +convenience we panel these next to each other in the output into a single matrix, so *B* in that case has dimensions +*mx(K*P)* where *P* is the number of phenotypes. Each matrix *B* is specific to a particular block in *X0*, so the +Spark DataFrame produced by the ``RidgeReducer`` can be thought of all of as the matrices *B* from all of the blocks +stacked one atop another. + +Parameters +---------- + +- ``block_df``: Spark DataFrame representing the beginning block matrix. +- ``label_df``: Pandas DataFrame containing the target labels used in fitting the ridge models. +- ``sample_blocks``: Dictionary containing a mapping of sample block IDs to a list of corresponding sample IDs. +- ``covariates``: Pandas DataFrame containing covariates to be included in every model in the stacking + ensemble (optional). + +Return +------ + +The fields in the model DataFrame are: + +- ``header_block``: An ID assigned to the header block *x0* corresponding to the coefficients in this row. +- ``sample_block``: An ID assigned to the sample block *x0* corresponding to the coefficients in this row. +- ``header``: The name of a column from the conceptual matrix *X0* that correspond with a particular row from the + coefficient matrix *B*. +- ``alphas``: List of alpha names corresponding to the columns of *B*. +- ``labels``: List of label (i.e., phenotypes) corresponding to the columns of *B*. +- ``coefficients``: List of the actual values from a row in *B*. + +Model transformation +==================== + +After fitting, the ``RidgeReducer.transform`` method can be used to generate *X1* from *X0*. + +Parameters +---------- + +- ``block_df``: Spark DataFrame representing the beginning block matrix. +- ``label_df``: Pandas DataFrame containing the target labels used in fitting the ridge models. +- ``sample_blocks``: Dictionary containing a mapping of sample block IDs to a list of corresponding sample IDs. +- ``model_df``: Spark DataFrame produced by the RidgeReducer fit method, representing the reducer model. +- ``covariates``: Pandas DataFrame containing covariates to be included in every model in the stacking + ensemble (optional). + +Return +------ + +The output of the transformation is closely analogous to the block matrix DataFrame we started with. The main +difference is that, rather than representing a single block matrix, it really represents multiple block matrices, with +one such matrix per label (phenotype). Comparing the schema of this block matrix DataFrame (``reduced_block_df``) with +the DataFrame we started with (``block_df``), the new columns are: + +- ``alpha``: This is the name of the alpha value used in fitting the model that produced the values in this row. +- ``label``: This is the label corresponding to the values in this row. Since the genotype block matrix *X0* is + phenotype-agnostic, the rows in ``block_df`` were not restricted to any label/phenotype, but the level 1 block + matrix *X1* represents ridge model predictions for the labels the reducer was fit with, so each row is associated with + a specific label. + +The headers in the *X1* block matrix are derived from a combination of the source block in *X0*, the alpha value used in +fitting the ridge model, and the label they were fit with. These headers are assigned to header blocks that correspond +to the chromosome of the source block in *X0*. + +Example +======= + +Use the ``fit_transform`` function if the block genotype matrix, phenotype DataFrame, sample block mapping, and +covariates are constant for both the model fitting and transformation. + +.. code-block:: python + + reduced_block_df = reducer.fit_transform(block_df, label_df, sample_blocks, covariates) + +-------------------------- +Estimate phenotypic values +-------------------------- + +The block matrix *X1* can be used to fit a final predictive model that can generate phenotype predictions *y_hat* using +the ``RidgeRegression`` class. + +Initialization +============== + +As with the ``RidgeReducer`` class, this class is initialized with a list of alpha values. + +Example +------- + +If alpha values are not provided, they will be generated during ``RidgeRegression.fit`` based on the unique number of +headers *h* in the blocked genotype matrix *X1*, and a set of heritability values. These are only sensible if the +phenotypes are on the scale of one. + +.. math:: + + \vec{\alpha} = h / [0.01, 0.25, 0.50, 0.75, 0.99] + +.. code-block:: python + + regression = RidgeRegression() + +Model fitting +============= + +This works much in the same way as the ridge reducer fitting, except that it returns two DataFrames. + +Parameters +---------- + +- ``block_df``: Spark DataFrame representing the beginning block matrix. +- ``label_df``: Pandas DataFrame containing the target labels used in fitting the ridge models. +- ``sample_blocks``: Dictionary containing a mapping of sample block IDs to a list of corresponding sample IDs. +- ``covariates``: Pandas DataFrame containing covariates to be included in every model in the stacking + ensemble (optional). + +Return +------ + +The first output is a model DataFrame analogous to the model DataFrame provided by the ``RidgeReducer``. An important +difference is that the header block ID for all rows will be 'all', indicating that all headers from all blocks have been +used in a single fit, rather than fitting within blocks. + +The second output is a cross validation report DataFrame, which reports the results of the hyperparameter (i.e., alpha) +value optimization routine. + +- ``label``: This is the label corresponding to the cross cv results on the row. +- ``alpha``: The name of the optimal alpha value +- ``r2_mean``: The mean out of fold r2 score for the optimal alpha value + +Model transformation +==================== + +After fitting the ``RidgeRegression`` model, the model DataFrame and cross validation DataFrame are used to apply the +model to the block matrix DataFrame to produce predictions (*y_hat*) for each label and sample using the +``RidgeRegression.transform`` method. + +Parameters +---------- + +- ``block_df``: Spark DataFrame representing the beginning block matrix. +- ``label_df``: Pandas DataFrame containing the target labels used in fitting the ridge models. +- ``sample_blocks``: Dictionary containing a mapping of sample block IDs to a list of corresponding sample IDs. +- ``model_df``: Spark DataFrame produced by the ``RidgeRegression.fit`` method, representing the reducer model +- ``cv_df``: Spark DataFrame produced by the ``RidgeRegression.fit`` method, containing the results of the cross + validation routine. +- ``covariates``: Pandas DataFrame containing covariates to be included in every model in the stacking + ensemble (optional). + +Return +------ + +The resulting *y_hat* Pandas DataFrame is shaped like ``label_df``, indexed by the sample ID with each column +representing a single phenotype. + +Example +======= + +We can produce the leave one chromosome out (LOCO) version of the *y_hat* values by filtering out rows that correspond +to the chromosome we wish to drop before applying the transformation. + +.. code-block:: python + + model_df, cv_df = regression.fit(reduced_block_df, label_df, sample_blocks, covariates) + all_contigs = [r.header_block for r in reduced_block_df.select('header_block').distinct().collect()] + all_y_hat_df = pd.DataFrame() + + for contig in all_contigs: + loco_reduced_block_df = reduced_block_df.filter(col('header_block') != lit(contig)) + loco_model_df = model_df.filter(~col('header').startswith(contig)) + loco_y_hat_df = regression.transform(loco_reduced_block_df, label_df, sample_blocks, loco_model_df, cv_df, covariates) + loco_y_hat_df['contigName'] = contig.split('_')[1] + all_y_hat_df = all_y_hat_df.append(loco_y_hat_df) + y_hat_df = all_y_hat_df.reset_index().set_index(['contigName', 'sample_id']) + +.. invisible-code-block: python + + import math + assert math.isclose(y_hat_df.at[('22', 'HG00096'),'Continuous_Trait_1'], -0.5203890988445584) + +Example notebook +---------------- + +.. notebook:: .. tertiary/glowgr.html + :title: GloWGR notebook diff --git a/test-data/gwas/README b/test-data/gwas/README index 1f8e70202..6cb56dc18 100644 --- a/test-data/gwas/README +++ b/test-data/gwas/README @@ -1,4 +1,4 @@ -The genotypes are sampled from the Thousand Genomes Project Phase 3 release chr22 VCF +The genotypes are sampled from the Thousand Genomes Project Phase 3 release chr21 and chr22 VCFs (ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf). The covariates and continuous phenotypes are simulated with PhenotypeSimulator diff --git a/test-data/gwas/binary-phenotypes.csv.gz b/test-data/gwas/binary-phenotypes.csv.gz index e97808c71..7507ebe8a 100644 Binary files a/test-data/gwas/binary-phenotypes.csv.gz and b/test-data/gwas/binary-phenotypes.csv.gz differ diff --git a/test-data/gwas/continuous-phenotypes.csv.gz b/test-data/gwas/continuous-phenotypes.csv.gz index 68957d602..4af40cebc 100644 Binary files a/test-data/gwas/continuous-phenotypes.csv.gz and b/test-data/gwas/continuous-phenotypes.csv.gz differ diff --git a/test-data/gwas/covariates.csv.gz b/test-data/gwas/covariates.csv.gz index 35ddca653..f8bdcc798 100644 Binary files a/test-data/gwas/covariates.csv.gz and b/test-data/gwas/covariates.csv.gz differ diff --git a/test-data/gwas/genotypes.vcf.gz b/test-data/gwas/genotypes.vcf.gz index 3e19489ad..1926a360d 100644 Binary files a/test-data/gwas/genotypes.vcf.gz and b/test-data/gwas/genotypes.vcf.gz differ