From 18cde9da5ec42baba81358419f8f4c473324e832 Mon Sep 17 00:00:00 2001 From: Alex Taylor-Barreto Date: Thu, 30 Jun 2022 15:50:18 -0500 Subject: [PATCH 01/27] Temporarily stop Hail tests Increase test timeout for docs tests Signed-off-by: Alex Taylor-Barreto --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d58335493..196e11696 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ setup_xl_base: &setup_xl_base working_directory: ~/glow docker: - image: cimg/openjdk:8.0.322 - resource_class: xlarge + resource_class: 2xlarge setup_m_base: &setup_m_base working_directory: ~/glow @@ -116,7 +116,7 @@ jobs: sbt core/test exit - run: name: Run docs tests - no_output_timeout: 30m + no_output_timeout: 120m environment: command: | export PATH=$HOME/conda/envs/glow/bin:$PATH @@ -133,7 +133,7 @@ jobs: export HAIL_VERSION="0.2.89" sudo apt-get update sudo apt-get -y install rsync - sbt installHail hail/test uninstallHail exit + sbt installHail uninstallHail exit - run: name: Run Python tests no_output_timeout: 90m From b875a633e6e85c2481054a1ec5e01c1428e0a8e7 Mon Sep 17 00:00:00 2001 From: Alex Taylor-Barreto Date: Thu, 30 Jun 2022 16:05:39 -0500 Subject: [PATCH 02/27] Fix yet another broken link in the docs (Repos sync) Signed-off-by: Alex Taylor-Barreto --- docs/source/getting-started.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting-started.rst b/docs/source/getting-started.rst index 992e933a0..74bae9c1e 100644 --- a/docs/source/getting-started.rst +++ b/docs/source/getting-started.rst @@ -98,7 +98,7 @@ Here is how to set it up on the Databricks web application, 3. Sync the Glow notebooks via Repos #. Fork the `Glow github repo `_. - #. Clone your fork to your Databricks workspace using Repos (step-by-step `guide `_). + #. Clone your fork to your Databricks workspace using Repos (step-by-step `guide `_). #. The notebooks are located under ``docs/source/_static``. .. image:: _static/images/glow-repo-notebooks.png From 9a095b32e7efc747ccf3e81345c37be0edcc44da Mon Sep 17 00:00:00 2001 From: Alex Taylor-Barreto Date: Thu, 30 Jun 2022 16:07:00 -0500 Subject: [PATCH 03/27] Add Docker images build for DBR 9.1 LTS Signed-off-by: Alex Taylor-Barreto --- docker/databricks/build.sh | 30 +++++ .../databricks/dbr/dbr9.1/dbfsfuse/Dockerfile | 11 ++ .../dbr/dbr9.1/genomics-with-glow/Dockerfile | 57 +++++++++ .../dbr9.1/genomics-with-hail/00-hail.conf | 4 + .../dbr/dbr9.1/genomics-with-hail/Dockerfile | 29 +++++ .../databricks/dbr/dbr9.1/genomics/Dockerfile | 115 ++++++++++++++++++ .../databricks/dbr/dbr9.1/minimal/Dockerfile | 13 ++ .../databricks/dbr/dbr9.1/python/Dockerfile | 35 ++++++ docker/databricks/dbr/dbr9.1/r/Dockerfile | 51 ++++++++ docker/databricks/dbr/dbr9.1/r/Rprofile.site | 12 ++ .../databricks/dbr/dbr9.1/standard/Dockerfile | 10 ++ 11 files changed, 367 insertions(+) create mode 100644 docker/databricks/dbr/dbr9.1/dbfsfuse/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/genomics-with-glow/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/genomics-with-hail/00-hail.conf create mode 100644 docker/databricks/dbr/dbr9.1/genomics-with-hail/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/genomics/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/minimal/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/python/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/r/Dockerfile create mode 100644 docker/databricks/dbr/dbr9.1/r/Rprofile.site create mode 100644 docker/databricks/dbr/dbr9.1/standard/Dockerfile diff --git a/docker/databricks/build.sh b/docker/databricks/build.sh index f6fc554f3..ac2af72da 100755 --- a/docker/databricks/build.sh +++ b/docker/databricks/build.sh @@ -3,6 +3,32 @@ # # Usage: ./build.sh +DATABRICKS_RUNTIME_VERSION="9.1" +GLOW_VERSION="1.1.2" +HAIL_VERSION="0.2.85" + +# build 9.1 LTS / Spark 3.1.2 images + +# Add commands to build images below +pushd dbr/dbr$DATABRICKS_RUNTIME_VERSION/ +docker build -t "${DOCKER_HUB}/minimal:${DATABRICKS_RUNTIME_VERSION}" minimal/ +docker build -t "${DOCKER_HUB}/python:${DATABRICKS_RUNTIME_VERSION}" python/ +docker build -t "${DOCKER_HUB}/dbfsfuse:${DATABRICKS_RUNTIME_VERSION}" dbfsfuse/ +docker build -t "${DOCKER_HUB}/standard:${DATABRICKS_RUNTIME_VERSION}" standard/ +docker build -t "${DOCKER_HUB}/with-r:${DATABRICKS_RUNTIME_VERSION}" r/ +docker build -t "${DOCKER_HUB}/genomics:${DATABRICKS_RUNTIME_VERSION}" genomics/ +docker build -t "${DOCKER_HUB}/databricks-hail:${HAIL_VERSION}" genomics-with-hail/ +docker build -t "${DOCKER_HUB}/databricks-glow-minus-ganglia:${GLOW_VERSION}" genomics-with-glow/ +docker build -t "${DOCKER_HUB}/databricks-glow:${GLOW_VERSION}" ganglia/ +docker build -t "${DOCKER_HUB}/databricks-glow-minus-ganglia:${DATABRICKS_RUNTIME_VERSION}" genomics-with-glow/ +docker build -t "${DOCKER_HUB}/databricks-glow:${DATABRICKS_RUNTIME_VERSION}" ganglia/ +popd + +docker push "${DOCKER_HUB}/databricks-hail:${HAIL_VERSION}" +docker push "${DOCKER_HUB}/databricks-glow:${GLOW_VERSION}" + +# build 10.4 LTS / Spark 3.2.1 images + DOCKER_HUB="projectglow" DATABRICKS_RUNTIME_VERSION="10.4" GLOW_VERSION="1.2.1" @@ -26,3 +52,7 @@ popd docker push "${DOCKER_HUB}/databricks-hail:${HAIL_VERSION}" docker push "${DOCKER_HUB}/databricks-glow:${GLOW_VERSION}" docker push "${DOCKER_HUB}/databricks-glow:${DATABRICKS_RUNTIME_VERSION}" + + + + diff --git a/docker/databricks/dbr/dbr9.1/dbfsfuse/Dockerfile b/docker/databricks/dbr/dbr9.1/dbfsfuse/Dockerfile new file mode 100644 index 000000000..04cf9ded4 --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/dbfsfuse/Dockerfile @@ -0,0 +1,11 @@ +FROM projectglow/python:9.1 + +RUN apt-get update \ + && apt-get install -y fuse \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Make sure the USER env variable is set. The files exposed +# by dbfs-fuse will be owned by this user. +# Within the container, the USER is always root. +ENV USER root diff --git a/docker/databricks/dbr/dbr9.1/genomics-with-glow/Dockerfile b/docker/databricks/dbr/dbr9.1/genomics-with-glow/Dockerfile new file mode 100644 index 000000000..4d30928e6 --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/genomics-with-glow/Dockerfile @@ -0,0 +1,57 @@ +# ===== For the runtime environment for this image we need the databricks azure setup ============== + +FROM projectglow/genomics:9.1 AS builder + +ENV GLOW_VERSION=1.1.1 + +# ===== Install python dependencies for Glow ======================================================= +# once available, we want specify that the earliest version is 1.1.0 + +RUN /databricks/python3/bin/pip install glow.py==$GLOW_VERSION + +ENV BIOINFOKIT_VERSION=0.8.5 +RUN /databricks/python3/bin/pip install bioinfokit==$BIOINFOKIT_VERSION + +# ===== Set up scala dependencies for Glow ========================================================= + +ENV HADOOP_BAM_VERSION=7.9.1 +ENV HTSJDK_VERSION=2.21.2 +ENV PICARD_VERSION=2.23.3 +ENV JDBI_VERSION=2.78 + +RUN mkdir /databricks/jars +RUN cd /databricks/jars && curl -O \ +https://repo1.maven.org/maven2/io/projectglow/glow-spark3_2.12/${GLOW_VERSION}/glow-spark3_2.12-${GLOW_VERSION}.jar +RUN cd /databricks/jars && curl -O \ +https://repo1.maven.org/maven2/org/seqdoop/hadoop-bam/${HADOOP_BAM_VERSION}/hadoop-bam-${HADOOP_BAM_VERSION}.jar +RUN cd /databricks/jars && curl -O \ +https://repo1.maven.org/maven2/com/github/samtools/htsjdk/${HTSJDK_VERSION}/htsjdk-${HTSJDK_VERSION}.jar +RUN cd /databricks/jars && curl -O \ +https://repo1.maven.org/maven2/com/github/broadinstitute/picard/${PICARD_VERSION}/picard-${PICARD_VERSION}.jar +RUN cd /databricks/jars && curl -O \ +https://repo1.maven.org/maven2/org/jdbi/jdbi/${JDBI_VERSION}/jdbi-${JDBI_VERSION}.jar + +# ===== Set up needed Spark config for scala jars ================================================== + +ENV JAVA_OPTS="-Dspark.executor.extraClassPath=/databricks/jars/glow-spark3_2.12-${GLOW_VERSION}.jar,/databricks/jars/hadoop-bam-${HADOOP_BAM_VERSION}.jar,/databricks/jars/htsjdk-${HTSJDK_VERSION}.jar,/databricks/jars/picard-${PICARD_VERSION}.jar,/databricks/jars/jdbi-${JDBI_VERSION}.jar \ + -Dspark.driver.extraClassPath=/databricks/jars/glow-spark3_2.12-${GLOW_VERSION}.jar,/databricks/jars/hadoop-bam-${HADOOP_BAM_VERSION}.jar,/databricks/jars/htsjdk-${HTSJDK_VERSION}.jar,/databricks/jars/picard-${PICARD_VERSION}.jar,/databricks/jars/jdbi-${JDBI_VERSION}.jar \ + -Dspark.serializer=org.apache.spark.serializer.KryoSerializer \ + -Dspark.hadoop.io.compression.codecs=io.projectglow.sql.util.BGZFCodec,org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec" + +# ===== Set up liftOver (used by standard Glow examples) =========================================== + +RUN mkdir /opt/liftover +RUN curl https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain --output /opt/liftover/b37ToHg38.over.chain + +# ===== Set up bedtools as desired by many Glow users ============================================== + +ENV BEDTOOLS_VERSION=2.30.0 +ENV PATH=/databricks/conda/envs/dcs-minimal/bin/:$PATH +RUN cd /opt && git clone --depth 1 --branch v${BEDTOOLS_VERSION} https://github.com/arq5x/bedtools2.git bedtools-${BEDTOOLS_VERSION} +RUN cd /opt/bedtools-${BEDTOOLS_VERSION} && make + +# Add bedtools path to the enviroment + +ENV PATH=/opt/bedtools-${BEDTOOLS_VERSION}:$PATH + +WORKDIR /root/ diff --git a/docker/databricks/dbr/dbr9.1/genomics-with-hail/00-hail.conf b/docker/databricks/dbr/dbr9.1/genomics-with-hail/00-hail.conf new file mode 100644 index 000000000..252904b7c --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/genomics-with-hail/00-hail.conf @@ -0,0 +1,4 @@ +[driver] { + "spark.kryo.registrator" = "is.hail.kryo.HailKryoRegistrator" + "spark.serializer" = "org.apache.spark.serializer.KryoSerializer" +} diff --git a/docker/databricks/dbr/dbr9.1/genomics-with-hail/Dockerfile b/docker/databricks/dbr/dbr9.1/genomics-with-hail/Dockerfile new file mode 100644 index 000000000..9408c2c7f --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/genomics-with-hail/Dockerfile @@ -0,0 +1,29 @@ +# ===== For the runtime environment for this image we need the databricks azure setup ============== + +FROM projectglow/genomics:9.1 AS builder + +# ===== Set up Hail ================================================================================ + +# earliest Hail version supported by Spark3 is 0.2.67 +ENV HAIL_VERSION=0.2.78 + +RUN apt-get update && apt-get install -y \ + openjdk-8-jre-headless \ + g++ \ + libopenblas-base liblapack3 \ + liblz4-1 liblz4-dev liblz4-tool \ + rsync python-setuptools + +RUN /databricks/python3/bin/pip install hail +RUN HAIL_JAR_PATH=$(find /databricks/python3 -name 'hail-all-spark.jar') && \ + mkdir -p /databricks/jars && \ + cp $HAIL_JAR_PATH /databricks/jars/ + +RUN mkdir -p /databricks/driver/conf/ +COPY 00-hail.conf /databricks/driver/conf/ + +# ===== Set up Selenium for Bokeh (Bokeh itself is included in Hail) =============================== + +ENV SELENIUM_VERSION=3.141.0 + +RUN /databricks/python3/bin/pip install selenium==$SELENIUM_VERSION diff --git a/docker/databricks/dbr/dbr9.1/genomics/Dockerfile b/docker/databricks/dbr/dbr9.1/genomics/Dockerfile new file mode 100644 index 000000000..7239b95f0 --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/genomics/Dockerfile @@ -0,0 +1,115 @@ +FROM projectglow/with-r:9.1 AS r + +# ===== Build off Databricks Runtime =============================================================== + +#The runtime base is Ubuntu 18.04, or 20.04 after 9.x +#See more here https://github.com/databricks/containers + +# ===== Set up python environment ================================================================== + +RUN /databricks/python3/bin/pip install awscli databricks-cli --no-cache-dir + +# ===== Set up Azure CLI ===== + +RUN apt-get install -y \ + curl \ + lsb-release \ + gnupg + +RUN curl -sL https://aka.ms/InstallAzureCLIDeb | bash + +# ===== Set up base required libraries ============================================================= + +RUN apt-get update && apt-get install -y \ + apt-utils \ + build-essential \ + git \ + apt-transport-https \ + ca-certificates \ + cpanminus \ + libpng-dev \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + perl \ + perl-base \ + unzip \ + curl \ + gnupg2 \ + software-properties-common \ + jq \ + libjemalloc2 \ + libjemalloc-dev \ + libdbi-perl \ + libdbd-mysql-perl \ + libdbd-sqlite3-perl \ + zlib1g \ + zlib1g-dev \ + libxml2 \ + libxml2-dev + +# ===== Set up R genomics packages ================================================================= + +RUN R -e "install.packages('sim1000G',dependencies=TRUE,repos='https://cran.rstudio.com')"\ + && R -e "install.packages('ukbtools',dependencies=TRUE,repos='https://cran.rstudio.com')"\ + && R -e "install.packages('qqman',dependencies=TRUE,repos='http://cran.us.r-project.org')"\ + && R -e "install.packages('bigsnpr',dependencies=TRUE,repos='http://cran.us.r-project.org')" + +# ===== Set up VEP environment ===================================================================== + +ENV OPT_SRC /opt/vep/src +ENV PERL5LIB $PERL5LIB:$OPT_SRC/ensembl-vep:$OPT_SRC/ensembl-vep/modules +RUN cpanm DBI && \ + cpanm Set::IntervalTree && \ + cpanm JSON && \ + cpanm Text::CSV && \ + cpanm Module::Build && \ + cpanm PerlIO::gzip && \ + cpanm IO::Uncompress::Gunzip + +RUN mkdir -p $OPT_SRC +WORKDIR $OPT_SRC +RUN git clone https://github.com/Ensembl/ensembl-vep.git +WORKDIR ensembl-vep + +# The commit is the most recent one on release branch 100 as of July 29, 2020 + +RUN git checkout 10932fab1e9c113e8e5d317e1f668413390344ac && \ + perl INSTALL.pl --NO_UPDATE -AUTO a && \ + perl INSTALL.pl -n -a p --PLUGINS AncestralAllele && \ + chmod +x vep + +# ===== Set up samtools ============================================================================ + +ENV SAMTOOLS_VERSION=1.9 + +WORKDIR /opt +RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + tar -xjf samtools-1.9.tar.bz2 +WORKDIR samtools-1.9 +RUN ./configure && \ + make && \ + make install + +ENV PATH=${DEST_DIR}/samtools-{$SAMTOOLS_VERSION}:$PATH + + +# ===== Set up htslib ============================================================================== + +WORKDIR /opt +RUN wget https://github.com/samtools/htslib/releases/download/${SAMTOOLS_VERSION}/htslib-${SAMTOOLS_VERSION}.tar.bz2 && \ + tar -xjvf htslib-1.9.tar.bz2 +WORKDIR htslib-1.9 +RUN ./configure && \ + make && \ + make install + +# ===== Set up MLR dependencies ==================================================================== + +ENV QQMAN_VERSION=1.0.6 +RUN /databricks/python3/bin/pip install qqman==$QQMAN_VERSION + +# ===== Reset current directory ==================================================================== + +WORKDIR /root + diff --git a/docker/databricks/dbr/dbr9.1/minimal/Dockerfile b/docker/databricks/dbr/dbr9.1/minimal/Dockerfile new file mode 100644 index 000000000..ec563e706 --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/minimal/Dockerfile @@ -0,0 +1,13 @@ +FROM ubuntu:20.04 + +RUN apt-get update \ + && apt-get install --yes \ + openjdk-8-jdk \ + iproute2 \ + bash \ + sudo \ + coreutils \ + procps \ + && /var/lib/dpkg/info/ca-certificates-java.postinst configure \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/databricks/dbr/dbr9.1/python/Dockerfile b/docker/databricks/dbr/dbr9.1/python/Dockerfile new file mode 100644 index 000000000..62d9db36b --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/python/Dockerfile @@ -0,0 +1,35 @@ +FROM projectglow/minimal:9.1 + +# Suppress interactive configuration prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Installs python 3.8 and virtualenv for Spark and Notebooks +RUN apt-get update \ + && apt-get install -y \ + python3.8 \ + virtualenv \ + git-all \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Initialize the default environment that Spark and notebooks will use +RUN virtualenv -p python3.8 --system-site-packages /databricks/python3 + +# These python libraries are used by Databricks notebooks and the Python REPL +# You do not need to install pyspark - it is injected when the cluster is launched +# Versions are intended to reflect DBR 9.1 +RUN /databricks/python3/bin/pip install \ + six==1.15.0 \ + # downgrade ipython to maintain backwards compatibility with 7.x and 8.x runtimes + ipython==7.22.0 \ + numpy==1.19.2 \ + pandas==1.2.4 \ + pyarrow==4.0.0 \ + matplotlib==3.4.2 \ + jinja2==2.11.3 \ + mlflow==1.19.0 + +ENV MLFLOW_TRACKING_URI=databricks + +# Specifies where Spark will look for the python process +ENV PYSPARK_PYTHON=/databricks/python3/bin/python3 diff --git a/docker/databricks/dbr/dbr9.1/r/Dockerfile b/docker/databricks/dbr/dbr9.1/r/Dockerfile new file mode 100644 index 000000000..6eebeb11e --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/r/Dockerfile @@ -0,0 +1,51 @@ +# Images using this R layer also need Databricks' Python environment +FROM projectglow/standard:9.1 + +# Suppress interactive configuration prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Set Databricks Run Time Version, This variable used to perform a runtime version check to see whether we can use notebook-scoped libraries in R. +ENV DATABRICKS_RUNTIME_VERSION=9.1 + +# update indices +# add the signing key (by Michael Rutter) for these repos +# To verify key, run gpg --show-keys /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc +# Fingerprint: 298A3A825C0D65DFD57CBB651716619E084DAB9 +# https://cran.rstudio.com/bin/linux/ubuntu/#secure-apt +RUN apt update -qq \ + && apt-get install --yes \ + software-properties-common \ + dirmngr \ + libssl-dev \ + r-base \ + r-base-dev \ + && wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc \ + && add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/" \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + + +# hwriterPlus is used by Databricks to display output in notebook cells +# Rserve allows Spark to communicate with a local R process to run R code +RUN R -e "install.packages(c('hwriterPlus'), repos='https://mran.revolutionanalytics.com/snapshot/2017-02-26')" \ + && R -e "install.packages(c('htmltools'), repos='https://cran.microsoft.com/')" \ + && R -e "install.packages('Rserve', repos='http://rforge.net/')" + +# Additional instructions to setup rstudio. If you dont need rstudio, you can +# omit the below commands in your docker file. Even after this you need to use +# an init script to start the RStudio daemon (See README.md for details.) + +# Databricks configuration for RStudio sessions. +COPY Rprofile.site /usr/lib/R/etc/Rprofile.site + +# Rstudio installation. +RUN apt-get update \ + # Installation of rstudio in databricks needs /usr/bin/python. + && apt-get install -y python \ + # Install gdebi-core. + && apt-get install -y gdebi-core \ + # Download rstudio 1.2 package for ubuntu 18.04 and install it. + && apt-get install -y wget \ + && wget https://download2.rstudio.org/server/bionic/amd64/rstudio-server-1.2.5042-amd64.deb -O rstudio-server.deb \ + && gdebi -n rstudio-server.deb \ + && rm rstudio-server.deb diff --git a/docker/databricks/dbr/dbr9.1/r/Rprofile.site b/docker/databricks/dbr/dbr9.1/r/Rprofile.site new file mode 100644 index 000000000..577e3afde --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/r/Rprofile.site @@ -0,0 +1,12 @@ +# Databricks configuration for RStudio sessions. +# Please do not remove this file or modify following lines. +if (grepl("rstudio", system(paste0("cat /proc/", Sys.getpid(), "/cmdline"), intern = T))) { + Sys.setenv("SPARK_HOME" = "/databricks/spark") + .libPaths(c("/databricks/spark/R/lib", .libPaths())) + assign("DATABRICKS_GUID", system('wget -qO - \'http://localhost:6061/?type="com.databricks.backend.common.rpc.DriverMessages$GetRStudioBackendGUID"\' --post-data=\'{"@class":"com.databricks.backend.common.rpc.DriverMessages$GetRStudioBackendGUID"}\' --no-check-certificate | tr -d \\" ', intern = TRUE), envir = .GlobalEnv) + Sys.setenv("EXISTING_SPARKR_BACKEND_PORT" = system(paste0('wget -qO - \'http://localhost:6061/?type="com.databricks.backend.common.rpc.DriverMessages$StartRStudioSparkRBackend"\' --post-data=\'{"@class":"com.databricks.backend.common.rpc.DriverMessages$StartRStudioSparkRBackend", "guid": "', DATABRICKS_GUID, '"}\' --no-check-certificate'), intern = TRUE)) + Sys.setenv("SPARKR_BACKEND_AUTH_SECRET" = system(paste0('wget -qO - \'http://localhost:6061/?type="com.databricks.backend.common.rpc.DriverMessages$GetRStudioRAuthSecret"\' --post-data=\'{"@class":"com.databricks.backend.common.rpc.DriverMessages$GetRStudioRAuthSecret", "port": "', Sys.getenv("EXISTING_SPARKR_BACKEND_PORT"), '"}\' --no-check-certificate | tr -d \\" '), intern = TRUE)) + .Last <- function() { + system(paste0('wget -qO - \'http://localhost:6061/?type="com.databricks.backend.common.rpc.DriverMessages$StopRStudioSparkRBackend"\' --post-data=\'{"@class":"com.databricks.backend.common.rpc.DriverMessages$StopRStudioSparkRBackend", "port": "', Sys.getenv("EXISTING_SPARKR_BACKEND_PORT") , '"}\' --no-check-certificate'), intern = TRUE) + } +} diff --git a/docker/databricks/dbr/dbr9.1/standard/Dockerfile b/docker/databricks/dbr/dbr9.1/standard/Dockerfile new file mode 100644 index 000000000..fdd7b815d --- /dev/null +++ b/docker/databricks/dbr/dbr9.1/standard/Dockerfile @@ -0,0 +1,10 @@ +FROM projectglow/dbfsfuse:9.1 + +RUN apt-get update \ + && apt-get install -y openssh-server \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Warning: the created user has root permissions inside the container +# Warning: you still need to start the ssh process with `sudo service ssh start` +RUN useradd --create-home --shell /bin/bash --groups sudo ubuntu From 7b5141773d8a6cc7212528d5227ba38349644a1d Mon Sep 17 00:00:00 2001 From: Alex Taylor-Barreto Date: Thu, 28 Jul 2022 12:23:41 -0500 Subject: [PATCH 04/27] Initial tutorial on contributing to the github repo --- GIT-PROCESS.md | 158 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 GIT-PROCESS.md diff --git a/GIT-PROCESS.md b/GIT-PROCESS.md new file mode 100644 index 000000000..8ff70c53f --- /dev/null +++ b/GIT-PROCESS.md @@ -0,0 +1,158 @@ +# Github Contribution Process + +Whether you're trying to contribute to the open source community or collaborating on your own projects, knowing how to properly fork and generate pull requests is essential. Unfortunately, it's quite easy to make mistakes or not know what you should do when you're initially learning the process. we know that many certainly had considerable initial trouble with it, and we've found a lot of the information on GitHub and around the internet to be rather piecemeal and incomplete - part of the process described here, another there, common hangups in a different place, and so on. + +In an attempt to coallate this information for myself and others, this short tutorial is what we've found to be fairly standard procedure for creating a fork, doing your work, issuing a pull request, and merging that pull request back into the original project. + +## Creating a Fork + +Just head over to the GitHub page and click the "Fork" button. It's just that simple. Once you've done that, you can use your favorite git client to clone your repo or just head straight to the command line: + +```shell +# Clone your fork to your local machine +git clone git@github.com:USERNAME/FORKED-PROJECT.git +``` + +## Keeping Your Fork Up to Date + +While this isn't an absolutely necessary step, if you plan on doing anything more than just a tiny quick fix, you'll want to make sure you keep your fork up to date by tracking the original "upstream" repo that you forked. To do this, you'll need to add a remote: + +```shell +# Add 'upstream' repo to list of remotes +git remote add upstream https://github.com/UPSTREAM-USER/ORIGINAL-PROJECT.git + +# Verify the new remote named 'upstream' +git remote -v +``` + +Whenever you want to update your fork with the latest upstream changes, you'll need to first fetch the upstream repo's branches and latest commits to bring them into your repository: + +```shell +# Fetch from upstream remote +git fetch upstream + +# View all branches, including those from upstream +git branch -va +``` + +Now, checkout your own master branch and merge the upstream repo's master branch: + +```shell +# Checkout your master branch and merge upstream +git checkout master +git merge upstream/master +``` + +If there are no unique commits on the local master branch, git will simply perform a fast-forward. However, if you have been making changes on master (in the vast majority of cases you probably shouldn't be - [see the next section](#doing-your-work), you may have to deal with conflicts. When doing so, be careful to respect the changes made upstream. + +Now, your local master branch is up-to-date with everything modified upstream. + +## Doing Your Work + +### Create a Branch +Whenever you begin work on a new feature or bugfix, it's important that you create a new branch. Not only is it proper git workflow, but it also keeps your changes organized and separated from the master branch so that you can easily submit and manage multiple pull requests for every task you complete. + +To create a new branch and start working on it: + +```shell +# Checkout the master branch - you want your new branch to come from master +git checkout master + +# Create a new branch named newfeature (give your branch its own simple informative name) +git branch newfeature + +# Switch to your new branch +git checkout newfeature +``` + +Now, go to town hacking away and making whatever changes you want to. + +## Submitting a Pull Request + +### Cleaning Up Your Work + +Prior to submitting your pull request, you might want to do a few things to clean up your branch and make it as simple as possible for the original repo's maintainer to test, accept, and merge your work. + +If any commits have been made to the upstream master branch, you should rebase your development branch so that merging it will be a simple fast-forward that won't require any conflict resolution work. + +```shell +# Fetch upstream master and merge with your repo's master branch +git fetch upstream +git checkout master +git merge upstream/master + +# If there were any new commits, rebase your development branch +git checkout newfeature +git rebase master +``` + +Now, it may be desirable to squash some of your smaller commits down into a small number of larger more cohesive commits. You can do this with an interactive rebase: + +```shell +# Rebase all commits on your development branch +git checkout +git rebase -i master +``` + +This will open up a text editor where you can specify which commits to squash. + +### Submitting + +Once you've committed and pushed all of your changes to GitHub, go to the page for your fork on GitHub, select your development branch, and click the pull request button. If you need to make any adjustments to your pull request, just push the updates to GitHub. Your pull request will automatically track the changes on your development branch and update. + +## Accepting and Merging a Pull Request + +Take note that unlike the previous sections which were written from the perspective of someone that created a fork and generated a pull request, this section is written from the perspective of the original repository owner who is handling an incoming pull request. Thus, where the "forker" was referring to the original repository as `upstream`, we're now looking at it as the owner of that original repository and the standard `origin` remote. + +### Checking Out and Testing Pull Requests +Open up the `.git/config` file and add a new line under `[remote "origin"]`: + +``` +fetch = +refs/pull/*/head:refs/pull/origin/* +``` + +Now you can fetch and checkout any pull request so that you can test them: + +```shell +# Fetch all pull request branches +git fetch origin + +# Checkout out a given pull request branch based on its number +git checkout -b 999 pull/origin/999 +``` + +Keep in mind that these branches will be read only and you won't be able to push any changes. + +### Automatically Merging a Pull Request +In cases where the merge would be a simple fast-forward, you can automatically do the merge by just clicking the button on the pull request page on GitHub. + +### Manually Merging a Pull Request +To do the merge manually, you'll need to checkout the target branch in the source repo, pull directly from the fork, and then merge and push. + +```shell +# Checkout the branch you're merging to in the target repo +git checkout master + +# Pull the development branch from the fork repo where the pull request development was done. +git pull https://github.com/forkuser/forkedrepo.git newfeature + +# Merge the development branch +git merge newfeature + +# Push master with the new feature merged into it +git push origin master +``` + +Now that you're done with the development branch, you're free to delete it. + +```shell +git branch -d newfeature +``` + +**Additional Reading** +* [Atlassian - Merging vs. Rebasing](https://www.atlassian.com/git/tutorials/merging-vs-rebasing) + +**Sources** +* [GitHub - Fork a Repo](https://help.github.com/articles/fork-a-repo) +* [GitHub - Syncing a Fork](https://help.github.com/articles/syncing-a-fork) +* [GitHub - Checking Out a Pull Request](https://help.github.com/articles/checking-out-pull-requests-locally) \ No newline at end of file From d56716d95fe7f25f3e47af657e7093430ba73e61 Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:10:43 -0500 Subject: [PATCH 05/27] Updated config.yml --- .circleci/config.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 196e11696..645b31b26 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,6 +4,12 @@ setup_xl_base: &setup_xl_base - image: cimg/openjdk:8.0.322 resource_class: 2xlarge +setup_l_base: &setup_m_base + working_directory: ~/glow + docker: + - image: cimg/openjdk:8.0.322 + resource_class: xlarge + setup_m_base: &setup_m_base working_directory: ~/glow docker: @@ -151,7 +157,7 @@ jobs: path: ~/glow/core/target/scala-2.12/test-reports all-notebook-tests: - <<: *setup_m_base + <<: *setup_l_base steps: - checkout - restore_cache: From 426514d29175038bf8c7450ffe9528397e15b709 Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:12:20 -0500 Subject: [PATCH 06/27] Updated config.yml --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 645b31b26..fa2561abe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,7 +4,7 @@ setup_xl_base: &setup_xl_base - image: cimg/openjdk:8.0.322 resource_class: 2xlarge -setup_l_base: &setup_m_base +setup_l_base: &setup_l_base working_directory: ~/glow docker: - image: cimg/openjdk:8.0.322 From ffab2e19f6af567f2d5b2a9d61eedc296f2242db Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:13:32 -0500 Subject: [PATCH 07/27] Updated config.yml --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fa2561abe..3e9de1f65 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,6 @@ create_docs_env: &create_docs_env check_clean_repo: &check_clean_repo run: name: Verify that repo is clean - environment: command: | if [[ -n $(git status --short) ]]; then echo "Working directory was not clean!" From c085668e6d8904ea311e5ab37cb85e44adbd9486 Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:13:58 -0500 Subject: [PATCH 08/27] Updated config.yml --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3e9de1f65..3c646e2c1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,6 @@ jobs: key: conda-deps-v1-{{ checksum "python/environment.yml" }}-{{ checksum "docs/source/environment.yml" }} - run: name: Check docs links - environment: command: | export PATH=$HOME/conda/envs/glow-docs/bin:$PATH cd docs From 194800f999f2840d5c7f512a0daf7d3fb9e4a292 Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:15:03 -0500 Subject: [PATCH 09/27] Updated config.yml --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3c646e2c1..8d22fe086 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -112,7 +112,6 @@ jobs: key: conda-deps-v1-{{ checksum "python/environment.yml" }} - run: name: Run Scala tests - environment: command: | export PATH=$HOME/conda/envs/glow/bin:$PATH export SPARK_VERSION="3.2.1" From f14f5a749b382a99aa6eba9dceaaded0da4f257a Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:16:25 -0500 Subject: [PATCH 10/27] Updated config.yml --- .circleci/config.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8d22fe086..ab75fae6a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -120,7 +120,6 @@ jobs: - run: name: Run docs tests no_output_timeout: 120m - environment: command: | export PATH=$HOME/conda/envs/glow/bin:$PATH export SPARK_VERSION="3.2.1" @@ -128,7 +127,6 @@ jobs: sbt docs/test exit - run: name: Run Hail on Spark3 tests - environment: command: | export PATH=$HOME/conda/envs/glow/bin:$HOME/conda/bin:$PATH export SPARK_VERSION="3.2.1" @@ -140,7 +138,6 @@ jobs: - run: name: Run Python tests no_output_timeout: 90m - environment: command: | export PATH=$HOME/conda/envs/glow/bin:$PATH export SPARK_VERSION="3.2.1" From af7c6168d5e9638e9ab49e31308b7d302525c2c2 Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:20:17 -0500 Subject: [PATCH 11/27] Updated config.yml --- .circleci/config.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ab75fae6a..443805490 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,15 +1,21 @@ -setup_xl_base: &setup_xl_base +setup_2xl_base: &setup_2xl_base working_directory: ~/glow docker: - image: cimg/openjdk:8.0.322 resource_class: 2xlarge -setup_l_base: &setup_l_base +setup_xl_base: &setup_xl_base working_directory: ~/glow docker: - image: cimg/openjdk:8.0.322 resource_class: xlarge +setup_l_base: &setup_l_base + working_directory: ~/glow + docker: + - image: cimg/openjdk:8.0.322 + resource_class: large + setup_m_base: &setup_m_base working_directory: ~/glow docker: @@ -98,7 +104,7 @@ jobs: - *check_clean_repo spark-3-tests: - <<: *setup_xl_base + <<: *setup_2xl_base steps: - checkout - restore_cache: @@ -151,7 +157,7 @@ jobs: path: ~/glow/core/target/scala-2.12/test-reports all-notebook-tests: - <<: *setup_l_base + <<: *setup_xl_base steps: - checkout - restore_cache: From 83f642841955d4038eaa865ceb991c2238f98db3 Mon Sep 17 00:00:00 2001 From: Alex T Date: Fri, 29 Jul 2022 14:25:21 -0500 Subject: [PATCH 12/27] Updated config.yml --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 443805490..67c600498 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -157,7 +157,7 @@ jobs: path: ~/glow/core/target/scala-2.12/test-reports all-notebook-tests: - <<: *setup_xl_base + <<: *setup_2xl_base steps: - checkout - restore_cache: From c43744064f38b5e24c923747534f0bfbad2fcd1a Mon Sep 17 00:00:00 2001 From: William Smith <104857768+willsmithDB@users.noreply.github.com> Date: Wed, 7 Sep 2022 14:46:26 -0700 Subject: [PATCH 13/27] Update Broken Link to Databricks File System Docs Fixing a broken link: https://docs.databricks.com/data/databricks-file-system.html#local-file-apis to https://docs.databricks.com/dbfs/index.html#local-file-apis as the URL has changed. --- docs/source/tertiary/spark-workflow-orchestrator.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tertiary/spark-workflow-orchestrator.rst b/docs/source/tertiary/spark-workflow-orchestrator.rst index 1bd46e257..752ad8909 100644 --- a/docs/source/tertiary/spark-workflow-orchestrator.rst +++ b/docs/source/tertiary/spark-workflow-orchestrator.rst @@ -32,7 +32,7 @@ Furthermore, the pipe transformer is not designed for parallel processing of dis - data must be accessible locally on each Spark worker using one of these approaches, - data is downloaded to each node of the cluster at start-up via an initialization script - cloud storage is mounted on the local filesystem using an open source tool such as `goofys `_ - - Databricks' `local file APIs `_ automatically mounts cloud object storage to the local filesystem + - Databricks' `local file APIs `_ automatically mounts cloud object storage to the local filesystem - bioinformatics tools must be installed on each node of the cluster - via an initialization script or a `Glow Docker Container `_ From c8e131ed80712b9ec076156db227d27d274b67ff Mon Sep 17 00:00:00 2001 From: William Smith <104857768+willsmithDB@users.noreply.github.com> Date: Wed, 7 Sep 2022 14:57:22 -0700 Subject: [PATCH 14/27] Updated Cloud Object Storage Docs Updated link as the anchor no longer exists. --- docs/source/tertiary/spark-workflow-orchestrator.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tertiary/spark-workflow-orchestrator.rst b/docs/source/tertiary/spark-workflow-orchestrator.rst index 752ad8909..78e5875f8 100644 --- a/docs/source/tertiary/spark-workflow-orchestrator.rst +++ b/docs/source/tertiary/spark-workflow-orchestrator.rst @@ -32,7 +32,7 @@ Furthermore, the pipe transformer is not designed for parallel processing of dis - data must be accessible locally on each Spark worker using one of these approaches, - data is downloaded to each node of the cluster at start-up via an initialization script - cloud storage is mounted on the local filesystem using an open source tool such as `goofys `_ - - Databricks' `local file APIs `_ automatically mounts cloud object storage to the local filesystem + - Databricks' `local file APIs `_ automatically mounts cloud object storage to the local filesystem - bioinformatics tools must be installed on each node of the cluster - via an initialization script or a `Glow Docker Container `_ From 0b371976a677c14220b8f482486ab7d2afba8a81 Mon Sep 17 00:00:00 2001 From: Alex Taylor-Barreto Date: Fri, 16 Sep 2022 09:27:53 -0500 Subject: [PATCH 15/27] Fixed broken 'local DBFS API' anchor reference to be: https://docs.databricks.com/dev-tools/api/latest/dbfs.html --- docs/source/tertiary/spark-workflow-orchestrator.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/tertiary/spark-workflow-orchestrator.rst b/docs/source/tertiary/spark-workflow-orchestrator.rst index 78e5875f8..f8dd9efe2 100644 --- a/docs/source/tertiary/spark-workflow-orchestrator.rst +++ b/docs/source/tertiary/spark-workflow-orchestrator.rst @@ -32,7 +32,11 @@ Furthermore, the pipe transformer is not designed for parallel processing of dis - data must be accessible locally on each Spark worker using one of these approaches, - data is downloaded to each node of the cluster at start-up via an initialization script - cloud storage is mounted on the local filesystem using an open source tool such as `goofys `_ +<<<<<<< HEAD - Databricks' `local file APIs `_ automatically mounts cloud object storage to the local filesystem +======= + - Databricks' `local file APIs `_ automatically mounts cloud object storage to the local filesystem +>>>>>>> a6acc19 (Fixed broken 'local DBFS API' anchor reference to be: https://docs.databricks.com/dev-tools/api/latest/dbfs.html) - bioinformatics tools must be installed on each node of the cluster - via an initialization script or a `Glow Docker Container `_ From 95425513281593c0d20056db3af53842dbda6255 Mon Sep 17 00:00:00 2001 From: Alex Taylor-Barreto Date: Fri, 16 Sep 2022 09:27:01 -0500 Subject: [PATCH 16/27] Halt Hail tests --- .circleci/config.yml | 4 ++++ GIT-PROCESS.md | 2 +- .../_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/vcf2delta.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 67c600498..e96beef27 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -116,6 +116,10 @@ jobs: paths: - /home/circleci/conda key: conda-deps-v1-{{ checksum "python/environment.yml" }} + - run: + name: Prepare CircleCI environment for dependency cache - issue #301 + environment: + command: sudo apt-get install -y ca-certificates - run: name: Run Scala tests command: | diff --git a/GIT-PROCESS.md b/GIT-PROCESS.md index 8ff70c53f..226a21070 100644 --- a/GIT-PROCESS.md +++ b/GIT-PROCESS.md @@ -1,4 +1,4 @@ -# Github Contribution Process +# Git Contribution Process Whether you're trying to contribute to the open source community or collaborating on your own projects, knowing how to properly fork and generate pull requests is essential. Unfortunately, it's quite easy to make mistakes or not know what you should do when you're initially learning the process. we know that many certainly had considerable initial trouble with it, and we've found a lot of the information on GitHub and around the internet to be rather piecemeal and incomplete - part of the process described here, another there, common hangups in a different place, and so on. diff --git a/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/vcf2delta.py b/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/vcf2delta.py index b1e85be6d..731b907c5 100644 --- a/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/vcf2delta.py +++ b/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/vcf2delta.py @@ -238,7 +238,7 @@ # COMMAND ---------- # MAGIC %md -# MAGIC #### Using the [local file API](https://docs.databricks.com/data/databricks-file-system.html#local-file-apis), we can read VCF directly from cloud storage via the shell +# MAGIC #### Using the [local file API](https://docs.databricks.com/dev-tools/api/latest/dbfs.html), we can read VCF directly from cloud storage via the shell # COMMAND ---------- From f41f506361f9be1af5e14c3fcfd79ff9f59bfd53 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 7 Oct 2022 09:51:58 -0500 Subject: [PATCH 17/27] Remove .metals --- .metals/metals.h2.db | Bin 5378048 -> 0 bytes .metals/metals.lock.db | 6 ---- .metals/metals.log | 72 ----------------------------------------- 3 files changed, 78 deletions(-) delete mode 100644 .metals/metals.h2.db delete mode 100644 .metals/metals.lock.db delete mode 100644 .metals/metals.log diff --git a/.metals/metals.h2.db b/.metals/metals.h2.db deleted file mode 100644 index 9b39596ed2c43f17ab3f03574f4b667ca358b776..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5378048 zcmeF)2Y6NW;s5af7f>L?04k#3P1uMSHni3@Bo~N6f=N*6N*S@DxBv%TwzYQG)vdO6 z*=d({@4bhu-P2C(*6!{9lbZkmE~7vDJpYIC;_>}}IrpCXJ+JdUC-K!5-N0t5&UAV7cs0RjXbh(PVu`+A<3DwsgUwuhk^;r|EA2PXm!j$SkQ%Br0eZ*%^81a?|GD6QKK!5-N0t5&U zAVA>I6e!H7D@>YvM%&V*GaF{koVj)LCEYX6Ub|^acWF*xS{6;Iu&8Cu%-J&=3RCI| z#i!%gWEJt(#RCTVY{o@kw>F zif8uCf(`wrLBp)#timxBo^xh-hs~TjbKcDPGZ)NUICIg=#V6Fw*vndG6lWC1Rai?e z*7Uk5g<~r$T{drK!_+K0uF|r+{*>aB!uSfyR_++vBXu-)u;hA z_YAB#YtYDF6h@vhxcZJE)nkW_c==(&?>T(<#$m(m96sz(BMyI4_2G3RhhADU^oS#d zoL@U+@TkGt4plhYO@IJ_|EoY@RIyln)qtkcXmipoujqOd# z8{3PmZJou|)hkxa?0w{t)h#QUyE+@&mzDPNz+ux1wZ*#Xb$vr=$qSyQXs^u9T(ASA$@%p?!|WTv&J9?%ZI`F-F@~Lw?Ae&kmX)QB z&c>Ciip`~^jjLC57MoVLxAzthUe4;*z$u6{W`3 z9S*~$pIEG~e%kiKbah82Zp+g0;?h|y9i1J;`i{o>Vn?QQacaZtb@LW2UeqvW-ukr* z8WyabT3p`PvAj5S-MWSI7tLNjcf-0x>lZ9;=w7sT-JCfK=jERb-5VAzn!lm@?1r;v z&z`^d?C!-48x}8~-QBQZ-QvY(Z&+~loVjz?FPfTzU}a0|%p5$o?t1XFv@UIHU)k8% z($?D5ku%83#+)=(v@PjsZtTpkb~|=<8+gSh?>yAH7VmZ>Em_glblT1XsN&Q69~_J8 zPsqV^afO5FKnAkwn#zY+#aG<1wbSbR1~?-J*R|XC*e)m519{z@>+3szmbbdNq-9x! zdDGOka#ee&qodT^e{T2UFKKDbNod&gVar>Za|)jxi^uJ9ian6gluuS&J!4wgxT?HY zd;X>Oyt`M9JTzzBUX7W>p7~HeeOe~Z+YWH@^gRCo&Yiit&7GbxWGEe_)Bl4BG`D_g z&@TBeOrQ*>=R4JY#QG0%N+!{54`>pV_d@R+D(|w&GpI+T(iv%0TStB0r0Y9l_MA)l z&$8bA+xM6q(`<6YHa>u9R(=PvZ>^3@uHJ`DpOjg3%Yn_RY|aBa9nINoO0_iq2UBTI z{lvhW`(Kz!8OFY*QV&kg$m%nHp8Y`PPgc}Bd9t>>oPCz8=`3|MwXJUL-2RBZsnWL- zbHds&FY1a@&gv-TyJh+PW>@)LVKLwGYk&6Byo2qz{vSKK=7upfg(F8^eALJRM^~>M zJK~4Oj95Kx_}`Bme#3FYhm9Y0$%J9!jz9eB;^D{E9X7ar=6M zy!>XMJi?tP@g1+$|E=HL`ogqq?)Hn*zC}Hkc6+%b>-$!}V`Y^$x56+AQv>kJ|0+Oz zF>lXHEz4SaFGb7F`DVZC^>sh7-gkT1g7(tVQZ8zmO1Zw+u_fiDrRMT3?foXR|83#EPb!*G%t4jyE!hH>@(DN3gZ7mCf__b3l~p3-tkB(Ffe`+5M(U|0Ae(3hY$sKgPZn zEaa9_^)&9LOxoQwkF@%~l`qH0ybh zU5?~R!`(jL#%I1=UGd#B zdjh>TgUhd97?+)}egE#|nBQd|_O7G%$nUqJN=MEyS@o6s*}u V=UUZk=@HIB!|n z(p0{U)&Hs|ru_r>ddzG(>j)ya%X_D#gIY}nTnCzl!_Ogrqk2_}fo2PpYci%f^aeZNQcFbcA`1sEY?y_U{9CyXt zQ9Ggh_y5QH@Bi=r!pkEF5FkK+009C72oNAZfWW~jFs4xc`~Sxttm8Ls0t5&UAV7cs z0RjXF5FkKc{{ppD<O&BZO-&R=)orY*}lPG5268FRb3^0ltMug%(W@unqDdC_~W zAGEZiarm7tyYjkwPpX>SSYMjc+0niE((cX04I8(tzc7E;*7Nj7o-wDZt9L=yjs@j) zy!r#Ty{5d55B092uCcB(`}C&G-D|gY7q_lmx2e1Q`fJXZ-PP6m)vn&J_B^}r{BOSf zH-mddaBJ_gr!+1ZP*pws009C72oNAZfB*pk`xY2e zQ~vM&9lq~JSxSHa0RjXF5FkK+009C72pr4;V-71{{|`Nwx53y65FkK+009C72oNAZ zfB=Dg3)GG%|NQ@u!TWxgr345NAV7cs0RjXF5FkK+z(Fi9W?1?2f0zFU@jf(k0t5&U zAV7cs0RjXF5FoHWf!dMf&;JMd@Bi&j%5nk(2oNAZfB*pk1PBlyaPSHYtRJy`@H47< z{%?OHQ}w9rU-LTw0t5&UAV7cs0RjXF5FqeBATadold=_csgQrh5Bf*`RG%iMNojJL zlBT9Q`2c_MOv9!Q(Ib5Oz{IxDS74^NLsk4%qBk4|0b zF==gDm)55ZsXO)jgv4XhIcZ}$H=UO@rSsDT>B4kTx;Sl4Thi8aNxC#$mU{lC#kTag z^!W6I^u+X}bVYh{dP;g~x-va2U6r1mo{^rJo|T@Ro|B%No|mpp*Q9IHb?N%_{PcqK z!t|o_;`EaA(sV<*G2N72mR_E2POnI>Os`6>POnL~q}Qg`rPrr7q&KEFrCZaR(_7M8 z)7#S9({1VY^p5n-^se;o^q%zI^uF}|^nvuj^r7_O^pW(@^s)5u^ojJz^r`ge^qKV8 z^ttr;^o8`r^riIWbVvG1`f9o}eJy=GeItD{eJg!CeJ6c4eJ_1K{UH4?-Iacnew==i zewu!kexB}5_oQE>U#4HBU#H)s-=^QCd((aC_vsJmkLge8&*?Aeujy~;@97`upXp!e z-|7CU^4+%q1M>U8G$<9);4~x+O^2n!)37u=jY!pLWU5I=q}ntpjZS0Ik?E*(bQ+tE zN#oM7>9{mLO-RS5Vya8^X=0j`CZ{QBYMPd&ry1#lG&7x;W~Gx-LzO-@tN)HE$kPcza9 zX=XYx%}OVwhBQ0PNpsV@G(Rm!3)7;sI6WvmIGvmxl1@nvO%F?rX-R5I&8d`@re$e) zYDuT2)6$BxGPS0*v?`sR+EYjBOsmrw>CALiT9Y209+4iI9+e)Qy3%9P+O#gMPa9Hq zIy*f!os%}EbJKZgQ#wCgkSDqK%x;{NWy&%0Xy(qmny(GOf-H>if zH>H=Qm#3T4E7B{|tJ15}Ytk+0wdr-~_2~`ijp~HBfT@d zE4@3tC%relFTFo~Abl`>D1A76Bz-h}EPXtEB7HJ_Dt$VACVe)2E`2_IA$>7@DSbKJ zk-n0?n(jr5~jqr=O&srk|ysr@PZV=@;pj z=~wC3={M=O>38YgbYJ>?`a}9-`cwLI`b+w2`dj*Y`bYX_`d9jQy1%MwP*oa`2B!SW zph6m)hNPkCuylADmWHPhsXC2JHR*^{n?|M4X-qmY9hHtwW79EdTsk%#m&T_F>G)Jk zb*Vm0Oq0^&G$l<<)6(=bBb|_DrW4bwbW&|~K zl=RT_u+*5Aq^8uIN@;0YmX@cMbZR;+tw<|VYidiY(&?!^b)?R;I-QZuOlPGv>EY=S z>5=JC>CvexJtnP9>(ctPA$6y-(__;)X=6G!otHMH^V0?C!gNu(IBiZ_($;iIx-?yu zE>GLi~P6VemYlhPIG$>}NSsp-n}v~*Q^dU{5BW_nh7c6v^FZhBt2I$e{lP1mLC z)AQ2{(hJjz(u>ne(o53~>Be+ZdRcmTx;ecfy)wNjy*j-n-I89LUYA~<-jLpy-jr@l zZ%%JXZ%uDYZ%?U`_l)~2h)erhto&WN7KjB$I~a$C)20W zr_*QBXVd4>=hGL`7t@#0m(v~TE9tB0&h)kP_4JMO&GfDG?ev}W-SoZm{q%$M!*o~r zQTlQEN&0E}S^9apJKd9hk$#zem42OmlYX0im+npXrQfGNq(7!Vr9Y>?q`#)WrN5_t zq<^M=rGKaUbM;-72Bd*$P|6nygVT^SG#!=>Ps7siG$K`}k*Ovfk!sVZG&+q*N2a6F z(P?ZtCXGwSrsML0hYgQV@AZFH?-v0A1PBlyK!5-N0t5&UAn?B^aM&={|No2o)ix0z zK!5-N0t5&UAV7csf&K!ARqwd|A29l%gKG1`ZpHkh@StMhuKTP0gMTir%1A44Hzz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5 zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HV3 EKm`*)!~g&Q diff --git a/.metals/metals.lock.db b/.metals/metals.lock.db deleted file mode 100644 index 86786beb4..000000000 --- a/.metals/metals.lock.db +++ /dev/null @@ -1,6 +0,0 @@ -#FileLock -#Wed Feb 16 10:42:58 CST 2022 -hostName=localhost -id=17f036a65f0743c88f36861cc22037fb2ffa607619f -method=file -server=localhost\:49654 diff --git a/.metals/metals.log b/.metals/metals.log deleted file mode 100644 index 7416b0032..000000000 --- a/.metals/metals.log +++ /dev/null @@ -1,72 +0,0 @@ -2022.02.16 10:42:57 INFO tracing is disabled for protocol LSP, to enable tracing of incoming and outgoing JSON messages create an empty file at /Users/axbo/Development/fastdata/glow/.metals/lsp.trace.json or /Users/axbo/Library/Caches/org.scalameta.metals/lsp.trace.json -2022.02.16 10:42:57 INFO logging to file /Users/axbo/Development/fastdata/glow/.metals/metals.log -2022.02.16 10:42:57 INFO Started: Metals version 0.11.1 in workspace '/Users/axbo/Development/fastdata/glow' for client Visual Studio Code 1.63.2. -2022.02.16 10:42:57 INFO Parse release: 1.8.0_311 -2022.02.16 10:42:57 WARN Can't instantiate JavaInteractiveSemanticdb (version: Some(JdkVersion(1,8)), jdkHome: /Library/Internet Plug-Ins/JavaAppletPlugin.plugin/Contents/Home, javac exists: false) -2022.02.16 10:42:58 INFO time: initialize in 0.4s -2022.02.16 10:42:58 WARN Build server is not auto-connectable. -Feb 16, 2022 11:00:32 AM org.eclipse.lsp4j.jsonrpc.RemoteEndpoint handleNotification -WARNING: Notification threw an exception: { - "jsonrpc": "2.0", - "method": "workspace/didChangeWatchedFiles", - "params": { - "changes": [ - { - "uri": "file:///Users/axbo/Development/fastdata/glow/build.sbt", - "type": 3 - }, - { - "uri": "file:///Users/axbo/Development/fastdata/glow/version.sbt", - "type": 3 - }, - { - "uri": "file:///Users/axbo/Development/fastdata/glow/sonatype.sbt", - "type": 3 - } - ] - } -} -java.lang.RuntimeException: java.lang.reflect.InvocationTargetException - at org.eclipse.lsp4j.jsonrpc.services.GenericEndpoint.lambda$null$0(GenericEndpoint.java:67) - at org.eclipse.lsp4j.jsonrpc.services.GenericEndpoint.notify(GenericEndpoint.java:152) - at org.eclipse.lsp4j.jsonrpc.RemoteEndpoint.handleNotification(RemoteEndpoint.java:220) - at org.eclipse.lsp4j.jsonrpc.RemoteEndpoint.consume(RemoteEndpoint.java:187) - at org.eclipse.lsp4j.jsonrpc.json.StreamMessageProducer.handleMessage(StreamMessageProducer.java:194) - at org.eclipse.lsp4j.jsonrpc.json.StreamMessageProducer.listen(StreamMessageProducer.java:94) - at org.eclipse.lsp4j.jsonrpc.json.ConcurrentMessageProcessor.run(ConcurrentMessageProcessor.java:113) - at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) - at java.util.concurrent.FutureTask.run(FutureTask.java:266) - at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) - at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) - at java.lang.Thread.run(Thread.java:748) -Caused by: java.lang.reflect.InvocationTargetException - at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) - at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) - at java.lang.reflect.Method.invoke(Method.java:498) - at org.eclipse.lsp4j.jsonrpc.services.GenericEndpoint.lambda$null$0(GenericEndpoint.java:65) - ... 11 more -Caused by: java.nio.file.NoSuchFileException: /Users/axbo/Development/fastdata/glow/build.sbt - at sun.nio.fs.UnixException.translateToIOException(UnixException.java:86) - at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102) - at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107) - at sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:214) - at java.nio.file.Files.newByteChannel(Files.java:361) - at java.nio.file.Files.newByteChannel(Files.java:407) - at java.nio.file.Files.readAllBytes(Files.java:3152) - at scala.meta.internal.io.PlatformFileIO$.slurp(PlatformFileIO.scala:45) - at scala.meta.internal.io.FileIO$.slurp(FileIO.scala:24) - at scala.meta.internal.metals.MetalsLanguageServer.$anonfun$onChange$1(MetalsLanguageServer.scala:1366) - at scala.meta.internal.metals.MetalsLanguageServer.$anonfun$onChange$1$adapted(MetalsLanguageServer.scala:1365) - at scala.collection.immutable.Stream.foreach(Stream.scala:533) - at scala.meta.internal.metals.MetalsLanguageServer.onChange(MetalsLanguageServer.scala:1365) - at scala.meta.internal.metals.MetalsLanguageServer.didChangeWatchedFiles(MetalsLanguageServer.scala:1309) - ... 16 more - -2022.02.16 11:00:54 WARN no build target for: /Users/axbo/Development/fastdata/glow/build.sbt -2022.02.16 11:00:54 WARN no build target for: /Users/axbo/Development/fastdata/glow/sonatype.sbt -2022.02.16 11:00:54 WARN no build target for: /Users/axbo/Development/fastdata/glow/version.sbt -2022.02.16 11:00:54 WARN no build target for: /Users/axbo/Development/fastdata/glow/project/Dependencies.scala -2022.02.16 11:00:54 WARN no build target for: /Users/axbo/Development/fastdata/glow/project/StableVersionPlugin.scala -2022.02.16 11:00:54 WARN no build target for: /Users/axbo/Development/fastdata/glow/project/plugins.sbt -2022.02.16 11:00:57 INFO no build target found for /Users/axbo/Development/fastdata/glow/sonatype.sbt. Using presentation compiler with project's scala-library version: 3.1.0 From 5fd4ee856d50ed1d405102a82742cad8b22cc9aa Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 8 Oct 2022 09:06:48 -0500 Subject: [PATCH 18/27] Bump acceptance tests From 5fea959dd79e0cb99704e11152f5d7f3ec343e9b Mon Sep 17 00:00:00 2001 From: willsmithDB Date: Mon, 17 Oct 2022 20:22:56 -0400 Subject: [PATCH 19/27] Added to .gitignore due to sbt generate files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b57e3a7f2..a1d1dc9cf 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,6 @@ hail-*.log # Test artifacts Miniconda3*.sh hail/ + +# Metadata from sbt locally +.bsp \ No newline at end of file From dabff95cd043e4de1b8ce4e9773701ac21b0a9b1 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 20 Oct 2022 07:55:18 -0500 Subject: [PATCH 20/27] Remove 2xlarge resource reference --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e96beef27..5b8065a5b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ setup_2xl_base: &setup_2xl_base working_directory: ~/glow docker: - image: cimg/openjdk:8.0.322 - resource_class: 2xlarge + resource_class: xlarge setup_xl_base: &setup_xl_base working_directory: ~/glow @@ -116,9 +116,9 @@ jobs: paths: - /home/circleci/conda key: conda-deps-v1-{{ checksum "python/environment.yml" }} - - run: + - run: name: Prepare CircleCI environment for dependency cache - issue #301 - environment: + environment: command: sudo apt-get install -y ca-certificates - run: name: Run Scala tests From 0ba4f97b1ffa35923ee6f6f94416f9d4e0153079 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 2 Nov 2022 07:30:57 -0500 Subject: [PATCH 21/27] Re-trigger stalled tests From 794f2184cf0bcbd1144e8149c552c27393b4cfaf Mon Sep 17 00:00:00 2001 From: a0x8o Date: Wed, 8 Mar 2023 20:08:40 +0000 Subject: [PATCH 22/27] Levels ridge regression tutorial --- levels_ridge_regression_tutorial.ipynb | 975 ++++++++++++++----------- 1 file changed, 539 insertions(+), 436 deletions(-) diff --git a/levels_ridge_regression_tutorial.ipynb b/levels_ridge_regression_tutorial.ipynb index 35e84897c..c4232a3b8 100644 --- a/levels_ridge_regression_tutorial.ipynb +++ b/levels_ridge_regression_tutorial.ipynb @@ -2,8 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "922c085d-0cda-44ff-be65-268ca3b3d8b8", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "import pandas as pd\n", @@ -17,8 +25,16 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0951d9a7-4c87-48f6-a256-8e6161657994", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "spark = SparkSession.builder.appName('levels').getOrCreate()\n", @@ -27,8 +43,16 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "12c91c53-ef6b-411c-8200-98be4e8b9adf", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "test_data_root = '/Users/leland.barnard/glow/glow-wgr/test-data/levels/ridge-regression' #path to glow levels test data" @@ -36,7 +60,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "287406ae-65ed-4a3c-ab3d-9236525f7429", + "showTitle": false, + "title": "" + } + }, "source": [ "We need three objects to get started:\n", "* A Spark DataFrame representing the block genotype matrix\n", @@ -46,8 +78,16 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "abd84cb6-e98c-498e-88af-ded9ee95ebdd", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "blockdf_lvl0 = spark.read.parquet(f'{test_data_root}/blockedGT.snappy.parquet') #block genotype matrix\n", @@ -57,7 +97,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "098333d9-a84b-4886-9a93-400a1866e8bd", + "showTitle": false, + "title": "" + } + }, "source": [ "#### The block genotype matrix as a DataFrame\n", "If we imagine the block genotype matrix conceptually, we think of an *NxM* matrix *X* where each row *n* represents an individual sample, each column *m* represents a variant, and each cell *(n, m)* contains a genotype value for sample *n* at variant *m*. We then imagine laying a coarse grid on top of this matrix such that matrix cells within the same coarse grid cell are all assigned to the same block *x*. Each block *x* is indexed by a sample block ID (corresponding to a list of rows belonging to the block) and a header block ID (corresponding to a list of columns belonging to the block). The sample block IDs are generally just integers 0 through the number of sample blocks. The header block IDs are strings of the form 'chr_C_block_B', which refers to the Bth block on chromosome C. The Spark DataFrame representing this block matrix can be thought of as the transpose of each block *xT* all stacked one atop another. Each row represents the values from a particular column from *X*, for the samples corresponding to a particular sample block. The fields in the DataFrame are:\n", @@ -74,33 +122,32 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------------+----+--------------------+--------------------+-------------+------------+---------+------------------+------------------+\n", - "| header|size| indices| values| header_block|sample_block| position| mu| sig|\n", - "+---------------+----+--------------------+--------------------+-------------+------------+---------+------------------+------------------+\n", - "|2:231414300:T:C| 9| [0, 1, 6, 7, 8]|[1.0, 1.0, 1.0, 1...|chr_2_block_6| 7|231414300|0.8686868686868686|0.6730002176294544|\n", - "|2:231414300:T:C| 10|[0, 1, 2, 3, 4, 5...|[2.0, 1.0, 1.0, 1...|chr_2_block_6| 1|231414300|0.8686868686868686|0.6730002176294544|\n", - "|2:231414300:T:C| 12|[1, 3, 4, 5, 7, 8...|[2.0, 1.0, 1.0, 1...|chr_2_block_6| 8|231414300|0.8686868686868686|0.6730002176294544|\n", - "|2:231414300:T:C| 13|[0, 1, 2, 3, 4, 5...|[2.0, 1.0, 2.0, 1...|chr_2_block_6| 9|231414300|0.8686868686868686|0.6730002176294544|\n", - "+---------------+----+--------------------+--------------------+-------------+------------+---------+------------------+------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "19d318c5-6513-4c21-8ba2-0f49311e4493", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl0.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a2f3933a-0d72-45f3-b594-0eba2ffb0e29", + "showTitle": false, + "title": "" + } + }, "source": [ "#### The sample block mapping\n", "This is a comparitively simple key-value store where each key is a sample block ID and each value is a list of sample IDs contained in that sample block. As a Spark DataFrame, this is represented as a two column DataFrame with the following fields:\n", @@ -110,33 +157,32 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+--------------------+\n", - "|sample_block| sample_ids|\n", - "+------------+--------------------+\n", - "| 3|[1008962444, 1035...|\n", - "| 9|[1083737921, 1041...|\n", - "| 7|[1048623585, 1030...|\n", - "| 1|[1073111137, 1082...|\n", - "+------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0d110459-be02-4e04-b570-afbfcbf8213c", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "indexdf.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "125bff86-49dc-41e2-a058-e03565311355", + "showTitle": false, + "title": "" + } + }, "source": [ "#### The phenotype data\n", "The phenotype data is represented as a Pandas DataFrame indexed by the sample ID. Each column represents a single phenotype, and it is assumed that there are no missing phenotype values, and that the phenotypes mean centered at 0." @@ -144,105 +190,32 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sim100sim92sim58sim16
sample_id
1042204109-0.905058-1.171217-1.437376-1.703535
1035505158-0.616539-0.411283-0.206027-0.000770
1008166305-0.946014-0.482639-0.0192630.444112
1068805020-1.155375-0.660005-0.1646340.330736
1095012035-1.024889-0.4921790.0405300.573240
\n", - "
" - ], - "text/plain": [ - " sim100 sim92 sim58 sim16\n", - "sample_id \n", - "1042204109 -0.905058 -1.171217 -1.437376 -1.703535\n", - "1035505158 -0.616539 -0.411283 -0.206027 -0.000770\n", - "1008166305 -0.946014 -0.482639 -0.019263 0.444112\n", - "1068805020 -1.155375 -0.660005 -0.164634 0.330736\n", - "1095012035 -1.024889 -0.492179 0.040530 0.573240" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "66f73276-8f93-43fb-8464-d1fe48c4f688", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "labeldf.head()" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bc02343a-0353-487a-b68b-8943e102ae40", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Reducer model fitting\n", "The first step in the fitting procedure is to apply a dimensionality reduction to the block matrix *X* using the `RidgeReducer`. This is accomplished by fitting multiple ridge models within each block *x* and producing a new block matrix where each column represents the prediction of one ridge model applied within one block. This approach to model building is generally referred to as **stacking**. We will call the block genotype matrix we started with the **level 0** matrix in the stack *X0*, and the output of the ridge reduction step the **level 1** matrix *X1*. The `RidgeReducer` class is used for this step, which is initiallized with a list of ridge regularization values (referred to here as alpha). Since ridge models are indexed by these alpha values, the `RidgeReducer` will generate one ridge model per value of alpha provided, which in turn will produce one column per block in *X0*, so the final dimensions of matrix *X1* will be *Nx(LxK)*, where *L* is the number of header blocks in *X0* and *K* is the number of alpha values provided to the `RidgeReducer`. In practice, we can estimate a span of alpha values in a reasonable order of magnitude based on guesses at the heritability of the phenotype we are fitting, but here we will just pick some values." @@ -250,8 +223,16 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "05aca85c-23b3-421b-8080-a829dcc66271", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "alphas_lvl0 = np.logspace(2, 5, 10)\n", @@ -260,51 +241,63 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c37cf140-97f9-4e10-9106-b34301c19d54", + "showTitle": false, + "title": "" + } + }, "source": [ "When the `RidgeReducer` is initialized, it will assign names to the provided alphas and store them in a dict accessible as `RidgeReducer.alphas`. This is mostly just to give an easily readable and sortable name to the models produced for each ridge value." ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'alpha_0': 100.0,\n", - " 'alpha_1': 215.44346900318845,\n", - " 'alpha_2': 464.15888336127773,\n", - " 'alpha_3': 1000.0,\n", - " 'alpha_4': 2154.4346900318824,\n", - " 'alpha_5': 4641.588833612777,\n", - " 'alpha_6': 10000.0,\n", - " 'alpha_7': 21544.346900318822,\n", - " 'alpha_8': 46415.888336127726,\n", - " 'alpha_9': 100000.0}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dbc31936-dadc-4996-bf08-035a8fb59dc8", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "stack_lvl0.alphas" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8b242244-0b6a-4b4b-a7c0-f6c4f131ac2c", + "showTitle": false, + "title": "" + } + }, "source": [ "The `RidgeReducer.fit(blockdf, labeldf, indexdf)` method generates a Spark DataFrame representing the model that we can use to reduce *X0* to *X1*." ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6b1d6749-19b5-4ad7-ae4e-493fa42faacd", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl0 = stack_lvl0.fit(blockdf_lvl0, labeldf, indexdf)" @@ -312,7 +305,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7ea1248f-bfe1-4ec3-92f7-272dd9e4e194", + "showTitle": false, + "title": "" + } + }, "source": [ "In explicit terms, the reduction of a block *x0* from *X0* to the corresponding block *x1* from *X1* is accomplished by the matrix multiplication *x0 * B = x1*, where *B* is a coefficient matrix of size *mxK*, where *m* is the number of columns in block *x0* and *K* is the number of alpha values used in the reduction. As an added wrinkle, if the ridge reduction is being performed against multiple phenotypes at once, each phenotype will have its own *B*, and for convenience we panel these next to each other in the output into a single matrix, so *B* in that case has dimensions *mx(K*P)* where *P* is the number of phenotypes. Each matrix *B* is specific to a particular block in *X0*, so the Spark DataFrame produced by the `RidgeReducer` can be thought of all of as the matrices *B* from all of the blocks stacked one atop another. The fields in the model DataFrame are:\n", "* header_block: An ID assigned to the block *x0* corresponding to the coefficients in this row.\n", @@ -325,33 +326,32 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------+------------+---------------+---------+--------------------+--------------------+--------------------+\n", - "| header_block|sample_block| header| position| alphas| labels| coefficients|\n", - "+-------------+------------+---------------+---------+--------------------+--------------------+--------------------+\n", - "|chr_3_block_8| 0|3:160741710:G:A|160741710|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.07462677364336...|\n", - "|chr_3_block_8| 0|3:175345110:C:T|175345110|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.07834053929928...|\n", - "|chr_3_block_8| 0|3:183469890:A:G|183469890|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.02152237814164...|\n", - "|chr_3_block_8| 0|3:195047160:C:T|195047160|[alpha_0, alpha_1...|[sim100, sim100, ...|[0.01153728383795...|\n", - "+-------------+------------+---------------+---------+--------------------+--------------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1e70a537-0417-4a24-9dc8-97b9cd72db76", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "modeldf_lvl0.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dd5e8481-773c-4d9c-99af-64cb9ffe43f3", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Reducer transformation\n", "After fitting, the `RidgeReducer.transform(blockdf, labeldf, modeldf)` method can be used to generate `X1` from `X0`." @@ -359,8 +359,16 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "02bd487b-4d0d-4e77-b6fd-151bacc59eba", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "blockdf_lvl1 = stack_lvl0.transform(blockdf_lvl0, labeldf, modeldf_lvl0)" @@ -368,7 +376,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "94b0e0f4-b0e8-45d0-9d50-abed3fd252ca", + "showTitle": false, + "title": "" + } + }, "source": [ "The output of the transformation is closely analogous to the block matrix DataFrame we started with. The main difference is that, rather than representing a single block matrix, it really represents multiple block matrices, with one such matrix per label (phenotype). Comparing the schema of this block matrix DataFrame (`blockdf_lvl1`) with the DataFrame we started with (`blockdf_lvl0`), the new columns are:\n", "* alpha: This is the name of the alpha value used in fitting the model that produced the values in this row.\n", @@ -377,66 +393,64 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+------+\n", - "| header|size| values|header_block|sample_block|position| mu| sig| alpha| label|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+------+\n", - "|chr_3_block_8_alp...| 13|[0.08337895454032...| chr_3| 0| 80| 0.04148112816674154|0.19099426058493266|alpha_0|sim100|\n", - "|chr_3_block_8_alp...| 13|[0.04796003873174...| chr_3| 0| 81| 0.02402075708176127|0.11316256614620662|alpha_1|sim100|\n", - "|chr_3_block_8_alp...| 13|[0.02504256254617...| chr_3| 0| 82|0.012596289114544081|0.06030642726717367|alpha_2|sim100|\n", - "|chr_3_block_8_alp...| 13|[0.01234023662311...| chr_3| 0| 83|0.006221371128544...|0.03006776034645892|alpha_3|sim100|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "90e303b1-09fe-4546-b4f1-b61b726d9521", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl1.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2fffc641-a2cd-4785-a5f0-b04b4e46c9b2", + "showTitle": false, + "title": "" + } + }, "source": [ "The headers in the *X1* block matrix are derived from a combination of the source block in *X0*, the alpha value used in fitting the ridge model, and the label they were fit with. These headers are assigned to header blocks that correspond to the chromosome of the source block in *X0*." ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------------------------------+------------+\n", - "|header |header_block|\n", - "+----------------------------------+------------+\n", - "|chr_3_block_8_alpha_0_label_sim100|chr_3 |\n", - "|chr_3_block_8_alpha_1_label_sim100|chr_3 |\n", - "|chr_3_block_8_alpha_2_label_sim100|chr_3 |\n", - "|chr_3_block_8_alpha_3_label_sim100|chr_3 |\n", - "+----------------------------------+------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5571df67-1419-40a8-b700-d8e63b8c3486", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl1.select('header', 'header_block').show(4, False)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "47174c5e-3dfd-453a-89d4-96276974a4fb", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Regression fitting\n", "The block matrix *X1* can be used to fit a final predictive model that can generate phenotype predictions *y_hat* using the `RidgeRegression` class. As with the `RidgeReducer` class, this class is initialized with a list of alpha values." @@ -444,8 +458,16 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5e0bcef0-4e89-4390-8a05-b5c3495d1846", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "alphas_lvl1 = np.logspace(1, 4, 10)\n", @@ -454,8 +476,16 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e32c5923-dfcd-44e3-bebd-5920cddf3c53", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl1_est, cvdf_lvl1 = estimator_lvl1.fit(blockdf_lvl1, labeldf, indexdf)" @@ -463,47 +493,62 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6e5936c1-2770-495f-b50c-dfd3f535a96d", + "showTitle": false, + "title": "" + } + }, "source": [ "The `RidgeRegression.fit(blockdf, labeldf, indexdf)` works in much the same way as the `RidgeReducer.fit(blockdf, labeldf, indexdf)` method, except that it returns two DataFrames:" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f6c42ccd-843d-4011-a1c9-6105dc996900", + "showTitle": false, + "title": "" + } + }, "source": [ "A model DataFrame analogous to the model DataFrame provided by the `RidgeReducer`. An important difference is that the header block ID for all rows will be 'all', indicating that all headers from all blocks have been used in a single fit, rather than fitting within blocks." ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "|header_block|sample_block| header|position| alphas| labels| coefficients|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "| all| 1|chr_1_block_0_alp...| 0|[alpha_0, alpha_1...|[sim16, sim16, si...|[0.02787784249249...|\n", - "| all| 1|chr_2_block_0_alp...| 0|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0164002560049...|\n", - "| all| 1|chr_3_block_0_alp...| 0|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0234168451974...|\n", - "| all| 1|chr_1_block_0_alp...| 1|[alpha_0, alpha_1...|[sim16, sim16, si...|[0.00381390574280...|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f9f9a80c-b2ed-4c46-81e5-17143eb3c017", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "modeldf_lvl1_est.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9c90c408-8dbc-4c8b-a3b7-881cba82f5a9", + "showTitle": false, + "title": "" + } + }, "source": [ "A cross validation (cv) report DataFrame, which reports the results of the hyperparameter (i.e., alpha) value optimization routine.\n", "* label: This is the label corresponding to the cross cv results on the row.\n", @@ -513,32 +558,32 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+-------+--------------------+\n", - "| label| alpha| r2_mean|\n", - "+------+-------+--------------------+\n", - "| sim92|alpha_5| 0.18389799898047948|\n", - "| sim16|alpha_8|-0.22499071350515992|\n", - "| sim58|alpha_6|-0.02504464471643515|\n", - "|sim100|alpha_5| 0.2566748993770534|\n", - "+------+-------+--------------------+\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8a93813f-00e2-4012-9687-d54f2ad84db8", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "cvdf_lvl1.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8b84d440-8d54-4ad9-9ac7-0520cc9d13a8", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Producing phenotype predictions *y_hat*\n", "After fitting the `RidgeRegression` model, the model DataFrame and cv DataFrame are used to apply the model to the block matrix DataFrame to produce predictions (*y_hat*) for each label in each sample block using the `RidgeRegression.transform(blockdf, labeldf, modeldf, cvdf)` method" @@ -546,8 +591,16 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2fe19547-76ea-4302-af02-d9cadc0d87ae", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "y_hat_lvl1 = estimator_lvl1.transform(blockdf_lvl1, labeldf, modeldf_lvl1_est, cvdf_lvl1)" @@ -555,7 +608,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f163d829-d752-4258-882b-dd0b1d7f9349", + "showTitle": false, + "title": "" + } + }, "source": [ "The resulting *y_hat* DataFrame has the following fields:\n", "* sample_block: The sample block ID for the samples corresponding to the *y_hat* values on this row.\n", @@ -566,33 +627,32 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-----+-------+--------------------+\n", - "|sample_block|label| alpha| values|\n", - "+------------+-----+-------+--------------------+\n", - "| 1|sim16|alpha_8|[0.08461773658136...|\n", - "| 4|sim16|alpha_8|[0.08343907935865...|\n", - "| 7|sim16|alpha_8|[-0.0976335915514...|\n", - "| 8|sim16|alpha_8|[-0.0461222342349...|\n", - "+------------+-----+-------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3bc9c2bc-ca22-4791-972f-0c0ae6d6b7e7", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "y_hat_lvl1.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "52f548da-497e-45d3-97b4-aa7b41763cde", + "showTitle": false, + "title": "" + } + }, "source": [ "#### Fitting a second round of ridge reduction instead of ridge regression\n", "After fitting the first ridge reduction step and producing *X1* from *X0*, we can go directly into fitting the final ridge regression model, as we have just seen. Alternatively, we can fit a second round of ridge reduction to squeeze *X1* into an even smaller feature matrix, which we will call the **level 2** matrix *X2*. This has some advantages when it comes to generating the leave-one-chromosome-out versions of the *y_hat*s and does not come at much additional cost. The procedure for fitting the second round of ridge reduction is identical to the first (we will reuse the same alphas we chose for the ridge regression fit above):" @@ -600,8 +660,16 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0f2efd36-54b7-4699-98a9-fdebdaee18ac", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "stack_lvl1 = RidgeReducer(alphas_lvl1)" @@ -609,8 +677,16 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9b15c1b0-c22a-4909-858e-c550d0bebef0", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl1 = stack_lvl1.fit(blockdf_lvl1, labeldf, indexdf)\n", @@ -619,81 +695,95 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "619e9d61-0458-40a2-92bf-0fe7bc86c088", + "showTitle": false, + "title": "" + } + }, "source": [ "The **level 2** block matrix DataFrame produced here has an identical schema to the **level 1** block matrix. A key difference is that the header block ID for all headers is now \"all\" for all headers, indicating that there are now no more blocks to collapse." ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+-----+\n", - "| header|size| values|header_block|sample_block|position| mu| sig| alpha|label|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+-----+\n", - "|all_block_1_alpha...| 13|[-0.0796642628265...| all| 9| 10|-1.49453099468771...| 0.4138556129030118|alpha_0|sim16|\n", - "|all_block_1_alpha...| 13|[-0.0957216101643...| all| 9| 11| 0.0|0.40982580962691184|alpha_1|sim16|\n", - "|all_block_1_alpha...| 13|[-0.1070572367031...| all| 9| 12|5.124106267500723...|0.40028561461234197|alpha_2|sim16|\n", - "|all_block_1_alpha...| 13|[-0.1163419313886...| all| 9| 13|1.708035422500241...| 0.3798792747100316|alpha_3|sim16|\n", - "+--------------------+----+--------------------+------------+------------+--------+--------------------+-------------------+-------+-----+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "91e7f3d8-c503-4ecf-a8de-97eef8129edc", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl2.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a4016386-8851-4c64-9b4c-13523759f652", + "showTitle": false, + "title": "" + } + }, "source": [ "The headers for each column now follow the name convention 'all_block_B_alpha_A_label_L', which refer to the ridge model prediction using alpha A and for label L fit using the features from header block B from block matrix *X1*. Since the blocks in *X1* refer to chromosomes, the block number B here can be interpreted as a chromosome. The 'all' token reflects the fact that we are not assigning the columns in *X2* to any new blocks (i.e, *X2* only has sample blocks, but there is only one header block which encompasses the entire matrix)." ] }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------------------------+\n", - "|header |\n", - "+-------------------------------+\n", - "|all_block_1_alpha_0_label_sim16|\n", - "|all_block_1_alpha_1_label_sim16|\n", - "|all_block_1_alpha_2_label_sim16|\n", - "|all_block_1_alpha_3_label_sim16|\n", - "+-------------------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "507eb550-5afe-4f9f-ad41-0935c08fba48", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "blockdf_lvl2.select('header').show(4, False)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8c6ab06b-8dc4-4231-985b-9c1101196b92", + "showTitle": false, + "title": "" + } + }, "source": [ "We can now fit a ridge regression model as we did above, except that we will use the matrix *X2* instead of *X1*" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c6dbbd53-a209-419f-9cbd-5bc568b3f987", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "alphas_lvl2 = np.logspace(0, 3, 10)\n", @@ -702,8 +792,16 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f41eea9c-4fa9-4266-b860-27d5d551701b", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "modeldf_lvl2_est, cvdf_lvl2 = estimator_lvl2.fit(blockdf_lvl2, labeldf, indexdf)" @@ -711,59 +809,50 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "|header_block|sample_block| header|position| alphas| labels| coefficients|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "| all| 1|all_block_1_alpha...| 10|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0939792871878...|\n", - "| all| 1|all_block_1_alpha...| 11|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0788683759104...|\n", - "| all| 1|all_block_1_alpha...| 12|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0693010949556...|\n", - "| all| 1|all_block_1_alpha...| 13|[alpha_0, alpha_1...|[sim16, sim16, si...|[-0.0446945065691...|\n", - "+------------+------------+--------------------+--------+--------------------+--------------------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ea0c3e52-bd80-4c3b-9c4e-8fd33443a5ae", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "modeldf_lvl2_est.show(4)" ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+-------+--------------------+\n", - "| label| alpha| r2_mean|\n", - "+------+-------+--------------------+\n", - "| sim92|alpha_7| 0.199251090828654|\n", - "| sim16|alpha_9|-0.22903758326079596|\n", - "| sim58|alpha_7|0.005461670993813417|\n", - "|sim100|alpha_8| 0.2314559298409073|\n", - "+------+-------+--------------------+\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6d746257-9d90-4aea-857f-d3a6d02d1887", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "cvdf_lvl2.show(4)" ] }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "93746bb0-6a6c-42c1-8b6c-b40c7934008d", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "y_hat_lvl2 = estimator_lvl2.transform(blockdf_lvl2, labeldf, modeldf_lvl2_est, cvdf_lvl2)" @@ -771,41 +860,48 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-----+-------+--------------------+\n", - "|sample_block|label| alpha| values|\n", - "+------------+-----+-------+--------------------+\n", - "| 9|sim58|alpha_7|[-0.2126330471314...|\n", - "| 6|sim58|alpha_7|[0.18042213283121...|\n", - "| 5|sim58|alpha_7|[-0.0126226427178...|\n", - "| 2|sim58|alpha_7|[0.00871975701462...|\n", - "+------------+-----+-------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "29fac9be-5092-4e08-a5ab-307536fbbf72", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "y_hat_lvl2.show(4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a0304841-6dfc-416d-902b-ea9c41a9ebc8", + "showTitle": false, + "title": "" + } + }, "source": [ "For producing the LOCO versions of the *y_hat* vectors, it is only necessary to filter out rows from `blockdf_lvl2` corresponding to the chromosome we wish to drop before applying the transformation. For example, if we wanted to produce *y_hat* with chromosome 1 left out (recall that the chromosomes constitute the source blocks for the headers in `blockdf_lvl2`, so headers from chromosome 1 will have headers like %block_1%):" ] }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4e5d9970-6d50-4f85-9ae3-dc0016bbca5d", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [ "y_hat_lvl2_loco1 = estimator_lvl2.transform(blockdf_lvl2.filter(f'header NOT LIKE \"%block_1%\"'), labeldf, modeldf_lvl2_est, cvdf_lvl2)" @@ -813,39 +909,46 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-----+-------+--------------------+\n", - "|sample_block|label| alpha| values|\n", - "+------------+-----+-------+--------------------+\n", - "| 9|sim58|alpha_7|[-0.1347024836295...|\n", - "| 6|sim58|alpha_7|[0.20213653390706...|\n", - "| 5|sim58|alpha_7|[-0.1602333580401...|\n", - "| 2|sim58|alpha_7|[-0.1511874717623...|\n", - "+------------+-----+-------+--------------------+\n", - "only showing top 4 rows\n", - "\n" - ] + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4bf01e2b-fad0-494c-9dd9-a199bf04ebe3", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ "y_hat_lvl2_loco1.show(4)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "43c76cb0-28d8-40b0-86f2-5c1a3da9c210", + "showTitle": false, + "title": "" + } + }, "outputs": [], "source": [] } ], "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": {}, + "notebookName": "levels_ridge_regression_tutorial", + "notebookOrigID": 2963708295114367, + "widgets": {} + }, "kernelspec": { "display_name": "glow", "language": "python", @@ -865,5 +968,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 0 } From b9eaeeed18859373e68c3b005b8e5dc5b8aa196c Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 26 Jun 2024 10:28:38 -0500 Subject: [PATCH 23/27] Update tests.yml --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e5eb28e56..474bf4cc2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -54,6 +54,7 @@ jobs: - name: Check docs links run: (cd docs && make linkcheck) + continue-on-error: true spark-tests: From 65984c9f94a49b5a3574baa3c89b353bfa0bf6e2 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 26 Jun 2024 10:57:17 -0500 Subject: [PATCH 24/27] Scala 2.12.15 -> 2.12.19 Push 2.12.15 to 2.12.19 - because most the libs NO LONGER SUPPORT 2.12.15! --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 474bf4cc2..4e0ab35c1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -66,7 +66,7 @@ jobs: matrix: spark_version: [3.4.1, 3.5.1] - scala_version: [2.12.15] + scala_version: [2.12.19] env: SPARK_VERSION: ${{ matrix.spark_version }} SCALA_VERSION: ${{ matrix.scala_version }} From 2c8dfb04185ac02091776e815e002da87de6ea60 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 26 Jun 2024 10:58:38 -0500 Subject: [PATCH 25/27] 2.12.15 -> 2.12.19 Push scala 2.12.15 -> 2.12.19 --- .github/workflows/staging-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/staging-release.yml b/.github/workflows/staging-release.yml index 2f7fac17f..996cf405d 100644 --- a/.github/workflows/staging-release.yml +++ b/.github/workflows/staging-release.yml @@ -10,7 +10,7 @@ on: default: "3.5.1" scala-version: description: "Scala version to use when building Glow" - default: "2.12.15" + default: "2.12.19" java-version: description: "Java version to use when building Glow" default: "8" From e2b3a1039a185b3400dd3f8cb0c363d5624d2db9 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 26 Jun 2024 11:01:05 -0500 Subject: [PATCH 26/27] Update dependabot.yml to include a0x8o as an assignee --- .github/dependabot.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index b02493f32..08a182457 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,4 +10,5 @@ updates: patterns: - "*" assignees: - - "kermany" \ No newline at end of file + - "kermany" + - "a0x8o" From 945a4b9046d0882e910e544539fbbae21a0d19ca Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 26 Jun 2024 11:13:26 -0500 Subject: [PATCH 27/27] Rename noteook-tests.yml to notebook-tests.yml --- .github/workflows/{noteook-tests.yml => notebook-tests.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{noteook-tests.yml => notebook-tests.yml} (96%) diff --git a/.github/workflows/noteook-tests.yml b/.github/workflows/notebook-tests.yml similarity index 96% rename from .github/workflows/noteook-tests.yml rename to .github/workflows/notebook-tests.yml index b1f62dceb..5dd26efa5 100644 --- a/.github/workflows/noteook-tests.yml +++ b/.github/workflows/notebook-tests.yml @@ -71,4 +71,4 @@ jobs: - name: Run all notebook tests run: | - python docs/dev/run-nb-test.py --cli-profile docs-ci \ No newline at end of file + python docs/dev/run-nb-test.py --cli-profile docs-ci