projectglow · henrydavidge · Jun 22, 2020 · May 15, 2020 · May 19, 2020 · May 19, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -91,6 +91,7 @@ jobs:
             sbt coverage core/test coverageReport exit
       - run:
           name: Run Python tests
+          no_output_timeout: 30m
           environment:
           command: |
             export PATH=$HOME/conda/envs/glow/bin:$PATH
@@ -133,6 +134,7 @@ jobs:
             sbt core/test exit
       - run:
           name: Run Python tests
+          no_output_timeout: 30m
           environment:
           command: |
             export PATH=$HOME/conda/envs/glow/bin:$PATH
@@ -175,6 +177,7 @@ jobs:
             sbt core/test exit
       - run:
           name: Run Python tests
+          no_output_timeout: 30m
           environment:
           command: |
             export PATH=$HOME/conda/envs/glow/bin:$PATH

diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,9 @@ maven-repo/
 **/__pycache__
 *.pyc
 
+# Jupyter notebook checkpoints
+.ipynb_checkpoints/
+
 # Sphinx documentation
 docs/build
 

diff --git a/bin/spark-submit b/bin/spark-submit
@@ -2,5 +2,5 @@
 
 # A simple wrapper around the SparkSubmit main class that allows us to run
 # PySpark unit tests with the same classpath as our Java tests.
-HEAPSIZE=${SPARK_MEMORY:-2g}
+HEAPSIZE=${SPARK_MEMORY:-1024m}
 java -Xmx"$HEAPSIZE" -cp "$SPARK_CLASSPATH" org.apache.spark.deploy.SparkSubmit "$@"
diff --git a/build.sbt b/build.sbt
@@ -205,7 +205,7 @@ lazy val pythonSettings = Seq(
     val env = if (majorMinorVersion(sparkVersion) >= "3.0") {
       baseEnv :+ "PYSPARK_ROW_FIELD_SORTING_ENABLED" -> "true"
     } else {
-      baseEnv
+      baseEnv :+ "ARROW_PRE_0_15_IPC_FORMAT" -> "1"
     }
     val ret = Process(
       Seq("pytest") ++ args,

diff --git a/conftest.py b/conftest.py
@@ -1,11 +1,28 @@
+# Copyright 2019 The Glow Authors
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from pyspark.sql import SparkSession
 import pytest
 
 # Set up a new Spark session for each test suite
 @pytest.fixture(scope="module")
 def spark():
+    print("set up new spark session")
     sess = SparkSession.builder \
         .master("local[2]") \
         .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec") \
+        .config("spark.ui.enabled", "false") \
         .getOrCreate()
     return sess.newSession()
diff --git a/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer b/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer
@@ -1,4 +1,5 @@
 io.projectglow.transformers.LiftOverVariantsTransformer
+io.projectglow.transformers.blockvariantsandsamples.BlockVariantsAndSamplesTransformer
 io.projectglow.transformers.normalizevariants.NormalizeVariantsTransformer
 io.projectglow.transformers.splitmultiallelics.SplitMultiallelicsTransformer
 io.projectglow.transformers.pipe.PipeTransformer

diff --git a/core/src/main/scala/io/projectglow/common/schemas.scala b/core/src/main/scala/io/projectglow/common/schemas.scala
@@ -140,6 +140,16 @@ object VariantSchemas {
   def plinkSchema(hasSampleIds: Boolean): StructType = {
     StructType(plinkBaseSchema :+ plinkGenotypeSchema(hasSampleIds))
   }
+
+  // BlockedGT Fields
+  val headerField = StructField("header", StringType)
+  val sizeField = StructField("size", IntegerType)
+  val valuesField = StructField("values", ArrayType(DoubleType))
+  val headerBlockIdField = StructField("header_block", StringType)
+  val sampleBlockIdField = StructField("sample_block", StringType)
+  val sortKeyField = StructField("sort_key", LongType)
+  val meanField = StructField("mu", DoubleType)
+  val stdDevField = StructField("sig", DoubleType)
 }
 
 object FeatureSchemas {

diff --git a/...projectglow/transformers/blockvariantsandsamples/BlockVariantsAndSamplesTransformer.scala b/...projectglow/transformers/blockvariantsandsamples/BlockVariantsAndSamplesTransformer.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2019 The Glow Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.projectglow.transformers.blockvariantsandsamples
+
+import io.projectglow.DataFrameTransformer
+import io.projectglow.common.logging.HlsUsageLogging
+
+import org.apache.spark.sql.DataFrame
+
+/**
+ * Implements DataFrameTransformer to transform the input DataFrame of variants to Blocked GT
+ * DataFrame for WGR use
+ */
+class BlockVariantsAndSamplesTransformer extends DataFrameTransformer with HlsUsageLogging {
+
+  import BlockVariantsAndSamplesTransformer._
+
+  override def name: String = TRANSFORMER_NAME
+
+  override def transform(df: DataFrame, options: Map[String, String]): DataFrame = {
+
+    val variantsPerBlock = validateIntegerOption(options, VARIANTS_PER_BLOCK)
+    val sampleBlockCount = validateIntegerOption(options, SAMPLE_BLOCK_COUNT)
+
+    VariantSampleBlockMaker.makeVariantAndSampleBlocks(df, variantsPerBlock, sampleBlockCount)
+  }
+}
+
+object BlockVariantsAndSamplesTransformer {
+  val TRANSFORMER_NAME = "block_variants_and_samples"
+  val VARIANTS_PER_BLOCK = "variants_per_block"
+  val SAMPLE_BLOCK_COUNT = "sample_block_count"
+
+  def validateIntegerOption(options: Map[String, String], optionName: String): Int = {
+    try {
+      (options.get(optionName).get.toInt)
+    } catch {
+      case _: Throwable =>
+        throw new IllegalArgumentException(
+          s"$optionName is not provided or cannot be cast as an integer!"
+        )
+    }
+  }
+}
diff --git a/...n/scala/io/projectglow/transformers/blockvariantsandsamples/VariantSampleBlockMaker.scala b/...n/scala/io/projectglow/transformers/blockvariantsandsamples/VariantSampleBlockMaker.scala
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2019 The Glow Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.projectglow.transformers.blockvariantsandsamples
+
+import io.projectglow.common.GlowLogging
+import io.projectglow.common.VariantSchemas._
+import io.projectglow.functions._
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType}
+
+private[projectglow] object VariantSampleBlockMaker extends GlowLogging {
+
+  def makeSampleBlocks(df: DataFrame, sampleBlockCount: Int): DataFrame = {
+    df.withColumn(
+        "fractionalSampleBlockSize",
+        size(col(valuesField.name)) / sampleBlockCount
+      )
+      .withColumn(
+        sampleBlockIdField.name,
+        explode(
+          sequence(
+            lit(1),
+            lit(sampleBlockCount)
+          ).cast(ArrayType(StringType))
+        )
+      )
+      .withColumn(
+        valuesField.name,
+        expr(
+          s"""slice(
+             |   ${valuesField.name},
+             |   round((${sampleBlockIdField.name} - 1) * fractionalSampleBlockSize) + 1,
+             |   round(${sampleBlockIdField.name} * fractionalSampleBlockSize) - round((${sampleBlockIdField.name} - 1) * fractionalSampleBlockSize)
+             |)""".stripMargin
+        )
+      )
+  }
+
+  def makeVariantAndSampleBlocks(
+      variantDf: DataFrame,
+      variantsPerBlock: Int,
+      sampleBlockCount: Int): DataFrame = {
+    val windowSpec = Window
+      .partitionBy(contigNameField.name, sampleBlockIdField.name)
+      .orderBy(startField.name, refAlleleField.name, alternateAllelesField.name)
+
+    val baseDf = variantDf
+      .withColumn(
+        sortKeyField.name,
+        col(startField.name).cast(IntegerType)
+      )
+      .withColumn(
+        headerField.name,
+        concat_ws(
+          ":",
+          col(contigNameField.name),
+          col(startField.name),
+          col(refAlleleField.name),
+          col(alternateAllelesField.name)
+        )
+      )
+      .withColumn(
+        "stats",
+        subset_struct(
+          array_summary_stats(
+            col(valuesField.name)
+          ),
+          "mean",
+          "stdDev"
+        )
+      )
+      .withColumn(
+        meanField.name,
+        col("stats.mean")
+      )
+      .withColumn(
+        stdDevField.name,
+        col("stats.stdDev")
+      )
+
+    makeSampleBlocks(baseDf, sampleBlockCount)
+      .withColumn(
+        sizeField.name,
+        size(col(valuesField.name))
+      )
+      .withColumn(
+        headerBlockIdField.name,
+        concat_ws(
+          "_",
+          lit("chr"),
+          col(contigNameField.name),
+          lit("block"),
+          ((row_number().over(windowSpec) - 1) / variantsPerBlock).cast(IntegerType)
+        )
+      )
+      .select(
+        col(headerField.name),
+        col(sizeField.name),
+        col(valuesField.name),
+        col(headerBlockIdField.name),
+        col(sampleBlockIdField.name),
+        col(sortKeyField.name),
+        col(meanField.name),
+        col(stdDevField.name)
+      )
+  }
+}