projectglow · henrydavidge · Oct 10, 2019 · Oct 8, 2019 · Oct 8, 2019 · Oct 8, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,17 +1,16 @@
 version: 2.1
 jobs:
   test:
-    working_directory: ~/spark-genomics
+    working_directory: ~/glow
     docker:
       - image: circleci/openjdk:8
     steps:
+      - checkout
 
       - restore_cache:
           keys:
             - conda-deps-v1-{{ checksum "python/environment.yml" }}
 
-      - checkout
-
       - run:
           name: install dependencies
           command: |
@@ -28,9 +27,13 @@ jobs:
           name: run tests
           environment:
           command: |
-            export PATH=$HOME/conda/envs/spark-genomics/bin:$PATH
+            export PATH=$HOME/conda/envs/glow/bin:$PATH
             sbt test exit
 
+      - store_artifacts:
+          path: ~/glow/unit-tests.log
+          destination: unit-tests.log
+
       - save_cache:
           paths:
             - /home/circleci/conda

diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ adam.log
 *.pyc
 .DS_Store
 docs/build
+unit-tests.log
diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -16,4 +16,4 @@ includeCurlyBraceInSelectChains = false
 includeNoParensInSelectChains = true
 importSelectors = singleLine
 
-rewrite.rules = [PreferCurlyFors, SortImports]
+rewrite.rules = [PreferCurlyFors, SortImports]
diff --git a/build.sbt b/build.sbt
@@ -1,12 +1,13 @@
-import Tests._
 import scala.sys.process._
 
+import sbt.Tests._
+
 val sparkVersion = "2.4.3"
 val scalaMajorMinor = "2.11"
 
 ThisBuild / scalaVersion := s"$scalaMajorMinor.12"
 ThisBuild / version := "0.1.0-SNAPSHOT"
-ThisBuild / organization := "com.databricks"
+ThisBuild / organization := "org.projectglow"
 ThisBuild / organizationName := "DB / RGC"
 ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml"
 
@@ -60,7 +61,7 @@ lazy val commonSettings = Seq(
 lazy val core = (project in file("core"))
   .settings(
     commonSettings,
-    name := "spark-genomics",
+    name := "glow",
     libraryDependencies ++= Seq(
       "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
       "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
@@ -105,7 +106,7 @@ lazy val python =
     .dependsOn(core % "test->test")
     .settings(
       unmanagedSourceDirectories in Compile := {
-        Seq(baseDirectory.value / "spark_genomics")
+        Seq(baseDirectory.value / "glow")
       },
       test in Test := {
         // Pass the test classpath to pyspark so that we run the same bits as the Scala tests

diff --git a/.../com/databricks/hls/sql/HLSFunctions.java → ...n/java/org/projectglow/sql/Functions.java b/.../com/databricks/hls/sql/HLSFunctions.java → ...n/java/org/projectglow/sql/Functions.java
@@ -1,9 +1,9 @@
-package com.databricks.hls.sql;
+package io.projectglow.sql;
 
 import org.apache.spark.sql.catalyst.util.GenericArrayData;
 import org.apache.spark.unsafe.types.UTF8String;
 
-public class HLSFunctions {
+public class Functions {
     public static GenericArrayData asciiCharSplit(UTF8String str, UTF8String split) {
          java.util.List<UTF8String> output = new java.util.ArrayList<>();
          int start = 0;

diff --git a/core/src/main/resources/META-INF/services/com.databricks.hls.DataFrameTransformer b/core/src/main/resources/META-INF/services/com.databricks.hls.DataFrameTransformer
diff --git a/...in/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory b/...in/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory
diff --git a/...n/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory b/...n/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory
diff --git a/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer b/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer
@@ -0,0 +1,4 @@
+io.projectglow.transformers.LiftOverVariantsTransformer
+io.projectglow.transformers.normalizevariants.NormalizeVariantsTransformer
+io.projectglow.transformers.pipe.PipeTransformer
+io.projectglow.transformers.pipe.CleanupPipeTransformer
diff --git a/...c/main/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory b/...c/main/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory
@@ -0,0 +1,3 @@
+io.projectglow.transformers.pipe.CSVInputFormatterFactory
+io.projectglow.transformers.pipe.UTF8TextInputFormatterFactory
+io.projectglow.vcf.VCFInputFormatterFactory
diff --git a/.../main/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory b/.../main/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory
@@ -0,0 +1,3 @@
+io.projectglow.transformers.pipe.CSVOutputFormatterFactory
+io.projectglow.transformers.pipe.UTF8TextOutputFormatterFactory
+io.projectglow.vcf.VCFOutputFormatterFactory
diff --git a/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -1,11 +1,11 @@
 # Standard file formats
-com.databricks.bgen.BgenFileFormat
-com.databricks.bgen.BigBgenDatasource
-com.databricks.vcf.BigVCFDatasource
-com.databricks.vcf.VCFFileFormat
+io.projectglow.bgen.BgenFileFormat
+io.projectglow.bgen.BigBgenDatasource
+io.projectglow.vcf.BigVCFDatasource
+io.projectglow.vcf.VCFFileFormat
 
 # Legacy file formats
-com.databricks.bgen.ComDatabricksBgenFileFormat
-com.databricks.bgen.ComDatabricksBigBgenDatasource
-com.databricks.vcf.ComDatabricksBigVCFDatasource
-com.databricks.vcf.ComDatabricksVCFFileFormat
+io.projectglow.bgen.ComDatabricksBgenFileFormat
+io.projectglow.bgen.ComDatabricksBigBgenDatasource
+io.projectglow.vcf.ComDatabricksBigVCFDatasource
+io.projectglow.vcf.ComDatabricksVCFFileFormat
diff --git a/core/src/main/scala/com/databricks/hls/common/HLSLogging.scala b/core/src/main/scala/com/databricks/hls/common/HLSLogging.scala
diff --git a/core/src/main/scala/com/databricks/hls/tertiary/privateExpressionUtils.scala b/core/src/main/scala/com/databricks/hls/tertiary/privateExpressionUtils.scala
diff --git a/...scala/com/databricks/hls/DBGenomics.scala → .../src/main/scala/io/projectglow/Glow.scala b/...scala/com/databricks/hls/DBGenomics.scala → .../src/main/scala/io/projectglow/Glow.scala
@@ -1,13 +1,13 @@
-package com.databricks.hls
+package io.projectglow
 
 import java.util.ServiceLoader
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.DataFrame
 
-import com.databricks.hls.common.Named
-import com.databricks.hls.transformers.util.{SnakeCaseMap, StringUtils}
+import io.projectglow.common.Named
+import io.projectglow.transformers.util.{SnakeCaseMap, StringUtils}
 
 /**
  * The entry point for all language specific functionality, meaning methods that cannot be expressed
@@ -16,7 +16,7 @@ import com.databricks.hls.transformers.util.{SnakeCaseMap, StringUtils}
  * We should expose as little functionality as is necessary through this object and should prefer
  * generic methods with stringly-typed arguments to reduce language-specific maintenance burden.
  */
-object DBGenomics {
+object Glow {
 
   /**
    * Apply a named transformation to a DataFrame of genomic data. All parameters apart from the

diff --git a/.../databricks/bgen/BgenConverterUtils.scala → ...projectglow/bgen/BgenConverterUtils.scala b/.../databricks/bgen/BgenConverterUtils.scala → ...projectglow/bgen/BgenConverterUtils.scala
@@ -1,11 +1,11 @@
-package com.databricks.bgen
+package io.projectglow.bgen
 
 import java.util.{HashMap => JHashMap}
 
 import org.apache.commons.math3.util.CombinatoricsUtils
 
 // Tools for calculating ploidy or number of genotypes for unphased posterior probabilities
-private[databricks] object BgenConverterUtils {
+private[projectglow] object BgenConverterUtils {
   var ploidyMap = new JHashMap[(Int, Int), Int] // (numGenotypes, numAlleles) to ploidy
   var genotypesMap = new JHashMap[(Int, Int), Int] // (ploidy, numAlleles) to numGenotypes
 

diff --git a/.../com/databricks/bgen/BgenFileFormat.scala → .../io/projectglow/bgen/BgenFileFormat.scala b/.../com/databricks/bgen/BgenFileFormat.scala → .../io/projectglow/bgen/BgenFileFormat.scala
@@ -1,10 +1,10 @@
-package com.databricks.bgen
-
-import scala.collection.JavaConverters._
+package io.projectglow.bgen
 
 import java.io.{BufferedReader, File, InputStreamReader}
 import java.nio.file.Paths
 
+import scala.collection.JavaConverters._
+
 import com.google.common.io.LittleEndianDataInputStream
 import com.google.common.util.concurrent.Striped
 import org.apache.hadoop.conf.Configuration
@@ -19,12 +19,11 @@ import org.apache.spark.sql.types.StructType
 import org.skife.jdbi.v2.DBI
 import org.skife.jdbi.v2.util.LongMapper
 
-import com.databricks.hls.common.logging._
-import com.databricks.hls.common.{HLSLogging, WithUtils}
-import com.databricks.hls.sql.util.SerializableConfiguration
-import com.databricks.sql.ComDatabricksDataSource
+import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging}
+import io.projectglow.common.{GlowLogging, WithUtils}
+import io.projectglow.sql.util.{ComDatabricksDataSource, SerializableConfiguration}
 
-class BgenFileFormat extends FileFormat with DataSourceRegister with Serializable with HLSLogging {
+class BgenFileFormat extends FileFormat with DataSourceRegister with Serializable with GlowLogging {
 
   override def shortName(): String = "bgen"
 

diff --git a/...om/databricks/bgen/BgenFileIterator.scala → ...o/projectglow/bgen/BgenFileIterator.scala b/...om/databricks/bgen/BgenFileIterator.scala → ...o/projectglow/bgen/BgenFileIterator.scala
@@ -1,19 +1,17 @@
-package com.databricks.bgen
+package io.projectglow.bgen
 
 import java.io.{ByteArrayInputStream, DataInput, DataInputStream}
 import java.nio.charset.StandardCharsets
 import java.util.zip.Inflater
 
-import com.google.common.io
 import com.google.common.io.LittleEndianDataInputStream
 import org.apache.commons.math3.util.CombinatoricsUtils
 import org.apache.hadoop.fs.FSDataInputStream
 
-import com.databricks.hls.common.HLSLogging
-import com.databricks.vcf.{BgenGenotype, BgenRow, VCFRow}
+import io.projectglow.common.{BgenGenotype, BgenRow, GlowLogging}
 
 /**
- * Parses variant records of a BGEN file into the [[VCFRow]] schema. The iterator assumes that the
+ * Parses variant records of a BGEN file into the [[io.projectglow.common.VCFRow]] schema. The iterator assumes that the
  * input streams are currently at the beginning of a variant block.
  *
  * The `init` method should be called before reading variants to skip to an appropriate starting
@@ -35,14 +33,14 @@ import com.databricks.vcf.{BgenGenotype, BgenRow, VCFRow}
  * @param maxPos The maximum stream position from which variant blocks can be read. `hasNext` will
  *               return `false` once we've reached this position.
  */
-private[databricks] class BgenFileIterator(
+private[projectglow] class BgenFileIterator(
     metadata: BgenMetadata,
     stream: LittleEndianDataInputStream,
     underlyingStream: FSDataInputStream,
     minPos: Long,
     maxPos: Long)
     extends Iterator[BgenRow]
-    with HLSLogging {
+    with GlowLogging {
 
   import BgenFileIterator._
 
@@ -80,7 +78,7 @@ private[databricks] class BgenFileIterator(
     inflater.inflate(uncompressedBytes)
 
     val rawGenotypeStream = new DataInputStream(new ByteArrayInputStream(uncompressedBytes))
-    val genotypeStream = new io.LittleEndianDataInputStream(rawGenotypeStream)
+    val genotypeStream = new LittleEndianDataInputStream(rawGenotypeStream)
     val genotypes = readGenotypes(nAlleles, genotypeStream, metadata.sampleIds)
 
     BgenRow(
@@ -287,7 +285,7 @@ private[databricks] class BgenFileIterator(
   }
 }
 
-private[databricks] object BgenFileIterator {
+private[projectglow] object BgenFileIterator {
 
   /**
    * Utility function to read a UTF8 string from a data stream. Included in the companion object
@@ -309,7 +307,8 @@ private[databricks] object BgenFileIterator {
  * Read a BGEN header from a data stream. Performs basic validation on the header parameters
  * according to what the reader currently supports.
  */
-private[databricks] class BgenHeaderReader(stream: LittleEndianDataInputStream) extends HLSLogging {
+private[projectglow] class BgenHeaderReader(stream: LittleEndianDataInputStream)
+    extends GlowLogging {
 
   def readHeader(sampleIdsOpt: Option[Seq[String]] = None): BgenMetadata = {
     val variantOffset = Integer.toUnsignedLong(stream.readInt()) + 4
@@ -381,7 +380,7 @@ private[databricks] class BgenHeaderReader(stream: LittleEndianDataInputStream)
   }
 }
 
-private[databricks] case class BgenMetadata(
+private[projectglow] case class BgenMetadata(
     firstVariantOffset: Long,
     nSamples: Long,
     nVariantBlocks: Long,