diff --git a/.circleci/config.yml b/.circleci/config.yml index 1d4458de6..e02464e63 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,17 +1,16 @@ version: 2.1 jobs: test: - working_directory: ~/spark-genomics + working_directory: ~/glow docker: - image: circleci/openjdk:8 steps: + - checkout - restore_cache: keys: - conda-deps-v1-{{ checksum "python/environment.yml" }} - - checkout - - run: name: install dependencies command: | @@ -28,9 +27,13 @@ jobs: name: run tests environment: command: | - export PATH=$HOME/conda/envs/spark-genomics/bin:$PATH + export PATH=$HOME/conda/envs/glow/bin:$PATH sbt test exit + - store_artifacts: + path: ~/glow/unit-tests.log + destination: unit-tests.log + - save_cache: paths: - /home/circleci/conda diff --git a/.gitignore b/.gitignore index d5f68b73d..46d77cff2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ adam.log *.pyc .DS_Store docs/build +unit-tests.log diff --git a/.scalafmt.conf b/.scalafmt.conf index 444ba506c..08375cf4a 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -16,4 +16,4 @@ includeCurlyBraceInSelectChains = false includeNoParensInSelectChains = true importSelectors = singleLine -rewrite.rules = [PreferCurlyFors, SortImports] \ No newline at end of file +rewrite.rules = [PreferCurlyFors, SortImports] diff --git a/build.sbt b/build.sbt index 82a138974..d3fa9b076 100644 --- a/build.sbt +++ b/build.sbt @@ -1,12 +1,13 @@ -import Tests._ import scala.sys.process._ +import sbt.Tests._ + val sparkVersion = "2.4.3" val scalaMajorMinor = "2.11" ThisBuild / scalaVersion := s"$scalaMajorMinor.12" ThisBuild / version := "0.1.0-SNAPSHOT" -ThisBuild / organization := "com.databricks" +ThisBuild / organization := "org.projectglow" ThisBuild / organizationName := "DB / RGC" ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" @@ -60,7 +61,7 @@ lazy val commonSettings = Seq( lazy val core = (project in file("core")) .settings( commonSettings, - name := "spark-genomics", + name := "glow", libraryDependencies ++= Seq( "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", "org.apache.spark" %% "spark-core" % sparkVersion % "provided", @@ -105,7 +106,7 @@ lazy val python = .dependsOn(core % "test->test") .settings( unmanagedSourceDirectories in Compile := { - Seq(baseDirectory.value / "spark_genomics") + Seq(baseDirectory.value / "glow") }, test in Test := { // Pass the test classpath to pyspark so that we run the same bits as the Scala tests diff --git a/core/src/main/java/com/databricks/hls/sql/HLSFunctions.java b/core/src/main/java/org/projectglow/sql/Functions.java similarity index 92% rename from core/src/main/java/com/databricks/hls/sql/HLSFunctions.java rename to core/src/main/java/org/projectglow/sql/Functions.java index aebdf29d9..c84092b80 100644 --- a/core/src/main/java/com/databricks/hls/sql/HLSFunctions.java +++ b/core/src/main/java/org/projectglow/sql/Functions.java @@ -1,9 +1,9 @@ -package com.databricks.hls.sql; +package io.projectglow.sql; import org.apache.spark.sql.catalyst.util.GenericArrayData; import org.apache.spark.unsafe.types.UTF8String; -public class HLSFunctions { +public class Functions { public static GenericArrayData asciiCharSplit(UTF8String str, UTF8String split) { java.util.List output = new java.util.ArrayList<>(); int start = 0; diff --git a/core/src/main/resources/META-INF/services/com.databricks.hls.DataFrameTransformer b/core/src/main/resources/META-INF/services/com.databricks.hls.DataFrameTransformer deleted file mode 100644 index d9ad59a39..000000000 --- a/core/src/main/resources/META-INF/services/com.databricks.hls.DataFrameTransformer +++ /dev/null @@ -1,4 +0,0 @@ -com.databricks.hls.transformers.LiftOverVariantsTransformer -com.databricks.hls.transformers.normalizevariants.NormalizeVariantsTransformer -com.databricks.hls.transformers.pipe.PipeTransformer -com.databricks.hls.transformers.pipe.CleanupPipeTransformer diff --git a/core/src/main/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory b/core/src/main/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory deleted file mode 100644 index 9c120dd8e..000000000 --- a/core/src/main/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory +++ /dev/null @@ -1,3 +0,0 @@ -com.databricks.hls.transformers.pipe.CSVInputFormatterFactory -com.databricks.hls.transformers.pipe.UTF8TextInputFormatterFactory -com.databricks.vcf.VCFInputFormatterFactory diff --git a/core/src/main/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory b/core/src/main/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory deleted file mode 100644 index ee7cb3cbb..000000000 --- a/core/src/main/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory +++ /dev/null @@ -1,3 +0,0 @@ -com.databricks.hls.transformers.pipe.CSVOutputFormatterFactory -com.databricks.hls.transformers.pipe.UTF8TextOutputFormatterFactory -com.databricks.vcf.VCFOutputFormatterFactory diff --git a/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer b/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer new file mode 100644 index 000000000..735b5efbc --- /dev/null +++ b/core/src/main/resources/META-INF/services/io.projectglow.DataFrameTransformer @@ -0,0 +1,4 @@ +io.projectglow.transformers.LiftOverVariantsTransformer +io.projectglow.transformers.normalizevariants.NormalizeVariantsTransformer +io.projectglow.transformers.pipe.PipeTransformer +io.projectglow.transformers.pipe.CleanupPipeTransformer diff --git a/core/src/main/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory b/core/src/main/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory new file mode 100644 index 000000000..eafd30fd9 --- /dev/null +++ b/core/src/main/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory @@ -0,0 +1,3 @@ +io.projectglow.transformers.pipe.CSVInputFormatterFactory +io.projectglow.transformers.pipe.UTF8TextInputFormatterFactory +io.projectglow.vcf.VCFInputFormatterFactory diff --git a/core/src/main/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory b/core/src/main/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory new file mode 100644 index 000000000..f71bf051a --- /dev/null +++ b/core/src/main/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory @@ -0,0 +1,3 @@ +io.projectglow.transformers.pipe.CSVOutputFormatterFactory +io.projectglow.transformers.pipe.UTF8TextOutputFormatterFactory +io.projectglow.vcf.VCFOutputFormatterFactory diff --git a/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index f3d8a7048..21efaeedd 100644 --- a/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -1,11 +1,11 @@ # Standard file formats -com.databricks.bgen.BgenFileFormat -com.databricks.bgen.BigBgenDatasource -com.databricks.vcf.BigVCFDatasource -com.databricks.vcf.VCFFileFormat +io.projectglow.bgen.BgenFileFormat +io.projectglow.bgen.BigBgenDatasource +io.projectglow.vcf.BigVCFDatasource +io.projectglow.vcf.VCFFileFormat # Legacy file formats -com.databricks.bgen.ComDatabricksBgenFileFormat -com.databricks.bgen.ComDatabricksBigBgenDatasource -com.databricks.vcf.ComDatabricksBigVCFDatasource -com.databricks.vcf.ComDatabricksVCFFileFormat +io.projectglow.bgen.ComDatabricksBgenFileFormat +io.projectglow.bgen.ComDatabricksBigBgenDatasource +io.projectglow.vcf.ComDatabricksBigVCFDatasource +io.projectglow.vcf.ComDatabricksVCFFileFormat diff --git a/core/src/main/scala/com/databricks/hls/common/HLSLogging.scala b/core/src/main/scala/com/databricks/hls/common/HLSLogging.scala deleted file mode 100644 index 493eaa1df..000000000 --- a/core/src/main/scala/com/databricks/hls/common/HLSLogging.scala +++ /dev/null @@ -1,5 +0,0 @@ -package com.databricks.hls.common - -import com.typesafe.scalalogging.slf4j.LazyLogging - -trait HLSLogging extends LazyLogging diff --git a/core/src/main/scala/com/databricks/hls/tertiary/privateExpressionUtils.scala b/core/src/main/scala/com/databricks/hls/tertiary/privateExpressionUtils.scala deleted file mode 100644 index 00228cc35..000000000 --- a/core/src/main/scala/com/databricks/hls/tertiary/privateExpressionUtils.scala +++ /dev/null @@ -1,96 +0,0 @@ -package org.apache.spark.sql.databricks.hls.tertiary // Visibility hack - -import org.apache.spark.ml.linalg.MatrixUDT -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.Star -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.expressions.{Alias, ExpectsInputTypes, Expression, Generator, GenericInternalRow, GetStructField, NamedExpression, Unevaluable} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.util.GenericArrayData -import org.apache.spark.sql.catalyst.{analysis, InternalRow} -import org.apache.spark.sql.types._ - -/** - * Expands all the fields of a potentially unnamed struct. - */ -case class ExpandStruct(struct: Expression) extends Star with Unevaluable { - override def expand(input: LogicalPlan, resolver: analysis.Resolver): Seq[NamedExpression] = { - if (!struct.dataType.isInstanceOf[StructType]) { - throw new AnalysisException("Only structs can be expanded.") - } - - struct.dataType.asInstanceOf[StructType].zipWithIndex.map { - case (f, i) => - Alias(GetStructField(struct, i), f.name)() - } - } -} - -/** - * Expression that adds fields to an existing struct. - * - * At optimization time, this expression is rewritten as the creation of new struct with all the - * fields of the existing struct as well as the new fields. See [[HLSReplaceExpressionsRule]] - * for more details. - */ -case class AddStructFields(struct: Expression, newFields: Seq[Expression]) - extends Expression - with Unevaluable { - - override def nullable: Boolean = true - override def children: Seq[Expression] = struct +: newFields - override def dataType: DataType = { - var base = struct.dataType.asInstanceOf[StructType] - newFields.grouped(2).foreach { - case Seq(name, value) => - val nameStr = name.eval().toString - base = base.add(nameStr, value.dataType, value.nullable) - } - base - } -} - -/** - * Explodes a matrix by row. Each row of the input matrix will be output as an array of doubles. - * - * If the input expression is null or has 0 rows, the output will be empty. - * @param matrixExpr The matrix to explode. May be dense or sparse. - */ -case class ExplodeMatrix(matrixExpr: Expression) - extends Generator - with CodegenFallback - with ExpectsInputTypes { - - val matrixUdt = new MatrixUDT() - - override def children: Seq[Expression] = Seq(matrixExpr) - - override def elementSchema: StructType = { - new StructType() - .add("row", ArrayType(DoubleType, containsNull = false), nullable = false) - } - - override def inputTypes: Seq[AbstractDataType] = Seq(matrixUdt) - - override def eval(input: InternalRow): TraversableOnce[InternalRow] = { - val matrixStruct = matrixExpr.eval(input) - if (matrixStruct == null) { - return Iterator.empty - } - val matrix = matrixUdt.deserialize(matrixStruct).toDenseRowMajor - var rowIdx = 0 - new Iterator[InternalRow] { - override def hasNext: Boolean = rowIdx < matrix.numRows - override def next(): InternalRow = { - var colIdx = 0 - val arr = new Array[Any](matrix.numCols) - while (colIdx < matrix.numCols) { - arr(colIdx) = matrix.values(rowIdx * matrix.numCols + colIdx) - colIdx += 1 - } - rowIdx += 1 - new GenericInternalRow(Array[Any](new GenericArrayData(arr))) - } - } - } -} diff --git a/core/src/main/scala/com/databricks/hls/DBGenomics.scala b/core/src/main/scala/io/projectglow/Glow.scala similarity index 92% rename from core/src/main/scala/com/databricks/hls/DBGenomics.scala rename to core/src/main/scala/io/projectglow/Glow.scala index 09336cf1c..f98645562 100644 --- a/core/src/main/scala/com/databricks/hls/DBGenomics.scala +++ b/core/src/main/scala/io/projectglow/Glow.scala @@ -1,4 +1,4 @@ -package com.databricks.hls +package io.projectglow import java.util.ServiceLoader @@ -6,8 +6,8 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.DataFrame -import com.databricks.hls.common.Named -import com.databricks.hls.transformers.util.{SnakeCaseMap, StringUtils} +import io.projectglow.common.Named +import io.projectglow.transformers.util.{SnakeCaseMap, StringUtils} /** * The entry point for all language specific functionality, meaning methods that cannot be expressed @@ -16,7 +16,7 @@ import com.databricks.hls.transformers.util.{SnakeCaseMap, StringUtils} * We should expose as little functionality as is necessary through this object and should prefer * generic methods with stringly-typed arguments to reduce language-specific maintenance burden. */ -object DBGenomics { +object Glow { /** * Apply a named transformation to a DataFrame of genomic data. All parameters apart from the diff --git a/core/src/main/scala/com/databricks/bgen/BgenConverterUtils.scala b/core/src/main/scala/io/projectglow/bgen/BgenConverterUtils.scala similarity index 95% rename from core/src/main/scala/com/databricks/bgen/BgenConverterUtils.scala rename to core/src/main/scala/io/projectglow/bgen/BgenConverterUtils.scala index c4d6e2baa..c8e1f08ec 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenConverterUtils.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenConverterUtils.scala @@ -1,11 +1,11 @@ -package com.databricks.bgen +package io.projectglow.bgen import java.util.{HashMap => JHashMap} import org.apache.commons.math3.util.CombinatoricsUtils // Tools for calculating ploidy or number of genotypes for unphased posterior probabilities -private[databricks] object BgenConverterUtils { +private[projectglow] object BgenConverterUtils { var ploidyMap = new JHashMap[(Int, Int), Int] // (numGenotypes, numAlleles) to ploidy var genotypesMap = new JHashMap[(Int, Int), Int] // (ploidy, numAlleles) to numGenotypes diff --git a/core/src/main/scala/com/databricks/bgen/BgenFileFormat.scala b/core/src/main/scala/io/projectglow/bgen/BgenFileFormat.scala similarity index 96% rename from core/src/main/scala/com/databricks/bgen/BgenFileFormat.scala rename to core/src/main/scala/io/projectglow/bgen/BgenFileFormat.scala index fb1834f94..f4ef69bda 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenFileFormat.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenFileFormat.scala @@ -1,10 +1,10 @@ -package com.databricks.bgen - -import scala.collection.JavaConverters._ +package io.projectglow.bgen import java.io.{BufferedReader, File, InputStreamReader} import java.nio.file.Paths +import scala.collection.JavaConverters._ + import com.google.common.io.LittleEndianDataInputStream import com.google.common.util.concurrent.Striped import org.apache.hadoop.conf.Configuration @@ -19,12 +19,11 @@ import org.apache.spark.sql.types.StructType import org.skife.jdbi.v2.DBI import org.skife.jdbi.v2.util.LongMapper -import com.databricks.hls.common.logging._ -import com.databricks.hls.common.{HLSLogging, WithUtils} -import com.databricks.hls.sql.util.SerializableConfiguration -import com.databricks.sql.ComDatabricksDataSource +import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging} +import io.projectglow.common.{GlowLogging, WithUtils} +import io.projectglow.sql.util.{ComDatabricksDataSource, SerializableConfiguration} -class BgenFileFormat extends FileFormat with DataSourceRegister with Serializable with HLSLogging { +class BgenFileFormat extends FileFormat with DataSourceRegister with Serializable with GlowLogging { override def shortName(): String = "bgen" diff --git a/core/src/main/scala/com/databricks/bgen/BgenFileIterator.scala b/core/src/main/scala/io/projectglow/bgen/BgenFileIterator.scala similarity index 95% rename from core/src/main/scala/com/databricks/bgen/BgenFileIterator.scala rename to core/src/main/scala/io/projectglow/bgen/BgenFileIterator.scala index 557a31f5d..ba72d7a27 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenFileIterator.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenFileIterator.scala @@ -1,19 +1,17 @@ -package com.databricks.bgen +package io.projectglow.bgen import java.io.{ByteArrayInputStream, DataInput, DataInputStream} import java.nio.charset.StandardCharsets import java.util.zip.Inflater -import com.google.common.io import com.google.common.io.LittleEndianDataInputStream import org.apache.commons.math3.util.CombinatoricsUtils import org.apache.hadoop.fs.FSDataInputStream -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.{BgenGenotype, BgenRow, VCFRow} +import io.projectglow.common.{BgenGenotype, BgenRow, GlowLogging} /** - * Parses variant records of a BGEN file into the [[VCFRow]] schema. The iterator assumes that the + * Parses variant records of a BGEN file into the [[io.projectglow.common.VCFRow]] schema. The iterator assumes that the * input streams are currently at the beginning of a variant block. * * The `init` method should be called before reading variants to skip to an appropriate starting @@ -35,14 +33,14 @@ import com.databricks.vcf.{BgenGenotype, BgenRow, VCFRow} * @param maxPos The maximum stream position from which variant blocks can be read. `hasNext` will * return `false` once we've reached this position. */ -private[databricks] class BgenFileIterator( +private[projectglow] class BgenFileIterator( metadata: BgenMetadata, stream: LittleEndianDataInputStream, underlyingStream: FSDataInputStream, minPos: Long, maxPos: Long) extends Iterator[BgenRow] - with HLSLogging { + with GlowLogging { import BgenFileIterator._ @@ -80,7 +78,7 @@ private[databricks] class BgenFileIterator( inflater.inflate(uncompressedBytes) val rawGenotypeStream = new DataInputStream(new ByteArrayInputStream(uncompressedBytes)) - val genotypeStream = new io.LittleEndianDataInputStream(rawGenotypeStream) + val genotypeStream = new LittleEndianDataInputStream(rawGenotypeStream) val genotypes = readGenotypes(nAlleles, genotypeStream, metadata.sampleIds) BgenRow( @@ -287,7 +285,7 @@ private[databricks] class BgenFileIterator( } } -private[databricks] object BgenFileIterator { +private[projectglow] object BgenFileIterator { /** * Utility function to read a UTF8 string from a data stream. Included in the companion object @@ -309,7 +307,8 @@ private[databricks] object BgenFileIterator { * Read a BGEN header from a data stream. Performs basic validation on the header parameters * according to what the reader currently supports. */ -private[databricks] class BgenHeaderReader(stream: LittleEndianDataInputStream) extends HLSLogging { +private[projectglow] class BgenHeaderReader(stream: LittleEndianDataInputStream) + extends GlowLogging { def readHeader(sampleIdsOpt: Option[Seq[String]] = None): BgenMetadata = { val variantOffset = Integer.toUnsignedLong(stream.readInt()) + 4 @@ -381,7 +380,7 @@ private[databricks] class BgenHeaderReader(stream: LittleEndianDataInputStream) } } -private[databricks] case class BgenMetadata( +private[projectglow] case class BgenMetadata( firstVariantOffset: Long, nSamples: Long, nVariantBlocks: Long, diff --git a/core/src/main/scala/com/databricks/bgen/BgenHeaderWriter.scala b/core/src/main/scala/io/projectglow/bgen/BgenHeaderWriter.scala similarity index 98% rename from core/src/main/scala/com/databricks/bgen/BgenHeaderWriter.scala rename to core/src/main/scala/io/projectglow/bgen/BgenHeaderWriter.scala index d319b8dc5..6348608bd 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenHeaderWriter.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenHeaderWriter.scala @@ -1,4 +1,4 @@ -package com.databricks.bgen +package io.projectglow.bgen import com.google.common.io.LittleEndianDataOutputStream @@ -7,7 +7,7 @@ private[bgen] class BgenHeaderWriter( numVariants: Long, sampleIds: Seq[Option[String]]) { - import com.databricks.bgen.BgenRecordWriter._ + import BgenRecordWriter._ private val HEADER_BLOCK_LENGTH = 20 private val COMPRESSION_TYPE = 1 // zlib diff --git a/core/src/main/scala/com/databricks/bgen/BgenRecordWriter.scala b/core/src/main/scala/io/projectglow/bgen/BgenRecordWriter.scala similarity index 99% rename from core/src/main/scala/com/databricks/bgen/BgenRecordWriter.scala rename to core/src/main/scala/io/projectglow/bgen/BgenRecordWriter.scala index 21a1b3a18..91d92d9e3 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenRecordWriter.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenRecordWriter.scala @@ -1,17 +1,16 @@ -package com.databricks.bgen +package io.projectglow.bgen import java.io.{BufferedOutputStream, ByteArrayOutputStream, DataOutput, OutputStream} import java.nio.charset.StandardCharsets -import java.util.{Comparator, Arrays => JArrays, HashMap => JHashMap} import java.util.zip.{Deflater, DeflaterOutputStream} +import java.util.{Comparator, Arrays => JArrays, HashMap => JHashMap} import com.google.common.io.{CountingOutputStream, LittleEndianDataOutputStream} import org.apache.commons.math3.util.CombinatoricsUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.BgenRow +import io.projectglow.common.{BgenRow, GlowLogging} /** * Writes BGEN records. @@ -36,7 +35,7 @@ class BgenRecordWriter( maxPloidy: Int, defaultPloidy: Int, defaultPhasing: Boolean) - extends HLSLogging { + extends GlowLogging { import BgenRecordWriter._ diff --git a/core/src/main/scala/com/databricks/bgen/BgenRowConverter.scala b/core/src/main/scala/io/projectglow/bgen/BgenRowConverter.scala similarity index 92% rename from core/src/main/scala/com/databricks/bgen/BgenRowConverter.scala rename to core/src/main/scala/io/projectglow/bgen/BgenRowConverter.scala index 64ad7da5d..28972dbc2 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenRowConverter.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenRowConverter.scala @@ -1,4 +1,4 @@ -package com.databricks.bgen +package io.projectglow.bgen import org.apache.spark.sql.SQLUtils.structFieldsEqualExceptNullability import org.apache.spark.sql.catalyst.InternalRow @@ -6,17 +6,16 @@ import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ArrayType, StructType} import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.sql.util.RowConverter -import com.databricks.vcf.{BgenGenotype, BgenRow, VariantSchemas} +import io.projectglow.common.{BgenGenotype, BgenRow, GlowLogging, VariantSchemas} +import io.projectglow.sql.util.RowConverter /** * Converts [[BgenRow]]s into [[InternalRow]] with a given required schema. During construction, * this class will throw an [[IllegalArgumentException]] if any of the fields in the required * schema cannot be derived from a BGEN record. */ -class BgenRowConverter(schema: StructType) extends HLSLogging { - import VariantSchemas._ +class BgenRowConverter(schema: StructType) extends GlowLogging { + import io.projectglow.common.VariantSchemas._ private val converter = { val fns = schema.map { field => val fn: RowConverter.Updater[BgenRow] = field match { diff --git a/core/src/main/scala/com/databricks/bgen/BgenRowToInternalRowConverter.scala b/core/src/main/scala/io/projectglow/bgen/BgenRowToInternalRowConverter.scala similarity index 92% rename from core/src/main/scala/com/databricks/bgen/BgenRowToInternalRowConverter.scala rename to core/src/main/scala/io/projectglow/bgen/BgenRowToInternalRowConverter.scala index 237fee0e1..95cb1dd1d 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenRowToInternalRowConverter.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenRowToInternalRowConverter.scala @@ -1,23 +1,21 @@ -package com.databricks.bgen +package io.projectglow.bgen import org.apache.spark.sql.SQLUtils.structFieldsEqualExceptNullability import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ArrayType, StructType} import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.sql.util.RowConverter -import com.databricks.vcf.{BgenGenotype, BgenRow, VariantSchemas} +import io.projectglow.common.{BgenGenotype, BgenRow, GlowLogging, VariantSchemas} +import io.projectglow.sql.util.RowConverter /** * Converts [[BgenRow]]s into [[InternalRow]] with a given required schema. During construction, * this class will throw an [[IllegalArgumentException]] if any of the fields in the required * schema cannot be derived from a BGEN record. */ -class BgenRowToInternalRowConverter(schema: StructType) extends HLSLogging { - import VariantSchemas._ +class BgenRowToInternalRowConverter(schema: StructType) extends GlowLogging { + import io.projectglow.common.VariantSchemas._ private val converter = { val fns = schema.map { field => val fn: RowConverter.Updater[BgenRow] = field match { diff --git a/core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala b/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala similarity index 89% rename from core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala rename to core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala index 936d11a52..8c6869ad9 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala @@ -1,13 +1,12 @@ -package com.databricks.bgen +package io.projectglow.bgen import com.google.common.io.LittleEndianDataInputStream -import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType -import com.databricks.hls.common.WithUtils -import com.databricks.hls.sql.util.SerializableConfiguration -import com.databricks.vcf.VariantSchemas +import io.projectglow.common.{VariantSchemas, WithUtils} +import io.projectglow.sql.util.SerializableConfiguration /** * Infers the schema of a set of BGEN files from the user-provided options and the header of each diff --git a/core/src/main/scala/com/databricks/bgen/BigBgenDatasource.scala b/core/src/main/scala/io/projectglow/bgen/BigBgenDatasource.scala similarity index 91% rename from core/src/main/scala/com/databricks/bgen/BigBgenDatasource.scala rename to core/src/main/scala/io/projectglow/bgen/BigBgenDatasource.scala index 11b3d7cc3..9a6986522 100644 --- a/core/src/main/scala/com/databricks/bgen/BigBgenDatasource.scala +++ b/core/src/main/scala/io/projectglow/bgen/BigBgenDatasource.scala @@ -1,4 +1,4 @@ -package com.databricks.bgen +package io.projectglow.bgen import java.io.ByteArrayOutputStream @@ -6,8 +6,9 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.sources.DataSourceRegister -import com.databricks.hls.common.logging._ -import com.databricks.sql.{BigFileDatasource, ComDatabricksDataSource} +import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging} +import io.projectglow.sql.BigFileDatasource +import io.projectglow.sql.util.ComDatabricksDataSource class BigBgenDatasource extends BigFileDatasource with DataSourceRegister with HlsUsageLogging { diff --git a/core/src/main/scala/com/databricks/bgen/InternalRowToBgenRowConverter.scala b/core/src/main/scala/io/projectglow/bgen/InternalRowToBgenRowConverter.scala similarity index 97% rename from core/src/main/scala/com/databricks/bgen/InternalRowToBgenRowConverter.scala rename to core/src/main/scala/io/projectglow/bgen/InternalRowToBgenRowConverter.scala index bfbd0ee10..f7339d558 100644 --- a/core/src/main/scala/com/databricks/bgen/InternalRowToBgenRowConverter.scala +++ b/core/src/main/scala/io/projectglow/bgen/InternalRowToBgenRowConverter.scala @@ -1,12 +1,12 @@ -package com.databricks.bgen +package io.projectglow.bgen import org.apache.spark.sql.SQLUtils.structFieldsEqualExceptNullability import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types.{ArrayType, StructType} -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.{BgenGenotype, BgenRow, ConverterUtils, VariantSchemas} +import io.projectglow.common.ConverterUtils._ +import io.projectglow.common.{BgenGenotype, BgenRow, GlowLogging, VariantSchemas} /** * Converts internal rows to BGEN rows. Includes logic to infer phasing and ploidy if missing (eg. @@ -34,9 +34,8 @@ class InternalRowToBgenRowConverter( maxPloidy: Int, defaultPloidy: Int, defaultPhasing: Boolean) - extends HLSLogging { - import VariantSchemas._ - import ConverterUtils._ + extends GlowLogging { + import io.projectglow.common.VariantSchemas._ private val genotypeSchema = rowSchema .find(_.name == "genotypes") diff --git a/core/src/main/scala/com/databricks/vcf/ConverterUtils.scala b/core/src/main/scala/io/projectglow/common/ConverterUtils.scala similarity index 97% rename from core/src/main/scala/com/databricks/vcf/ConverterUtils.scala rename to core/src/main/scala/io/projectglow/common/ConverterUtils.scala index 411a918c8..db978115d 100644 --- a/core/src/main/scala/com/databricks/vcf/ConverterUtils.scala +++ b/core/src/main/scala/io/projectglow/common/ConverterUtils.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.common import htsjdk.variant.variantcontext.GenotypeLikelihoods import htsjdk.variant.vcf.VCFConstants @@ -6,7 +6,7 @@ import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String -private[databricks] object ConverterUtils { +private[projectglow] object ConverterUtils { // Parses the attribute in a map as a comma-separated sequence. def getFieldAsSeq(map: scala.collection.Map[String, String], attr: String): Seq[String] = { diff --git a/core/src/main/scala/com/databricks/hls/common/DebugUtils.scala b/core/src/main/scala/io/projectglow/common/DebugUtils.scala similarity index 97% rename from core/src/main/scala/com/databricks/hls/common/DebugUtils.scala rename to core/src/main/scala/io/projectglow/common/DebugUtils.scala index 107602958..9a36c560e 100644 --- a/core/src/main/scala/com/databricks/hls/common/DebugUtils.scala +++ b/core/src/main/scala/io/projectglow/common/DebugUtils.scala @@ -6,7 +6,7 @@ * License, Version 2.0, a copy of which you may obtain at * http://www.apache.org/licenses/LICENSE-2.0 */ -package com.databricks.hls.common +package io.projectglow.common import java.lang.reflect.Modifier diff --git a/core/src/main/scala/io/projectglow/common/GlowLogging.scala b/core/src/main/scala/io/projectglow/common/GlowLogging.scala new file mode 100644 index 000000000..827e5cf02 --- /dev/null +++ b/core/src/main/scala/io/projectglow/common/GlowLogging.scala @@ -0,0 +1,5 @@ +package io.projectglow.common + +import com.typesafe.scalalogging.slf4j.LazyLogging + +trait GlowLogging extends LazyLogging diff --git a/core/src/main/scala/com/databricks/hls/common/HailUtils.scala b/core/src/main/scala/io/projectglow/common/HailUtils.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/common/HailUtils.scala rename to core/src/main/scala/io/projectglow/common/HailUtils.scala index 4d531344b..7829067f6 100644 --- a/core/src/main/scala/com/databricks/hls/common/HailUtils.scala +++ b/core/src/main/scala/io/projectglow/common/HailUtils.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.common +package io.projectglow.common object HailUtils { val defaultTolerance = 1e-6 diff --git a/core/src/main/scala/com/databricks/hls/common/HasStringency.scala b/core/src/main/scala/io/projectglow/common/HasStringency.scala similarity index 82% rename from core/src/main/scala/com/databricks/hls/common/HasStringency.scala rename to core/src/main/scala/io/projectglow/common/HasStringency.scala index ec379f52d..ec72a17e4 100644 --- a/core/src/main/scala/com/databricks/hls/common/HasStringency.scala +++ b/core/src/main/scala/io/projectglow/common/HasStringency.scala @@ -1,8 +1,8 @@ -package com.databricks.hls.common +package io.projectglow.common import htsjdk.samtools.ValidationStringency -trait HasStringency extends HLSLogging { +trait HasStringency extends GlowLogging { def stringency: ValidationStringency protected def provideWarning(warning: String): Unit = { if (stringency == ValidationStringency.STRICT) { diff --git a/core/src/main/scala/com/databricks/hls/common/Named.scala b/core/src/main/scala/io/projectglow/common/Named.scala similarity index 51% rename from core/src/main/scala/com/databricks/hls/common/Named.scala rename to core/src/main/scala/io/projectglow/common/Named.scala index d3de252db..0855acfb7 100644 --- a/core/src/main/scala/com/databricks/hls/common/Named.scala +++ b/core/src/main/scala/io/projectglow/common/Named.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.common +package io.projectglow.common trait Named { def name: String diff --git a/core/src/main/scala/com/databricks/hls/common/TimeUtils.scala b/core/src/main/scala/io/projectglow/common/TimeUtils.scala similarity index 95% rename from core/src/main/scala/com/databricks/hls/common/TimeUtils.scala rename to core/src/main/scala/io/projectglow/common/TimeUtils.scala index ea9d54af0..b7fe81480 100644 --- a/core/src/main/scala/com/databricks/hls/common/TimeUtils.scala +++ b/core/src/main/scala/io/projectglow/common/TimeUtils.scala @@ -6,7 +6,7 @@ * License, Version 2.0, a copy of which you may obtain at * http://www.apache.org/licenses/LICENSE-2.0 */ -package com.databricks.hls.common +package io.projectglow.common import scala.concurrent.duration._ diff --git a/core/src/main/scala/com/databricks/hls/common/WithUtils.scala b/core/src/main/scala/io/projectglow/common/WithUtils.scala similarity index 97% rename from core/src/main/scala/com/databricks/hls/common/WithUtils.scala rename to core/src/main/scala/io/projectglow/common/WithUtils.scala index 7fcb70462..5b48e0c35 100644 --- a/core/src/main/scala/com/databricks/hls/common/WithUtils.scala +++ b/core/src/main/scala/io/projectglow/common/WithUtils.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.common +package io.projectglow.common import java.io.Closeable import java.util.concurrent.locks.Lock diff --git a/core/src/main/scala/com/databricks/hls/common/logging/HlsUsageLogging.scala b/core/src/main/scala/io/projectglow/common/logging/HlsUsageLogging.scala similarity index 91% rename from core/src/main/scala/com/databricks/hls/common/logging/HlsUsageLogging.scala rename to core/src/main/scala/io/projectglow/common/logging/HlsUsageLogging.scala index a5ccb97ac..92ca16082 100644 --- a/core/src/main/scala/com/databricks/hls/common/logging/HlsUsageLogging.scala +++ b/core/src/main/scala/io/projectglow/common/logging/HlsUsageLogging.scala @@ -1,13 +1,15 @@ -package com.databricks.hls.common.logging +package io.projectglow.common.logging -import com.databricks.hls.common.HLSLogging -import com.google.gson.Gson import scala.collection.JavaConverters._ +import com.google.gson.Gson + +import io.projectglow.common.GlowLogging + /** * These are trait/objects/case classes to log hls events. */ -trait HlsUsageLogging extends HLSLogging { +trait HlsUsageLogging extends GlowLogging { protected def recordHlsUsage( metric: MetricDefinition, diff --git a/core/src/main/scala/com/databricks/vcf/schemas.scala b/core/src/main/scala/io/projectglow/common/schemas.scala similarity index 94% rename from core/src/main/scala/com/databricks/vcf/schemas.scala rename to core/src/main/scala/io/projectglow/common/schemas.scala index 64de50494..f469d8f5c 100644 --- a/core/src/main/scala/com/databricks/vcf/schemas.scala +++ b/core/src/main/scala/io/projectglow/common/schemas.scala @@ -1,6 +1,6 @@ -package com.databricks.vcf +package io.projectglow.common -import org.apache.spark.sql.{Encoders, SQLUtils} +import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.types._ @@ -83,7 +83,7 @@ object VariantSchemas { ) } -private[databricks] case class GenotypeFields( +case class GenotypeFields( sampleId: Option[String], phased: Option[Boolean], calls: Option[Seq[Int]], @@ -99,7 +99,7 @@ private[databricks] case class GenotypeFields( alleleDepths: Option[Seq[Int]], otherFields: scala.collection.Map[String, String]) -private[databricks] object GenotypeFields { +object GenotypeFields { val baseReverseAliases: Map[String, String] = Map( "depth" -> "DP", "filters" -> "FT", @@ -121,7 +121,7 @@ private[databricks] object GenotypeFields { ("GT" -> Seq("phased", "calls")) } -private[databricks] case class VCFRow( +case class VCFRow( contigName: String, start: Long, end: Long, @@ -144,13 +144,13 @@ object VCFRow { .asInstanceOf[ExpressionEncoder[VCFRow]] } -private[databricks] case class BgenGenotype( +private[projectglow] case class BgenGenotype( sampleId: Option[String], phased: Option[Boolean], ploidy: Option[Int], posteriorProbabilities: Seq[Double]) -private[databricks] case class BgenRow( +private[projectglow] case class BgenRow( contigName: String, start: Long, end: Long, diff --git a/core/src/main/scala/com/databricks/sql/BigFileDatasource.scala b/core/src/main/scala/io/projectglow/sql/BigFileDatasource.scala similarity index 96% rename from core/src/main/scala/com/databricks/sql/BigFileDatasource.scala rename to core/src/main/scala/io/projectglow/sql/BigFileDatasource.scala index aee39ce44..76711c312 100644 --- a/core/src/main/scala/com/databricks/sql/BigFileDatasource.scala +++ b/core/src/main/scala/io/projectglow/sql/BigFileDatasource.scala @@ -1,9 +1,10 @@ -package com.databricks.sql +package io.projectglow.sql import java.net.URI import java.util.ServiceLoader import scala.collection.JavaConverters._ + import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -12,7 +13,7 @@ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging /** * Base class for big file datasources. Handles plumbing that's necessary for all such sources: @@ -71,7 +72,7 @@ trait BigFileUploader { def upload(bytes: RDD[Array[Byte]], path: String): Unit } -private[databricks] object SingleFileWriter extends HLSLogging { +private[projectglow] object SingleFileWriter extends GlowLogging { lazy val uploaders: Seq[BigFileUploader] = ServiceLoader .load(classOf[BigFileUploader]) diff --git a/core/src/main/scala/com/databricks/hls/sql/SqlExtensionProvider.scala b/core/src/main/scala/io/projectglow/sql/SqlExtensionProvider.scala similarity index 94% rename from core/src/main/scala/com/databricks/hls/sql/SqlExtensionProvider.scala rename to core/src/main/scala/io/projectglow/sql/SqlExtensionProvider.scala index 19f32ba8b..c44ff14a9 100644 --- a/core/src/main/scala/com/databricks/hls/sql/SqlExtensionProvider.scala +++ b/core/src/main/scala/io/projectglow/sql/SqlExtensionProvider.scala @@ -6,7 +6,7 @@ * License, Version 2.0, a copy of which you may obtain at * http://www.apache.org/licenses/LICENSE-2.0 */ -package com.databricks.hls.sql +package io.projectglow.sql import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.FunctionIdentifier @@ -14,12 +14,11 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, UnresolvedExtra import org.apache.spark.sql.catalyst.expressions.CreateNamedStruct import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.databricks.hls.tertiary.{AddStructFields, ExpandStruct, ExplodeMatrix} import org.apache.spark.sql.internal.SQLConf -import com.databricks.hls.sql.optimizer.HLSReplaceExpressionsRule -import com.databricks.hls.tertiary._ -import com.databricks.vcf.VariantSchemas +import io.projectglow.common.VariantSchemas +import io.projectglow.sql.expressions._ +import io.projectglow.sql.optimizer.HLSReplaceExpressionsRule object SqlExtensionProvider { diff --git a/core/src/main/scala/com/databricks/hls/tertiary/ComputeQR.scala b/core/src/main/scala/io/projectglow/sql/expressions/ComputeQR.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/tertiary/ComputeQR.scala rename to core/src/main/scala/io/projectglow/sql/expressions/ComputeQR.scala index bcb65f4ea..03c162005 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/ComputeQR.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/ComputeQR.scala @@ -1,10 +1,10 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import com.github.fommil.netlib.LAPACK import org.apache.spark.ml.linalg.DenseMatrix import org.netlib.util.intW -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging /** * Context that can be computed once for all variant sites for a linear regression GWAS analysis. @@ -25,7 +25,7 @@ case class CovariateQRContext( def numCols: Int = numCovariateCols + 1 } -object ComputeQR extends HLSLogging { +object ComputeQR extends GlowLogging { def computeQR(covariateMatrix: DenseMatrix): CovariateQRContext = { require( covariateMatrix.numRows > covariateMatrix.numCols, diff --git a/core/src/main/scala/com/databricks/hls/tertiary/LiftOverCoordinatesExpr.scala b/core/src/main/scala/io/projectglow/sql/expressions/LiftOverCoordinatesExpr.scala similarity index 98% rename from core/src/main/scala/com/databricks/hls/tertiary/LiftOverCoordinatesExpr.scala rename to core/src/main/scala/io/projectglow/sql/expressions/LiftOverCoordinatesExpr.scala index b98cd4d89..76a7fdba0 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/LiftOverCoordinatesExpr.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/LiftOverCoordinatesExpr.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import java.io.File @@ -41,9 +41,11 @@ case class LiftOverCoordinatesExpr( override def children: Seq[Expression] = Seq(contigName, start, end, chainFile) ++ minMatchRatioOpt + override def inputTypes = { // scalastyle:ignore Seq(StringType, LongType, LongType, StringType) ++ minMatchRatioOpt.map(_ => DecimalType) } + override def checkInputDataTypes(): TypeCheckResult = { super.checkInputDataTypes() if (!chainFile.foldable) { @@ -52,12 +54,14 @@ case class LiftOverCoordinatesExpr( TypeCheckResult.TypeCheckSuccess } } + override def dataType: DataType = StructType( Seq( StructField("contigName", StringType), StructField("start", LongType), StructField("end", LongType))) + override def nullable: Boolean = true /** diff --git a/core/src/main/scala/org/apache/spark/sql/LinearRegressionExpr.scala b/core/src/main/scala/io/projectglow/sql/expressions/LinearRegressionExpr.scala similarity index 88% rename from core/src/main/scala/org/apache/spark/sql/LinearRegressionExpr.scala rename to core/src/main/scala/io/projectglow/sql/expressions/LinearRegressionExpr.scala index ac4278d5b..e94aa137c 100644 --- a/core/src/main/scala/org/apache/spark/sql/LinearRegressionExpr.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/LinearRegressionExpr.scala @@ -1,14 +1,12 @@ -package org.apache.spark.sql +package io.projectglow.sql.expressions -import org.apache.spark.ml.linalg.MatrixUDT +import org.apache.spark.sql.SQLUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, TernaryExpression} import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ -import com.databricks.hls.tertiary.{ComputeQR, CovariateQRContext, LinearRegressionGwas} - case class LinearRegressionExpr( genotypes: Expression, phenotypes: Expression, @@ -17,7 +15,7 @@ case class LinearRegressionExpr( with CodegenFallback with ImplicitCastInputTypes { - private val matrixUDT = new MatrixUDT() + private val matrixUDT = SQLUtils.newMatrixUDT() override def dataType: DataType = StructType( diff --git a/core/src/main/scala/com/databricks/hls/tertiary/LinearRegressionGwas.scala b/core/src/main/scala/io/projectglow/sql/expressions/LinearRegressionGwas.scala similarity index 98% rename from core/src/main/scala/com/databricks/hls/tertiary/LinearRegressionGwas.scala rename to core/src/main/scala/io/projectglow/sql/expressions/LinearRegressionGwas.scala index 9d62fd66e..2882b0679 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/LinearRegressionGwas.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/LinearRegressionGwas.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import com.github.fommil.netlib.{BLAS, LAPACK} import org.apache.commons.math3.distribution.TDistribution @@ -7,11 +7,11 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.ArrayData import org.netlib.util.{doubleW, intW} -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging case class RegressionStats(beta: Double, standardError: Double, pValue: Double) -object LinearRegressionGwas extends HLSLogging { +object LinearRegressionGwas extends GlowLogging { /** * Fits a linear regression model to a single variant. diff --git a/core/src/main/scala/org/apache/spark/sql/LogisticRegressionExpr.scala b/core/src/main/scala/io/projectglow/sql/expressions/LogisticRegressionExpr.scala similarity index 87% rename from core/src/main/scala/org/apache/spark/sql/LogisticRegressionExpr.scala rename to core/src/main/scala/io/projectglow/sql/expressions/LogisticRegressionExpr.scala index 66adf0943..7a1171b9d 100644 --- a/core/src/main/scala/org/apache/spark/sql/LogisticRegressionExpr.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/LogisticRegressionExpr.scala @@ -1,16 +1,17 @@ -package org.apache.spark.sql +package io.projectglow.sql.expressions -import scala.collection.mutable.{Map => MMap} +import scala.collection.mutable import scala.util.hashing.MurmurHash3 -import org.apache.spark.ml.linalg.{DenseMatrix, MatrixUDT} + +import org.apache.spark.ml.linalg.DenseMatrix +import org.apache.spark.sql.SQLUtils import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, QuaternaryExpression} import org.apache.spark.sql.catalyst.util.ArrayData -import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, StringType} import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.tertiary.{LogisticRegressionGwas, NewtonResult} -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult case class LogisticRegressionExpr( genotypes: Expression, @@ -21,7 +22,7 @@ case class LogisticRegressionExpr( with CodegenFallback with ImplicitCastInputTypes { - private val matrixUDT = new MatrixUDT() + private val matrixUDT = SQLUtils.newMatrixUDT() private val logitTest = LogisticRegressionGwas .logitTests @@ -45,7 +46,7 @@ case class LogisticRegressionExpr( } } - private val nullFitMap: MMap[Int, NewtonResult] = MMap.empty + private val nullFitMap: mutable.Map[Int, NewtonResult] = mutable.Map.empty // For each phenotype, save the null model fit of the covariate matrix since it's the same for every genotype private def fitNullModel(phenotypes: Array[Double], covariates: DenseMatrix): NewtonResult = { val phenoHash = MurmurHash3.arrayHash(phenotypes) diff --git a/core/src/main/scala/com/databricks/hls/tertiary/LogisticRegressionGwas.scala b/core/src/main/scala/io/projectglow/sql/expressions/LogisticRegressionGwas.scala similarity index 97% rename from core/src/main/scala/com/databricks/hls/tertiary/LogisticRegressionGwas.scala rename to core/src/main/scala/io/projectglow/sql/expressions/LogisticRegressionGwas.scala index 604e6c455..bc00f662b 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/LogisticRegressionGwas.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/LogisticRegressionGwas.scala @@ -1,15 +1,16 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import breeze.linalg._ import breeze.numerics._ import com.google.common.annotations.VisibleForTesting import org.apache.commons.math3.distribution.{ChiSquaredDistribution, NormalDistribution} import org.apache.spark.ml.linalg.{DenseMatrix => SparkDenseMatrix} -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.Encoders -import org.apache.spark.sql.types.StructType -import com.databricks.hls.common.HLSLogging +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.types.StructType + +import io.projectglow.common.GlowLogging /** * Statistics returned upon performing a likelihood ratio test. @@ -30,12 +31,12 @@ case class LikelihoodRatioTestStats( * The Hail project can be found on Github: https://github.com/hail-is/hail. * The Hail project is under an MIT license: https://github.com/hail-is/hail/blob/master/LICENSE. */ -object LogisticRegressionGwas extends HLSLogging { +object LogisticRegressionGwas extends GlowLogging { val logitTests: Map[String, LogitTest] = Map("LRT" -> LikelihoodRatioTest) val zScore: Double = new NormalDistribution().inverseCumulativeProbability(.975) // Two-sided 95% confidence @VisibleForTesting - private[databricks] def newtonIterations( + private[projectglow] def newtonIterations( X: DenseMatrix[Double], y: DenseVector[Double], nullFitOpt: Option[NewtonArguments], diff --git a/core/src/main/scala/com/databricks/hls/tertiary/MomentAggState.scala b/core/src/main/scala/io/projectglow/sql/expressions/MomentAggState.scala similarity index 95% rename from core/src/main/scala/com/databricks/hls/tertiary/MomentAggState.scala rename to core/src/main/scala/io/projectglow/sql/expressions/MomentAggState.scala index 687cbd4fe..8e758d280 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/MomentAggState.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/MomentAggState.scala @@ -1,10 +1,10 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{DoubleType, StructField, StructType} -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging /** * The state necessary for maintaining moment based aggregations, currently only supported up to m2. @@ -56,7 +56,7 @@ case class MomentAggState( } } -object MomentAggState extends HLSLogging { +object MomentAggState extends GlowLogging { val schema = StructType( Seq( StructField("mean", DoubleType), diff --git a/core/src/main/scala/com/databricks/hls/tertiary/PerSampleSummaryStatistics.scala b/core/src/main/scala/io/projectglow/sql/expressions/PerSampleSummaryStatistics.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/tertiary/PerSampleSummaryStatistics.scala rename to core/src/main/scala/io/projectglow/sql/expressions/PerSampleSummaryStatistics.scala index 19907d6fb..dda4b9d02 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/PerSampleSummaryStatistics.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/PerSampleSummaryStatistics.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import java.nio.ByteBuffer @@ -13,8 +13,8 @@ import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, Generic import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.VariantSchemas +import io.projectglow.common.{GlowLogging, VariantSchemas} +import io.projectglow.sql.util.ExpectsGenotypeFields case class SampleSummaryStatsState(var sampleId: String, var momentAggState: MomentAggState) { def this() = this(null, null) // need 0-arg constructor for serialization @@ -34,7 +34,7 @@ case class PerSampleSummaryStatistics( inputAggBufferOffset: Int = 0) extends TypedImperativeAggregate[mutable.ArrayBuffer[SampleSummaryStatsState]] with ExpectsGenotypeFields - with HLSLogging { + with GlowLogging { override def children: Seq[Expression] = Seq(genotypes) override def nullable: Boolean = false diff --git a/core/src/main/scala/com/databricks/hls/tertiary/SampleCallSummaryStats.scala b/core/src/main/scala/io/projectglow/sql/expressions/SampleCallSummaryStats.scala similarity index 97% rename from core/src/main/scala/com/databricks/hls/tertiary/SampleCallSummaryStats.scala rename to core/src/main/scala/io/projectglow/sql/expressions/SampleCallSummaryStats.scala index 0f815f41e..1540e13f9 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/SampleCallSummaryStats.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/SampleCallSummaryStats.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import java.nio.ByteBuffer @@ -14,8 +14,8 @@ import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, Generic import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.VariantSchemas +import io.projectglow.common.{GlowLogging, VariantSchemas} +import io.projectglow.sql.util.ExpectsGenotypeFields /** * Computes summary statistics per-sample in a genomic cohort. These statistics include the call @@ -31,7 +31,7 @@ case class CallSummaryStats( inputAggBufferOffset: Int = 0) extends TypedImperativeAggregate[mutable.ArrayBuffer[SampleCallStats]] with ExpectsGenotypeFields - with HLSLogging { + with GlowLogging { override def genotypesExpr: Expression = genotypes @@ -234,7 +234,7 @@ case class SampleCallStats( ) } -object SampleCallStats extends HLSLogging { +object SampleCallStats extends GlowLogging { def merge(s1: SampleCallStats, s2: SampleCallStats): SampleCallStats = { require(s1.sampleId == s2.sampleId, s"${s1.sampleId}, ${s2.sampleId}") val out = new SampleCallStats(s1.sampleId) diff --git a/core/src/main/scala/com/databricks/hls/tertiary/VariantQcExprs.scala b/core/src/main/scala/io/projectglow/sql/expressions/VariantQcExprs.scala similarity index 94% rename from core/src/main/scala/com/databricks/hls/tertiary/VariantQcExprs.scala rename to core/src/main/scala/io/projectglow/sql/expressions/VariantQcExprs.scala index 462097992..70c6e4fc9 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/VariantQcExprs.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/VariantQcExprs.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} @@ -7,8 +7,8 @@ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData} import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection} import org.apache.spark.sql.types._ -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.VariantSchemas +import io.projectglow.common.{GlowLogging, VCFRow, VariantSchemas} +import io.projectglow.sql.util.{ExpectsGenotypeFields, LeveneHaldane} /** * Contains implementations of QC functions. These implementations are called during both @@ -16,13 +16,13 @@ import com.databricks.vcf.VariantSchemas * * The functions are exposed to the user as Catalyst expressions. */ -object VariantQcExprs extends HLSLogging { +object VariantQcExprs extends GlowLogging { /** * Performs a two-sided test of the Hardy-Weinberg equilibrium. Returns the expected het frequency * as well as the associated p value. * @param genotypes an array of structs with the schema required by [[CallStats]] - * @param genotypesIdx the position of the genotype struct (with calls and phasing info) within + * @param genotypeIdx the position of the genotype struct (with calls and phasing info) within * the element struct of the genotypes array * @return a row with the schema of [[HardyWeinbergStruct]] */ @@ -193,7 +193,7 @@ object VariantQcExprs extends HLSLogging { * the fields. * * We use this function for many of the variant QC functions so that each function can require - * a specific schema without requiring that the [[com.databricks.vcf.VCFRow]] schema remain + * a specific schema without requiring that the [[VCFRow]] schema remain * fixed for all time. * * @param schema the desired schema @@ -232,7 +232,7 @@ case class HardyWeinberg(genotypes: Expression) extends UnaryExpression with Exp override def child: Expression = genotypes override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val fn = "com.databricks.hls.tertiary.VariantQcExprs.hardyWeinberg" + val fn = "io.projectglow.sql.expressions.VariantQcExprs.hardyWeinberg" nullSafeCodeGen(ctx, ev, calls => { s""" |${ev.value} = $fn($calls, $genotypeStructSize, ${genotypeFieldIndices.head}); @@ -265,7 +265,7 @@ case class CallStats(genotypes: Expression) extends UnaryExpression with Expects override def child: Expression = genotypes override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val fn = "com.databricks.hls.tertiary.VariantQcExprs.callStats" + val fn = "io.projectglow.sql.expressions.VariantQcExprs.callStats" nullSafeCodeGen(ctx, ev, calls => { s""" |${ev.value} = $fn($calls, $genotypeStructSize, ${genotypeFieldIndices.head}); @@ -309,7 +309,7 @@ case class ArrayStatsSummary(array: Expression) extends UnaryExpression with Exp override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, c => { s""" - |${ev.value} = com.databricks.hls.tertiary.VariantQcExprs.arraySummaryStats($c); + |${ev.value} = io.projectglow.sql.expressions.VariantQcExprs.arraySummaryStats($c); """.stripMargin }) } diff --git a/core/src/main/scala/com/databricks/hls/tertiary/VariantUtilExprs.scala b/core/src/main/scala/io/projectglow/sql/expressions/VariantUtilExprs.scala similarity index 98% rename from core/src/main/scala/com/databricks/hls/tertiary/VariantUtilExprs.scala rename to core/src/main/scala/io/projectglow/sql/expressions/VariantUtilExprs.scala index 2b1e217f3..661ad5c7e 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/VariantUtilExprs.scala +++ b/core/src/main/scala/io/projectglow/sql/expressions/VariantUtilExprs.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult @@ -8,7 +8,8 @@ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import com.databricks.vcf.VariantSchemas +import io.projectglow.common.VariantSchemas +import io.projectglow.sql.util.ExpectsGenotypeFields /** * Implementations of utility functions for transforming variant representations. These @@ -160,7 +161,7 @@ case class GenotypeStates(genotypes: Expression) override def dataType: DataType = ArrayType(IntegerType) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val fn = "com.databricks.hls.tertiary.VariantUtilExprs.genotypeStates" + val fn = "io.projectglow.sql.expressions.VariantUtilExprs.genotypeStates" nullSafeCodeGen(ctx, ev, calls => { s""" |${ev.value} = $fn($calls, $genotypeStructSize, ${genotypeFieldIndices.head}); diff --git a/core/src/main/scala/io/projectglow/sql/expressions/glueExpressions.scala b/core/src/main/scala/io/projectglow/sql/expressions/glueExpressions.scala new file mode 100644 index 000000000..e6008d0c0 --- /dev/null +++ b/core/src/main/scala/io/projectglow/sql/expressions/glueExpressions.scala @@ -0,0 +1,179 @@ +package io.projectglow.sql.expressions + +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.sql.SQLUtils +import org.apache.spark.sql.catalyst.analysis.Star +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode} +import org.apache.spark.sql.catalyst.expressions.{Alias, ExpectsInputTypes, Expression, Generator, GenericInternalRow, GetStructField, ImplicitCastInputTypes, NamedExpression, UnaryExpression, Unevaluable} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData} +import org.apache.spark.sql.catalyst.{analysis, InternalRow} +import org.apache.spark.sql.types._ + +/** + * Expands all the fields of a potentially unnamed struct. + */ +case class ExpandStruct(struct: Expression) extends Star with Unevaluable { + override def expand(input: LogicalPlan, resolver: analysis.Resolver): Seq[NamedExpression] = { + if (!struct.dataType.isInstanceOf[StructType]) { + throw SQLUtils.newAnalysisException("Only structs can be expanded.") + } + + struct.dataType.asInstanceOf[StructType].zipWithIndex.map { + case (f, i) => + Alias(GetStructField(struct, i), f.name)() + } + } +} + +/** + * Expression that adds fields to an existing struct. + * + * At optimization time, this expression is rewritten as the creation of new struct with all the + * fields of the existing struct as well as the new fields. See [[HLSReplaceExpressionsRule]] + * for more details. + */ +case class AddStructFields(struct: Expression, newFields: Seq[Expression]) + extends Expression + with Unevaluable { + + override def nullable: Boolean = true + override def children: Seq[Expression] = struct +: newFields + override def dataType: DataType = { + var base = struct.dataType.asInstanceOf[StructType] + newFields.grouped(2).foreach { + case Seq(name, value) => + val nameStr = name.eval().toString + base = base.add(nameStr, value.dataType, value.nullable) + } + base + } +} + +/** + * Explodes a matrix by row. Each row of the input matrix will be output as an array of doubles. + * + * If the input expression is null or has 0 rows, the output will be empty. + * @param matrixExpr The matrix to explode. May be dense or sparse. + */ +case class ExplodeMatrix(matrixExpr: Expression) + extends Generator + with CodegenFallback + with ExpectsInputTypes { + + private val matrixUdt = SQLUtils.newMatrixUDT() + + override def children: Seq[Expression] = Seq(matrixExpr) + + override def elementSchema: StructType = { + new StructType() + .add("row", ArrayType(DoubleType, containsNull = false), nullable = false) + } + + override def inputTypes = Seq(matrixUdt) // scalastyle:ignore + + override def eval(input: InternalRow): TraversableOnce[InternalRow] = { + val matrixStruct = matrixExpr.eval(input) + if (matrixStruct == null) { + return Iterator.empty + } + val matrix = matrixUdt.deserialize(matrixStruct).toDenseRowMajor + var rowIdx = 0 + new Iterator[InternalRow] { + override def hasNext: Boolean = rowIdx < matrix.numRows + override def next(): InternalRow = { + var colIdx = 0 + val arr = new Array[Any](matrix.numCols) + while (colIdx < matrix.numCols) { + arr(colIdx) = matrix.values(rowIdx * matrix.numCols + colIdx) + colIdx += 1 + } + rowIdx += 1 + new GenericInternalRow(Array[Any](new GenericArrayData(arr))) + } + } + } +} + +case class ArrayToSparseVector(child: Expression) + extends UnaryExpression + with ImplicitCastInputTypes { + + override def inputTypes: Seq[SQLUtils.ADT] = Seq(ArrayType(DoubleType)) + override def dataType: DataType = ArrayToSparseVector.vectorType + override def nullSafeEval(input: Any): Any = ArrayToSparseVector.fromDoubleArray(input) + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + nullSafeCodeGen( + ctx, + ev, + c => { + s""" + |${ev.value} = + |io.projectglow.sql.expressions.ArrayToSparseVector.fromDoubleArray($c); + """.stripMargin + } + ) + } +} + +object ArrayToSparseVector { + lazy val vectorType = SQLUtils.newVectorUDT() + + def fromDoubleArray(input: Any): InternalRow = { + val vector = Vectors.dense(input.asInstanceOf[ArrayData].toDoubleArray()) + vectorType.serialize(vector.toSparse) + } +} + +case class ArrayToDenseVector(child: Expression) + extends UnaryExpression + with ImplicitCastInputTypes { + + override def inputTypes: Seq[SQLUtils.ADT] = Seq(ArrayType(DoubleType)) + override def dataType: DataType = ArrayToDenseVector.vectorType + override def nullSafeEval(input: Any): Any = ArrayToDenseVector.fromDoubleArray(input) + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + nullSafeCodeGen( + ctx, + ev, + c => { + s""" + |${ev.value} = + |io.projectglow.sql.expressions.ArrayToDenseVector.fromDoubleArray($c); + """.stripMargin + } + ) + } +} + +object ArrayToDenseVector { + private lazy val vectorType = SQLUtils.newVectorUDT() + + def fromDoubleArray(input: Any): InternalRow = { + val vector = Vectors.dense(input.asInstanceOf[ArrayData].toDoubleArray()) + vectorType.serialize(vector) + } +} + +case class VectorToArray(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { + override def inputTypes: Seq[SQLUtils.ADT] = Seq(VectorToArray.vectorType) + override def dataType: DataType = ArrayType(DoubleType) + override def nullSafeEval(input: Any): Any = VectorToArray.toDoubleArray(input) + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + nullSafeCodeGen(ctx, ev, c => { + s""" + |${ev.value} = + |io.projectglow.sql.expressions.VectorToArray.toDoubleArray($c); + """.stripMargin + }) + } +} + +object VectorToArray { + lazy val vectorType = SQLUtils.newVectorUDT() + def toDoubleArray(input: Any): ArrayData = { + new GenericArrayData(vectorType.deserialize(input).toArray) + } +} diff --git a/core/src/main/scala/com/databricks/hls/sql/optimizer/hlsOptimizerRules.scala b/core/src/main/scala/io/projectglow/sql/optimizer/hlsOptimizerRules.scala similarity index 87% rename from core/src/main/scala/com/databricks/hls/sql/optimizer/hlsOptimizerRules.scala rename to core/src/main/scala/io/projectglow/sql/optimizer/hlsOptimizerRules.scala index 8de172074..2e4a3e844 100644 --- a/core/src/main/scala/com/databricks/hls/sql/optimizer/hlsOptimizerRules.scala +++ b/core/src/main/scala/io/projectglow/sql/optimizer/hlsOptimizerRules.scala @@ -1,11 +1,12 @@ -package com.databricks.hls.sql.optimizer +package io.projectglow.sql.optimizer import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, GetStructField, Literal} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.databricks.hls.tertiary.AddStructFields import org.apache.spark.sql.types.StructType +import io.projectglow.sql.expressions.AddStructFields + /** * Simple optimization rule that handles expression rewrites */ diff --git a/core/src/main/scala/com/databricks/hls/sql/util/BGZFCodec.scala b/core/src/main/scala/io/projectglow/sql/util/BGZFCodec.scala similarity index 80% rename from core/src/main/scala/com/databricks/hls/sql/util/BGZFCodec.scala rename to core/src/main/scala/io/projectglow/sql/util/BGZFCodec.scala index 579764dd4..8add5f6e2 100644 --- a/core/src/main/scala/com/databricks/hls/sql/util/BGZFCodec.scala +++ b/core/src/main/scala/io/projectglow/sql/util/BGZFCodec.scala @@ -1,9 +1,9 @@ -package com.databricks.hls.sql.util +package io.projectglow.sql.util import java.io.OutputStream import org.apache.hadoop.io.compress.{CompressionOutputStream, Compressor} -import org.seqdoop.hadoop_bam.util.{BGZFCodec => HBBGZFCodec, DatabricksBGZFOutputStream} +import org.seqdoop.hadoop_bam.util.{DatabricksBGZFOutputStream, BGZFCodec => HBBGZFCodec} /** * A copy of Hadoop-BAM's BGZF codec that returns a Databricks BGZF output stream. diff --git a/core/src/main/scala/com/databricks/hls/sql/util/CodegenUtils.scala b/core/src/main/scala/io/projectglow/sql/util/CodegenUtils.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/sql/util/CodegenUtils.scala rename to core/src/main/scala/io/projectglow/sql/util/CodegenUtils.scala index 97c49a4ab..155997735 100644 --- a/core/src/main/scala/com/databricks/hls/sql/util/CodegenUtils.scala +++ b/core/src/main/scala/io/projectglow/sql/util/CodegenUtils.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.sql.util +package io.projectglow.sql.util import java.util diff --git a/core/src/main/scala/com/databricks/sql/ComDatabricksDataSource.scala b/core/src/main/scala/io/projectglow/sql/util/ComDatabricksDataSource.scala similarity index 92% rename from core/src/main/scala/com/databricks/sql/ComDatabricksDataSource.scala rename to core/src/main/scala/io/projectglow/sql/util/ComDatabricksDataSource.scala index 8f8d28834..0ffb5853c 100644 --- a/core/src/main/scala/com/databricks/sql/ComDatabricksDataSource.scala +++ b/core/src/main/scala/io/projectglow/sql/util/ComDatabricksDataSource.scala @@ -1,4 +1,4 @@ -package com.databricks.sql +package io.projectglow.sql.util import org.apache.spark.sql.sources.DataSourceRegister diff --git a/core/src/main/scala/com/databricks/hls/sql/util/EncoderUtils.scala b/core/src/main/scala/io/projectglow/sql/util/EncoderUtils.scala similarity index 93% rename from core/src/main/scala/com/databricks/hls/sql/util/EncoderUtils.scala rename to core/src/main/scala/io/projectglow/sql/util/EncoderUtils.scala index 5677501bb..264847f83 100644 --- a/core/src/main/scala/com/databricks/hls/sql/util/EncoderUtils.scala +++ b/core/src/main/scala/io/projectglow/sql/util/EncoderUtils.scala @@ -1,11 +1,11 @@ -package com.databricks.hls.sql.util +package io.projectglow.sql.util import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.NamedExpression import org.apache.spark.sql.types.StructType -import com.databricks.vcf.VCFRow +import io.projectglow.common.VCFRow object EncoderUtils { diff --git a/core/src/main/scala/com/databricks/hls/tertiary/ExpectsGenotypeFields.scala b/core/src/main/scala/io/projectglow/sql/util/ExpectsGenotypeFields.scala similarity index 98% rename from core/src/main/scala/com/databricks/hls/tertiary/ExpectsGenotypeFields.scala rename to core/src/main/scala/io/projectglow/sql/util/ExpectsGenotypeFields.scala index f317845fa..b8383f540 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/ExpectsGenotypeFields.scala +++ b/core/src/main/scala/io/projectglow/sql/util/ExpectsGenotypeFields.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.util import org.apache.spark.sql.SQLUtils import org.apache.spark.sql.catalyst.analysis.TypeCheckResult diff --git a/core/src/main/scala/com/databricks/hls/sql/util/HadoopLineIterator.scala b/core/src/main/scala/io/projectglow/sql/util/HadoopLineIterator.scala similarity index 94% rename from core/src/main/scala/com/databricks/hls/sql/util/HadoopLineIterator.scala rename to core/src/main/scala/io/projectglow/sql/util/HadoopLineIterator.scala index 209ec6827..a6cc3b3e7 100644 --- a/core/src/main/scala/com/databricks/hls/sql/util/HadoopLineIterator.scala +++ b/core/src/main/scala/io/projectglow/sql/util/HadoopLineIterator.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.sql.util +package io.projectglow.sql.util import java.io.Closeable import java.net.URI @@ -11,7 +11,7 @@ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.execution.datasources.RecordReaderIterator -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging /** * Identical to [[org.apache.spark.sql.execution.datasources.HadoopFileLinesReader]], but @@ -27,7 +27,7 @@ class HadoopLineIterator( conf: Configuration) extends Iterator[Text] with Closeable - with HLSLogging { + with GlowLogging { private val iterator = { val fileSplit = new FileSplit( diff --git a/core/src/main/scala/com/databricks/hls/tertiary/util/KryoUtils.scala b/core/src/main/scala/io/projectglow/sql/util/KryoUtils.scala similarity index 89% rename from core/src/main/scala/com/databricks/hls/tertiary/util/KryoUtils.scala rename to core/src/main/scala/io/projectglow/sql/util/KryoUtils.scala index 8ca4f1ade..6ddb8003f 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/util/KryoUtils.scala +++ b/core/src/main/scala/io/projectglow/sql/util/KryoUtils.scala @@ -1,9 +1,7 @@ -package com.databricks.hls.tertiary.util +package io.projectglow.sql.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import scala.reflect.ClassTag - import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.io.{Input, Output} diff --git a/core/src/main/scala/com/databricks/hls/tertiary/LeveneHaldane.scala b/core/src/main/scala/io/projectglow/sql/util/LeveneHaldane.scala similarity index 98% rename from core/src/main/scala/com/databricks/hls/tertiary/LeveneHaldane.scala rename to core/src/main/scala/io/projectglow/sql/util/LeveneHaldane.scala index a989fd792..9020d0a51 100644 --- a/core/src/main/scala/com/databricks/hls/tertiary/LeveneHaldane.scala +++ b/core/src/main/scala/io/projectglow/sql/util/LeveneHaldane.scala @@ -1,9 +1,9 @@ -package com.databricks.hls.tertiary +package io.projectglow.sql.util import org.apache.commons.math3.distribution.AbstractIntegerDistribution import org.apache.commons.math3.random.RandomGenerator -import com.databricks.hls.common.HailUtils._ +import io.projectglow.common.HailUtils._ /** Implementation pulled from Hail */ // scalastyle:off diff --git a/core/src/main/scala/com/databricks/sql/ManualRegionPartitioner.scala b/core/src/main/scala/io/projectglow/sql/util/ManualRegionPartitioner.scala similarity index 92% rename from core/src/main/scala/com/databricks/sql/ManualRegionPartitioner.scala rename to core/src/main/scala/io/projectglow/sql/util/ManualRegionPartitioner.scala index 5b39c3726..25f58ee20 100644 --- a/core/src/main/scala/com/databricks/sql/ManualRegionPartitioner.scala +++ b/core/src/main/scala/io/projectglow/sql/util/ManualRegionPartitioner.scala @@ -1,4 +1,4 @@ -package com.databricks.sql +package io.projectglow.sql.util import org.apache.spark.Partitioner diff --git a/core/src/main/scala/com/databricks/hls/sql/util/RowConverter.scala b/core/src/main/scala/io/projectglow/sql/util/RowConverter.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/sql/util/RowConverter.scala rename to core/src/main/scala/io/projectglow/sql/util/RowConverter.scala index 9669fda66..93327227f 100644 --- a/core/src/main/scala/com/databricks/hls/sql/util/RowConverter.scala +++ b/core/src/main/scala/io/projectglow/sql/util/RowConverter.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.sql.util +package io.projectglow.sql.util import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow diff --git a/core/src/main/scala/com/databricks/hls/sql/util/SerializableConfiguration.scala b/core/src/main/scala/io/projectglow/sql/util/SerializableConfiguration.scala similarity index 92% rename from core/src/main/scala/com/databricks/hls/sql/util/SerializableConfiguration.scala rename to core/src/main/scala/io/projectglow/sql/util/SerializableConfiguration.scala index 473828958..538eacee0 100644 --- a/core/src/main/scala/com/databricks/hls/sql/util/SerializableConfiguration.scala +++ b/core/src/main/scala/io/projectglow/sql/util/SerializableConfiguration.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.sql.util +package io.projectglow.sql.util import java.io.{ObjectInputStream, ObjectOutputStream} diff --git a/core/src/main/scala/com/databricks/hls/transformers/LiftOverVariantsTransformer.scala b/core/src/main/scala/io/projectglow/transformers/LiftOverVariantsTransformer.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/transformers/LiftOverVariantsTransformer.scala rename to core/src/main/scala/io/projectglow/transformers/LiftOverVariantsTransformer.scala index 94198e864..11f773fc8 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/LiftOverVariantsTransformer.scala +++ b/core/src/main/scala/io/projectglow/transformers/LiftOverVariantsTransformer.scala @@ -1,8 +1,9 @@ -package com.databricks.hls.transformers +package io.projectglow.transformers + +import java.io.File import scala.collection.JavaConverters._ import scala.collection.mutable -import java.io.File import htsjdk.samtools.ValidationStringency import htsjdk.samtools.liftover.LiftOver @@ -10,18 +11,19 @@ import htsjdk.samtools.reference.{ReferenceSequence, ReferenceSequenceFileFactor import htsjdk.samtools.util.Interval import htsjdk.variant.variantcontext.VariantContext import htsjdk.variant.vcf._ -import org.apache.spark.sql.{DataFrame, SQLUtils} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection import org.apache.spark.sql.catalyst.expressions.{BoundReference, Literal, MutableProjection} import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, SQLUtils} import org.apache.spark.unsafe.types.UTF8String import picard.util.LiftoverUtils import picard.vcf.LiftoverVcf -import com.databricks.hls.DataFrameTransformer -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.tertiary.LiftOverCoordinatesExpr -import com.databricks.vcf.{InternalRowToVariantContextConverter, VCFSchemaInferrer, VariantContextToInternalRowConverter, VariantSchemas} + +import io.projectglow.DataFrameTransformer +import io.projectglow.common.{GlowLogging, VariantSchemas} +import io.projectglow.sql.expressions.LiftOverCoordinatesExpr +import io.projectglow.vcf.{InternalRowToVariantContextConverter, VCFSchemaInferrer, VariantContextToInternalRowConverter} /** * Performs lift over from a variant on the reference sequence to a query sequence. Similar to @@ -111,7 +113,7 @@ class LiftOverVariantsTransformer extends DataFrameTransformer { } } -object LiftOverVariantsTransformer extends HLSLogging { +object LiftOverVariantsTransformer extends GlowLogging { val liftOverStatusColName = "liftOverStatus" val successColName = "success" val errorMessageColName = "errorMessage" diff --git a/core/src/main/scala/com/databricks/hls/transformers/normalizevariants/NormalizeVariantsTransformer.scala b/core/src/main/scala/io/projectglow/transformers/normalizevariants/NormalizeVariantsTransformer.scala similarity index 88% rename from core/src/main/scala/com/databricks/hls/transformers/normalizevariants/NormalizeVariantsTransformer.scala rename to core/src/main/scala/io/projectglow/transformers/normalizevariants/NormalizeVariantsTransformer.scala index 9ee494c12..dbb5663dc 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/normalizevariants/NormalizeVariantsTransformer.scala +++ b/core/src/main/scala/io/projectglow/transformers/normalizevariants/NormalizeVariantsTransformer.scala @@ -1,12 +1,12 @@ -package com.databricks.hls.transformers.normalizevariants +package io.projectglow.transformers.normalizevariants -import com.databricks.hls.DataFrameTransformer -import com.databricks.hls.common.logging._ -import com.databricks.vcf._ import htsjdk.samtools.ValidationStringency import org.apache.spark.sql.DataFrame -import com.databricks.hls.transformers.util.StringUtils +import io.projectglow.DataFrameTransformer +import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging} +import io.projectglow.transformers.util.StringUtils +import io.projectglow.vcf.VCFOptionParser /** * Implements DataFrameTransformer to transform the input DataFrame of varaints to an output @@ -78,7 +78,7 @@ class NormalizeVariantsTransformer extends DataFrameTransformer with HlsUsageLog } } -private[databricks] object NormalizeVariantsTransformer extends HlsUsageLogging { +private[projectglow] object NormalizeVariantsTransformer extends HlsUsageLogging { private val MODE_KEY = "mode" val MODE_NORMALIZE = "normalize" val MODE_SPLIT_NORMALIZE = "split_and_normalize" diff --git a/core/src/main/scala/com/databricks/hls/transformers/normalizevariants/VariantNormalizer.scala b/core/src/main/scala/io/projectglow/transformers/normalizevariants/VariantNormalizer.scala similarity index 97% rename from core/src/main/scala/com/databricks/hls/transformers/normalizevariants/VariantNormalizer.scala rename to core/src/main/scala/io/projectglow/transformers/normalizevariants/VariantNormalizer.scala index 89945c77c..0679645bb 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/normalizevariants/VariantNormalizer.scala +++ b/core/src/main/scala/io/projectglow/transformers/normalizevariants/VariantNormalizer.scala @@ -1,25 +1,25 @@ -package com.databricks.hls.transformers.normalizevariants +package io.projectglow.transformers.normalizevariants import java.io.File import java.nio.file.Paths + import scala.collection.JavaConverters._ +import scala.math.min -import com.databricks.hls.common.HLSLogging -import com.databricks.vcf.{InternalRowToVariantContextConverter, VCFFileWriter, VariantContextToInternalRowConverter} import com.google.common.annotations.VisibleForTesting import htsjdk.samtools.ValidationStringency import htsjdk.variant.variantcontext._ import htsjdk.variant.vcf.VCFHeader import org.apache.spark.sql.{DataFrame, SQLUtils} -import org.apache.spark.sql.functions.lit import org.broadinstitute.hellbender.engine.{ReferenceContext, ReferenceDataSource} import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod import org.broadinstitute.hellbender.utils.SimpleInterval import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils -import scala.math.min +import io.projectglow.common.GlowLogging +import io.projectglow.vcf.{InternalRowToVariantContextConverter, VCFFileWriter, VariantContextToInternalRowConverter} -private[databricks] object VariantNormalizer extends HLSLogging { +private[projectglow] object VariantNormalizer extends GlowLogging { /** * Normalizes the input DataFrame of variants and outputs them as a Dataframe; Optionally diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/CSVInputFormatter.scala b/core/src/main/scala/io/projectglow/transformers/pipe/CSVInputFormatter.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/CSVInputFormatter.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/CSVInputFormatter.scala index 743febd9e..bc2a589cf 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/CSVInputFormatter.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/CSVInputFormatter.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import java.io.{OutputStream, PrintWriter} diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/CSVOutputFormatter.scala b/core/src/main/scala/io/projectglow/transformers/pipe/CSVOutputFormatter.scala similarity index 91% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/CSVOutputFormatter.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/CSVOutputFormatter.scala index f6e14ad39..201d629da 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/CSVOutputFormatter.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/CSVOutputFormatter.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import java.io.InputStream @@ -6,7 +6,7 @@ import scala.collection.JavaConverters._ import com.univocity.parsers.csv.CsvParser import org.apache.commons.io.IOUtils -import org.apache.spark.sql.execution.datasources.csv.{CSVDataSourceUtils, CSVOptions, CSVUtils, UnivocityParser, UnivocityParserUtils} +import org.apache.spark.sql.execution.datasources.csv._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StringType, StructField, StructType} diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/CleanupPipeTransformer.scala b/core/src/main/scala/io/projectglow/transformers/pipe/CleanupPipeTransformer.scala similarity index 74% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/CleanupPipeTransformer.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/CleanupPipeTransformer.scala index dcd91674e..e3c1944d6 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/CleanupPipeTransformer.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/CleanupPipeTransformer.scala @@ -1,8 +1,8 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import org.apache.spark.sql.DataFrame -import com.databricks.hls.DataFrameTransformer +import io.projectglow.DataFrameTransformer class CleanupPipeTransformer extends DataFrameTransformer { override def name: String = "pipe_cleanup" diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/PipeTransformer.scala b/core/src/main/scala/io/projectglow/transformers/pipe/PipeTransformer.scala similarity index 95% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/PipeTransformer.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/PipeTransformer.scala index c40412fda..8af9d2b01 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/PipeTransformer.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/PipeTransformer.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import java.io.{Closeable, InputStream, OutputStream} import java.util.ServiceLoader @@ -10,10 +10,10 @@ import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.InternalRow -import com.databricks.hls.DataFrameTransformer -import com.databricks.hls.common.Named -import com.databricks.hls.common.logging._ -import com.databricks.hls.transformers.util.SnakeCaseMap +import io.projectglow.DataFrameTransformer +import io.projectglow.common.Named +import io.projectglow.common.logging._ +import io.projectglow.transformers.util.SnakeCaseMap class PipeTransformer extends DataFrameTransformer with HlsUsageLogging { override def name: String = "pipe" diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/Piper.scala b/core/src/main/scala/io/projectglow/transformers/pipe/Piper.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/Piper.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/Piper.scala index 6dd05867f..145b3c464 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/Piper.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/Piper.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import java.io._ import java.util.concurrent.atomic.AtomicReference @@ -13,7 +13,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLUtils} -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging /** * Based on Spark's PipedRDD with the following modifications: @@ -21,7 +21,7 @@ import com.databricks.hls.common.HLSLogging * - Use the input and output formatters to determine output schema * - Use the input and output formatters to return a DataFrame */ -private[databricks] object Piper extends HLSLogging { +private[projectglow] object Piper extends GlowLogging { private val cachedRdds = mutable.ListBuffer[RDD[_]]() def clearCache(): Unit = cachedRdds.synchronized { @@ -75,12 +75,12 @@ private[databricks] object Piper extends HLSLogging { } } -private[databricks] class ProcessHelper( +private[projectglow] class ProcessHelper( cmd: Seq[String], environment: Map[String, String], inputFn: OutputStream => Unit, context: TaskContext) - extends HLSLogging { + extends GlowLogging { private val childThreadException = new AtomicReference[Throwable](null) private var process: Process = _ diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/UTF8TextInputFormatter.scala b/core/src/main/scala/io/projectglow/transformers/pipe/UTF8TextInputFormatter.scala similarity index 96% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/UTF8TextInputFormatter.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/UTF8TextInputFormatter.scala index eeaebc76c..8746dcadf 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/UTF8TextInputFormatter.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/UTF8TextInputFormatter.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import java.io.{OutputStream, PrintWriter} diff --git a/core/src/main/scala/com/databricks/hls/transformers/pipe/UTF8TextOutputFormatter.scala b/core/src/main/scala/io/projectglow/transformers/pipe/UTF8TextOutputFormatter.scala similarity index 95% rename from core/src/main/scala/com/databricks/hls/transformers/pipe/UTF8TextOutputFormatter.scala rename to core/src/main/scala/io/projectglow/transformers/pipe/UTF8TextOutputFormatter.scala index 575c9ea4d..9461a59ff 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/pipe/UTF8TextOutputFormatter.scala +++ b/core/src/main/scala/io/projectglow/transformers/pipe/UTF8TextOutputFormatter.scala @@ -1,9 +1,9 @@ -package com.databricks.hls.transformers.pipe - -import scala.collection.JavaConverters._ +package io.projectglow.transformers.pipe import java.io.InputStream +import scala.collection.JavaConverters._ + import org.apache.commons.io.IOUtils import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{StringType, StructField, StructType} diff --git a/core/src/main/scala/com/databricks/hls/transformers/util/StringUtils.scala b/core/src/main/scala/io/projectglow/transformers/util/StringUtils.scala similarity index 95% rename from core/src/main/scala/com/databricks/hls/transformers/util/StringUtils.scala rename to core/src/main/scala/io/projectglow/transformers/util/StringUtils.scala index 503b44289..07768d915 100644 --- a/core/src/main/scala/com/databricks/hls/transformers/util/StringUtils.scala +++ b/core/src/main/scala/io/projectglow/transformers/util/StringUtils.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.util +package io.projectglow.transformers.util object StringUtils { // Matches all capital letters except at beginning of string (to allow UpperCamelCase) diff --git a/core/src/main/scala/com/databricks/vcf/BigVCFDatasource.scala b/core/src/main/scala/io/projectglow/vcf/BigVCFDatasource.scala similarity index 89% rename from core/src/main/scala/com/databricks/vcf/BigVCFDatasource.scala rename to core/src/main/scala/io/projectglow/vcf/BigVCFDatasource.scala index b9169d950..3da149eaf 100644 --- a/core/src/main/scala/com/databricks/vcf/BigVCFDatasource.scala +++ b/core/src/main/scala/io/projectglow/vcf/BigVCFDatasource.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.ByteArrayOutputStream @@ -9,9 +9,9 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.sources.DataSourceRegister import org.seqdoop.hadoop_bam.util.DatabricksBGZFOutputStream -import com.databricks.hls.common.logging._ -import com.databricks.hls.sql.util.SerializableConfiguration -import com.databricks.sql.{BigFileDatasource, ComDatabricksDataSource} +import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging} +import io.projectglow.sql.BigFileDatasource +import io.projectglow.sql.util.{ComDatabricksDataSource, SerializableConfiguration} class BigVCFDatasource extends BigFileDatasource with DataSourceRegister { diff --git a/core/src/main/scala/com/databricks/vcf/InternalRowToVariantContextConverter.scala b/core/src/main/scala/io/projectglow/vcf/InternalRowToVariantContextConverter.scala similarity index 98% rename from core/src/main/scala/com/databricks/vcf/InternalRowToVariantContextConverter.scala rename to core/src/main/scala/io/projectglow/vcf/InternalRowToVariantContextConverter.scala index cf4df8028..87ca19317 100644 --- a/core/src/main/scala/com/databricks/vcf/InternalRowToVariantContextConverter.scala +++ b/core/src/main/scala/io/projectglow/vcf/InternalRowToVariantContextConverter.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.util.{ArrayList => JArrayList} @@ -13,7 +13,8 @@ import org.apache.spark.sql.SQLUtils.structFieldsEqualExceptNullability import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types.{ArrayType, StructField, StructType} -import com.databricks.hls.common.{HLSLogging, HasStringency} + +import io.projectglow.common.{GenotypeFields, GlowLogging, HasStringency, VariantSchemas} /** * Converts internal rows with the provided schema into HTSJDK variant context. @@ -30,11 +31,11 @@ class InternalRowToVariantContextConverter( rowSchema: StructType, headerLineSet: Set[VCFHeaderLine], val stringency: ValidationStringency) - extends HLSLogging + extends GlowLogging with HasStringency with Serializable { - import VariantSchemas._ - import ConverterUtils._ + import io.projectglow.common.ConverterUtils._ + import io.projectglow.common.VariantSchemas._ private val alleles = scala.collection.mutable.ArrayBuffer[Allele]() private val genotypeSchema = rowSchema diff --git a/core/src/main/scala/com/databricks/vcf/LineIteratorImpl.scala b/core/src/main/scala/io/projectglow/vcf/LineIteratorImpl.scala similarity index 95% rename from core/src/main/scala/com/databricks/vcf/LineIteratorImpl.scala rename to core/src/main/scala/io/projectglow/vcf/LineIteratorImpl.scala index ec3407dd2..9abe563ea 100644 --- a/core/src/main/scala/com/databricks/vcf/LineIteratorImpl.scala +++ b/core/src/main/scala/io/projectglow/vcf/LineIteratorImpl.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import htsjdk.tribble.readers.LineIterator diff --git a/core/src/main/scala/com/databricks/vcf/TabixIndexHelper.scala b/core/src/main/scala/io/projectglow/vcf/TabixIndexHelper.scala similarity index 98% rename from core/src/main/scala/com/databricks/vcf/TabixIndexHelper.scala rename to core/src/main/scala/io/projectglow/vcf/TabixIndexHelper.scala index 25dd16d5e..749225bf0 100644 --- a/core/src/main/scala/com/databricks/vcf/TabixIndexHelper.scala +++ b/core/src/main/scala/io/projectglow/vcf/TabixIndexHelper.scala @@ -1,19 +1,19 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.File import java.nio.file.Paths import scala.collection.JavaConverters._ + +import com.google.common.annotations.VisibleForTesting import htsjdk.tribble.index.tabix._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources._ +import org.apache.spark.sql.sources.{Filter, _} import org.broadinstitute.hellbender.utils.SimpleInterval -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.common.WithUtils -import com.google.common.annotations.VisibleForTesting + +import io.projectglow.common.{GlowLogging, WithUtils} /** An extended Contig class used by filter parser that keeps an Option(contigName) * updated under And and Or operations and provides other required functionalities @@ -130,7 +130,7 @@ case class ParsedFilterResult( endInterval: FilterInterval) /** Contains filter parsing tools and other tools used to apply tabix index */ -object TabixIndexHelper extends HLSLogging { +object TabixIndexHelper extends GlowLogging { /** * Parses filters provided by spark sql parser to generate the ParsedFilterResult=(contig, diff --git a/core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala b/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala similarity index 97% rename from core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala rename to core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala index 971407232..2b2d6bb33 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala @@ -1,9 +1,9 @@ -package com.databricks.vcf - -import scala.collection.JavaConverters._ +package io.projectglow.vcf import java.io.BufferedInputStream +import scala.collection.JavaConverters._ + import com.google.common.util.concurrent.Striped import htsjdk.samtools.ValidationStringency import htsjdk.samtools.util.{BlockCompressedInputStream, OverlapDetector} @@ -23,14 +23,13 @@ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types._ import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod -import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils import org.broadinstitute.hellbender.utils.SimpleInterval +import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils import org.seqdoop.hadoop_bam.util.{BGZFEnhancedGzipCodec, DatabricksBGZFOutputStream} -import com.databricks.hls.common.{HLSLogging, WithUtils} -import com.databricks.hls.common.logging._ -import com.databricks.hls.sql.util.{HadoopLineIterator, SerializableConfiguration} -import com.databricks.sql.ComDatabricksDataSource +import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging} +import io.projectglow.common.{GlowLogging, VCFRow, WithUtils} +import io.projectglow.sql.util.{ComDatabricksDataSource, HadoopLineIterator, SerializableConfiguration} class VCFFileFormat extends TextBasedFileFormat with DataSourceRegister with HlsUsageLogging { var codecFactory: CompressionCodecFactory = _ @@ -233,7 +232,7 @@ object VCFFileFormat { def hadoopConfWithBGZ(conf: Configuration): Configuration = { val toReturn = new Configuration(conf) val bgzCodecs = Seq( - "com.databricks.hls.sql.util.BGZFCodec", + "io.projectglow.sql.util.BGZFCodec", "org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec" ) val codecs = toReturn @@ -293,7 +292,7 @@ private[vcf] class VCFIterator( codec: VCFCodec, filteredSimpleInterval: SimpleInterval) extends AbstractVCFIterator(codec) - with HLSLogging { + with GlowLogging { // filteredSimpleInterval is the SimpleInterval containing the contig and interval generated by // the filter parser to be checked for overlap by overlap detector. @@ -360,7 +359,7 @@ private[vcf] class VCFIterator( private[vcf] class SplitVCFIterator(baseIterator: VCFIterator) extends AbstractVCFIterator(baseIterator.getCodec) - with HLSLogging { + with GlowLogging { private var isSplit: Boolean = false private var nextVC: VariantContext = _ // nextVC always holds the nextVC to be passed by the @@ -519,7 +518,7 @@ private[vcf] object SchemaDelegate { } -private[databricks] class VCFOutputWriterFactory(options: Map[String, String]) +private[projectglow] class VCFOutputWriterFactory(options: Map[String, String]) extends OutputWriterFactory { override def newInstance( @@ -541,14 +540,14 @@ private[databricks] class VCFOutputWriterFactory(options: Map[String, String]) } } -private[databricks] object VCFOptionParser { +private[projectglow] object VCFOptionParser { def getValidationStringency(options: Map[String, String]): ValidationStringency = { val stringency = options.getOrElse(VCFOption.VALIDATION_STRINGENCY, "SILENT").toUpperCase ValidationStringency.valueOf(stringency) } } -private[databricks] object VCFOption { +object VCFOption { // Reader-only options val FLATTEN_INFO_FIELDS = "flattenInfoFields" val INCLUDE_SAMPLE_IDS = "includeSampleIds" diff --git a/core/src/main/scala/com/databricks/vcf/VCFFileWriter.scala b/core/src/main/scala/io/projectglow/vcf/VCFFileWriter.scala similarity index 95% rename from core/src/main/scala/com/databricks/vcf/VCFFileWriter.scala rename to core/src/main/scala/io/projectglow/vcf/VCFFileWriter.scala index 8a78b2f99..1a6e0972e 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFFileWriter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFFileWriter.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.{OutputStream, StringReader} import java.net.{URI, URISyntaxException} @@ -6,18 +6,18 @@ import java.net.{URI, URISyntaxException} import scala.collection.JavaConverters._ import scala.util.control.NonFatal +import com.google.common.annotations.VisibleForTesting import htsjdk.variant.vcf._ import org.apache.commons.io.IOUtils +import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter -import com.google.common.annotations.VisibleForTesting -import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.types.StructType import org.bdgenomics.adam.rdd.VCFMetadataLoader -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.GlowLogging -object VCFFileWriter extends HLSLogging { +object VCFFileWriter extends GlowLogging { val VCF_HEADER_KEY = "vcfHeader" private val INFER_HEADER = "infer" @@ -46,7 +46,7 @@ object VCFFileWriter extends HLSLogging { * If reading a VCF header from a string or a file, the sample IDs are returned. */ @VisibleForTesting - private[databricks] def parseHeaderLinesAndSamples( + private[projectglow] def parseHeaderLinesAndSamples( options: Map[String, String], defaultHeader: Option[String], schema: StructType, @@ -86,7 +86,7 @@ class VCFFileWriter( stream: OutputStream, writeHeader: Boolean) extends OutputWriter - with HLSLogging { + with GlowLogging { private val DEFAULT_VCF_WRITER_HEADER = "infer" private val (headerLineSet, providedSampleIds) = diff --git a/core/src/main/scala/com/databricks/vcf/VCFHeaderLoader.scala b/core/src/main/scala/io/projectglow/vcf/VCFHeaderLoader.scala similarity index 97% rename from core/src/main/scala/com/databricks/vcf/VCFHeaderLoader.scala rename to core/src/main/scala/io/projectglow/vcf/VCFHeaderLoader.scala index 7fa320ed7..643f39cdc 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFHeaderLoader.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFHeaderLoader.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import htsjdk.variant.vcf.VCFHeader import org.apache.hadoop.conf.Configuration diff --git a/core/src/main/scala/com/databricks/vcf/VCFInputFormatter.scala b/core/src/main/scala/io/projectglow/vcf/VCFInputFormatter.scala similarity index 90% rename from core/src/main/scala/com/databricks/vcf/VCFInputFormatter.scala rename to core/src/main/scala/io/projectglow/vcf/VCFInputFormatter.scala index d47a69adf..4daa2b726 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFInputFormatter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFInputFormatter.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.OutputStream @@ -7,8 +7,8 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.InternalRow -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.transformers.pipe.{InputFormatter, InputFormatterFactory} +import io.projectglow.common.GlowLogging +import io.projectglow.transformers.pipe.{InputFormatter, InputFormatterFactory} /** * An input formatter that writes rows as VCF records. @@ -17,7 +17,7 @@ class VCFInputFormatter( converter: InternalRowToVariantContextConverter, providedSampleIds: Option[Seq[String]]) extends InputFormatter - with HLSLogging { + with GlowLogging { private var writer: VCFStreamWriter = _ private var stream: OutputStream = _ diff --git a/core/src/main/scala/com/databricks/vcf/VCFOutputFormatter.scala b/core/src/main/scala/io/projectglow/vcf/VCFOutputFormatter.scala similarity index 86% rename from core/src/main/scala/com/databricks/vcf/VCFOutputFormatter.scala rename to core/src/main/scala/io/projectglow/vcf/VCFOutputFormatter.scala index 3e7fba79b..e0086f55c 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFOutputFormatter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFOutputFormatter.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.InputStream @@ -7,12 +7,11 @@ import htsjdk.tribble.readers.{AsciiLineReader, AsciiLineReaderIterator} import htsjdk.variant.vcf.{VCFCodec, VCFHeader} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.transformers.pipe.{OutputFormatter, OutputFormatterFactory} +import io.projectglow.common.GlowLogging +import io.projectglow.transformers.pipe.{OutputFormatter, OutputFormatterFactory} -class VCFOutputFormatter extends OutputFormatter with HLSLogging { +class VCFOutputFormatter extends OutputFormatter with GlowLogging { override def makeIterator(stream: InputStream): Iterator[Any] = { val codec = new VCFCodec val lineIterator = new AsciiLineReaderIterator(AsciiLineReader.from(stream)) diff --git a/core/src/main/scala/com/databricks/vcf/VCFRowHeaderLines.scala b/core/src/main/scala/io/projectglow/vcf/VCFRowHeaderLines.scala similarity index 98% rename from core/src/main/scala/com/databricks/vcf/VCFRowHeaderLines.scala rename to core/src/main/scala/io/projectglow/vcf/VCFRowHeaderLines.scala index 7e7c6557c..9c15271a5 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFRowHeaderLines.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFRowHeaderLines.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import htsjdk.variant.vcf._ diff --git a/core/src/main/scala/com/databricks/vcf/VCFRowToVariantContextConverter.scala b/core/src/main/scala/io/projectglow/vcf/VCFRowToVariantContextConverter.scala similarity index 93% rename from core/src/main/scala/com/databricks/vcf/VCFRowToVariantContextConverter.scala rename to core/src/main/scala/io/projectglow/vcf/VCFRowToVariantContextConverter.scala index c6e02b957..a88fad48f 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFRowToVariantContextConverter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFRowToVariantContextConverter.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import scala.collection.JavaConverters._ @@ -6,6 +6,8 @@ import htsjdk.samtools.ValidationStringency import htsjdk.variant.variantcontext.{VariantContext => HtsjdkVariantContext} import htsjdk.variant.vcf.VCFHeader +import io.projectglow.common.VCFRow + /** * VCFRow -> HTSJDK VariantContext * Under the hood, this class relies on a [[InternalRowToVariantContextConverter]]. diff --git a/core/src/main/scala/com/databricks/vcf/VCFSchemaInferrer.scala b/core/src/main/scala/io/projectglow/vcf/VCFSchemaInferrer.scala similarity index 99% rename from core/src/main/scala/com/databricks/vcf/VCFSchemaInferrer.scala rename to core/src/main/scala/io/projectglow/vcf/VCFSchemaInferrer.scala index b2c4a75ec..acfc5e81c 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFSchemaInferrer.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFSchemaInferrer.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import scala.collection.JavaConverters._ @@ -6,6 +6,8 @@ import htsjdk.variant.vcf._ import org.apache.commons.lang3.math.NumberUtils import org.apache.spark.sql.types._ +import io.projectglow.common.{GenotypeFields, VariantSchemas} + /** * Infers the schema of a VCF file from its headers. */ diff --git a/core/src/main/scala/com/databricks/vcf/VCFStreamWriter.scala b/core/src/main/scala/io/projectglow/vcf/VCFStreamWriter.scala similarity index 99% rename from core/src/main/scala/com/databricks/vcf/VCFStreamWriter.scala rename to core/src/main/scala/io/projectglow/vcf/VCFStreamWriter.scala index 6217c7707..7fc3bb252 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFStreamWriter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFStreamWriter.scala @@ -1,10 +1,11 @@ -package com.databricks.vcf +package io.projectglow.vcf -import scala.collection.JavaConverters._ import java.io.{Closeable, OutputStream} -import htsjdk.variant.variantcontext.{GenotypeBuilder, VariantContext, VariantContextBuilder} +import scala.collection.JavaConverters._ + import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriter, VariantContextWriterBuilder} +import htsjdk.variant.variantcontext.{GenotypeBuilder, VariantContext, VariantContextBuilder} import htsjdk.variant.vcf.{VCFHeader, VCFHeaderLine} /** diff --git a/core/src/main/scala/com/databricks/vcf/VariantContextToInternalRowConverter.scala b/core/src/main/scala/io/projectglow/vcf/VariantContextToInternalRowConverter.scala similarity index 98% rename from core/src/main/scala/com/databricks/vcf/VariantContextToInternalRowConverter.scala rename to core/src/main/scala/io/projectglow/vcf/VariantContextToInternalRowConverter.scala index a9cc51411..051d0d1bb 100644 --- a/core/src/main/scala/com/databricks/vcf/VariantContextToInternalRowConverter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VariantContextToInternalRowConverter.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.util.{HashMap => JHashMap, List => JList, Map => JMap} @@ -16,8 +16,8 @@ import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.common.{HLSLogging, HasStringency} -import com.databricks.hls.sql.util.RowConverter +import io.projectglow.common.{GenotypeFields, GlowLogging, HasStringency, VariantSchemas} +import io.projectglow.sql.util.RowConverter /** * Converts an HTSJDK variant context into a SparkSQL row with the provided schema. @@ -33,11 +33,11 @@ class VariantContextToInternalRowConverter( schema: StructType, val stringency: ValidationStringency, writeSampleIds: Boolean = true) - extends HLSLogging + extends GlowLogging with HasStringency with Serializable { - import VariantSchemas._ + import io.projectglow.common.VariantSchemas._ private val infoKeysParsedWithoutHeader = mutable.HashSet.empty[String] private val formatKeysParsedWithoutHeader = mutable.HashSet.empty[String] diff --git a/core/src/main/scala/com/databricks/vcf/VariantContextToVCFRowConverter.scala b/core/src/main/scala/io/projectglow/vcf/VariantContextToVCFRowConverter.scala similarity index 81% rename from core/src/main/scala/com/databricks/vcf/VariantContextToVCFRowConverter.scala rename to core/src/main/scala/io/projectglow/vcf/VariantContextToVCFRowConverter.scala index d8ace0c19..63aa75d43 100644 --- a/core/src/main/scala/com/databricks/vcf/VariantContextToVCFRowConverter.scala +++ b/core/src/main/scala/io/projectglow/vcf/VariantContextToVCFRowConverter.scala @@ -1,21 +1,18 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.lang.{Boolean => JBoolean, Iterable => JIterable} -import java.util.{HashMap => JHashMap, Map => JMap} -import scala.collection.JavaConverters._ import scala.collection.mutable import htsjdk.samtools.ValidationStringency -import htsjdk.variant.variantcontext.{Allele, Genotype => HtsjdkGenotype, VariantContext => HtsjdkVariantContext} +import htsjdk.variant.variantcontext.{VariantContext => HtsjdkVariantContext} import htsjdk.variant.vcf.{VCFConstants, VCFHeader} -import org.apache.spark.sql.catalyst.util.ArrayData -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.{GlowLogging, VCFRow} // HTSJDK VariantContext -> VCFRow // Based on the HTSJDK classes VCFEncoder and CommonInfo -private[databricks] object VariantContextToVCFRowConverter { +object VariantContextToVCFRowConverter { def parseObjectAsString(obj: Object): String = { obj match { @@ -48,11 +45,11 @@ private[databricks] object VariantContextToVCFRowConverter { } // HTSJDK VariantContext -> VCFRow -private[databricks] class VariantContextToVCFRowConverter( +class VariantContextToVCFRowConverter( vcfHeader: VCFHeader, stringency: ValidationStringency = ValidationStringency.LENIENT, includeSampleIds: Boolean = true) - extends HLSLogging + extends GlowLogging with Serializable { private val converter = new VariantContextToInternalRowConverter( diff --git a/core/src/main/scala/org/apache/spark/sql/SQLUtils.scala b/core/src/main/scala/org/apache/spark/sql/SQLUtils.scala index 40131ca06..13da58bea 100644 --- a/core/src/main/scala/org/apache/spark/sql/SQLUtils.scala +++ b/core/src/main/scala/org/apache/spark/sql/SQLUtils.scala @@ -1,12 +1,9 @@ package org.apache.spark.sql import org.apache.spark.TaskContext -import org.apache.spark.ml.linalg.{VectorUDT, Vectors} +import org.apache.spark.ml.linalg.{MatrixUDT, VectorUDT} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} -import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, UnaryExpression} -import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData} import org.apache.spark.sql.types._ object SQLUtils { @@ -46,76 +43,18 @@ object SQLUtils { def setTaskContext(context: TaskContext): Unit = { TaskContext.setTaskContext(context) } -} - -case class ArrayToSparseVector(child: Expression) - extends UnaryExpression - with ImplicitCastInputTypes { - - override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType(DoubleType)) - override def dataType: DataType = ArrayToSparseVector.vectorType - override def nullSafeEval(input: Any): Any = ArrayToSparseVector.fromDoubleArray(input) - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, c => { - s""" - |${ev.value} = org.apache.spark.sql.ArrayToSparseVector.fromDoubleArray($c); - """.stripMargin - }) + def newMatrixUDT(): MatrixUDT = { + new MatrixUDT() } -} - -object ArrayToSparseVector { - lazy val vectorType: VectorUDT = new VectorUDT() - def fromDoubleArray(input: Any): InternalRow = { - val vector = Vectors.dense(input.asInstanceOf[ArrayData].toDoubleArray()) - vectorType.serialize(vector.toSparse) + def newVectorUDT(): VectorUDT = { + new VectorUDT() } -} - -case class ArrayToDenseVector(child: Expression) - extends UnaryExpression - with ImplicitCastInputTypes { - override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType(DoubleType)) - override def dataType: DataType = ArrayToDenseVector.vectorType - override def nullSafeEval(input: Any): Any = ArrayToDenseVector.fromDoubleArray(input) - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, c => { - s""" - |${ev.value} = org.apache.spark.sql.ArrayToDenseVector.fromDoubleArray($c); - """.stripMargin - }) + def newAnalysisException(msg: String): AnalysisException = { + new AnalysisException(msg) } -} -object ArrayToDenseVector { - lazy val vectorType: VectorUDT = new VectorUDT() - - def fromDoubleArray(input: Any): InternalRow = { - val vector = Vectors.dense(input.asInstanceOf[ArrayData].toDoubleArray()) - vectorType.serialize(vector) - } -} - -case class VectorToArray(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { - override def inputTypes: Seq[AbstractDataType] = Seq(VectorToArray.vectorType) - override def dataType: DataType = ArrayType(DoubleType) - override def nullSafeEval(input: Any): Any = VectorToArray.toDoubleArray(input) - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, c => { - s""" - |${ev.value} = org.apache.spark.sql.VectorToArray.toDoubleArray($c); - """.stripMargin - }) - } -} - -object VectorToArray { - lazy val vectorType: VectorUDT = new VectorUDT() - def toDoubleArray(input: Any): ArrayData = { - new GenericArrayData(vectorType.deserialize(input).toArray) - } + type ADT = AbstractDataType } diff --git a/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/QuaternaryExpression.scala b/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/QuaternaryExpression.scala index e2b13bb7e..b31811704 100644 --- a/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/QuaternaryExpression.scala +++ b/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/QuaternaryExpression.scala @@ -1,8 +1,8 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ +import org.apache.spark.sql.catalyst.expressions.codegen._ // Inlines QuaternaryExpression from Spark 3.0 diff --git a/core/src/test/resources/META-INF/services/com.databricks.hls.DataFrameTransformer b/core/src/test/resources/META-INF/services/com.databricks.hls.DataFrameTransformer deleted file mode 100644 index 687964a7e..000000000 --- a/core/src/test/resources/META-INF/services/com.databricks.hls.DataFrameTransformer +++ /dev/null @@ -1 +0,0 @@ -com.databricks.hls.DummyTransformer diff --git a/core/src/test/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory b/core/src/test/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory deleted file mode 100644 index 6900b69af..000000000 --- a/core/src/test/resources/META-INF/services/com.databricks.hls.transformers.pipe.InputFormatterFactory +++ /dev/null @@ -1 +0,0 @@ -com.databricks.hls.transformers.pipe.DummyInputFormatterFactory diff --git a/core/src/test/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory b/core/src/test/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory deleted file mode 100644 index a2dd1b7c4..000000000 --- a/core/src/test/resources/META-INF/services/com.databricks.hls.transformers.pipe.OutputFormatterFactory +++ /dev/null @@ -1 +0,0 @@ -com.databricks.hls.transformers.pipe.DummyOutputFormatterFactory diff --git a/core/src/test/resources/META-INF/services/com.databricks.sql.BigFileUploader b/core/src/test/resources/META-INF/services/com.databricks.sql.BigFileUploader deleted file mode 100644 index 79b5e7033..000000000 --- a/core/src/test/resources/META-INF/services/com.databricks.sql.BigFileUploader +++ /dev/null @@ -1 +0,0 @@ -com.databricks.sql.DummyFileUploader \ No newline at end of file diff --git a/core/src/test/resources/META-INF/services/io.projectglow.DataFrameTransformer b/core/src/test/resources/META-INF/services/io.projectglow.DataFrameTransformer new file mode 100644 index 000000000..30e8a6e89 --- /dev/null +++ b/core/src/test/resources/META-INF/services/io.projectglow.DataFrameTransformer @@ -0,0 +1 @@ +io.projectglow.DummyTransformer diff --git a/core/src/test/resources/META-INF/services/io.projectglow.sql.BigFileUploader b/core/src/test/resources/META-INF/services/io.projectglow.sql.BigFileUploader new file mode 100644 index 000000000..9a7037cff --- /dev/null +++ b/core/src/test/resources/META-INF/services/io.projectglow.sql.BigFileUploader @@ -0,0 +1 @@ +io.projectglow.sql.DummyFileUploader \ No newline at end of file diff --git a/core/src/test/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory b/core/src/test/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory new file mode 100644 index 000000000..3230322cc --- /dev/null +++ b/core/src/test/resources/META-INF/services/io.projectglow.transformers.pipe.InputFormatterFactory @@ -0,0 +1 @@ +io.projectglow.transformers.pipe.DummyInputFormatterFactory diff --git a/core/src/test/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory b/core/src/test/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory new file mode 100644 index 000000000..689b657f1 --- /dev/null +++ b/core/src/test/resources/META-INF/services/io.projectglow.transformers.pipe.OutputFormatterFactory @@ -0,0 +1 @@ +io.projectglow.transformers.pipe.DummyOutputFormatterFactory diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index 7b5a7da6d..ca48a2782 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -1,11 +1,12 @@ # Set everything to be logged to the console -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yyyy/MM/dd HH:mm:ss} %p %c{1}%X{traceInfo}: %.16384m%n +log4j.rootCategory=INFO, file +log4j.appender.file=org.apache.log4j.FileAppender +log4j.appender.file.layout=org.apache.log4j.PatternLayout +log4j.appender.file.layout.ConversionPattern=%d{yyyy/MM/dd HH:mm:ss} %p %c{1}%X{traceInfo}: %.16384m%n +log4j.appender.file.File=unit-tests.log # Settings to quiet third party logs that are too verbose log4j.logger.akka=ERROR log4j.logger.Remoting=ERROR -log4j.logger.org.apache.spark=WARN +log4j.logger.org.apache.spark=INFO log4j.logger.org.eclipse.jetty=ERROR diff --git a/core/src/test/scala/com/databricks/hls/DBGenomicsSuite.scala b/core/src/test/scala/io/projectglow/GlowSuite.scala similarity index 70% rename from core/src/test/scala/com/databricks/hls/DBGenomicsSuite.scala rename to core/src/test/scala/io/projectglow/GlowSuite.scala index c08211f62..f9da0cbae 100644 --- a/core/src/test/scala/com/databricks/hls/DBGenomicsSuite.scala +++ b/core/src/test/scala/io/projectglow/GlowSuite.scala @@ -1,15 +1,15 @@ -package com.databricks.hls +package io.projectglow import org.apache.spark.sql.DataFrame -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest -class DBGenomicsSuite extends HLSBaseTest { +class GlowSuite extends GlowBaseTest { test("uses service provider") { val sess = spark import sess.implicits._ val output = - DBGenomics.transform("dummy_transformer", spark.emptyDataFrame, Map.empty[String, String]) + Glow.transform("dummy_transformer", spark.emptyDataFrame, Map.empty[String, String]) assert(output.count() == 1) assert(output.as[String].head() == "monkey") } @@ -18,7 +18,7 @@ class DBGenomicsSuite extends HLSBaseTest { val sess = spark import sess.implicits._ val output = - DBGenomics.transform("dummyTransformer", spark.emptyDataFrame, Map.empty[String, String]) + Glow.transform("dummyTransformer", spark.emptyDataFrame, Map.empty[String, String]) assert(output.count() == 1) assert(output.as[String].head() == "monkey") } diff --git a/core/src/test/scala/com/databricks/bgen/BgenConverterBaseTest.scala b/core/src/test/scala/io/projectglow/bgen/BgenConverterBaseTest.scala similarity index 90% rename from core/src/test/scala/com/databricks/bgen/BgenConverterBaseTest.scala rename to core/src/test/scala/io/projectglow/bgen/BgenConverterBaseTest.scala index 68533bd92..438e29680 100644 --- a/core/src/test/scala/com/databricks/bgen/BgenConverterBaseTest.scala +++ b/core/src/test/scala/io/projectglow/bgen/BgenConverterBaseTest.scala @@ -1,10 +1,11 @@ -package com.databricks.bgen +package io.projectglow.bgen -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.vcf.BgenRow +import io.projectglow.common.BgenRow +import io.projectglow.common.BgenRow +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -trait BgenConverterBaseTest extends HLSBaseTest { +trait BgenConverterBaseTest extends GlowBaseTest { val testRoot = s"$testDataHome/bgen" diff --git a/core/src/test/scala/com/databricks/bgen/BgenReaderSuite.scala b/core/src/test/scala/io/projectglow/bgen/BgenReaderSuite.scala similarity index 97% rename from core/src/test/scala/com/databricks/bgen/BgenReaderSuite.scala rename to core/src/test/scala/io/projectglow/bgen/BgenReaderSuite.scala index d5e1f74f8..0699a7178 100644 --- a/core/src/test/scala/com/databricks/bgen/BgenReaderSuite.scala +++ b/core/src/test/scala/io/projectglow/bgen/BgenReaderSuite.scala @@ -1,6 +1,6 @@ -package com.databricks.bgen +package io.projectglow.bgen -import java.io.{BufferedReader, FileInputStream, InputStreamReader} +import java.io.{BufferedReader, InputStreamReader} import scala.collection.JavaConverters._ @@ -10,10 +10,12 @@ import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.types.{ArrayType, StructType} -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.vcf.{BgenRow, VCFRow, VariantSchemas} +import io.projectglow.common.{BgenRow, VCFRow, VariantSchemas} +import io.projectglow.common.{BgenRow, VCFRow, VariantSchemas} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class BgenReaderSuite extends HLSBaseTest { +class BgenReaderSuite extends GlowBaseTest { val sourceName = "bgen" private val testRoot = s"$testDataHome/bgen" diff --git a/core/src/test/scala/com/databricks/bgen/BgenRowConverterSuite.scala b/core/src/test/scala/io/projectglow/bgen/BgenRowConverterSuite.scala similarity index 97% rename from core/src/test/scala/com/databricks/bgen/BgenRowConverterSuite.scala rename to core/src/test/scala/io/projectglow/bgen/BgenRowConverterSuite.scala index dd5ea8537..ff32fc3b2 100644 --- a/core/src/test/scala/com/databricks/bgen/BgenRowConverterSuite.scala +++ b/core/src/test/scala/io/projectglow/bgen/BgenRowConverterSuite.scala @@ -1,8 +1,9 @@ -package com.databricks.bgen +package io.projectglow.bgen import org.apache.spark.sql.catalyst.encoders.RowEncoder -import com.databricks.vcf.{BgenGenotype, BgenRow} +import io.projectglow.common.{BgenGenotype, BgenRow} +import io.projectglow.common.{BgenGenotype, BgenRow} class BgenRowConverterSuite extends BgenConverterBaseTest { diff --git a/core/src/test/scala/com/databricks/bgen/BgenWriterSuite.scala b/core/src/test/scala/io/projectglow/bgen/BgenWriterSuite.scala similarity index 97% rename from core/src/test/scala/com/databricks/bgen/BgenWriterSuite.scala rename to core/src/test/scala/io/projectglow/bgen/BgenWriterSuite.scala index 1b1962f68..ca1fdefb9 100644 --- a/core/src/test/scala/com/databricks/bgen/BgenWriterSuite.scala +++ b/core/src/test/scala/io/projectglow/bgen/BgenWriterSuite.scala @@ -1,14 +1,13 @@ -package com.databricks.bgen +package io.projectglow.bgen import java.io.{File, FileInputStream} import java.nio.file.Files import com.google.common.io.LittleEndianDataInputStream -import org.apache.commons.io.FileUtils import org.apache.spark.SparkException -import com.databricks.hls.common.TestUtils._ -import com.databricks.vcf.{BgenRow, VariantSchemas} +import io.projectglow.common.BgenRow +import io.projectglow.common.BgenRow class BgenWriterSuite extends BgenConverterBaseTest { diff --git a/core/src/test/scala/com/databricks/hls/common/TestUtils.scala b/core/src/test/scala/io/projectglow/common/TestUtils.scala similarity index 98% rename from core/src/test/scala/com/databricks/hls/common/TestUtils.scala rename to core/src/test/scala/io/projectglow/common/TestUtils.scala index a97bfddfe..ad1aa18f8 100644 --- a/core/src/test/scala/com/databricks/hls/common/TestUtils.scala +++ b/core/src/test/scala/io/projectglow/common/TestUtils.scala @@ -1,8 +1,8 @@ -package com.databricks.hls.common +package io.projectglow.common import org.scalatest.exceptions.TestFailedException -object TestUtils { +trait TestUtils { val ABS_TOL_MSG = " using absolute tolerance" val REL_TOL_MSG = " using relative tolerance" diff --git a/core/src/test/scala/com/databricks/sql/ComDatabricksDataSourceSuite.scala b/core/src/test/scala/io/projectglow/sql/ComDatabricksDataSourceSuite.scala similarity index 93% rename from core/src/test/scala/com/databricks/sql/ComDatabricksDataSourceSuite.scala rename to core/src/test/scala/io/projectglow/sql/ComDatabricksDataSourceSuite.scala index 97b27c4ab..e09d6b3db 100644 --- a/core/src/test/scala/com/databricks/sql/ComDatabricksDataSourceSuite.scala +++ b/core/src/test/scala/io/projectglow/sql/ComDatabricksDataSourceSuite.scala @@ -1,11 +1,9 @@ -package com.databricks.sql +package io.projectglow.sql import java.nio.file.{Files, Path} -import com.databricks.hls.sql.HLSBaseTest - // Sanity check that legacy DataSource names starting with "com.databricks." still work -class ComDatabricksDataSourceSuite extends HLSBaseTest { +class ComDatabricksDataSourceSuite extends GlowBaseTest { lazy val vcf = s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf" lazy val bgen = s"$testDataHome/bgen/example.16bits.bgen" diff --git a/core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala b/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala similarity index 78% rename from core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala rename to core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala index 3983a3a70..dfb108df5 100644 --- a/core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala +++ b/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala @@ -1,19 +1,21 @@ -package com.databricks.hls.sql +package io.projectglow.sql -import org.apache.spark.{DebugFilesystem, SparkConf} +import htsjdk.samtools.util.Log import org.apache.spark.sql.SparkSession import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.{DebugFilesystem, SparkConf} import org.scalatest.concurrent.{AbstractPatienceConfiguration, Eventually} import org.scalatest.time.{Milliseconds, Seconds, Span} -import org.scalatest.{FunSuite, Tag} +import org.scalatest.{Args, FunSuite, Status, Tag} -import com.databricks.hls.common.HLSLogging +import io.projectglow.common.{GlowLogging, TestUtils} -abstract class HLSBaseTest +abstract class GlowBaseTest extends FunSuite with SharedSparkSession - with HLSLogging - with HLSTestData + with GlowLogging + with GlowTestData + with TestUtils with JenkinsTestPatience { override protected def sparkConf: SparkConf = { @@ -35,6 +37,7 @@ abstract class HLSBaseTest val session = super.createSparkSession SqlExtensionProvider.register(session) SparkSession.setActiveSession(session) + Log.setGlobalLogLevel(Log.LogLevel.ERROR) session } @@ -49,6 +52,17 @@ abstract class HLSBaseTest DebugFilesystem.clearOpenStreams() super.afterEach() } + + override def runTest(testName: String, args: Args): Status = { + logger.info(s"Running test '$testName'") + val res = super.runTest(testName, args) + if (res.succeeds()) { + logger.info(s"Done running test '$testName'") + } else { + logger.info(s"Done running test '$testName' with a failure") + } + res + } } /** diff --git a/core/src/test/scala/com/databricks/hls/sql/HLSTestData.scala b/core/src/test/scala/io/projectglow/sql/GlowTestData.scala similarity index 75% rename from core/src/test/scala/com/databricks/hls/sql/HLSTestData.scala rename to core/src/test/scala/io/projectglow/sql/GlowTestData.scala index 467f89d68..49fcbf7a8 100644 --- a/core/src/test/scala/com/databricks/hls/sql/HLSTestData.scala +++ b/core/src/test/scala/io/projectglow/sql/GlowTestData.scala @@ -1,8 +1,8 @@ -package com.databricks.hls.sql +package io.projectglow.sql import java.nio.file.Paths -trait HLSTestData { +trait GlowTestData { final lazy val testDataHome = Paths .get( sys.props.getOrElse("test.dir", ""), diff --git a/core/src/test/scala/com/databricks/sql/SingleFileWriterSuite.scala b/core/src/test/scala/io/projectglow/sql/SingleFileWriterSuite.scala similarity index 87% rename from core/src/test/scala/com/databricks/sql/SingleFileWriterSuite.scala rename to core/src/test/scala/io/projectglow/sql/SingleFileWriterSuite.scala index ec4da31dd..fca9874f0 100644 --- a/core/src/test/scala/com/databricks/sql/SingleFileWriterSuite.scala +++ b/core/src/test/scala/io/projectglow/sql/SingleFileWriterSuite.scala @@ -1,13 +1,11 @@ -package com.databricks.sql +package io.projectglow.sql import java.nio.file.Files import org.apache.hadoop.conf.Configuration import org.apache.spark.rdd.RDD -import com.databricks.hls.sql.HLSBaseTest - -class SingleFileWriterSuite extends HLSBaseTest { +class SingleFileWriterSuite extends GlowBaseTest { test("uses service loader") { val outDir = Files.createTempDirectory("writer") assert(DummyFileUploader.counter == 0) diff --git a/core/src/test/scala/com/databricks/hls/tertiary/LiftOverCoordinatesExprSuite.scala b/core/src/test/scala/io/projectglow/tertiary/LiftOverCoordinatesExprSuite.scala similarity index 97% rename from core/src/test/scala/com/databricks/hls/tertiary/LiftOverCoordinatesExprSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/LiftOverCoordinatesExprSuite.scala index e61f14b05..cd2aaa084 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/LiftOverCoordinatesExprSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/LiftOverCoordinatesExprSuite.scala @@ -1,13 +1,13 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary +import org.apache.spark.SparkException import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.expr import org.apache.spark.sql.types.{LongType, StringType, StructType} -import org.apache.spark.SparkException -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest -class LiftOverCoordinatesExprSuite extends HLSBaseTest { +class LiftOverCoordinatesExprSuite extends GlowBaseTest { val requiredBaseSchema: StructType = new StructType() .add("contigName", StringType) .add("start", LongType) diff --git a/core/src/test/scala/com/databricks/hls/tertiary/LinearRegressionSuite.scala b/core/src/test/scala/io/projectglow/tertiary/LinearRegressionSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/hls/tertiary/LinearRegressionSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/LinearRegressionSuite.scala index 7109de2fa..4824f326b 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/LinearRegressionSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/LinearRegressionSuite.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary import scala.concurrent.duration._ import scala.util.Random @@ -9,11 +9,12 @@ import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression import org.apache.spark.ml.linalg.DenseMatrix import org.apache.spark.sql.functions._ -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.hls.tertiary.RegressionTestUtils._ +import io.projectglow.sql.expressions.{ComputeQR, LinearRegressionGwas, RegressionStats} +import RegressionTestUtils._ +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.{ComputeQR, LinearRegressionGwas, RegressionStats} -class LinearRegressionSuite extends HLSBaseTest { +class LinearRegressionSuite extends GlowBaseTest { private lazy val sess = spark private lazy val random = { diff --git a/core/src/test/scala/com/databricks/hls/tertiary/LogisticRegressionSuite.scala b/core/src/test/scala/io/projectglow/tertiary/LogisticRegressionSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/hls/tertiary/LogisticRegressionSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/LogisticRegressionSuite.scala index 650ed3b7c..abf40db8d 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/LogisticRegressionSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/LogisticRegressionSuite.scala @@ -1,15 +1,17 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary import breeze.linalg.DenseVector -import org.apache.spark.sql.functions.{expr, monotonically_increasing_id} import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.functions.{expr, monotonically_increasing_id} -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.hls.tertiary.RegressionTestUtils._ +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.{LikelihoodRatioTestStats, LogisticRegressionGwas, NewtonResult} +import io.projectglow.tertiary.RegressionTestUtils._ +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.{LikelihoodRatioTestStats, LogisticRegressionGwas, NewtonResult} -class LogisticRegressionSuite extends HLSBaseTest { +class LogisticRegressionSuite extends GlowBaseTest { private lazy val sess = spark diff --git a/core/src/test/scala/com/databricks/hls/tertiary/MomentAggStateSuite.scala b/core/src/test/scala/io/projectglow/tertiary/MomentAggStateSuite.scala similarity index 80% rename from core/src/test/scala/com/databricks/hls/tertiary/MomentAggStateSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/MomentAggStateSuite.scala index a304eb1b0..6527da061 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/MomentAggStateSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/MomentAggStateSuite.scala @@ -1,8 +1,11 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.MomentAggState +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.MomentAggState -class MomentAggStateSuite extends HLSBaseTest { +class MomentAggStateSuite extends GlowBaseTest { test("merge") { val s1 = MomentAggState(5, 0, 10, 2, 1) val s2 = MomentAggState(3, 1, 11, 1, 2) diff --git a/core/src/test/scala/com/databricks/hls/tertiary/RegressionTestUtils.scala b/core/src/test/scala/io/projectglow/tertiary/RegressionTestUtils.scala similarity index 95% rename from core/src/test/scala/com/databricks/hls/tertiary/RegressionTestUtils.scala rename to core/src/test/scala/io/projectglow/tertiary/RegressionTestUtils.scala index 61d3aa17e..59e39011a 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/RegressionTestUtils.scala +++ b/core/src/test/scala/io/projectglow/tertiary/RegressionTestUtils.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary import breeze.linalg.{DenseMatrix => BreezeDenseMatrix} import org.apache.spark.ml.linalg.{DenseMatrix => SparkDenseMatrix} diff --git a/core/src/test/scala/com/databricks/hls/tertiary/SampleQcExprsSuite.scala b/core/src/test/scala/io/projectglow/tertiary/SampleQcExprsSuite.scala similarity index 95% rename from core/src/test/scala/com/databricks/hls/tertiary/SampleQcExprsSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/SampleQcExprsSuite.scala index e90a6edda..b1af62d83 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/SampleQcExprsSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/SampleQcExprsSuite.scala @@ -1,12 +1,13 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary import org.apache.spark.sql.{DataFrame, Row} -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.vcf.VCFRow +import io.projectglow.common.VCFRow +import io.projectglow.common.VCFRow +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class SampleQcExprsSuite extends HLSBaseTest { +class SampleQcExprsSuite extends GlowBaseTest { lazy val testVcf = s"$testDataHome/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf" lazy val na12878 = s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf" lazy private val sess = { diff --git a/core/src/test/scala/com/databricks/hls/tertiary/VariantQcExprsSuite.scala b/core/src/test/scala/io/projectglow/tertiary/VariantQcExprsSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/hls/tertiary/VariantQcExprsSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/VariantQcExprsSuite.scala index 302147957..5f4045d2a 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/VariantQcExprsSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/VariantQcExprsSuite.scala @@ -1,15 +1,16 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary import scala.util.Random import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.functions._ -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.vcf.{GenotypeFields, VCFRow} +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class VariantQcExprsSuite extends HLSBaseTest { +class VariantQcExprsSuite extends GlowBaseTest { lazy val testVcf = s"$testDataHome/1kg_sample.vcf" lazy private val sess = spark diff --git a/core/src/test/scala/com/databricks/hls/tertiary/VariantUtilExprsSuite.scala b/core/src/test/scala/io/projectglow/tertiary/VariantUtilExprsSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/hls/tertiary/VariantUtilExprsSuite.scala rename to core/src/test/scala/io/projectglow/tertiary/VariantUtilExprsSuite.scala index 435169d86..ea6efee6b 100644 --- a/core/src/test/scala/com/databricks/hls/tertiary/VariantUtilExprsSuite.scala +++ b/core/src/test/scala/io/projectglow/tertiary/VariantUtilExprsSuite.scala @@ -1,14 +1,17 @@ -package com.databricks.hls.tertiary +package io.projectglow.tertiary -import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, SparseMatrix, SparseVector, Vector} +import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, SparseVector, Vector} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.{VariantType, VariantUtilExprs} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.expressions.{VariantType, VariantUtilExprs} -class VariantUtilExprsSuite extends HLSBaseTest { +class VariantUtilExprsSuite extends GlowBaseTest { case class SimpleGenotypeFields(calls: Seq[Int]) case class SimpleVariant(genotypes: Seq[SimpleGenotypeFields]) diff --git a/core/src/test/scala/com/databricks/hls/transformers/LiftOverVariantsTransformerSuite.scala b/core/src/test/scala/io/projectglow/transformers/LiftOverVariantsTransformerSuite.scala similarity index 93% rename from core/src/test/scala/com/databricks/hls/transformers/LiftOverVariantsTransformerSuite.scala rename to core/src/test/scala/io/projectglow/transformers/LiftOverVariantsTransformerSuite.scala index 4d7658a29..dfc1280f8 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/LiftOverVariantsTransformerSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/LiftOverVariantsTransformerSuite.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers +package io.projectglow.transformers import org.apache.spark.SparkException import org.apache.spark.sql.DataFrame @@ -6,11 +6,13 @@ import org.apache.spark.sql.functions.monotonically_increasing_id import org.apache.spark.sql.types.{LongType, StringType, StructType} import picard.vcf.LiftoverVcf -import com.databricks.hls.DBGenomics -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.vcf.VCFConverterBaseTest +import io.projectglow.Glow +import io.projectglow.Glow +import io.projectglow.vcf.VCFConverterBaseTest +import io.projectglow.sql.GlowBaseTest +import io.projectglow.vcf.VCFConverterBaseTest -class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBaseTest { +class LiftOverVariantsTransformerSuite extends GlowBaseTest with VCFConverterBaseTest { val picardTestDataHome = s"$testDataHome/liftover/picard" val CHAIN_FILE = s"$testDataHome/liftover/hg38ToHg19.over.chain.gz" val REFERENCE_FILE = s"$testDataHome/liftover/hg19.chr20.fa.gz" @@ -34,7 +36,7 @@ class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBase } else { Map.empty } - val outputDf = DBGenomics + val outputDf = Glow .transform( "lift_over_variants", inputDf, @@ -136,7 +138,7 @@ class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBase val inputDf = readVcf(s"$testDataHome/liftover/unlifted.test.vcf") .withColumn("id", monotonically_increasing_id) - val outputDf = DBGenomics + val outputDf = Glow .transform( "lift_over_variants", inputDf, @@ -158,7 +160,7 @@ class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBase .take(idx) ++ requiredBaseSchema.fields.takeRight(requiredBaseSchema.size - idx - 1))) .load(s"$testDataHome/combined.chr20_18210071_18210093.g.vcf") assertThrows[IllegalArgumentException] { - DBGenomics + Glow .transform( "lift_over_variants", inputDf, @@ -211,7 +213,7 @@ class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBase val inputDf = readVcf(s"$testDataHome/combined.chr20_18210071_18210093.g.vcf") val ex = intercept[SparkException] { val outputDf = - DBGenomics.transform("lift_over_variants", inputDf, Map("referenceFile" -> REFERENCE_FILE)) + Glow.transform("lift_over_variants", inputDf, Map("referenceFile" -> REFERENCE_FILE)) outputDf.count } assert(ex.getMessage.contains("Must provide chain file")) @@ -221,7 +223,7 @@ class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBase val inputDf = readVcf(s"$testDataHome/combined.chr20_18210071_18210093.g.vcf") val ex = intercept[SparkException] { val outputDf = - DBGenomics.transform("lift_over_variants", inputDf, Map("chainFile" -> CHAIN_FILE)) + Glow.transform("lift_over_variants", inputDf, Map("chainFile" -> CHAIN_FILE)) outputDf.count } assert(ex.getMessage.contains("Must provide reference file")) @@ -233,7 +235,7 @@ class LiftOverVariantsTransformerSuite extends HLSBaseTest with VCFConverterBase // chr20 refseq for chr1 interval val inputDf = readVcf(s"$picardTestDataHome/testLiftoverBiallelicIndels.vcf") - val outputDf = DBGenomics.transform( + val outputDf = Glow.transform( "lift_over_variants", inputDf, Map("chainFile" -> s"$picardTestDataHome/test.over.chain", "referenceFile" -> REFERENCE_FILE)) diff --git a/core/src/test/scala/com/databricks/hls/transformers/normalizevariants/NormalizeVariantsTransformerSuite.scala b/core/src/test/scala/io/projectglow/transformers/normalizevariants/NormalizeVariantsTransformerSuite.scala similarity index 94% rename from core/src/test/scala/com/databricks/hls/transformers/normalizevariants/NormalizeVariantsTransformerSuite.scala rename to core/src/test/scala/io/projectglow/transformers/normalizevariants/NormalizeVariantsTransformerSuite.scala index d7911d052..c9fa9eb45 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/normalizevariants/NormalizeVariantsTransformerSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/normalizevariants/NormalizeVariantsTransformerSuite.scala @@ -1,12 +1,15 @@ -package com.databricks.hls.transformers.normalizevariants +package io.projectglow.transformers.normalizevariants import org.apache.spark.SparkConf -import com.databricks.hls.DBGenomics -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.Glow +import io.projectglow.common.GlowLogging +import io.projectglow.Glow +import io.projectglow.common.GlowLogging +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class NormalizeVariantsTransformerSuite extends HLSBaseTest with HLSLogging { +class NormalizeVariantsTransformerSuite extends GlowBaseTest with GlowLogging { lazy val sourceName: String = "vcf" lazy val testFolder: String = s"$testDataHome/variantnormalizer-test" @@ -108,7 +111,7 @@ class NormalizeVariantsTransformerSuite extends HLSBaseTest with HLSLogging { .format(sourceName) .load(originalVCFFileName) - val dfNormalized = DBGenomics + val dfNormalized = Glow .transform( "normalize_variants", dfOriginal, diff --git a/core/src/test/scala/com/databricks/hls/transformers/normalizevariants/VariantNormalizerSuite.scala b/core/src/test/scala/io/projectglow/transformers/normalizevariants/VariantNormalizerSuite.scala similarity index 83% rename from core/src/test/scala/com/databricks/hls/transformers/normalizevariants/VariantNormalizerSuite.scala rename to core/src/test/scala/io/projectglow/transformers/normalizevariants/VariantNormalizerSuite.scala index 44437ff3e..47b84fda4 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/normalizevariants/VariantNormalizerSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/normalizevariants/VariantNormalizerSuite.scala @@ -1,16 +1,19 @@ -package com.databricks.hls.transformers.normalizevariants +package io.projectglow.transformers.normalizevariants import java.nio.file.Paths -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.hls.transformers.normalizevariants.VariantNormalizer._ import htsjdk.variant.variantcontext.Allele import org.broadinstitute.hellbender.engine.ReferenceDataSource -class VariantNormalizerSuite extends HLSBaseTest with HLSLogging { +import io.projectglow.common.GlowLogging +import io.projectglow.common.GlowLogging +import io.projectglow.sql.GlowBaseTest +import io.projectglow.transformers.normalizevariants.VariantNormalizer._ +import io.projectglow.sql.GlowBaseTest - lazy val sourceName: String = "com.databricks.vcf" +class VariantNormalizerSuite extends GlowBaseTest with GlowLogging { + + lazy val sourceName: String = "vcf" lazy val testFolder: String = s"$testDataHome/variantnormalizer-test" lazy val vtTestReference = s"$testFolder/20_altered.fasta" diff --git a/core/src/test/scala/com/databricks/hls/transformers/pipe/CSVPiperSuite.scala b/core/src/test/scala/io/projectglow/transformers/pipe/CSVPiperSuite.scala similarity index 94% rename from core/src/test/scala/com/databricks/hls/transformers/pipe/CSVPiperSuite.scala rename to core/src/test/scala/io/projectglow/transformers/pipe/CSVPiperSuite.scala index e107c7d22..93b809e22 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/pipe/CSVPiperSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/pipe/CSVPiperSuite.scala @@ -6,14 +6,17 @@ * License, Version 2.0, a copy of which you may obtain at * http://www.apache.org/licenses/LICENSE-2.0 */ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe -import com.databricks.hls.DBGenomics -import com.databricks.hls.sql.HLSBaseTest import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{StringType, StructField} -class CSVPiperSuite extends HLSBaseTest { +import io.projectglow.Glow +import io.projectglow.Glow +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest + +class CSVPiperSuite extends GlowBaseTest { private val saige = s"$testDataHome/saige_output.txt" private val csv = s"$testDataHome/no_header.csv" @@ -51,7 +54,7 @@ class CSVPiperSuite extends HLSBaseTest { Map.empty } - DBGenomics.transform( + Glow.transform( "pipe", inputDf, baseOptions ++ inputDelimiterOption ++ outputDelimiterOption ++ inputHeaderOption ++ outputHeaderOption) @@ -140,7 +143,7 @@ class CSVPiperSuite extends HLSBaseTest { "out_delimiter" -> " ", "cmd" -> s"""["$testDataHome/vcf/scripts/gwas.sh"]""" ) - val outputDf = DBGenomics.transform("pipe", input, options) + val outputDf = Glow.transform("pipe", input, options) assert( outputDf.schema.fields.toSeq == Seq( StructField("CHR", StringType, nullable = true), @@ -162,7 +165,7 @@ class CSVPiperSuite extends HLSBaseTest { "out_delimiter" -> " ", "cmd" -> s"""["python", "$testDataHome/vcf/scripts/gwas-region.py", "$testDataHome/vcf/scripts/group_file.txt"]""" ) - val outputDf = DBGenomics.transform("pipe", input, options) + val outputDf = Glow.transform("pipe", input, options) assert( outputDf.schema.fields.toSeq == Seq( StructField("Gene", StringType, nullable = true), @@ -173,7 +176,7 @@ class CSVPiperSuite extends HLSBaseTest { test("Big file") { val inputDf = spark.read.text(s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf") - val outputDf = DBGenomics.transform( + val outputDf = Glow.transform( "pipe", inputDf, Map("inputFormatter" -> "text", "outputFormatter" -> "csv", "cmd" -> """["cat", "-"]""")) diff --git a/core/src/test/scala/com/databricks/hls/transformers/pipe/PipeTransformerSuite.scala b/core/src/test/scala/io/projectglow/transformers/pipe/PipeTransformerSuite.scala similarity index 89% rename from core/src/test/scala/com/databricks/hls/transformers/pipe/PipeTransformerSuite.scala rename to core/src/test/scala/io/projectglow/transformers/pipe/PipeTransformerSuite.scala index bd5a9c79b..9624857bb 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/pipe/PipeTransformerSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/pipe/PipeTransformerSuite.scala @@ -1,31 +1,19 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import java.io.{InputStream, OutputStream} import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.types.{StringType, StructField, StructType} -import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.unsafe.types.UTF8String -import com.databricks.hls.DBGenomics -import com.databricks.hls.sql.HLSBaseTest - -class PipeTransformerSuite extends HLSBaseTest { - test("read input and output formatters from service loader") { - val sess = spark - import sess.implicits._ - - val df = Seq("dolphin").toDF.repartition(1) - val options = - Map("inputFormatter" -> "dummy_in", "outputFormatter" -> "dummy_out", "cmd" -> """["cat"]""") - val output = new PipeTransformer().transform(df, options) - assert(output.count() == 1) - assert(output.schema.length == 1) - assert(output.schema.exists(f => f.name == "animal" && f.dataType == StringType)) - assert(output.where("animal = 'monkey'").count() == 1) - } +import io.projectglow.Glow +import io.projectglow.Glow +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest +class PipeTransformerSuite extends GlowBaseTest { test("cleanup") { sparkContext.getPersistentRDDs.values.foreach(_.unpersist(true)) val sess = spark @@ -36,11 +24,25 @@ class PipeTransformerSuite extends HLSBaseTest { Map("inputFormatter" -> "dummy_in", "outputFormatter" -> "dummy_out", "cmd" -> """["cat"]""") new PipeTransformer().transform(df, options) assert(sparkContext.getPersistentRDDs.size == 2) - DBGenomics.transform("pipe_cleanup", df, Map.empty[String, String]) + Glow.transform("pipe_cleanup", df, Map.empty[String, String]) eventually { assert(sparkContext.getPersistentRDDs.size == 1) // Should cleanup the RDD cached by piping } } + + test("read input and output formatters from service loader") { + val sess = spark + import sess.implicits._ + + val df = Seq("dolphin").toDF.repartition(1) + val options = + Map("inputFormatter" -> "dummy_in", "outputFormatter" -> "dummy_out", "cmd" -> """["cat"]""") + val output = new PipeTransformer().transform(df, options) + assert(output.count() == 1) + assert(output.schema.length == 1) + assert(output.schema.exists(f => f.name == "animal" && f.dataType == StringType)) + assert(output.where("animal = 'monkey'").count() == 1) + } } class DummyInputFormatterFactory() extends InputFormatterFactory { diff --git a/core/src/test/scala/com/databricks/hls/transformers/pipe/TextPiperSuite.scala b/core/src/test/scala/io/projectglow/transformers/pipe/TextPiperSuite.scala similarity index 94% rename from core/src/test/scala/com/databricks/hls/transformers/pipe/TextPiperSuite.scala rename to core/src/test/scala/io/projectglow/transformers/pipe/TextPiperSuite.scala index 2ecda008b..c2e346e66 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/pipe/TextPiperSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/pipe/TextPiperSuite.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.pipe +package io.projectglow.transformers.pipe import scala.collection.JavaConverters._ @@ -6,9 +6,9 @@ import org.apache.spark.SparkException import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{StringType, StructField, StructType} -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest -class TextPiperSuite extends HLSBaseTest { +class TextPiperSuite extends GlowBaseTest { def pipeText(df: DataFrame): DataFrame = { val options = Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""") diff --git a/core/src/test/scala/com/databricks/hls/transformers/util/StringUtilsSuite.scala b/core/src/test/scala/io/projectglow/transformers/util/StringUtilsSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/hls/transformers/util/StringUtilsSuite.scala rename to core/src/test/scala/io/projectglow/transformers/util/StringUtilsSuite.scala index 350168906..0e2af01bf 100644 --- a/core/src/test/scala/com/databricks/hls/transformers/util/StringUtilsSuite.scala +++ b/core/src/test/scala/io/projectglow/transformers/util/StringUtilsSuite.scala @@ -1,4 +1,4 @@ -package com.databricks.hls.transformers.util +package io.projectglow.transformers.util import org.scalatest.FunSuite diff --git a/core/src/test/scala/com/databricks/vcf/InternalRowToVariantContextConverterSuite.scala b/core/src/test/scala/io/projectglow/vcf/InternalRowToVariantContextConverterSuite.scala similarity index 88% rename from core/src/test/scala/com/databricks/vcf/InternalRowToVariantContextConverterSuite.scala rename to core/src/test/scala/io/projectglow/vcf/InternalRowToVariantContextConverterSuite.scala index 4e0f72ab1..fec7b04e4 100644 --- a/core/src/test/scala/com/databricks/vcf/InternalRowToVariantContextConverterSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/InternalRowToVariantContextConverterSuite.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import scala.collection.JavaConverters._ @@ -6,9 +6,9 @@ import htsjdk.samtools.ValidationStringency import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} import org.bdgenomics.adam.rdd.VCFMetadataLoader -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest -class InternalRowToVariantContextConverterSuite extends HLSBaseTest { +class InternalRowToVariantContextConverterSuite extends GlowBaseTest { lazy val NA12878 = s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf" lazy val header = VCFMetadataLoader.readVcfHeader(sparkContext.hadoopConfiguration, NA12878) lazy val headerLines = header.getMetaDataInInputOrder.asScala.toSet @@ -21,7 +21,7 @@ class InternalRowToVariantContextConverterSuite extends HLSBaseTest { ) gridTest("common schema options pass strict validation")(optionsSeq) { options => - val df = spark.read.format("com.databricks.vcf").options(options).load(NA12878) + val df = spark.read.format("vcf").options(options).load(NA12878) new InternalRowToVariantContextConverter( toggleNullability(df.schema, true), headerLines, diff --git a/core/src/test/scala/com/databricks/vcf/TabixHelperSuite.scala b/core/src/test/scala/io/projectglow/vcf/TabixHelperSuite.scala similarity index 98% rename from core/src/test/scala/com/databricks/vcf/TabixHelperSuite.scala rename to core/src/test/scala/io/projectglow/vcf/TabixHelperSuite.scala index c476b025d..a5e3f84fb 100644 --- a/core/src/test/scala/com/databricks/vcf/TabixHelperSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/TabixHelperSuite.scala @@ -1,14 +1,16 @@ -package com.databricks.vcf +package io.projectglow.vcf import org.apache.spark.SparkConf import org.apache.spark.sql.sources._ import org.broadinstitute.hellbender.utils.SimpleInterval -import com.databricks.hls.common.HLSLogging -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.vcf.TabixIndexHelper._ +import io.projectglow.common.{GlowLogging, VCFRow} +import io.projectglow.common.{GlowLogging, VCFRow} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.vcf.TabixIndexHelper._ +import io.projectglow.sql.GlowBaseTest -class TabixHelperSuite extends HLSBaseTest with HLSLogging { +class TabixHelperSuite extends GlowBaseTest with GlowLogging { lazy val sourceName: String = "vcf" lazy val tabixTestVcf: String = s"$testDataHome/tabix-test-vcf" diff --git a/core/src/test/scala/com/databricks/vcf/VCFConverterBaseTest.scala b/core/src/test/scala/io/projectglow/vcf/VCFConverterBaseTest.scala similarity index 95% rename from core/src/test/scala/com/databricks/vcf/VCFConverterBaseTest.scala rename to core/src/test/scala/io/projectglow/vcf/VCFConverterBaseTest.scala index d662978e4..9c19af1df 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFConverterBaseTest.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFConverterBaseTest.scala @@ -1,12 +1,13 @@ -package com.databricks.vcf +package io.projectglow.vcf import scala.reflect.runtime.universe._ import org.bdgenomics.adam.util.PhredUtils -import com.databricks.hls.common.TestUtils._ +import io.projectglow.common.{GenotypeFields, TestUtils, VCFRow} +import io.projectglow.common.{GenotypeFields, TestUtils, VCFRow} -trait VCFConverterBaseTest { +trait VCFConverterBaseTest extends TestUtils { final lazy val defaultContigName = "" final lazy val defaultStart = 0L diff --git a/core/src/test/scala/com/databricks/vcf/VCFDatasourceSuite.scala b/core/src/test/scala/io/projectglow/vcf/VCFDatasourceSuite.scala similarity index 98% rename from core/src/test/scala/com/databricks/vcf/VCFDatasourceSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VCFDatasourceSuite.scala index 503a1564c..590792f9d 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFDatasourceSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFDatasourceSuite.scala @@ -1,4 +1,4 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.nio.file.Files @@ -13,10 +13,12 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.{SparkConf, SparkException} -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class VCFDatasourceSuite extends HLSBaseTest { +class VCFDatasourceSuite extends GlowBaseTest { val sourceName = "vcf" diff --git a/core/src/test/scala/com/databricks/vcf/VCFFileWriterSuite.scala b/core/src/test/scala/io/projectglow/vcf/VCFFileWriterSuite.scala similarity index 98% rename from core/src/test/scala/com/databricks/vcf/VCFFileWriterSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VCFFileWriterSuite.scala index 3dabb309a..104553ef0 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFFileWriterSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFFileWriterSuite.scala @@ -1,25 +1,29 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.{BufferedInputStream, File} import java.nio.file.{Files, Path, Paths} import java.util.stream.Collectors import scala.collection.JavaConverters._ + import com.google.common.io.ByteStreams import htsjdk.samtools.ValidationStringency import htsjdk.samtools.util.{BlockCompressedInputStream, BlockCompressedStreamConstants} import htsjdk.variant.variantcontext.writer.VCFHeaderWriter import htsjdk.variant.vcf.{VCFCompoundHeaderLine, VCFHeader, VCFHeaderLine} import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType import org.apache.spark.{SparkConf, SparkException} import org.bdgenomics.adam.rdd.ADAMContext._ -import com.databricks.hls.common.WithUtils -import com.databricks.hls.sql.HLSBaseTest -import org.apache.spark.sql.types.StructType import org.bdgenomics.adam.rdd.VCFMetadataLoader +import io.projectglow.common.{VCFRow, WithUtils} +import io.projectglow.common.{VCFRow, WithUtils} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest + abstract class VCFFileWriterSuite(val sourceName: String) - extends HLSBaseTest + extends GlowBaseTest with VCFConverterBaseTest { lazy val NA12878 = s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf" @@ -395,7 +399,7 @@ class SingleFileVCFWriterSuite extends VCFFileWriterSuite("bigvcf") { } } -class VCFWriterUtilsSuite extends HLSBaseTest { +class VCFWriterUtilsSuite extends GlowBaseTest { val vcf = s"$testDataHome/NA12878_21_10002403.vcf" lazy val schema = spark.read.format("vcf").load(vcf).schema diff --git a/core/src/test/scala/com/databricks/vcf/VCFPiperSuite.scala b/core/src/test/scala/io/projectglow/vcf/VCFPiperSuite.scala similarity index 89% rename from core/src/test/scala/com/databricks/vcf/VCFPiperSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VCFPiperSuite.scala index b48fd8bd8..defe7475d 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFPiperSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFPiperSuite.scala @@ -6,19 +6,23 @@ * License, Version 2.0, a copy of which you may obtain at * http://www.apache.org/licenses/LICENSE-2.0 */ -package com.databricks.vcf +package io.projectglow.vcf import scala.collection.JavaConverters._ -import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.sql.DataFrame +import org.apache.spark.{SparkException, TaskContext} -import com.databricks.hls.DBGenomics -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest -import com.databricks.hls.transformers.pipe.ProcessHelper +import io.projectglow.Glow +import io.projectglow.common.VCFRow +import io.projectglow.Glow +import io.projectglow.common.VCFRow +import io.projectglow.sql.GlowBaseTest +import io.projectglow.transformers.pipe.ProcessHelper +import io.projectglow.sql.GlowBaseTest +import io.projectglow.transformers.pipe.ProcessHelper -class VCFPiperSuite extends HLSBaseTest { +class VCFPiperSuite extends GlowBaseTest { lazy val sess = spark private val na12878 = s"$testDataHome/NA12878_21_10002403.vcf" private val TGP = s"$testDataHome/1000genomes-phase3-1row.vcf" @@ -38,7 +42,7 @@ class VCFPiperSuite extends HLSBaseTest { "outputFormatter" -> "vcf", "in_vcfHeader" -> "infer", "cmd" -> s"""["$script"]""") - val outputDf = DBGenomics.transform("pipe", inputDf, options) + val outputDf = Glow.transform("pipe", inputDf, options) (inputDf, outputDf) } @@ -98,7 +102,7 @@ class VCFPiperSuite extends HLSBaseTest { "env_c" -> "D", "envE" -> "F") val df = readVcf(na12878) - val output = DBGenomics + val output = Glow .transform("pipe", df, options) .as[String] .collect() @@ -114,7 +118,7 @@ class VCFPiperSuite extends HLSBaseTest { assert(df.count == 4) val options = baseTextOptions ++ Map("cmd" -> """["wc", "-l"]""", "in_vcfHeader" -> na12878) - val output = DBGenomics.transform("pipe", df, options) + val output = Glow.transform("pipe", df, options) assert(output.count() == 8) } @@ -123,7 +127,7 @@ class VCFPiperSuite extends HLSBaseTest { assert(df.count == 4) val options = baseTextOptions ++ Map("cmd" -> """["wc", "-l"]""", "in_vcfHeader" -> "infer") - assertThrows[SparkException](DBGenomics.transform("pipe", df, options)) + assertThrows[SparkException](Glow.transform("pipe", df, options)) } test("stdin and stderr threads are cleaned up for successful commands") { @@ -174,7 +178,7 @@ class VCFPiperSuite extends HLSBaseTest { "outputFormatter" -> "text", "in_vcfHeader" -> na12878, "cmd" -> s"""["cat", "-"]""") - val output = DBGenomics.transform("pipe", df, options) + val output = Glow.transform("pipe", df, options) assert(output.count == 28) } @@ -199,7 +203,7 @@ class VCFPiperSuite extends HLSBaseTest { "outputFormatter" -> "vcf", "in_vcfHeader" -> na12878, "cmd" -> s"""["cat", "-"]""") - val output = DBGenomics.transform("pipe", df, options) + val output = Glow.transform("pipe", df, options) assert(output.count() == 4) } @@ -219,7 +223,7 @@ class VCFPiperSuite extends HLSBaseTest { "outputFormatter" -> "vcf", "in_vcfHeader" -> "infer", "cmd" -> s"""["cat", "-"]""") - val outputDf = DBGenomics.transform("pipe", inputDf.toDF, options) + val outputDf = Glow.transform("pipe", inputDf.toDF, options) inputDf.as[SimpleVcfRow].collect.zip(outputDf.as[SimpleVcfRow].collect).foreach { case (vc1, vc2) => diff --git a/core/src/test/scala/com/databricks/vcf/VCFRowToVariantContextConverterSuite.scala b/core/src/test/scala/io/projectglow/vcf/VCFRowToVariantContextConverterSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/vcf/VCFRowToVariantContextConverterSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VCFRowToVariantContextConverterSuite.scala index 5e4398fa0..2b17daa27 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFRowToVariantContextConverterSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFRowToVariantContextConverterSuite.scala @@ -1,20 +1,22 @@ -package com.databricks.vcf - -import scala.collection.JavaConverters._ +package io.projectglow.vcf import java.io.File import java.nio.file.Files +import scala.collection.JavaConverters._ + import htsjdk.samtools.ValidationStringency import htsjdk.variant.variantcontext.GenotypeLikelihoods import htsjdk.variant.vcf.{VCFFileReader, VCFHeader} import org.apache.commons.io.FileUtils import org.bdgenomics.adam.rdd.VCFMetadataLoader -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class VCFRowToVariantContextConverterSuite extends HLSBaseTest with VCFConverterBaseTest { +class VCFRowToVariantContextConverterSuite extends GlowBaseTest with VCFConverterBaseTest { lazy val NA12878 = s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf" lazy val TGP = s"$testDataHome/1000genomes-phase3-1row.vcf" diff --git a/core/src/test/scala/com/databricks/vcf/VCFSchemaInferrerSuite.scala b/core/src/test/scala/io/projectglow/vcf/VCFSchemaInferrerSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/vcf/VCFSchemaInferrerSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VCFSchemaInferrerSuite.scala index e817a2291..93c87d9ea 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFSchemaInferrerSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFSchemaInferrerSuite.scala @@ -1,11 +1,14 @@ -package com.databricks.vcf +package io.projectglow.vcf import htsjdk.variant.vcf._ import org.apache.spark.sql.types._ -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.common.VariantSchemas +import io.projectglow.common.VariantSchemas +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class VCFSchemaInferrerSuite extends HLSBaseTest { +class VCFSchemaInferrerSuite extends GlowBaseTest { test("includes base fields") { val schema = VCFSchemaInferrer.inferSchema(false, false, Seq.empty, Seq.empty) VariantSchemas.vcfBaseSchema.foreach { field => diff --git a/core/src/test/scala/com/databricks/vcf/VCFStreamWriterSuite.scala b/core/src/test/scala/io/projectglow/vcf/VCFStreamWriterSuite.scala similarity index 98% rename from core/src/test/scala/com/databricks/vcf/VCFStreamWriterSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VCFStreamWriterSuite.scala index 4acb26cf4..f5c297e1d 100644 --- a/core/src/test/scala/com/databricks/vcf/VCFStreamWriterSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VCFStreamWriterSuite.scala @@ -1,18 +1,18 @@ -package com.databricks.vcf - -import scala.collection.JavaConverters._ +package io.projectglow.vcf import java.io.{ByteArrayOutputStream, StringReader} -import org.apache.commons.io.IOUtils -import org.bdgenomics.adam.rdd.VCFMetadataLoader +import scala.collection.JavaConverters._ + import htsjdk.tribble.TribbleException.InvalidHeader import htsjdk.variant.variantcontext.{Allele, GenotypeBuilder, VariantContextBuilder} import htsjdk.variant.vcf.{VCFCodec, VCFHeader, VCFHeaderLine} +import org.apache.commons.io.IOUtils +import org.bdgenomics.adam.rdd.VCFMetadataLoader -import com.databricks.hls.sql.HLSBaseTest +import io.projectglow.sql.GlowBaseTest -class VCFStreamWriterSuite extends HLSBaseTest { +class VCFStreamWriterSuite extends GlowBaseTest { val refA: Allele = Allele.create("A", true) val altT: Allele = Allele.create("T", false) diff --git a/core/src/test/scala/com/databricks/vcf/VariantContextToVCFRowConverterSuite.scala b/core/src/test/scala/io/projectglow/vcf/VariantContextToVCFRowConverterSuite.scala similarity index 96% rename from core/src/test/scala/com/databricks/vcf/VariantContextToVCFRowConverterSuite.scala rename to core/src/test/scala/io/projectglow/vcf/VariantContextToVCFRowConverterSuite.scala index 5b9a2bd30..1b2e8b3e1 100644 --- a/core/src/test/scala/com/databricks/vcf/VariantContextToVCFRowConverterSuite.scala +++ b/core/src/test/scala/io/projectglow/vcf/VariantContextToVCFRowConverterSuite.scala @@ -1,18 +1,22 @@ -package com.databricks.vcf +package io.projectglow.vcf import java.io.File import java.lang.{Double => JDouble, Integer => JInteger} import java.util.{ArrayList => JArrayList, HashSet => JHashSet} import scala.collection.JavaConverters._ + import htsjdk.samtools.ValidationStringency -import org.bdgenomics.adam.rdd.VCFMetadataLoader import htsjdk.variant.variantcontext.{Allele, GenotypeBuilder, VariantContextBuilder} import htsjdk.variant.vcf.{VCFFileReader, VCFHeader} -import com.databricks.hls.common.TestUtils._ -import com.databricks.hls.sql.HLSBaseTest +import org.bdgenomics.adam.rdd.VCFMetadataLoader + +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.common.{GenotypeFields, VCFRow} +import io.projectglow.sql.GlowBaseTest +import io.projectglow.sql.GlowBaseTest -class VariantContextToVCFRowConverterSuite extends HLSBaseTest with VCFConverterBaseTest { +class VariantContextToVCFRowConverterSuite extends GlowBaseTest with VCFConverterBaseTest { lazy val NA12878 = s"$testDataHome/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf" lazy val TGP = s"$testDataHome/1000genomes-phase3-1row.vcf" diff --git a/python/db_genomics/__init__.py b/python/db_genomics/__init__.py deleted file mode 100644 index eeee4dc11..000000000 --- a/python/db_genomics/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from db_genomics.dbg import * diff --git a/python/environment.yml b/python/environment.yml index 410c8ec18..4a59d50c2 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -1,4 +1,4 @@ -name: spark-genomics +name: glow dependencies: - python=3.7 - pytest diff --git a/python/glow/__init__.py b/python/glow/__init__.py new file mode 100644 index 000000000..081715ede --- /dev/null +++ b/python/glow/__init__.py @@ -0,0 +1 @@ +from glow.glow import * diff --git a/python/db_genomics/dbg.py b/python/glow/glow.py similarity index 94% rename from python/db_genomics/dbg.py rename to python/glow/glow.py index 882c43a09..7b3df18c7 100644 --- a/python/db_genomics/dbg.py +++ b/python/glow/glow.py @@ -25,7 +25,7 @@ def transform(operation: str, df: DataFrame, arg_map: Dict[str, str]=None, assert check_argument_types() sc = SparkContext.getOrCreate(0) - transform_fn = SparkContext._jvm.com.databricks.hls.DBGenomics.transform + transform_fn = SparkContext._jvm.org.projectglow.Glow.transform args = arg_map if arg_map is not None else kwargs output_jdf = transform_fn(operation, df._jdf, args) output_df = DataFrame(output_jdf, SQLContext.getOrCreate(sc)) diff --git a/python/db_genomics/tests/__init__.py b/python/glow/tests/__init__.py similarity index 100% rename from python/db_genomics/tests/__init__.py rename to python/glow/tests/__init__.py diff --git a/python/db_genomics/tests/conftest.py b/python/glow/tests/conftest.py similarity index 100% rename from python/db_genomics/tests/conftest.py rename to python/glow/tests/conftest.py diff --git a/python/db_genomics/tests/test_transform.py b/python/glow/tests/test_transform.py similarity index 80% rename from python/db_genomics/tests/test_transform.py rename to python/glow/tests/test_transform.py index 8f6f077da..2971e045d 100644 --- a/python/db_genomics/tests/test_transform.py +++ b/python/glow/tests/test_transform.py @@ -1,12 +1,12 @@ import pytest from pyspark.sql.utils import IllegalArgumentException -import db_genomics as sg +import glow as glow def test_transform(spark): df = spark.read.format("vcf")\ .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf") - converted = sg.transform("pipe", df, input_formatter="vcf", output_formatter="vcf", + converted = glow.transform("pipe", df, input_formatter="vcf", output_formatter="vcf", cmd='["cat"]', in_vcf_header="infer") assert converted.count() == 1075 @@ -15,7 +15,7 @@ def test_no_transform(spark): df = spark.read.format("vcf") \ .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf") with pytest.raises(IllegalArgumentException): - sg.transform("dne", df) + glow.transform("dne", df) def test_arg_map(spark): @@ -27,5 +27,5 @@ def test_arg_map(spark): "cmd": '["cat"]', "in_vcfHeader": "infer" } - converted = sg.transform("pipe", df, args) + converted = glow.transform("pipe", df, args) assert converted.count() == 1075