Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
1968d3a
do the big rename
henrydavidge Oct 8, 2019
b13014f
Make tests pass
henrydavidge Oct 8, 2019
ee08118
imports
henrydavidge Oct 8, 2019
8c8a181
sg -> glow
henrydavidge Oct 8, 2019
2bf6a0a
Trigger CircleCI tests
karenfeng Oct 9, 2019
8eb72ba
Trigger CircleCI again
karenfeng Oct 9, 2019
ef980d1
Fix CircleCI config
karenfeng Oct 9, 2019
47381b8
Fix Python dir
karenfeng Oct 9, 2019
2e29b02
Rename datasources
karenfeng Oct 9, 2019
5850ad2
CircleCI wip
karenfeng Oct 9, 2019
7eea7c1
More CircleCI wip
karenfeng Oct 9, 2019
7e819fd
Continue Circleci wip
karenfeng Oct 9, 2019
0b4ccd1
WIP
karenfeng Oct 9, 2019
0193b06
Revert last change
karenfeng Oct 9, 2019
cb118c8
Un package-private
karenfeng Oct 9, 2019
1b0144b
More un package-private
karenfeng Oct 9, 2019
e15370f
Continue un package-private
karenfeng Oct 9, 2019
9221069
More un-package-private
karenfeng Oct 9, 2019
21d3289
Try again
karenfeng Oct 9, 2019
a6e7922
no core
henrydavidge Oct 9, 2019
26823c0
Merge branch 'rename' of github.com:henrydavidge/glow into rename
henrydavidge Oct 9, 2019
7cbf6d6
rename
henrydavidge Oct 9, 2019
4506775
compile
henrydavidge Oct 9, 2019
a8a1f72
fix test
henrydavidge Oct 9, 2019
b6b70e9
fix tests
henrydavidge Oct 9, 2019
f5d53e4
test file
henrydavidge Oct 9, 2019
217179d
no tabs
henrydavidge Oct 9, 2019
db1923a
less logging'
henrydavidge Oct 10, 2019
339eefc
update
henrydavidge Oct 10, 2019
db728ad
io
henrydavidge Oct 10, 2019
88d0e68
ignore unit tests
henrydavidge Oct 10, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
version: 2.1
jobs:
test:
working_directory: ~/spark-genomics
working_directory: ~/glow
docker:
- image: circleci/openjdk:8
steps:
- checkout

- restore_cache:
keys:
- conda-deps-v1-{{ checksum "python/environment.yml" }}

- checkout

- run:
name: install dependencies
command: |
Expand All @@ -28,9 +27,13 @@ jobs:
name: run tests
environment:
command: |
export PATH=$HOME/conda/envs/spark-genomics/bin:$PATH
export PATH=$HOME/conda/envs/glow/bin:$PATH
sbt test exit

- store_artifacts:
path: ~/glow/unit-tests.log
destination: unit-tests.log

- save_cache:
paths:
- /home/circleci/conda
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ adam.log
*.pyc
.DS_Store
docs/build
unit-tests.log
2 changes: 1 addition & 1 deletion .scalafmt.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ includeCurlyBraceInSelectChains = false
includeNoParensInSelectChains = true
importSelectors = singleLine

rewrite.rules = [PreferCurlyFors, SortImports]
rewrite.rules = [PreferCurlyFors, SortImports]
9 changes: 5 additions & 4 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import Tests._
import scala.sys.process._

import sbt.Tests._

val sparkVersion = "2.4.3"
val scalaMajorMinor = "2.11"

ThisBuild / scalaVersion := s"$scalaMajorMinor.12"
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / organization := "com.databricks"
ThisBuild / organization := "org.projectglow"
ThisBuild / organizationName := "DB / RGC"
ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml"

Expand Down Expand Up @@ -60,7 +61,7 @@ lazy val commonSettings = Seq(
lazy val core = (project in file("core"))
.settings(
commonSettings,
name := "spark-genomics",
name := "glow",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
Expand Down Expand Up @@ -105,7 +106,7 @@ lazy val python =
.dependsOn(core % "test->test")
.settings(
unmanagedSourceDirectories in Compile := {
Seq(baseDirectory.value / "spark_genomics")
Seq(baseDirectory.value / "glow")
},
test in Test := {
// Pass the test classpath to pyspark so that we run the same bits as the Scala tests
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package com.databricks.hls.sql;
package io.projectglow.sql;

import org.apache.spark.sql.catalyst.util.GenericArrayData;
import org.apache.spark.unsafe.types.UTF8String;

public class HLSFunctions {
public class Functions {
public static GenericArrayData asciiCharSplit(UTF8String str, UTF8String split) {
java.util.List<UTF8String> output = new java.util.ArrayList<>();
int start = 0;
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
io.projectglow.transformers.LiftOverVariantsTransformer
io.projectglow.transformers.normalizevariants.NormalizeVariantsTransformer
io.projectglow.transformers.pipe.PipeTransformer
io.projectglow.transformers.pipe.CleanupPipeTransformer
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
io.projectglow.transformers.pipe.CSVInputFormatterFactory
io.projectglow.transformers.pipe.UTF8TextInputFormatterFactory
io.projectglow.vcf.VCFInputFormatterFactory
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
io.projectglow.transformers.pipe.CSVOutputFormatterFactory
io.projectglow.transformers.pipe.UTF8TextOutputFormatterFactory
io.projectglow.vcf.VCFOutputFormatterFactory
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Standard file formats
com.databricks.bgen.BgenFileFormat
com.databricks.bgen.BigBgenDatasource
com.databricks.vcf.BigVCFDatasource
com.databricks.vcf.VCFFileFormat
io.projectglow.bgen.BgenFileFormat
io.projectglow.bgen.BigBgenDatasource
io.projectglow.vcf.BigVCFDatasource
io.projectglow.vcf.VCFFileFormat

# Legacy file formats
com.databricks.bgen.ComDatabricksBgenFileFormat
com.databricks.bgen.ComDatabricksBigBgenDatasource
com.databricks.vcf.ComDatabricksBigVCFDatasource
com.databricks.vcf.ComDatabricksVCFFileFormat
io.projectglow.bgen.ComDatabricksBgenFileFormat
io.projectglow.bgen.ComDatabricksBigBgenDatasource
io.projectglow.vcf.ComDatabricksBigVCFDatasource
io.projectglow.vcf.ComDatabricksVCFFileFormat

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package com.databricks.hls
package io.projectglow

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.spark.sql.DataFrame

import com.databricks.hls.common.Named
import com.databricks.hls.transformers.util.{SnakeCaseMap, StringUtils}
import io.projectglow.common.Named
import io.projectglow.transformers.util.{SnakeCaseMap, StringUtils}

/**
* The entry point for all language specific functionality, meaning methods that cannot be expressed
Expand All @@ -16,7 +16,7 @@ import com.databricks.hls.transformers.util.{SnakeCaseMap, StringUtils}
* We should expose as little functionality as is necessary through this object and should prefer
* generic methods with stringly-typed arguments to reduce language-specific maintenance burden.
*/
object DBGenomics {
object Glow {

/**
* Apply a named transformation to a DataFrame of genomic data. All parameters apart from the
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package com.databricks.bgen
package io.projectglow.bgen

import java.util.{HashMap => JHashMap}

import org.apache.commons.math3.util.CombinatoricsUtils

// Tools for calculating ploidy or number of genotypes for unphased posterior probabilities
private[databricks] object BgenConverterUtils {
private[projectglow] object BgenConverterUtils {
var ploidyMap = new JHashMap[(Int, Int), Int] // (numGenotypes, numAlleles) to ploidy
var genotypesMap = new JHashMap[(Int, Int), Int] // (ploidy, numAlleles) to numGenotypes

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package com.databricks.bgen

import scala.collection.JavaConverters._
package io.projectglow.bgen

import java.io.{BufferedReader, File, InputStreamReader}
import java.nio.file.Paths

import scala.collection.JavaConverters._

import com.google.common.io.LittleEndianDataInputStream
import com.google.common.util.concurrent.Striped
import org.apache.hadoop.conf.Configuration
Expand All @@ -19,12 +19,11 @@ import org.apache.spark.sql.types.StructType
import org.skife.jdbi.v2.DBI
import org.skife.jdbi.v2.util.LongMapper

import com.databricks.hls.common.logging._
import com.databricks.hls.common.{HLSLogging, WithUtils}
import com.databricks.hls.sql.util.SerializableConfiguration
import com.databricks.sql.ComDatabricksDataSource
import io.projectglow.common.logging.{HlsMetricDefinitions, HlsTagDefinitions, HlsTagValues, HlsUsageLogging}
import io.projectglow.common.{GlowLogging, WithUtils}
import io.projectglow.sql.util.{ComDatabricksDataSource, SerializableConfiguration}

class BgenFileFormat extends FileFormat with DataSourceRegister with Serializable with HLSLogging {
class BgenFileFormat extends FileFormat with DataSourceRegister with Serializable with GlowLogging {

override def shortName(): String = "bgen"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
package com.databricks.bgen
package io.projectglow.bgen

import java.io.{ByteArrayInputStream, DataInput, DataInputStream}
import java.nio.charset.StandardCharsets
import java.util.zip.Inflater

import com.google.common.io
import com.google.common.io.LittleEndianDataInputStream
import org.apache.commons.math3.util.CombinatoricsUtils
import org.apache.hadoop.fs.FSDataInputStream

import com.databricks.hls.common.HLSLogging
import com.databricks.vcf.{BgenGenotype, BgenRow, VCFRow}
import io.projectglow.common.{BgenGenotype, BgenRow, GlowLogging}

/**
* Parses variant records of a BGEN file into the [[VCFRow]] schema. The iterator assumes that the
* Parses variant records of a BGEN file into the [[io.projectglow.common.VCFRow]] schema. The iterator assumes that the
* input streams are currently at the beginning of a variant block.
*
* The `init` method should be called before reading variants to skip to an appropriate starting
Expand All @@ -35,14 +33,14 @@ import com.databricks.vcf.{BgenGenotype, BgenRow, VCFRow}
* @param maxPos The maximum stream position from which variant blocks can be read. `hasNext` will
* return `false` once we've reached this position.
*/
private[databricks] class BgenFileIterator(
private[projectglow] class BgenFileIterator(
metadata: BgenMetadata,
stream: LittleEndianDataInputStream,
underlyingStream: FSDataInputStream,
minPos: Long,
maxPos: Long)
extends Iterator[BgenRow]
with HLSLogging {
with GlowLogging {

import BgenFileIterator._

Expand Down Expand Up @@ -80,7 +78,7 @@ private[databricks] class BgenFileIterator(
inflater.inflate(uncompressedBytes)

val rawGenotypeStream = new DataInputStream(new ByteArrayInputStream(uncompressedBytes))
val genotypeStream = new io.LittleEndianDataInputStream(rawGenotypeStream)
val genotypeStream = new LittleEndianDataInputStream(rawGenotypeStream)
val genotypes = readGenotypes(nAlleles, genotypeStream, metadata.sampleIds)

BgenRow(
Expand Down Expand Up @@ -287,7 +285,7 @@ private[databricks] class BgenFileIterator(
}
}

private[databricks] object BgenFileIterator {
private[projectglow] object BgenFileIterator {

/**
* Utility function to read a UTF8 string from a data stream. Included in the companion object
Expand All @@ -309,7 +307,8 @@ private[databricks] object BgenFileIterator {
* Read a BGEN header from a data stream. Performs basic validation on the header parameters
* according to what the reader currently supports.
*/
private[databricks] class BgenHeaderReader(stream: LittleEndianDataInputStream) extends HLSLogging {
private[projectglow] class BgenHeaderReader(stream: LittleEndianDataInputStream)
extends GlowLogging {

def readHeader(sampleIdsOpt: Option[Seq[String]] = None): BgenMetadata = {
val variantOffset = Integer.toUnsignedLong(stream.readInt()) + 4
Expand Down Expand Up @@ -381,7 +380,7 @@ private[databricks] class BgenHeaderReader(stream: LittleEndianDataInputStream)
}
}

private[databricks] case class BgenMetadata(
private[projectglow] case class BgenMetadata(
firstVariantOffset: Long,
nSamples: Long,
nVariantBlocks: Long,
Expand Down
Loading