Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
# Java targets
target
core/target
project/target
.idea
*.swp
*.swo
adam.log

# Distribution / packaging
build/
dist/
maven-repo/
*.egg-info/

# Mac
.DS_Store

# Byte-compiled / optimized / DLL files
**/__pycache__
*.pyc
.DS_Store

# Sphinx documentation
docs/build

# Editor files
*.swp
*.swo
*.iml
.idea

# Logs
adam.log
unit-tests.log
36 changes: 0 additions & 36 deletions README.md

This file was deleted.

10 changes: 10 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
==================================
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you migrate over the rest of the readme contents?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And delete the old one?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To clean everything up for publishing (PyPi uses readme.rst), I put everything in Building and Testing in the wiki.

Glow: Genomics on Apache Spark SQL
==================================

|circle-ci|

.. |circle-ci| image:: https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d
:target: https://circleci.com/gh/projectglow/glow

Glow is a library to ETL genomics data using Apache Spark SQL.
159 changes: 98 additions & 61 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ val sparkVersion = "2.4.3"
val scalaMajorMinor = "2.11"

ThisBuild / scalaVersion := s"$scalaMajorMinor.12"
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / version := "0.1.0"
ThisBuild / organization := "org.projectglow"
ThisBuild / organizationName := "DB / RGC"
ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml"
ThisBuild / publish / skip := true

// Compile Java sources before Scala sources, so Scala code can depend on Java
// but not vice versa
Expand All @@ -30,7 +31,7 @@ def groupByHash(tests: Seq[TestDefinition]) = {
.map {
case (i, tests) =>
val options = ForkOptions()
.withRunJVMOptions(Vector("-Xmx1024m"))
.withRunJVMOptions(Vector("-Dspark.ui.enabled=false", "-Xmx1024m"))
new Group(i.toString, tests, SubProcess(options))
}
.toSeq
Expand Down Expand Up @@ -58,46 +59,76 @@ lazy val commonSettings = Seq(
scalacOptions += "-target:jvm-1.8"
)

lazy val dependencies = Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.seqdoop" % "hadoop-bam" % "7.9.1",
"log4j" % "log4j" % "1.2.17",
"org.slf4j" % "slf4j-api" % "1.7.16",
"org.slf4j" % "slf4j-log4j12" % "1.7.16",
"org.jdbi" % "jdbi" % "2.63.1",
"com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2",
// Exclude extraneous GATK dependencies
("org.broadinstitute" % "gatk" % "4.0.11.0")
.exclude("biz.k11i", "xgboost-predictor")
.exclude("com.esotericsoftware", "kryo")
.exclude("com.esotericsoftware", "reflectasm")
.exclude("com.github.jsr203hadoop", "jsr203hadoop")
.exclude("com.google.cloud", "google-cloud-nio")
.exclude("com.google.cloud.bigdataoss", "gcs-connector")
.exclude("com.intel", "genomicsdb")
.exclude("com.intel.gkl", "gkl")
.exclude("com.opencsv", "opencsv")
.exclude("commons-io", "commons-io")
.exclude("gov.nist.math.jama", "gov.nist.math.jama")
.exclude("it.unimi.dsi", "fastutil")
.exclude("org.aeonbits.owner", "owner")
.exclude("org.apache.commons", "commons-lang3")
.exclude("org.apache.commons", "commons-math3")
.exclude("org.apache.commons", "commons-collections4")
.exclude("org.apache.commons", "commons-vfs2")
.exclude("org.apache.hadoop", "hadoop-client")
.exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor")
.exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor")
.exclude("org.broadinstitute", "barclay")
.exclude("org.broadinstitute", "hdf5-java-bindings")
.exclude("org.broadinstitute", "gatk-native-bindings")
.exclude("org.broadinstitute", "gatk-bwamem-jni")
.exclude("org.broadinstitute", "gatk-fermilite-jni")
.exclude("org.jgrapht", "jgrapht-core")
.exclude("org.objenesis", "objenesis")
.exclude("org.ojalgo", "ojalgo")
.exclude("org.ojalgo", "ojalgo-commons-math3")
.exclude("org.reflections", "reflections")
.exclude("org.seqdoop", "hadoop-bam")
.exclude("org.xerial", "sqlite-jdbc"),
// Test dependencies
"org.scalatest" %% "scalatest" % "3.0.3" % "test",
"org.scalacheck" %% "scalacheck" % "1.12.5" % "test",
"org.mockito" % "mockito-all" % "1.9.5" % "test",
"org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
"org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test",
"org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test",
"org.xerial" % "sqlite-jdbc" % "3.20.1" % "test"
).map(_.exclude("com.google.code.findbugs", "jsr305"))

lazy val core = (project in file("core"))
.settings(
commonSettings,
name := "glow",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"com.github.samtools" % "htsjdk" % "2.20.0",
"com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.9",
"org.seqdoop" % "hadoop-bam" % "7.9.1",
"log4j" % "log4j" % "1.2.17",
"org.slf4j" % "slf4j-api" % "1.7.16",
"org.slf4j" % "slf4j-log4j12" % "1.7.16",
"org.jdbi" % "jdbi" % "2.63.1",
"com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2",
// Exclude extraneous GATK dependencies
("org.broadinstitute" % "gatk" % "4.0.11.0")
.exclude("biz.k11i", "xgboost-predictor")
.exclude("com.google.cloud.bigdataoss", "gcs-connector")
.exclude("com.intel", "genomicsdb")
.exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor")
.exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor")
.exclude("com.google.cloud", "google-cloud-nio"),
// Test dependencies
"org.scalatest" %% "scalatest" % "3.0.3" % "test",
"org.scalacheck" %% "scalacheck" % "1.12.5" % "test",
"org.mockito" % "mockito-all" % "1.9.5" % "test",
"org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
"org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test",
"org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test"
),
publish / skip := false,
bintrayRepository := "glow",
libraryDependencies ++= dependencies,
// Fix versions of libraries that are depended on multiple times
dependencyOverrides ++= Seq(
"org.apache.hadoop" % "hadoop-client" % "2.7.3",
"io.netty" % "netty" % "3.9.9.Final",
"io.netty" % "netty-all" % "4.1.17.Final"
"io.netty" % "netty-all" % "4.1.17.Final",
"com.github.samtools" % "htsjdk" % "2.20.1"
)
)

Expand All @@ -122,33 +153,39 @@ lazy val python =
"SPARK_HOME" -> (ThisBuild / baseDirectory).value.absolutePath
).!
require(ret == 0, "Python tests failed")
}
},
publish / skip := true
)

// Uncomment the following for publishing to Sonatype.
// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for more detail.
// Publish to Bintray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get that bintray is helpful for testing, but why would we publish to bintray vs maven central? I think it's easier for users to download from maven central because they don't need to add a repo to their pom (please correct me if that's wrong).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline, this is what Delta does with their OSS release - stage in bintray before pushing to Maven Central.

ThisBuild / description := "Glow: Genomics on Apache Spark"
ThisBuild / licenses := List(
"Apache-2.0" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt"))
ThisBuild / homepage := Some(url("https://github.com/projectglow/glow"))
ThisBuild / scmInfo := Some(
ScmInfo(
url("https://github.com/projectglow/glow"),
"scm:git@github.com:projectglow/glow.git"
)
)
ThisBuild / pomIncludeRepository := { _ =>
false
}
ThisBuild / publishMavenStyle := true

ThisBuild / bintrayOrganization := Some("projectglow")
ThisBuild / bintrayRepository := "glow"

// ThisBuild / description := "Some descripiton about your project."
// ThisBuild / licenses := List("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt"))
// ThisBuild / homepage := Some(url("https://github.com/example/project"))
// ThisBuild / scmInfo := Some(
// ScmInfo(
// url("https://github.com/your-account/your-project"),
// "scm:git@github.com:your-account/your-project.git"
// )
// )
// ThisBuild / developers := List(
// Developer(
// id = "Your identifier",
// name = "Your Name",
// email = "your@email",
// url = url("http://your.url")
// )
// )
// ThisBuild / pomIncludeRepository := { _ => false }
// ThisBuild / publishTo := {
// val nexus = "https://oss.sonatype.org/"
// if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots")
// else Some("releases" at nexus + "service/local/staging/deploy/maven2")
// }
// ThisBuild / publishMavenStyle := true
import ReleaseTransformations._

releaseProcess := Seq[ReleaseStep](
checkSnapshotDependencies,
inquireVersions,
runTest,
setReleaseVersion,
commitReleaseVersion,
tagRelease,
publishArtifacts,
setNextVersion,
commitNextVersion
)
13 changes: 7 additions & 6 deletions core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,20 @@ object BgenSchemaInferrer {

val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
val ignoreExtension = options.get(BgenFileFormat.IGNORE_EXTENSION_KEY).exists(_.toBoolean)
val bgenFiles =
val bgenPaths =
files.filter { fs =>
fs.getLen > 0 && (fs
.getPath
.toString
.endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension)
}
}.map(_.getPath.toString)
val hasSampleIds = spark
.sparkContext
.parallelize(bgenFiles)
.map { fs =>
val hadoopFs = fs.getPath.getFileSystem(serializableConf.value)
WithUtils.withCloseable(hadoopFs.open(fs.getPath)) { stream =>
.parallelize(bgenPaths)
.map { path =>
val hPath = new Path(path)
val hadoopFs = hPath.getFileSystem(serializableConf.value)
WithUtils.withCloseable(hadoopFs.open(hPath)) { stream =>
val littleEndianDataInputStream = new LittleEndianDataInputStream(stream)
new BgenHeaderReader(littleEndianDataInputStream)
.readHeader(None)
Expand Down
8 changes: 5 additions & 3 deletions core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala
Original file line number Diff line number Diff line change
Expand Up @@ -441,13 +441,15 @@ private[vcf] object SchemaDelegate {
files: Seq[FileStatus]): (Seq[VCFInfoHeaderLine], Seq[VCFFormatHeaderLine]) = {
val serializableConf = new SerializableConfiguration(spark.sessionState.newHadoopConf())

val filePaths = files.map(_.getPath.toString)
spark
.sparkContext
.parallelize(files.map(_.getPath.toString))
.parallelize(filePaths)
.map { path =>
val (header, _) = VCFFileFormat.createVCFCodec(path, serializableConf.value)

(header.getInfoHeaderLines.asScala.toSeq, header.getFormatHeaderLines.asScala.toSeq)
val infoHeaderLines = header.getInfoHeaderLines.asScala.toSet
val formatHeaderLines = header.getFormatHeaderLines.asScala.toSet
(infoHeaderLines, formatHeaderLines)
}
.collect()
.foldLeft((Seq.empty[VCFInfoHeaderLine], Seq.empty[VCFFormatHeaderLine])) {
Expand Down
1 change: 1 addition & 0 deletions core/src/test/resources/log4j.properties
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ log4j.logger.akka=ERROR
log4j.logger.Remoting=ERROR
log4j.logger.org.apache.spark=INFO
log4j.logger.org.eclipse.jetty=ERROR
log4j.logger.org.apache.hadoop=ERROR
1 change: 0 additions & 1 deletion core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ abstract class GlowBaseTest
override protected def sparkConf: SparkConf = {
super
.sparkConf
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.driver.maxResultSize", "0")
.set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.GATKRegistrator")
.set("spark.kryoserializer.buffer.max", "2047m")
Expand Down
2 changes: 2 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.0")
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")
2 changes: 2 additions & 0 deletions python/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ dependencies:
- pip
- pip:
- pyspark==2.4.2
- setuptools==41.2.0 # Python packaging
- typeguard==2.5.0
- twine==2.0.0 # Pypi publishing
20 changes: 20 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from setuptools import setup

setup(
name='glowpy',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just call it glow?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline, both glow and pyglow have been taken on PyPi.

version='1.0.3',
packages=['db_genomics'],
install_requires=[
'typeguard==2.5.0',
],
author='Glow Project',
description='Glow: Genomics on Apache Spark',
long_description=open('../README.rst').read(),
long_description_content_type='text/x-rst',
license='Apache License 2.0',
classifiers=[
'Intended Audience :: Developers',
'Programming Language :: Python :: 3.7',
],
url='https://github.com/projectglow/glow'
)
1 change: 1 addition & 0 deletions version.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
version in ThisBuild := "0.1.1-SNAPSHOT"