Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
# Java targets
target
core/target
project/target
.idea
*.swp
*.swo
adam.log

# Distribution / packaging
build/
dist/
maven-repo/
*.egg-info/

# Mac
.DS_Store

# Byte-compiled / optimized / DLL files
**/__pycache__
*.pyc
.DS_Store

# Sphinx documentation
docs/build

# Editor files
*.swp
*.swo
*.iml
.idea

# Logs
adam.log
unit-tests.log
156 changes: 95 additions & 61 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ val sparkVersion = "2.4.3"
val scalaMajorMinor = "2.11"

ThisBuild / scalaVersion := s"$scalaMajorMinor.12"
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / organization := "io.projectglow"
ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml"
ThisBuild / publish / skip := true

ThisBuild / organizationName := "The Glow Authors"
ThisBuild / startYear := Some(2019)
Expand Down Expand Up @@ -61,48 +61,78 @@ lazy val commonSettings = Seq(
MergeStrategy.first
},
scalacOptions += "-target:jvm-1.8"
)
)

lazy val dependencies = Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.seqdoop" % "hadoop-bam" % "7.9.1",
"log4j" % "log4j" % "1.2.17",
"org.slf4j" % "slf4j-api" % "1.7.16",
"org.slf4j" % "slf4j-log4j12" % "1.7.16",
"org.jdbi" % "jdbi" % "2.63.1",
"com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2",
// Exclude extraneous GATK dependencies
("org.broadinstitute" % "gatk" % "4.0.11.0")
.exclude("biz.k11i", "xgboost-predictor")
.exclude("com.esotericsoftware", "kryo")
.exclude("com.esotericsoftware", "reflectasm")
.exclude("com.github.jsr203hadoop", "jsr203hadoop")
.exclude("com.google.cloud", "google-cloud-nio")
.exclude("com.google.cloud.bigdataoss", "gcs-connector")
.exclude("com.intel", "genomicsdb")
.exclude("com.intel.gkl", "gkl")
.exclude("com.opencsv", "opencsv")
.exclude("commons-io", "commons-io")
.exclude("gov.nist.math.jama", "gov.nist.math.jama")
.exclude("it.unimi.dsi", "fastutil")
.exclude("org.aeonbits.owner", "owner")
.exclude("org.apache.commons", "commons-lang3")
.exclude("org.apache.commons", "commons-math3")
.exclude("org.apache.commons", "commons-collections4")
.exclude("org.apache.commons", "commons-vfs2")
.exclude("org.apache.hadoop", "hadoop-client")
.exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor")
.exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor")
.exclude("org.broadinstitute", "barclay")
.exclude("org.broadinstitute", "hdf5-java-bindings")
.exclude("org.broadinstitute", "gatk-native-bindings")
.exclude("org.broadinstitute", "gatk-bwamem-jni")
.exclude("org.broadinstitute", "gatk-fermilite-jni")
.exclude("org.jgrapht", "jgrapht-core")
.exclude("org.objenesis", "objenesis")
.exclude("org.ojalgo", "ojalgo")
.exclude("org.ojalgo", "ojalgo-commons-math3")
.exclude("org.reflections", "reflections")
.exclude("org.seqdoop", "hadoop-bam")
.exclude("org.xerial", "sqlite-jdbc"),
// Test dependencies
"org.scalatest" %% "scalatest" % "3.0.3" % "test",
"org.scalacheck" %% "scalacheck" % "1.12.5" % "test",
"org.mockito" % "mockito-all" % "1.9.5" % "test",
"org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
"org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test",
"org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test",
"org.xerial" % "sqlite-jdbc" % "3.20.1" % "test"
).map(_.exclude("com.google.code.findbugs", "jsr305"))

lazy val core = (project in file("core"))
.settings(
commonSettings,
name := "glow",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"com.github.samtools" % "htsjdk" % "2.20.0",
"com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.9",
"org.seqdoop" % "hadoop-bam" % "7.9.1",
"log4j" % "log4j" % "1.2.17",
"org.slf4j" % "slf4j-api" % "1.7.16",
"org.slf4j" % "slf4j-log4j12" % "1.7.16",
"org.jdbi" % "jdbi" % "2.63.1",
"com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2",
// Exclude extraneous GATK dependencies
("org.broadinstitute" % "gatk" % "4.0.11.0")
.exclude("biz.k11i", "xgboost-predictor")
.exclude("com.google.cloud.bigdataoss", "gcs-connector")
.exclude("com.intel", "genomicsdb")
.exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor")
.exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor")
.exclude("com.google.cloud", "google-cloud-nio"),
// Test dependencies
"org.scalatest" %% "scalatest" % "3.0.3" % "test",
"org.scalacheck" %% "scalacheck" % "1.12.5" % "test",
"org.mockito" % "mockito-all" % "1.9.5" % "test",
"org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
"org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test",
"org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test"
),
publish / skip := false,
bintrayRepository := "glow",
libraryDependencies ++= dependencies,
// Fix versions of libraries that are depended on multiple times
dependencyOverrides ++= Seq(
"org.apache.hadoop" % "hadoop-client" % "2.7.3",
"io.netty" % "netty" % "3.9.9.Final",
"io.netty" % "netty-all" % "4.1.17.Final"
"io.netty" % "netty-all" % "4.1.17.Final",
"com.github.samtools" % "htsjdk" % "2.20.1"
)
)

Expand All @@ -127,33 +157,37 @@ lazy val python =
"SPARK_HOME" -> (ThisBuild / baseDirectory).value.absolutePath
).!
require(ret == 0, "Python tests failed")
}
},
publish / skip := true
)

// Uncomment the following for publishing to Sonatype.
// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for more detail.
// Publish to Bintray
ThisBuild / description := "An open-source toolkit for large-scale genomic analysis"
ThisBuild / homepage := Some(url("https://projectglow.io"))
ThisBuild / scmInfo := Some(
ScmInfo(
url("https://github.com/projectglow/glow"),
"scm:[email protected]:projectglow/glow.git"
)
)
ThisBuild / pomIncludeRepository := { _ =>
false
}
ThisBuild / publishMavenStyle := true

ThisBuild / bintrayOrganization := Some("projectglow")
ThisBuild / bintrayRepository := "glow"

// ThisBuild / description := "Some descripiton about your project."
// ThisBuild / licenses := List("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt"))
// ThisBuild / homepage := Some(url("https://github.com/example/project"))
// ThisBuild / scmInfo := Some(
// ScmInfo(
// url("https://github.com/your-account/your-project"),
// "scm:[email protected]:your-account/your-project.git"
// )
// )
// ThisBuild / developers := List(
// Developer(
// id = "Your identifier",
// name = "Your Name",
// email = "your@email",
// url = url("http://your.url")
// )
// )
// ThisBuild / pomIncludeRepository := { _ => false }
// ThisBuild / publishTo := {
// val nexus = "https://oss.sonatype.org/"
// if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots")
// else Some("releases" at nexus + "service/local/staging/deploy/maven2")
// }
// ThisBuild / publishMavenStyle := true
import ReleaseTransformations._

releaseProcess := Seq[ReleaseStep](
checkSnapshotDependencies,
inquireVersions,
runTest,
setReleaseVersion,
commitReleaseVersion,
tagRelease,
publishArtifacts,
setNextVersion,
commitNextVersion
)
15 changes: 8 additions & 7 deletions core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package io.projectglow.bgen

import com.google.common.io.LittleEndianDataInputStream
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType

Expand Down Expand Up @@ -45,19 +45,20 @@ object BgenSchemaInferrer {

val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
val ignoreExtension = options.get(BgenFileFormat.IGNORE_EXTENSION_KEY).exists(_.toBoolean)
val bgenFiles =
val bgenPaths =
files.filter { fs =>
fs.getLen > 0 && (fs
.getPath
.toString
.endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension)
}
}.map(_.getPath.toString)
val hasSampleIds = spark
.sparkContext
.parallelize(bgenFiles)
.map { fs =>
val hadoopFs = fs.getPath.getFileSystem(serializableConf.value)
WithUtils.withCloseable(hadoopFs.open(fs.getPath)) { stream =>
.parallelize(bgenPaths)
.map { path =>
val hPath = new Path(path)
val hadoopFs = hPath.getFileSystem(serializableConf.value)
WithUtils.withCloseable(hadoopFs.open(hPath)) { stream =>
val littleEndianDataInputStream = new LittleEndianDataInputStream(stream)
new BgenHeaderReader(littleEndianDataInputStream)
.readHeader(None)
Expand Down
8 changes: 5 additions & 3 deletions core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala
Original file line number Diff line number Diff line change
Expand Up @@ -457,13 +457,15 @@ private[vcf] object SchemaDelegate {
files: Seq[FileStatus]): (Seq[VCFInfoHeaderLine], Seq[VCFFormatHeaderLine]) = {
val serializableConf = new SerializableConfiguration(spark.sessionState.newHadoopConf())

val filePaths = files.map(_.getPath.toString)
spark
.sparkContext
.parallelize(files.map(_.getPath.toString))
.parallelize(filePaths)
.map { path =>
val (header, _) = VCFFileFormat.createVCFCodec(path, serializableConf.value)

(header.getInfoHeaderLines.asScala.toSeq, header.getFormatHeaderLines.asScala.toSeq)
val infoHeaderLines = header.getInfoHeaderLines.asScala.toSet
val formatHeaderLines = header.getFormatHeaderLines.asScala.toSet
(infoHeaderLines, formatHeaderLines)
}
.collect()
.foldLeft((Seq.empty[VCFInfoHeaderLine], Seq.empty[VCFFormatHeaderLine])) {
Expand Down
1 change: 1 addition & 0 deletions core/src/test/resources/log4j.properties
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ log4j.appender.file.File=unit-tests.log
log4j.logger.akka=ERROR
log4j.logger.Remoting=ERROR
log4j.logger.org.eclipse.jetty=ERROR
log4j.logger.org.apache.hadoop=WARN
1 change: 0 additions & 1 deletion core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ abstract class GlowBaseTest
override protected def sparkConf: SparkConf = {
super
.sparkConf
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.driver.maxResultSize", "0")
.set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.GATKRegistrator")
.set("spark.kryoserializer.buffer.max", "2047m")
Expand Down
2 changes: 2 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.0")
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")
Expand Down
16 changes: 16 additions & 0 deletions python/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
=======================================================
An open-source toolkit for large-scale genomic analysis
=======================================================

|circle-ci|

.. |circle-ci| image:: https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d
:target: https://circleci.com/gh/projectglow/glow

Glow is an open-source toolkit for working with genomic data at biobank-scale and beyond. The toolkit is natively built
on Apache Spark, the leading unified engine for big data processing and machine learning, enabling the scale of the
cloud for genomics workflows.

`Read the docs`_ to start using Glow.

.. _Read the docs: https://glow.readthedocs.io
2 changes: 2 additions & 0 deletions python/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ dependencies:
- pip
- pip:
- pyspark==2.4.2
- setuptools==41.2.0 # Python packaging
- typeguard==2.5.0
- twine==2.0.0 # Pypi publishing
20 changes: 20 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from setuptools import setup

setup(
name='glow.py',
version='0.1.0',
packages=['glow'],
install_requires=[
'typeguard==2.5.0',
],
author='The Glow Authors',
description='An open-source toolkit for large-scale genomic analysis',
long_description=open('README.rst').read(),
long_description_content_type='text/x-rst',
license='Apache License 2.0',
classifiers=[
'Intended Audience :: Developers',
'Programming Language :: Python :: 3.7',
],
url='https://projectglow.io'
)
1 change: 1 addition & 0 deletions version.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
version in ThisBuild := "0.1.0"