Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
# Java targets
target
core/target
project/target
.idea
*.swp
*.swo
adam.log

# Distribution / packaging
build/
dist/
maven-repo/
*.egg-info/

# Mac
.DS_Store

# Byte-compiled / optimized / DLL files
**/__pycache__
*.pyc
.DS_Store

# Sphinx documentation
docs/build

# Editor files
*.swp
*.swo
*.iml
.idea

# Logs
adam.log
unit-tests.log
36 changes: 0 additions & 36 deletions README.md

This file was deleted.

10 changes: 10 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
==================================
Glow: Genomics on Apache Spark SQL
==================================

|circle-ci|

.. |circle-ci| image:: https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d
:target: https://circleci.com/gh/projectglow/glow

Glow is a library to ETL genomics data using Apache Spark SQL.
155 changes: 95 additions & 60 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ ThisBuild / scalaVersion := s"$scalaMajorMinor.12"
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / organization := "io.projectglow"
ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml"
ThisBuild / publish / skip := true

ThisBuild / organizationName := "The Glow Authors"
ThisBuild / startYear := Some(2019)
Expand Down Expand Up @@ -61,48 +62,78 @@ lazy val commonSettings = Seq(
MergeStrategy.first
},
scalacOptions += "-target:jvm-1.8"
)
)

lazy val dependencies = Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.seqdoop" % "hadoop-bam" % "7.9.1",
"log4j" % "log4j" % "1.2.17",
"org.slf4j" % "slf4j-api" % "1.7.16",
"org.slf4j" % "slf4j-log4j12" % "1.7.16",
"org.jdbi" % "jdbi" % "2.63.1",
"com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2",
// Exclude extraneous GATK dependencies
("org.broadinstitute" % "gatk" % "4.0.11.0")
.exclude("biz.k11i", "xgboost-predictor")
.exclude("com.esotericsoftware", "kryo")
.exclude("com.esotericsoftware", "reflectasm")
.exclude("com.github.jsr203hadoop", "jsr203hadoop")
.exclude("com.google.cloud", "google-cloud-nio")
.exclude("com.google.cloud.bigdataoss", "gcs-connector")
.exclude("com.intel", "genomicsdb")
.exclude("com.intel.gkl", "gkl")
.exclude("com.opencsv", "opencsv")
.exclude("commons-io", "commons-io")
.exclude("gov.nist.math.jama", "gov.nist.math.jama")
.exclude("it.unimi.dsi", "fastutil")
.exclude("org.aeonbits.owner", "owner")
.exclude("org.apache.commons", "commons-lang3")
.exclude("org.apache.commons", "commons-math3")
.exclude("org.apache.commons", "commons-collections4")
.exclude("org.apache.commons", "commons-vfs2")
.exclude("org.apache.hadoop", "hadoop-client")
.exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor")
.exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor")
.exclude("org.broadinstitute", "barclay")
.exclude("org.broadinstitute", "hdf5-java-bindings")
.exclude("org.broadinstitute", "gatk-native-bindings")
.exclude("org.broadinstitute", "gatk-bwamem-jni")
.exclude("org.broadinstitute", "gatk-fermilite-jni")
.exclude("org.jgrapht", "jgrapht-core")
.exclude("org.objenesis", "objenesis")
.exclude("org.ojalgo", "ojalgo")
.exclude("org.ojalgo", "ojalgo-commons-math3")
.exclude("org.reflections", "reflections")
.exclude("org.seqdoop", "hadoop-bam")
.exclude("org.xerial", "sqlite-jdbc"),
// Test dependencies
"org.scalatest" %% "scalatest" % "3.0.3" % "test",
"org.scalacheck" %% "scalacheck" % "1.12.5" % "test",
"org.mockito" % "mockito-all" % "1.9.5" % "test",
"org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
"org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test",
"org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test",
"org.xerial" % "sqlite-jdbc" % "3.20.1" % "test"
).map(_.exclude("com.google.code.findbugs", "jsr305"))

lazy val core = (project in file("core"))
.settings(
commonSettings,
name := "glow",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"com.github.samtools" % "htsjdk" % "2.20.0",
"com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.9",
"org.seqdoop" % "hadoop-bam" % "7.9.1",
"log4j" % "log4j" % "1.2.17",
"org.slf4j" % "slf4j-api" % "1.7.16",
"org.slf4j" % "slf4j-log4j12" % "1.7.16",
"org.jdbi" % "jdbi" % "2.63.1",
"com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2",
// Exclude extraneous GATK dependencies
("org.broadinstitute" % "gatk" % "4.0.11.0")
.exclude("biz.k11i", "xgboost-predictor")
.exclude("com.google.cloud.bigdataoss", "gcs-connector")
.exclude("com.intel", "genomicsdb")
.exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor")
.exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor")
.exclude("com.google.cloud", "google-cloud-nio"),
// Test dependencies
"org.scalatest" %% "scalatest" % "3.0.3" % "test",
"org.scalacheck" %% "scalacheck" % "1.12.5" % "test",
"org.mockito" % "mockito-all" % "1.9.5" % "test",
"org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests",
"org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
"org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test",
"org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test"
),
publish / skip := false,
bintrayRepository := "glow",
libraryDependencies ++= dependencies,
// Fix versions of libraries that are depended on multiple times
dependencyOverrides ++= Seq(
"org.apache.hadoop" % "hadoop-client" % "2.7.3",
"io.netty" % "netty" % "3.9.9.Final",
"io.netty" % "netty-all" % "4.1.17.Final"
"io.netty" % "netty-all" % "4.1.17.Final",
"com.github.samtools" % "htsjdk" % "2.20.1"
)
)

Expand All @@ -127,33 +158,37 @@ lazy val python =
"SPARK_HOME" -> (ThisBuild / baseDirectory).value.absolutePath
).!
require(ret == 0, "Python tests failed")
}
},
publish / skip := true
)

// Uncomment the following for publishing to Sonatype.
// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for more detail.
// Publish to Bintray
ThisBuild / description := "Glow: Genomics on Apache Spark"
ThisBuild / homepage := Some(url("http://projectglow.io"))
ThisBuild / scmInfo := Some(
ScmInfo(
url("https://github.com/projectglow/glow"),
"scm:git@github.com:projectglow/glow.git"
)
)
ThisBuild / pomIncludeRepository := { _ =>
false
}
ThisBuild / publishMavenStyle := true

ThisBuild / bintrayOrganization := Some("projectglow")
ThisBuild / bintrayRepository := "glow"

// ThisBuild / description := "Some descripiton about your project."
// ThisBuild / licenses := List("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt"))
// ThisBuild / homepage := Some(url("https://github.com/example/project"))
// ThisBuild / scmInfo := Some(
// ScmInfo(
// url("https://github.com/your-account/your-project"),
// "scm:git@github.com:your-account/your-project.git"
// )
// )
// ThisBuild / developers := List(
// Developer(
// id = "Your identifier",
// name = "Your Name",
// email = "your@email",
// url = url("http://your.url")
// )
// )
// ThisBuild / pomIncludeRepository := { _ => false }
// ThisBuild / publishTo := {
// val nexus = "https://oss.sonatype.org/"
// if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots")
// else Some("releases" at nexus + "service/local/staging/deploy/maven2")
// }
// ThisBuild / publishMavenStyle := true
import ReleaseTransformations._

releaseProcess := Seq[ReleaseStep](
checkSnapshotDependencies,
inquireVersions,
runTest,
setReleaseVersion,
commitReleaseVersion,
tagRelease,
publishArtifacts,
setNextVersion,
commitNextVersion
)
13 changes: 7 additions & 6 deletions core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,20 @@ object BgenSchemaInferrer {

val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
val ignoreExtension = options.get(BgenFileFormat.IGNORE_EXTENSION_KEY).exists(_.toBoolean)
val bgenFiles =
val bgenPaths =
files.filter { fs =>
fs.getLen > 0 && (fs
.getPath
.toString
.endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension)
}
}.map(_.getPath.toString)
val hasSampleIds = spark
.sparkContext
.parallelize(bgenFiles)
.map { fs =>
val hadoopFs = fs.getPath.getFileSystem(serializableConf.value)
WithUtils.withCloseable(hadoopFs.open(fs.getPath)) { stream =>
.parallelize(bgenPaths)
.map { path =>
val hPath = new Path(path)
val hadoopFs = hPath.getFileSystem(serializableConf.value)
WithUtils.withCloseable(hadoopFs.open(hPath)) { stream =>
val littleEndianDataInputStream = new LittleEndianDataInputStream(stream)
new BgenHeaderReader(littleEndianDataInputStream)
.readHeader(None)
Expand Down
8 changes: 5 additions & 3 deletions core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala
Original file line number Diff line number Diff line change
Expand Up @@ -457,13 +457,15 @@ private[vcf] object SchemaDelegate {
files: Seq[FileStatus]): (Seq[VCFInfoHeaderLine], Seq[VCFFormatHeaderLine]) = {
val serializableConf = new SerializableConfiguration(spark.sessionState.newHadoopConf())

val filePaths = files.map(_.getPath.toString)
spark
.sparkContext
.parallelize(files.map(_.getPath.toString))
.parallelize(filePaths)
.map { path =>
val (header, _) = VCFFileFormat.createVCFCodec(path, serializableConf.value)

(header.getInfoHeaderLines.asScala.toSeq, header.getFormatHeaderLines.asScala.toSeq)
val infoHeaderLines = header.getInfoHeaderLines.asScala.toSet
val formatHeaderLines = header.getFormatHeaderLines.asScala.toSet
(infoHeaderLines, formatHeaderLines)
}
.collect()
.foldLeft((Seq.empty[VCFInfoHeaderLine], Seq.empty[VCFFormatHeaderLine])) {
Expand Down
1 change: 1 addition & 0 deletions core/src/test/resources/log4j.properties
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ log4j.appender.file.File=unit-tests.log
log4j.logger.akka=ERROR
log4j.logger.Remoting=ERROR
log4j.logger.org.eclipse.jetty=ERROR
log4j.logger.org.apache.hadoop=ERROR
1 change: 0 additions & 1 deletion core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ abstract class GlowBaseTest
override protected def sparkConf: SparkConf = {
super
.sparkConf
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.driver.maxResultSize", "0")
.set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.GATKRegistrator")
.set("spark.kryoserializer.buffer.max", "2047m")
Expand Down
2 changes: 2 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.0")
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")
Expand Down
2 changes: 2 additions & 0 deletions python/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ dependencies:
- pip
- pip:
- pyspark==2.4.2
- setuptools==41.2.0 # Python packaging
- typeguard==2.5.0
- twine==2.0.0 # Pypi publishing
20 changes: 20 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from setuptools import setup

setup(
name='glowpy',
version='1.0.3',
packages=['db_genomics'],
install_requires=[
'typeguard==2.5.0',
],
author='Glow Project',
description='Glow: Genomics on Apache Spark',
long_description=open('../README.rst').read(),
long_description_content_type='text/x-rst',
license='Apache License 2.0',
classifiers=[
'Intended Audience :: Developers',
'Programming Language :: Python :: 3.7',
],
url='https://github.com/projectglow/glow'
)
1 change: 1 addition & 0 deletions version.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
version in ThisBuild := "0.1.1-SNAPSHOT"