diff --git a/.gitignore b/.gitignore index 46d77cff2..83766268e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,30 @@ +# Java targets target core/target project/target -.idea -*.swp -*.swo -adam.log + +# Distribution / packaging +build/ +dist/ +maven-repo/ +*.egg-info/ + +# Mac +.DS_Store + +# Byte-compiled / optimized / DLL files **/__pycache__ *.pyc -.DS_Store + +# Sphinx documentation docs/build + +# Editor files +*.swp +*.swo +*.iml +.idea + +# Logs +adam.log unit-tests.log diff --git a/build.sbt b/build.sbt index 4ebb2da90..ecdb86c35 100644 --- a/build.sbt +++ b/build.sbt @@ -6,9 +6,9 @@ val sparkVersion = "2.4.3" val scalaMajorMinor = "2.11" ThisBuild / scalaVersion := s"$scalaMajorMinor.12" -ThisBuild / version := "0.1.0-SNAPSHOT" ThisBuild / organization := "io.projectglow" ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" +ThisBuild / publish / skip := true ThisBuild / organizationName := "The Glow Authors" ThisBuild / startYear := Some(2019) @@ -61,48 +61,78 @@ lazy val commonSettings = Seq( MergeStrategy.first }, scalacOptions += "-target:jvm-1.8" - ) +) + +lazy val dependencies = Seq( + "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", + "org.apache.spark" %% "spark-core" % sparkVersion % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", + "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", + "org.seqdoop" % "hadoop-bam" % "7.9.1", + "log4j" % "log4j" % "1.2.17", + "org.slf4j" % "slf4j-api" % "1.7.16", + "org.slf4j" % "slf4j-log4j12" % "1.7.16", + "org.jdbi" % "jdbi" % "2.63.1", + "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", + // Exclude extraneous GATK dependencies + ("org.broadinstitute" % "gatk" % "4.0.11.0") + .exclude("biz.k11i", "xgboost-predictor") + .exclude("com.esotericsoftware", "kryo") + .exclude("com.esotericsoftware", "reflectasm") + .exclude("com.github.jsr203hadoop", "jsr203hadoop") + .exclude("com.google.cloud", "google-cloud-nio") + .exclude("com.google.cloud.bigdataoss", "gcs-connector") + .exclude("com.intel", "genomicsdb") + .exclude("com.intel.gkl", "gkl") + .exclude("com.opencsv", "opencsv") + .exclude("commons-io", "commons-io") + .exclude("gov.nist.math.jama", "gov.nist.math.jama") + .exclude("it.unimi.dsi", "fastutil") + .exclude("org.aeonbits.owner", "owner") + .exclude("org.apache.commons", "commons-lang3") + .exclude("org.apache.commons", "commons-math3") + .exclude("org.apache.commons", "commons-collections4") + .exclude("org.apache.commons", "commons-vfs2") + .exclude("org.apache.hadoop", "hadoop-client") + .exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor") + .exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor") + .exclude("org.broadinstitute", "barclay") + .exclude("org.broadinstitute", "hdf5-java-bindings") + .exclude("org.broadinstitute", "gatk-native-bindings") + .exclude("org.broadinstitute", "gatk-bwamem-jni") + .exclude("org.broadinstitute", "gatk-fermilite-jni") + .exclude("org.jgrapht", "jgrapht-core") + .exclude("org.objenesis", "objenesis") + .exclude("org.ojalgo", "ojalgo") + .exclude("org.ojalgo", "ojalgo-commons-math3") + .exclude("org.reflections", "reflections") + .exclude("org.seqdoop", "hadoop-bam") + .exclude("org.xerial", "sqlite-jdbc"), + // Test dependencies + "org.scalatest" %% "scalatest" % "3.0.3" % "test", + "org.scalacheck" %% "scalacheck" % "1.12.5" % "test", + "org.mockito" % "mockito-all" % "1.9.5" % "test", + "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", + "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", + "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", + "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test", + "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test", + "org.xerial" % "sqlite-jdbc" % "3.20.1" % "test" +).map(_.exclude("com.google.code.findbugs", "jsr305")) lazy val core = (project in file("core")) .settings( commonSettings, name := "glow", - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", - "org.apache.spark" %% "spark-core" % sparkVersion % "provided", - "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", - "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", - "com.github.samtools" % "htsjdk" % "2.20.0", - "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.9", - "org.seqdoop" % "hadoop-bam" % "7.9.1", - "log4j" % "log4j" % "1.2.17", - "org.slf4j" % "slf4j-api" % "1.7.16", - "org.slf4j" % "slf4j-log4j12" % "1.7.16", - "org.jdbi" % "jdbi" % "2.63.1", - "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", - // Exclude extraneous GATK dependencies - ("org.broadinstitute" % "gatk" % "4.0.11.0") - .exclude("biz.k11i", "xgboost-predictor") - .exclude("com.google.cloud.bigdataoss", "gcs-connector") - .exclude("com.intel", "genomicsdb") - .exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor") - .exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor") - .exclude("com.google.cloud", "google-cloud-nio"), - // Test dependencies - "org.scalatest" %% "scalatest" % "3.0.3" % "test", - "org.scalacheck" %% "scalacheck" % "1.12.5" % "test", - "org.mockito" % "mockito-all" % "1.9.5" % "test", - "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", - "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test", - "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test" - ), + publish / skip := false, + bintrayRepository := "glow", + libraryDependencies ++= dependencies, // Fix versions of libraries that are depended on multiple times dependencyOverrides ++= Seq( "org.apache.hadoop" % "hadoop-client" % "2.7.3", "io.netty" % "netty" % "3.9.9.Final", - "io.netty" % "netty-all" % "4.1.17.Final" + "io.netty" % "netty-all" % "4.1.17.Final", + "com.github.samtools" % "htsjdk" % "2.20.1" ) ) @@ -127,33 +157,37 @@ lazy val python = "SPARK_HOME" -> (ThisBuild / baseDirectory).value.absolutePath ).! require(ret == 0, "Python tests failed") - } + }, + publish / skip := true ) -// Uncomment the following for publishing to Sonatype. -// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for more detail. +// Publish to Bintray +ThisBuild / description := "An open-source toolkit for large-scale genomic analysis" +ThisBuild / homepage := Some(url("https://projectglow.io")) +ThisBuild / scmInfo := Some( + ScmInfo( + url("https://github.com/projectglow/glow"), + "scm:git@github.com:projectglow/glow.git" + ) +) +ThisBuild / pomIncludeRepository := { _ => + false +} +ThisBuild / publishMavenStyle := true + +ThisBuild / bintrayOrganization := Some("projectglow") +ThisBuild / bintrayRepository := "glow" -// ThisBuild / description := "Some descripiton about your project." -// ThisBuild / licenses := List("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) -// ThisBuild / homepage := Some(url("https://github.com/example/project")) -// ThisBuild / scmInfo := Some( -// ScmInfo( -// url("https://github.com/your-account/your-project"), -// "scm:git@github.com:your-account/your-project.git" -// ) -// ) -// ThisBuild / developers := List( -// Developer( -// id = "Your identifier", -// name = "Your Name", -// email = "your@email", -// url = url("http://your.url") -// ) -// ) -// ThisBuild / pomIncludeRepository := { _ => false } -// ThisBuild / publishTo := { -// val nexus = "https://oss.sonatype.org/" -// if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") -// else Some("releases" at nexus + "service/local/staging/deploy/maven2") -// } -// ThisBuild / publishMavenStyle := true +import ReleaseTransformations._ + +releaseProcess := Seq[ReleaseStep]( + checkSnapshotDependencies, + inquireVersions, + runTest, + setReleaseVersion, + commitReleaseVersion, + tagRelease, + publishArtifacts, + setNextVersion, + commitNextVersion +) diff --git a/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala b/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala index f07e2f9c4..bbe9930ab 100644 --- a/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala @@ -17,7 +17,7 @@ package io.projectglow.bgen import com.google.common.io.LittleEndianDataInputStream -import org.apache.hadoop.fs.FileStatus +import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType @@ -45,19 +45,20 @@ object BgenSchemaInferrer { val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration) val ignoreExtension = options.get(BgenFileFormat.IGNORE_EXTENSION_KEY).exists(_.toBoolean) - val bgenFiles = + val bgenPaths = files.filter { fs => fs.getLen > 0 && (fs .getPath .toString .endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension) - } + }.map(_.getPath.toString) val hasSampleIds = spark .sparkContext - .parallelize(bgenFiles) - .map { fs => - val hadoopFs = fs.getPath.getFileSystem(serializableConf.value) - WithUtils.withCloseable(hadoopFs.open(fs.getPath)) { stream => + .parallelize(bgenPaths) + .map { path => + val hPath = new Path(path) + val hadoopFs = hPath.getFileSystem(serializableConf.value) + WithUtils.withCloseable(hadoopFs.open(hPath)) { stream => val littleEndianDataInputStream = new LittleEndianDataInputStream(stream) new BgenHeaderReader(littleEndianDataInputStream) .readHeader(None) diff --git a/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala b/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala index 4b26157c6..8325711d0 100644 --- a/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala @@ -457,13 +457,15 @@ private[vcf] object SchemaDelegate { files: Seq[FileStatus]): (Seq[VCFInfoHeaderLine], Seq[VCFFormatHeaderLine]) = { val serializableConf = new SerializableConfiguration(spark.sessionState.newHadoopConf()) + val filePaths = files.map(_.getPath.toString) spark .sparkContext - .parallelize(files.map(_.getPath.toString)) + .parallelize(filePaths) .map { path => val (header, _) = VCFFileFormat.createVCFCodec(path, serializableConf.value) - - (header.getInfoHeaderLines.asScala.toSeq, header.getFormatHeaderLines.asScala.toSeq) + val infoHeaderLines = header.getInfoHeaderLines.asScala.toSet + val formatHeaderLines = header.getFormatHeaderLines.asScala.toSet + (infoHeaderLines, formatHeaderLines) } .collect() .foldLeft((Seq.empty[VCFInfoHeaderLine], Seq.empty[VCFFormatHeaderLine])) { diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index 207980419..f1c824f9a 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -9,3 +9,4 @@ log4j.appender.file.File=unit-tests.log log4j.logger.akka=ERROR log4j.logger.Remoting=ERROR log4j.logger.org.eclipse.jetty=ERROR +log4j.logger.org.apache.hadoop=WARN diff --git a/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala b/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala index 1eaa5fe56..c5cdefc18 100644 --- a/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala +++ b/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala @@ -37,7 +37,6 @@ abstract class GlowBaseTest override protected def sparkConf: SparkConf = { super .sparkConf - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.driver.maxResultSize", "0") .set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.GATKRegistrator") .set("spark.kryoserializer.buffer.max", "2047m") diff --git a/project/plugins.sbt b/project/plugins.sbt index cf9434064..d2eba7975 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,7 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") +addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11") addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2") +addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.0") addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") diff --git a/python/README.rst b/python/README.rst new file mode 100644 index 000000000..963ca0624 --- /dev/null +++ b/python/README.rst @@ -0,0 +1,16 @@ +======================================================= +An open-source toolkit for large-scale genomic analysis +======================================================= + +|circle-ci| + +.. |circle-ci| image:: https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d + :target: https://circleci.com/gh/projectglow/glow + +Glow is an open-source toolkit for working with genomic data at biobank-scale and beyond. The toolkit is natively built +on Apache Spark, the leading unified engine for big data processing and machine learning, enabling the scale of the +cloud for genomics workflows. + +`Read the docs`_ to start using Glow. + +.. _Read the docs: https://glow.readthedocs.io diff --git a/python/environment.yml b/python/environment.yml index 4a59d50c2..669afc851 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -5,4 +5,6 @@ dependencies: - pip - pip: - pyspark==2.4.2 + - setuptools==41.2.0 # Python packaging - typeguard==2.5.0 + - twine==2.0.0 # Pypi publishing diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000..e93134582 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +setup( + name='glow.py', + version='0.1.0', + packages=['glow'], + install_requires=[ + 'typeguard==2.5.0', + ], + author='The Glow Authors', + description='An open-source toolkit for large-scale genomic analysis', + long_description=open('README.rst').read(), + long_description_content_type='text/x-rst', + license='Apache License 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'Programming Language :: Python :: 3.7', + ], + url='https://projectglow.io' +) diff --git a/version.sbt b/version.sbt new file mode 100644 index 000000000..e76544405 --- /dev/null +++ b/version.sbt @@ -0,0 +1 @@ +version in ThisBuild := "0.1.0"