diff --git a/.gitignore b/.gitignore index 46d77cff2..83766268e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,30 @@ +# Java targets target core/target project/target -.idea -*.swp -*.swo -adam.log + +# Distribution / packaging +build/ +dist/ +maven-repo/ +*.egg-info/ + +# Mac +.DS_Store + +# Byte-compiled / optimized / DLL files **/__pycache__ *.pyc -.DS_Store + +# Sphinx documentation docs/build + +# Editor files +*.swp +*.swo +*.iml +.idea + +# Logs +adam.log unit-tests.log diff --git a/README.md b/README.md deleted file mode 100644 index 759a24ac2..000000000 --- a/README.md +++ /dev/null @@ -1,36 +0,0 @@ -[![CircleCI](https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d)](https://circleci.com/gh/projectglow/glow) - -# Building and Testing -This project is built using sbt: https://www.scala-sbt.org/1.0/docs/Setup.html - -Start an sbt shell using the `sbt` command. - -To compile the main code: -``` -compile -``` - -To run all tests: -``` -test -``` - -To test a specific suite: -``` -testOnly *VCFDataSourceSuite -``` - -If you use IntelliJ, you'll want to set up [scalafmt on save](https://scalameta.org/scalafmt/docs/installation.html). - -To test or testOnly in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to 'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the definition of options in groupByHash function in build.sbt to -``` -val options = ForkOptions().withRunJVMOptions(Vector("-Xmx1024m")).withRunJVMOptions(Vector("-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005")) -``` - -To run Python tests, you must install and activate the conda environment in -`python/environment.yml`. You can then run tests from sbt: -``` -python/test -``` - -These tests will run with the same Spark classpath as the Scala tests. diff --git a/README.rst b/README.rst new file mode 100644 index 000000000..efc307a2a --- /dev/null +++ b/README.rst @@ -0,0 +1,10 @@ +================================== +Glow: Genomics on Apache Spark SQL +================================== + +|circle-ci| + +.. |circle-ci| image:: https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d + :target: https://circleci.com/gh/projectglow/glow + +Glow is a library to ETL genomics data using Apache Spark SQL. diff --git a/build.sbt b/build.sbt index d3fa9b076..c5e28ab95 100644 --- a/build.sbt +++ b/build.sbt @@ -6,10 +6,11 @@ val sparkVersion = "2.4.3" val scalaMajorMinor = "2.11" ThisBuild / scalaVersion := s"$scalaMajorMinor.12" -ThisBuild / version := "0.1.0-SNAPSHOT" +ThisBuild / version := "0.1.0" ThisBuild / organization := "org.projectglow" ThisBuild / organizationName := "DB / RGC" ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" +ThisBuild / publish / skip := true // Compile Java sources before Scala sources, so Scala code can depend on Java // but not vice versa @@ -30,7 +31,7 @@ def groupByHash(tests: Seq[TestDefinition]) = { .map { case (i, tests) => val options = ForkOptions() - .withRunJVMOptions(Vector("-Xmx1024m")) + .withRunJVMOptions(Vector("-Dspark.ui.enabled=false", "-Xmx1024m")) new Group(i.toString, tests, SubProcess(options)) } .toSeq @@ -58,46 +59,76 @@ lazy val commonSettings = Seq( scalacOptions += "-target:jvm-1.8" ) +lazy val dependencies = Seq( + "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", + "org.apache.spark" %% "spark-core" % sparkVersion % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", + "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", + "org.seqdoop" % "hadoop-bam" % "7.9.1", + "log4j" % "log4j" % "1.2.17", + "org.slf4j" % "slf4j-api" % "1.7.16", + "org.slf4j" % "slf4j-log4j12" % "1.7.16", + "org.jdbi" % "jdbi" % "2.63.1", + "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", + // Exclude extraneous GATK dependencies + ("org.broadinstitute" % "gatk" % "4.0.11.0") + .exclude("biz.k11i", "xgboost-predictor") + .exclude("com.esotericsoftware", "kryo") + .exclude("com.esotericsoftware", "reflectasm") + .exclude("com.github.jsr203hadoop", "jsr203hadoop") + .exclude("com.google.cloud", "google-cloud-nio") + .exclude("com.google.cloud.bigdataoss", "gcs-connector") + .exclude("com.intel", "genomicsdb") + .exclude("com.intel.gkl", "gkl") + .exclude("com.opencsv", "opencsv") + .exclude("commons-io", "commons-io") + .exclude("gov.nist.math.jama", "gov.nist.math.jama") + .exclude("it.unimi.dsi", "fastutil") + .exclude("org.aeonbits.owner", "owner") + .exclude("org.apache.commons", "commons-lang3") + .exclude("org.apache.commons", "commons-math3") + .exclude("org.apache.commons", "commons-collections4") + .exclude("org.apache.commons", "commons-vfs2") + .exclude("org.apache.hadoop", "hadoop-client") + .exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor") + .exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor") + .exclude("org.broadinstitute", "barclay") + .exclude("org.broadinstitute", "hdf5-java-bindings") + .exclude("org.broadinstitute", "gatk-native-bindings") + .exclude("org.broadinstitute", "gatk-bwamem-jni") + .exclude("org.broadinstitute", "gatk-fermilite-jni") + .exclude("org.jgrapht", "jgrapht-core") + .exclude("org.objenesis", "objenesis") + .exclude("org.ojalgo", "ojalgo") + .exclude("org.ojalgo", "ojalgo-commons-math3") + .exclude("org.reflections", "reflections") + .exclude("org.seqdoop", "hadoop-bam") + .exclude("org.xerial", "sqlite-jdbc"), + // Test dependencies + "org.scalatest" %% "scalatest" % "3.0.3" % "test", + "org.scalacheck" %% "scalacheck" % "1.12.5" % "test", + "org.mockito" % "mockito-all" % "1.9.5" % "test", + "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", + "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", + "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", + "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test", + "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test", + "org.xerial" % "sqlite-jdbc" % "3.20.1" % "test" +).map(_.exclude("com.google.code.findbugs", "jsr305")) + lazy val core = (project in file("core")) .settings( commonSettings, name := "glow", - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", - "org.apache.spark" %% "spark-core" % sparkVersion % "provided", - "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", - "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", - "com.github.samtools" % "htsjdk" % "2.20.0", - "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.9", - "org.seqdoop" % "hadoop-bam" % "7.9.1", - "log4j" % "log4j" % "1.2.17", - "org.slf4j" % "slf4j-api" % "1.7.16", - "org.slf4j" % "slf4j-log4j12" % "1.7.16", - "org.jdbi" % "jdbi" % "2.63.1", - "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", - // Exclude extraneous GATK dependencies - ("org.broadinstitute" % "gatk" % "4.0.11.0") - .exclude("biz.k11i", "xgboost-predictor") - .exclude("com.google.cloud.bigdataoss", "gcs-connector") - .exclude("com.intel", "genomicsdb") - .exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor") - .exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor") - .exclude("com.google.cloud", "google-cloud-nio"), - // Test dependencies - "org.scalatest" %% "scalatest" % "3.0.3" % "test", - "org.scalacheck" %% "scalacheck" % "1.12.5" % "test", - "org.mockito" % "mockito-all" % "1.9.5" % "test", - "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", - "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test", - "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test" - ), + publish / skip := false, + bintrayRepository := "glow", + libraryDependencies ++= dependencies, // Fix versions of libraries that are depended on multiple times dependencyOverrides ++= Seq( "org.apache.hadoop" % "hadoop-client" % "2.7.3", "io.netty" % "netty" % "3.9.9.Final", - "io.netty" % "netty-all" % "4.1.17.Final" + "io.netty" % "netty-all" % "4.1.17.Final", + "com.github.samtools" % "htsjdk" % "2.20.1" ) ) @@ -122,33 +153,39 @@ lazy val python = "SPARK_HOME" -> (ThisBuild / baseDirectory).value.absolutePath ).! require(ret == 0, "Python tests failed") - } + }, + publish / skip := true ) -// Uncomment the following for publishing to Sonatype. -// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for more detail. +// Publish to Bintray +ThisBuild / description := "Glow: Genomics on Apache Spark" +ThisBuild / licenses := List( + "Apache-2.0" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) +ThisBuild / homepage := Some(url("https://github.com/projectglow/glow")) +ThisBuild / scmInfo := Some( + ScmInfo( + url("https://github.com/projectglow/glow"), + "scm:git@github.com:projectglow/glow.git" + ) +) +ThisBuild / pomIncludeRepository := { _ => + false +} +ThisBuild / publishMavenStyle := true + +ThisBuild / bintrayOrganization := Some("projectglow") +ThisBuild / bintrayRepository := "glow" -// ThisBuild / description := "Some descripiton about your project." -// ThisBuild / licenses := List("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) -// ThisBuild / homepage := Some(url("https://github.com/example/project")) -// ThisBuild / scmInfo := Some( -// ScmInfo( -// url("https://github.com/your-account/your-project"), -// "scm:git@github.com:your-account/your-project.git" -// ) -// ) -// ThisBuild / developers := List( -// Developer( -// id = "Your identifier", -// name = "Your Name", -// email = "your@email", -// url = url("http://your.url") -// ) -// ) -// ThisBuild / pomIncludeRepository := { _ => false } -// ThisBuild / publishTo := { -// val nexus = "https://oss.sonatype.org/" -// if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") -// else Some("releases" at nexus + "service/local/staging/deploy/maven2") -// } -// ThisBuild / publishMavenStyle := true +import ReleaseTransformations._ + +releaseProcess := Seq[ReleaseStep]( + checkSnapshotDependencies, + inquireVersions, + runTest, + setReleaseVersion, + commitReleaseVersion, + tagRelease, + publishArtifacts, + setNextVersion, + commitNextVersion +) diff --git a/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala b/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala index 8c6869ad9..10806a6e3 100644 --- a/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala +++ b/core/src/main/scala/io/projectglow/bgen/BgenSchemaInferrer.scala @@ -29,19 +29,20 @@ object BgenSchemaInferrer { val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration) val ignoreExtension = options.get(BgenFileFormat.IGNORE_EXTENSION_KEY).exists(_.toBoolean) - val bgenFiles = + val bgenPaths = files.filter { fs => fs.getLen > 0 && (fs .getPath .toString .endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension) - } + }.map(_.getPath.toString) val hasSampleIds = spark .sparkContext - .parallelize(bgenFiles) - .map { fs => - val hadoopFs = fs.getPath.getFileSystem(serializableConf.value) - WithUtils.withCloseable(hadoopFs.open(fs.getPath)) { stream => + .parallelize(bgenPaths) + .map { path => + val hPath = new Path(path) + val hadoopFs = hPath.getFileSystem(serializableConf.value) + WithUtils.withCloseable(hadoopFs.open(hPath)) { stream => val littleEndianDataInputStream = new LittleEndianDataInputStream(stream) new BgenHeaderReader(littleEndianDataInputStream) .readHeader(None) diff --git a/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala b/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala index 2b2d6bb33..7d0deed1a 100644 --- a/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala +++ b/core/src/main/scala/io/projectglow/vcf/VCFFileFormat.scala @@ -441,13 +441,15 @@ private[vcf] object SchemaDelegate { files: Seq[FileStatus]): (Seq[VCFInfoHeaderLine], Seq[VCFFormatHeaderLine]) = { val serializableConf = new SerializableConfiguration(spark.sessionState.newHadoopConf()) + val filePaths = files.map(_.getPath.toString) spark .sparkContext - .parallelize(files.map(_.getPath.toString)) + .parallelize(filePaths) .map { path => val (header, _) = VCFFileFormat.createVCFCodec(path, serializableConf.value) - - (header.getInfoHeaderLines.asScala.toSeq, header.getFormatHeaderLines.asScala.toSeq) + val infoHeaderLines = header.getInfoHeaderLines.asScala.toSet + val formatHeaderLines = header.getFormatHeaderLines.asScala.toSet + (infoHeaderLines, formatHeaderLines) } .collect() .foldLeft((Seq.empty[VCFInfoHeaderLine], Seq.empty[VCFFormatHeaderLine])) { diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index ca48a2782..4a8840aef 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -10,3 +10,4 @@ log4j.logger.akka=ERROR log4j.logger.Remoting=ERROR log4j.logger.org.apache.spark=INFO log4j.logger.org.eclipse.jetty=ERROR +log4j.logger.org.apache.hadoop=ERROR diff --git a/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala b/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala index dfb108df5..1ccb41a51 100644 --- a/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala +++ b/core/src/test/scala/io/projectglow/sql/GlowBaseTest.scala @@ -21,7 +21,6 @@ abstract class GlowBaseTest override protected def sparkConf: SparkConf = { super .sparkConf - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.driver.maxResultSize", "0") .set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.GATKRegistrator") .set("spark.kryoserializer.buffer.max", "2047m") diff --git a/project/plugins.sbt b/project/plugins.sbt index e97c33c28..f6a0ebe23 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,7 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") +addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11") addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2") +addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.0") addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") diff --git a/python/environment.yml b/python/environment.yml index 4a59d50c2..669afc851 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -5,4 +5,6 @@ dependencies: - pip - pip: - pyspark==2.4.2 + - setuptools==41.2.0 # Python packaging - typeguard==2.5.0 + - twine==2.0.0 # Pypi publishing diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000..d745c5b5e --- /dev/null +++ b/python/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +setup( + name='glowpy', + version='1.0.3', + packages=['db_genomics'], + install_requires=[ + 'typeguard==2.5.0', + ], + author='Glow Project', + description='Glow: Genomics on Apache Spark', + long_description=open('../README.rst').read(), + long_description_content_type='text/x-rst', + license='Apache License 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'Programming Language :: Python :: 3.7', + ], + url='https://github.com/projectglow/glow' +) diff --git a/version.sbt b/version.sbt new file mode 100644 index 000000000..1be9a631a --- /dev/null +++ b/version.sbt @@ -0,0 +1 @@ +version in ThisBuild := "0.1.1-SNAPSHOT"